diff --git a/.clang-tidy b/.clang-tidy
index 924095b4def280..868a22c2596029 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -20,7 +20,7 @@ bugprone-integer-division,
 bugprone-misplaced-widening-cast,
 -bugprone-move-forwarding-reference,
 -bugprone-multiple-statement-macro,
--bugprone-narrowing-conversions,
+bugprone-narrowing-conversions,
 -bugprone-not-null-terminated-result,
 -bugprone-parent-virtual-call,
 -bugprone-posix-return,
@@ -155,7 +155,7 @@ cppcoreguidelines-avoid-c-arrays,
 -cppcoreguidelines-avoid-goto,
 cppcoreguidelines-c-copy-assignment-signature,
 cppcoreguidelines-explicit-virtual-functions,
--cppcoreguidelines-init-variables,
+cppcoreguidelines-init-variables,
 cppcoreguidelines-narrowing-conversions,
 cppcoreguidelines-no-malloc,
 -cppcoreguidelines-pro-type-const-cast,
@@ -189,12 +189,12 @@ modernize-use-override,
 modernize-use-transparent-functors,
 -modernize-use-uncaught-exceptions,
 performance-faster-string-find,
--performance-for-range-copy,
+performance-for-range-copy,
 -performance-implicit-conversion-in-loop,
 -performance-inefficient-algorithm,
 performance-inefficient-string-concatenation,
 -performance-inefficient-vector-operation,
--performance-move-const-arg,
+performance-move-const-arg,
 -performance-move-constructor-init,
 -performance-no-automatic-move,
 performance-noexcept-move-constructor,
diff --git a/.flake8 b/.flake8
index d9585ef248701d..91137a006d0885 100644
--- a/.flake8
+++ b/.flake8
@@ -26,6 +26,9 @@ per-file-ignores =
     # These files need tabs for testing.
     test/dygraph_to_static/test_error.py:E101,W191
 
+    # Ignore compare with True in sot unittest
+    test/sot/test_dup_top.py:E712
+
     # temp ignore base directory
     python/paddle/base/*:
         E712,
diff --git a/.gitmodules b/.gitmodules
index 1fb3d67c6f27ca..8b06f4fb771cbb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -106,3 +106,7 @@
 	path = third_party/jitify
 	url = https://github.com/NVIDIA/jitify.git
 	ignore = dirty
+[submodule "third_party/cccl"]
+	path = third_party/cccl
+	url = https://github.com/NVIDIA/cccl.git
+	ignore = dirty
diff --git a/cmake/cinn/external/absl.cmake b/cmake/cinn/external/absl.cmake
index 56befafecea214..0b3f3d685ed803 100644
--- a/cmake/cinn/external/absl.cmake
+++ b/cmake/cinn/external/absl.cmake
@@ -5,7 +5,7 @@ set(ABSL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/absl)
 set(ABSL_PREFIX_DIR ${THIRD_PARTY_PATH}/absl)
 set(ABSL_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 
-set(ABSL_REPOSITORY "https://github.com/abseil/abseil-cpp.git")
+set(ABSL_REPOSITORY "${GIT_URL}/abseil/abseil-cpp.git")
 set(ABSL_TAG "20210324.2")
 
 set(OPTIONAL_ARGS
diff --git a/cmake/cinn/external/jitify.cmake b/cmake/cinn/external/jitify.cmake
index 7750934d8056c7..8e478a00176b04 100644
--- a/cmake/cinn/external/jitify.cmake
+++ b/cmake/cinn/external/jitify.cmake
@@ -7,7 +7,7 @@ include(ExternalProject)
 
 # clone jitify to Paddle/third_party
 set(JITIFY_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/jitify)
-set(JITIFY_URL https://github.com/NVIDIA/jitify.git)
+set(JITIFY_URL ${GIT_URL}/NVIDIA/jitify.git)
 set(JITIFY_TAG 57de649139c866eb83acacfe50c92ad7c6278776)
 
 ExternalProject_Add(
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index d647e9116b5868..ad414418caefee 100755
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 include(ExternalProject)
-set(OPENSSL_USE_STATIC_LIBS ON)
+if(NOT WITH_ARM)
+  set(OPENSSL_USE_STATIC_LIBS ON)
+endif()
 find_package(OpenSSL REQUIRED)
 
 message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})
@@ -38,7 +40,7 @@ include_directories(${BRPC_INCLUDE_DIR})
 
 # clone brpc to Paddle/third_party
 set(BRPC_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/brpc)
-set(BRPC_URL https://github.com/apache/brpc.git)
+set(BRPC_URL ${GIT_URL}/apache/brpc.git)
 set(BRPC_TAG 1.4.0)
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
diff --git a/cmake/external/cccl.cmake b/cmake/external/cccl.cmake
new file mode 100755
index 00000000000000..c4185bd41a2da7
--- /dev/null
+++ b/cmake/external/cccl.cmake
@@ -0,0 +1,31 @@
+include(ExternalProject)
+
+set(CCCL_PATH
+    "${THIRD_PARTY_PATH}/cccl"
+    CACHE STRING "A path setting for external_cccl path.")
+set(CCCL_PREFIX_DIR ${CCCL_PATH})
+set(CCCL_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cccl)
+
+# The latest commit has bugs in windows, so we set a fix commit.
+set(CCCL_TAG 1f6e4bcae0fbf1bbed87f88544d8d2161c490fc1)
+execute_process(COMMAND git --git-dir=${CCCL_SOURCE_DIR}/.git
+                        --work-tree=${CCCL_SOURCE_DIR} checkout ${CCCL_TAG})
+
+set(CCCL_INCLUDE_DIR ${CCCL_SOURCE_DIR})
+message("CCCL_INCLUDE_DIR is ${CCCL_INCLUDE_DIR}")
+include_directories(${CCCL_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cccl
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${CCCL_SOURCE_DIR}
+  PREFIX ${CCCL_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
+
+add_library(cccl INTERFACE)
+
+add_dependencies(cccl extern_cccl)
diff --git a/cmake/external/cudnn-frontend.cmake b/cmake/external/cudnn-frontend.cmake
index 16c21c8dbf26de..37625f88d9ded4 100644
--- a/cmake/external/cudnn-frontend.cmake
+++ b/cmake/external/cudnn-frontend.cmake
@@ -34,7 +34,7 @@ if((NOT DEFINED CUDNN_FRONTEND_NAME) OR (NOT DEFINED CUDNN_FRONTEND_URL))
       "cudnn-frontend"
       CACHE STRING "" FORCE)
   set(CUDNN_FRONTEND_URL
-      "https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/${CUDNN_FRONTEND_VER}.tar.gz"
+      "${GIT_URL}/NVIDIA/cudnn-frontend/archive/refs/tags/${CUDNN_FRONTEND_VER}.tar.gz"
       CACHE STRING "" FORCE)
 endif()
 set(CUDNN_FRONTEND_CACHE_FILENAME "${CUDNN_FRONTEND_VER}.tar.gz")
diff --git a/cmake/external/dirent.cmake b/cmake/external/dirent.cmake
index 9c212a237f0a4a..7bec37d5f1b7e2 100644
--- a/cmake/external/dirent.cmake
+++ b/cmake/external/dirent.cmake
@@ -25,7 +25,7 @@ if((NOT DEFINED DIRENT_NAME) OR (NOT DEFINED DIRENT_URL))
       "dirent"
       CACHE STRING "" FORCE)
   set(DIRENT_URL
-      "https://github.com/tronkko/dirent/archive/refs/tags/1.23.2.tar.gz"
+      "${GIT_URL}/tronkko/dirent/archive/refs/tags/1.23.2.tar.gz"
       CACHE STRING "" FORCE)
   set(DIRENT_CACHE_FILENAME "1.23.2.tar.gz")
 endif()
diff --git a/cmake/external/jemalloc.cmake b/cmake/external/jemalloc.cmake
index 1fc2d508fb7356..bdd6bfc6c00378 100644
--- a/cmake/external/jemalloc.cmake
+++ b/cmake/external/jemalloc.cmake
@@ -6,8 +6,7 @@ set(JEMALLOC_PROJECT "extern_jemalloc")
 set(JEMALLOC_BUILD ${THIRD_PARTY_PATH}/jemalloc/src/extern_jemalloc)
 set(JEMALLOC_PREFIX_DIR ${THIRD_PARTY_PATH}/jemalloc)
 set(JEMALLOC_URL
-    https://github.com/jemalloc/jemalloc/releases/download/5.1.0/jemalloc-5.1.0.tar.bz2
-)
+    ${GIT_URL}/jemalloc/jemalloc/releases/download/5.1.0/jemalloc-5.1.0.tar.bz2)
 set(JEMALLOC_INSTALL ${THIRD_PARTY_PATH}/install/jemalloc)
 set(JEMALLOC_INCLUDE_DIR ${JEMALLOC_INSTALL}/include)
 
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index cbe951211b5a19..0f06fe09529685 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -31,9 +31,8 @@ set(LIBXSMMNOBLAS_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
 file(GLOB LIBXSMM_SOURCE_FILE_LIST ${LIBXSMM_SOURCE_DIR})
 list(LENGTH LIBXSMM_SOURCE_FILE_LIST RES_LEN)
 if(RES_LEN EQUAL 0)
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} clone -b ${LIBXSMM_TAG}
-            "https://github.com/hfp/libxsmm.git" ${LIBXSMM_SOURCE_DIR})
+  execute_process(COMMAND ${GIT_EXECUTABLE} clone -b ${LIBXSMM_TAG}
+                          "${GIT_URL}/hfp/libxsmm.git" ${LIBXSMM_SOURCE_DIR})
 else()
   # check git tag
   execute_process(
diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake
index 57969e8c76c8e9..1a2f7662fea24f 100644
--- a/cmake/external/onnxruntime.cmake
+++ b/cmake/external/onnxruntime.cmake
@@ -44,19 +44,19 @@ set(ONNXRUNTIME_DOWNLOAD_DIR
 
 if(WIN32)
   set(ONNXRUNTIME_URL
-      "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-win-x64-${ONNXRUNTIME_VERSION}.zip"
+      "${GIT_URL}/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-win-x64-${ONNXRUNTIME_VERSION}.zip"
   )
   set(ONNXRUNTIME_URL_MD5 f21d6bd1feef15935a5f4e1007797593)
   set(ONNXRUNTIME_CACHE_EXTENSION "zip")
 elseif(APPLE)
   set(ONNXRUNTIME_URL
-      "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-osx-x86_64-${ONNXRUNTIME_VERSION}.tgz"
+      "${GIT_URL}/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-osx-x86_64-${ONNXRUNTIME_VERSION}.tgz"
   )
   set(ONNXRUNTIME_URL_MD5 6a6f6b7df97587da59976042f475d3f4)
   set(ONNXRUNTIME_CACHE_EXTENSION "tgz")
 else()
   set(ONNXRUNTIME_URL
-      "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz"
+      "${GIT_URL}/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz"
   )
   set(ONNXRUNTIME_URL_MD5 ce3f2376854b3da4b483d6989666995a)
   set(ONNXRUNTIME_CACHE_EXTENSION "tgz")
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 267f8d733cbd41..5c9112a4d4e893 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -19,12 +19,16 @@ set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
 set(CBLAS_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/openblas)
 set(CBLAS_TAG v0.3.7)
 
-# OpenBLAS support Raptor Lake from v0.3.22
-if(UNIX
-   AND NOT APPLE
-   AND NOT WITH_ROCM
+# Why use v0.3.18?  The IDG business line encountered a random openblas error,
+# which can be resolved after upgrading openblas.
+# And why compile when gcc>8.2? Please refer to
+# https://github.com/spack/spack/issues/19932#issuecomment-733452619
+# v0.3.18 only support gcc>=8.3 or gcc>=7.4
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+   AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.2
    AND NOT WITH_XPU)
-  set(CBLAS_TAG v0.3.23)
+  # We only compile with openblas 0.3.18 when gcc >= 8.3
+  set(CBLAS_TAG v0.3.18)
 endif()
 
 if(APPLE AND WITH_ARM)
@@ -42,9 +46,8 @@ endif()
 file(GLOB CBLAS_SOURCE_FILE_LIST ${CBLAS_SOURCE_DIR})
 list(LENGTH CBLAS_SOURCE_FILE_LIST RES_LEN)
 if(RES_LEN EQUAL 0)
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} clone -b ${CBLAS_TAG}
-            "https://github.com/xianyi/OpenBLAS.git" ${CBLAS_SOURCE_DIR})
+  execute_process(COMMAND ${GIT_EXECUTABLE} clone -b ${CBLAS_TAG}
+                          "${GIT_URL}/xianyi/OpenBLAS.git" ${CBLAS_SOURCE_DIR})
 else()
   # check git tag
   execute_process(
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index 0a80c87e8e5faa..decb6c91682744 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -71,19 +71,19 @@ endif()
 
 if(WIN32)
   set(PADDLE2ONNX_URL
-      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v${PADDLE2ONNX_VERSION}/paddle2onnx-win-x64-${PADDLE2ONNX_VERSION}.zip"
+      "${GIT_URL}/PaddlePaddle/Paddle2ONNX/releases/download/v${PADDLE2ONNX_VERSION}/paddle2onnx-win-x64-${PADDLE2ONNX_VERSION}.zip"
   )
   set(PADDLE2ONNX_URL_MD5 "122b864cb57338191a7e9ef5f607c4ba")
   set(PADDLE2ONNX_CACHE_EXTENSION "zip")
 elseif(APPLE)
   set(PADDLE2ONNX_URL
-      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v${PADDLE2ONNX_VERSION}/paddle2onnx-osx-x86_64-${PADDLE2ONNX_VERSION}.tgz"
+      "${GIT_URL}/PaddlePaddle/Paddle2ONNX/releases/download/v${PADDLE2ONNX_VERSION}/paddle2onnx-osx-x86_64-${PADDLE2ONNX_VERSION}.tgz"
   )
   set(PADDLE2ONNX_URL_MD5 "32a4381ff8441b69d58ef0fd6fd919eb")
   set(PADDLE2ONNX_CACHE_EXTENSION "tgz")
 else()
   set(PADDLE2ONNX_URL
-      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v${PADDLE2ONNX_VERSION}/paddle2onnx-linux-x64-${PADDLE2ONNX_VERSION}.tgz"
+      "${GIT_URL}/PaddlePaddle/Paddle2ONNX/releases/download/v${PADDLE2ONNX_VERSION}/paddle2onnx-linux-x64-${PADDLE2ONNX_VERSION}.tgz"
   )
   set(PADDLE2ONNX_URL_MD5 "3fbb074987ba241327797f76514e937f")
   set(PADDLE2ONNX_CACHE_EXTENSION "tgz")
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 85836a33f8c08c..0dc93d47ec92b5 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -244,7 +244,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     set(PROTOBUF_TAG 01a05a53f40ca2ac5f0af10c6cc0810bee39b792)
   else()
     if(WITH_PSLIB)
-      set(PROTOBUF_REPOSITORY "https://github.com/google/protobuf.git")
+      set(PROTOBUF_REPOSITORY "${GIT_URL}/google/protobuf.git")
       set(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
     else()
       set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 13aaf0d760f160..297ff5b3983f21 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -88,7 +88,7 @@
 # To build a unit test binary, which is an executable binary with libpaddle.so
 # automatically linked:
 #
-#   paddle_test(example SHARED)
+#   paddle_test(example SRCS example_test.cc)
 #
 
 # including binary directory for generated headers.
@@ -499,12 +499,15 @@ function(cc_test_run TARGET_NAME)
       NAME ${TARGET_NAME}
       COMMAND ${cc_test_COMMAND} ${cc_test_ARGS}
       WORKING_DIRECTORY ${cc_test_DIR})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_cudnn_deterministic=true)
+    set_property(
+      TEST ${TARGET_NAME}
+      PROPERTY
+        ENVIRONMENT
+        FLAGS_cpu_deterministic=true
+        FLAGS_init_allocated_mem=true
+        FLAGS_cudnn_deterministic=true
+        LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE_BINARY_DIR}/python/paddle/libs:${PADDLE_BINARY_DIR}/python/paddle/base
+    )
     # No unit test should exceed 2 minutes.
     if(WIN32)
       set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
@@ -1345,6 +1348,9 @@ function(math_library TARGET)
   if(WITH_GPU)
     if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
       list(APPEND math_common_deps cub)
+    elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0
+           OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0)
+      list(APPEND math_common_deps cccl)
     else()
       list(APPEND math_common_deps)
     endif()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index f7a6e9a696b70c..4134b31a966ed5 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -247,6 +247,14 @@ if(NOT DEFINED WITH_MKLDNN)
   endif()
 endif()
 
+if(WIN32)
+  if(MSVC)
+    if(MSVC_VERSION LESS 1920)
+      set(WITH_MKLDNN OFF)
+    endif()
+  endif()
+endif()
+
 if(WIN32
    OR APPLE
    OR NOT WITH_GPU
@@ -375,6 +383,10 @@ if(WITH_GPU)
   if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
     include(external/cub) # download cub
     list(APPEND third_party_deps extern_cub)
+  elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0
+         OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0)
+    include(external/cccl)
+    list(APPEND third_party_deps extern_cccl)
   endif()
   set(URL
       "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz"
diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
index 3d94f1fc8b7a01..c8be20ae3afa61 100644
--- a/paddle/cinn/ast_gen_ius/ast_gen.cc
+++ b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -19,6 +19,7 @@
 #include "paddle/cinn/ir/operation.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
 namespace ast_gen_ius {
@@ -84,11 +85,75 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     tensor_group->MarkShareMemBuffer(tensor, init_tensor);
     tensor_group->CtrlDepend(tensor, init_tensor);
     Expr init_body = ir::Store::Make(init_tensor, init_value, axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> block_vars;
+    std::vector<ir::Expr> iter_values;
+    // reduce body and reduce init schedule block should have different objects
+    // for same axis so we re-create objects
+    std::vector<Var> axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      block_vars.push_back(Var(Expr(0),
+                               shape[i],
+                               cinn::UniqName("i" + std::to_string(i)),
+                               /*is_reduce = */ false));
+      optim::ReplaceVarWithExpr(&init_body, axis[i], block_vars[i]);
+      axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        iter_values.push_back(Expr(0));
+      } else {
+        iter_values.push_back(axis_vars[i]);
+      }
+    }
+    init_body = ir::ScheduleBlockRealize::Make(
+        iter_values,
+        ir::ScheduleBlock::Make(
+            block_vars, {}, {}, reduce_init_name, init_body));
 
     // For the remaining reduce axis, make reduce body
     const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
     ir::Expr reduce_body =
         ConvertReduceBody(tensor->body(), tensor, axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> reduce_block_vars;
+    std::vector<ir::Expr> reduce_iter_values;
+    // reduce body and reduce init schedule block should have different objects
+    // for same axis so we re-create objects
+    std::vector<Var> reduce_axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      reduce_block_vars.push_back(Var(Expr(0),
+                                      shape[i],
+                                      cinn::UniqName("i" + std::to_string(i)),
+                                      /*is_reduce = */ false));
+      reduce_axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        reduce_iter_values.push_back(Expr(0));
+      } else {
+        reduce_iter_values.push_back(axis_vars[i]);
+      }
+    }
+    for (int i = 0; i < reduce_axis.size(); ++i) {
+      int count = shape.size() + i;
+      reduce_block_vars.push_back(
+          Var(reduce_axis[i]->lower_bound,
+              reduce_axis[i]->upper_bound,
+              cinn::UniqName("i" + std::to_string(count)),
+              /*is_reduce = */ true));
+      ir::Var reduce_axis_var = reduce_axis[i];
+      reduce_axis_var->is_reduce_axis = true;
+      reduce_iter_values.push_back(reduce_axis_var);
+    }
+    for (int i = 0; i < axis.size(); ++i) {
+      optim::ReplaceVarWithExpr(&reduce_body, axis[i], reduce_block_vars[i]);
+    }
+    for (int i = axis.size(); i < reduce_block_vars.size(); ++i) {
+      optim::ReplaceVarWithExpr(
+          &reduce_body, reduce_axis[i - axis.size()], reduce_block_vars[i]);
+    }
+
+    reduce_body = ir::ScheduleBlockRealize::Make(
+        reduce_iter_values,
+        ir::ScheduleBlock::Make(
+            reduce_block_vars, {}, {}, tensor->name, reduce_body));
     for (int i = static_cast<int>(reduce_axis.size()) - 1; i >= 0; --i) {
       reduce_body = ir::For::Make(reduce_axis[i],
                                   reduce_axis[i]->lower_bound,
@@ -114,6 +179,24 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     return body;
   } else {
     ir::Expr body = ir::Store::Make(tensor, tensor->body(), axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> block_vars;
+    std::vector<ir::Expr> iter_values;
+    std::vector<Var> axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      block_vars.push_back(Var(
+          Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false));
+      optim::ReplaceVarWithExpr(&body, axis[i], block_vars[i]);
+      axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        iter_values.push_back(Expr(0));
+      } else {
+        iter_values.push_back(axis_vars[i]);
+      }
+    }
+    body = ir::ScheduleBlockRealize::Make(
+        iter_values,
+        ir::ScheduleBlock::Make(block_vars, {}, {}, tensor->name, body));
     for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
       ir::Var loop_var = axis[i];
       ir::Expr loop_extent = shape[i];
diff --git a/paddle/cinn/ast_gen_ius/tensor_group.cc b/paddle/cinn/ast_gen_ius/tensor_group.cc
index e8b9c6a345e72b..34e6e5beb0f9dd 100644
--- a/paddle/cinn/ast_gen_ius/tensor_group.cc
+++ b/paddle/cinn/ast_gen_ius/tensor_group.cc
@@ -21,26 +21,37 @@
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace ast_gen_ius {
 
 TensorGroup::TensorGroup(const std::vector<ir::Tensor>& tensors) {
-  std::set<ir::Tensor> all_tensors(tensors.begin(), tensors.end());
-
-  for (auto& tensor : tensors) {
+  for (const ir::Tensor& tensor : tensors) {
     output_tensor_names_.insert(tensor->name);
-    std::set<ir::Expr> used_tensors = ir::ir_utils::CollectIRNodes(
-        tensor->body(), [](const Expr* x) { return x->as_tensor(); });
-    for (const Expr& x : used_tensors) {
-      const ir::Tensor to_dep = x.as_tensor_ref();
-      all_tensors.insert(to_dep);
-      this->CtrlDepend(tensor, to_dep);
+    this->Insert(tensor);
+  }
+}
+
+void TensorGroup::ShowLog() const {
+  VLOG(6) << "Showing log for TensorGroup";
+  for (auto& p : name_to_tensor_) {
+    VLOG(6) << "Tensor name = " << p.first << " depends on {";
+    if (ctrl_dep_.count(p.first)) {
+      for (auto& dep_name : ctrl_dep_.at(p.first)) {
+        VLOG(6) << dep_name;
+      }
     }
+    VLOG(6) << "}";
   }
+}
 
-  for (const ir::Tensor& t : all_tensors) {
-    name_to_tensor_.insert({t->name, t});
+TensorGroup::TensorGroup(
+    const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
+  for (const auto& map_pair : tensor_map) {
+    const ir::Tensor& tensor = map_pair.second;
+    output_tensor_names_.insert(tensor->name);
+    this->Insert(tensor);
   }
 }
 
@@ -51,7 +62,23 @@ bool TensorGroup::Contain(const std::string& name) const {
 }
 
 void TensorGroup::Insert(const ir::Tensor& tensor) {
-  name_to_tensor_.insert({tensor->name, tensor});
+  if (!name_to_tensor_.count(tensor->name)) {
+    name_to_tensor_.insert({tensor->name, tensor});
+  }
+
+  // Using set to de-duplicate
+  std::set<ir::Tensor> dep_tensors;
+  std::set<ir::Expr> used_tensors = ir::ir_utils::CollectIRNodes(
+      tensor->body(), [](const Expr* x) { return x->as_tensor(); });
+  for (const Expr& x : used_tensors) {
+    const ir::Tensor to_dep = x.as_tensor_ref();
+    dep_tensors.insert(to_dep);
+    this->CtrlDepend(tensor, to_dep);
+  }
+
+  for (const ir::Tensor& t : dep_tensors) {
+    this->Insert(t);
+  }
 }
 
 ir::Tensor TensorGroup::Get(const std::string& name) {
@@ -72,6 +99,8 @@ std::vector<ir::Tensor> TensorGroup::GetGenFuncTopoOrder(
   for (const auto& dep_pair : ctrl_dep_) {
     const std::unordered_set<std::string>& dep_tensor_names = dep_pair.second;
     in_degree[dep_pair.first] = dep_tensor_names.size();
+    VLOG(6) << "indegree[" << dep_pair.first
+            << "] = " << dep_tensor_names.size();
   }
 
   std::vector<ir::Tensor> ret;
@@ -95,7 +124,6 @@ std::vector<ir::Tensor> TensorGroup::GetGenFuncTopoOrder(
   while (!node_set.empty()) {
     const std::string cur = *(node_set.begin());
     node_set.erase(node_set.begin());
-
     if (!input_arg_names.count(cur)) {
       ret.push_back(name_to_tensor_[cur]);
     }
@@ -187,5 +215,45 @@ absl::flat_hash_map<std::string, ir::Tensor> TensorGroup::AllocateBuffers() {
   return name_to_tensor_;
 }
 
+void StageMapShareMemory(const poly::StageMap& stages) {
+  absl::flat_hash_map<std::string, ir::_Tensor_*> tensor_map;
+  for (auto& stage : stages) {
+    tensor_map[stage.second->tensor()->name] = stage.second->tensor();
+  }
+  for (auto& stage : stages) {
+    if (!stage.second->tensor()->buffer.defined() &&
+        !stage.second->meta.tensors_to_share_buffer_with.empty()) {
+      for (auto& str : stage.second->meta.tensors_to_share_buffer_with) {
+        if (tensor_map[str]->buffer.defined()) {
+          auto edited_shape = tensor_map[str]->buffer->shape;
+          stage.second->tensor()->Bind(tensor_map[str]->buffer);
+          tensor_map[str]->buffer->shape = edited_shape;
+          VLOG(3) << "Stage Tensor " << stage.second->tensor()->name
+                  << " bind buffer to " << tensor_map[str]->name << " , "
+                  << tensor_map[str]->buffer->name;
+        }
+      }
+    }
+  }
+}
+
+TensorGroup ConvertStageMapToTensorGroup(const poly::StageMap& stage_map) {
+  std::vector<ir::Tensor> stage_tensors;
+  std::set<ir::Tensor> reshape_tensors;
+  for (auto iter = stage_map.begin(); iter != stage_map.end(); ++iter) {
+    if (iter->second->has_expression()) {
+      const std::string& tensor_name = iter->first;
+      stage_tensors.push_back(ir::Tensor(iter->second->tensor()));
+      if (utils::Endswith(tensor_name, "_reshape")) {
+        reshape_tensors.insert(ir::Tensor(iter->second->tensor()));
+      }
+    }
+  }
+
+  ast_gen_ius::TensorGroup tensor_group(stage_tensors);
+  StageMapShareMemory(stage_map);
+  return tensor_group;
+}
+
 }  // namespace ast_gen_ius
 }  // namespace cinn
diff --git a/paddle/cinn/ast_gen_ius/tensor_group.h b/paddle/cinn/ast_gen_ius/tensor_group.h
index c6e12690e9dcc2..d981b0f674f092 100644
--- a/paddle/cinn/ast_gen_ius/tensor_group.h
+++ b/paddle/cinn/ast_gen_ius/tensor_group.h
@@ -24,6 +24,7 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace ast_gen_ius {
@@ -41,11 +42,21 @@ class TensorGroup {
    */
   explicit TensorGroup(const std::vector<ir::Tensor>& tensors);
 
+  /**
+   * Constructor for a TensorGroup, the argument tensors should be output tensor
+   * arguments of the AST body to be generated. The dependent tensors of the
+   * output tensors will be collected during construction.
+   */
+  explicit TensorGroup(
+      const std::unordered_map<std::string, ir::Tensor>& tensor_map);
+
   /**
    * Destructor.
    */
   ~TensorGroup();
 
+  void ShowLog() const;
+
   /**
    * Returns true if TensorGroup collection contains a tensor with input name.
    */
@@ -119,5 +130,9 @@ class TensorGroup {
   std::unordered_map<std::string, std::string> share_memory_tensor_;
 };
 
+// TODO(zhhsplendid): remove stage_map need to change all fcompute CINNValuePack
+// we will change it in the next PR
+TensorGroup ConvertStageMapToTensorGroup(const poly::StageMap& stage_map);
+
 }  // namespace ast_gen_ius
 }  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index 64f2955f30d3dd..fbfdc7af72e9a6 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -190,5 +190,40 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
   return new_func;
 }
 
+std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
+  const ir::ScheduleBlockRealize* block_realize =
+      block.As<ir::ScheduleBlockRealize>();
+  CHECK_NOTNULL(block_realize);
+  const ir::ScheduleBlock* block_node =
+      block_realize->schedule_block.As<ir::ScheduleBlock>();
+  CHECK_NOTNULL(block_node);
+  std::vector<ir::Expr> iter_values = block_realize->iter_values;
+  std::vector<ir::Var> iter_vars = block_node->iter_vars;
+
+  std::unordered_set<std::string> reduce_loop_var;
+  for (int i = 0; i < iter_vars.size(); ++i) {
+    if (iter_vars[i]->is_reduce_axis) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(
+          iter_values[i], [&](const ir::Expr* x) {
+            if (x->as_var()) {
+              reduce_loop_var.insert(x->as_var_ref()->name);
+            }
+            return false;
+          });
+    }
+  }
+  return reduce_loop_var;
+}
+
+std::string GetBlockName(const ir::Expr block) {
+  const ir::ScheduleBlockRealize* block_realize =
+      block.As<ir::ScheduleBlockRealize>();
+  CHECK_NOTNULL(block_realize);
+  const ir::ScheduleBlock* block_node =
+      block_realize->schedule_block.As<ir::ScheduleBlock>();
+  CHECK_NOTNULL(block_node);
+  return block_node->name;
+}
+
 }  // namespace auto_schedule
 }  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.h b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
index 8fbdd52329f514..81d00dcb22ec3a 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.h
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
@@ -48,5 +48,15 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
                                       const ir::LoweredFunc& old_func,
                                       ir::Expr& body);  // NOLINT
 
+/**
+ * Get loop var names of reduce axis
+ */
+std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block);
+
+/**
+ * Get name of a ScheduleBlock
+ */
+std::string GetBlockName(const ir::Expr block);
+
 }  // namespace auto_schedule
 }  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
index ef408b7b7778a8..f7fffa0e0ff4b2 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
@@ -20,6 +20,7 @@
 #include <sstream>
 #include <vector>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
@@ -49,9 +50,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_SimpleAssign) {
   ir::Tensor B = lang::Compute(
       {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
 
-  poly::StageMap stages = poly::CreateStages({A, B});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);
 
   ASSERT_FALSE(funcs.empty());
   ir::Expr ast_expr = funcs[0]->body;
@@ -115,9 +116,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_AddDiffShape) {
   ir::Tensor C = lang::Compute(
       {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
 
-  poly::StageMap stages = poly::CreateStages({C});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "AddDiffShape", stages, {C}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("AddDiffShape", {C}, &tensor_group, target);
 
   ir::Expr ast_expr = funcs[0]->body;
   VLOG(6) << "Expr before MultiLevelTiling: ";
@@ -169,9 +170,9 @@ TEST(AnalyzeIr, ContainsNodeType) {
   ir::Tensor B = lang::Compute(
       {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
 
-  poly::StageMap stages = poly::CreateStages({A, B});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);
 
   ASSERT_FALSE(funcs.empty());
   ir::Expr ast_expr = funcs[0]->body;
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
index 9364374156f4ab..3b51eac2600e38 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
@@ -21,6 +21,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
@@ -48,9 +49,9 @@ TEST(FeatureExtractor, SimpleAssign) {
   ir::Tensor B = lang::Compute(
       {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
 
-  poly::StageMap stages = poly::CreateStages({A, B});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);
   ir::Expr ast_expr = funcs[0]->body;
   VLOG(6) << "Expr to test: " << ast_expr;
 
@@ -109,9 +110,9 @@ TEST(FeatureExtractor, MatrixMultiply) {
       [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); },
       "C");
 
-  poly::StageMap stages = poly::CreateStages({C});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "MatrixMultiply", stages, {C}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("SimpleAssign", {C}, &tensor_group, target);
 
   std::vector<Expr> vec_ast{funcs[0]->body};
   ir::ModuleExpr mod_expr(vec_ast);
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
index 5d6d1be6e0c136..5db6f8999b18a5 100644
--- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
@@ -20,6 +20,7 @@
 #include <fstream>
 #include <vector>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/auto_schedule/search_space/search_state.h"
 #include "paddle/cinn/auto_schedule/task/task_registry.h"
 #include "paddle/cinn/cinn.h"
@@ -47,8 +48,8 @@ std::vector<ir::LoweredFunc> LowerCompute(const std::vector<int>& shape,
   C = Compute(
       domain, [&B](Var i, Var j) { return B(i, j); }, "C");
 
-  return cinn::lang::LowerVec(
-      "test_func", CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  return cinn::lang::LowerToAstVec("test_func", {A, B}, &tensor_group, target);
 }
 
 // Create a new IRSchedule with copied ir::LoweredFunc AST
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
index a6e1db2a8b20ef..0507c78ff2e1cc 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
@@ -129,7 +129,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
                   {
                     i0, i1 = axis.bind(((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1)))
                     {
-                      temp_matmul_out__reduce_init[((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1))] = 0.00000000f
+                      temp_matmul_out__reduce_init[i0, i1] = 0.00000000f
                     }
                   }
                 }
@@ -181,7 +181,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
                   {
                     i0_0, i1_0, i2 = axis.bind(((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1)), ((4 * reduce_k_0) + reduce_k_1))
                     {
-                      temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] = (temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] + (X_reshape_shared_temp_buffer[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((4 * reduce_k_0) + reduce_k_1)] * Y_reshape_shared_temp_buffer[((4 * reduce_k_0) + reduce_k_1), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))]))
+                      temp_matmul_out[i0_0, i1_0] = (temp_matmul_out[i0_0, i1_0] + (X_reshape_shared_temp_buffer[i0_0, i2] * Y_reshape_shared_temp_buffer[i2, i1_0]))
                     }
                   }
                 }
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
index 730575b79ad894..9965046f16635c 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
@@ -8,7 +8,8 @@ gather_srcs(
   auto_unroll.cc
   multi_level_tiling.cc
   skip_rule.cc
-  auto_bind.cc)
+  auto_bind.cc
+  reduction_factoring.cc)
 
 if(WITH_TESTING)
   cinn_cc_library(
@@ -51,3 +52,11 @@ endif()
 #cinn_cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper)
 cinn_cc_test(test_skip_rule SRCS skip_rule_test.cc DEPS cinncore)
 cinn_cc_test(test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore)
+cinn_cc_test(
+  test_reduction_factoring
+  SRCS
+  reduction_factoring_test.cc
+  DEPS
+  cinncore
+  auto_gen_rule_test_helper
+  test_program_builder)
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
index 4cfef12e030e0f..e69d3069f19390 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
@@ -21,6 +21,7 @@
 #include <iostream>
 #include <vector>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
 #include "paddle/cinn/cinn.h"
@@ -59,16 +60,13 @@ TEST(AutoInline, SingleLoopInline) {
   ir::Tensor C = Compute(
       {M}, [&](Var i) { return B(i) + ir::Expr(1.f); }, "C");
 
-  poly::StageMap stages = CreateStages({A, B, C});
+  ast_gen_ius::TensorGroup tensor_group({A, B, C});
   std::vector<ir::LoweredFunc> funcs =
-      lang::LowerVec("TestAutoInline_SingleLoopInline",
-                     stages,
-                     {A, C},
-                     {},
-                     {},
-                     nullptr,
-                     target,
-                     true);
+      lang::LowerToAstVec("TestAutoInline_SingleLoopInline",
+
+                          {A, C},
+                          &tensor_Group,
+                          target);
   VLOG(6) << "Expr after lowering:";
   VLOG(6) << funcs[0]->body;
 
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
index 8b08d2c0658b3b..e4b0597cfeed75 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
@@ -17,6 +17,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/lang/lower.h"
 
@@ -38,9 +39,9 @@ TEST(AutoUnroll, Init) {
 #else
   Target target = common::DefaultHostTarget();
 #endif
-  auto stages = CreateStages({C});
-  auto funcs = cinn::lang::LowerVec(
-      "test_init", stages, {A, B, C}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({C});
+  auto funcs =
+      cinn::lang::LowerToAstVec("test_init", {A, B, C}, &tensor_group, target);
 
   auto ast_expr = funcs[0]->body;
   ir::IRSchedule init_schedule(ir::ModuleExpr({ast_expr}));
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
index a1be2399ce6e9b..62f1bb74f4ac0e 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
@@ -21,6 +21,7 @@
 #include <iostream>
 #include <vector>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
 #include "paddle/cinn/cinn.h"
@@ -106,16 +107,9 @@ TEST(MultiLevelTile, SimpleLoops) {
   ir::Tensor C = Compute(
       {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
 
-  poly::StageMap stages = CreateStages({C});
-  std::vector<ir::LoweredFunc> funcs =
-      lang::LowerVec("TestMultiLevelTile_SimpleLoops",
-                     stages,
-                     {C},
-                     {},
-                     {},
-                     nullptr,
-                     target,
-                     true);
+  ast_gen_ius::TensorGroup tensor_group({C});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerToAstVec(
+      "TestMultiLevelTile_SimpleLoops", {C}, &tensor_group, target);
 
   ir::Expr ast_expr = funcs[0]->body;
   VLOG(6) << "Expr before MultiLevelTiling: ";
@@ -261,7 +255,7 @@ TEST_F(TestMultiLevelTiling, Matmul) {
                       {
                         i0, i1 = axis.bind(((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3)))
                         {
-                          temp_matmul_out__reduce_init[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] = 0.00000000f
+                          temp_matmul_out__reduce_init[i0, i1] = 0.00000000f
                         }
                       }
                     }
@@ -308,10 +302,10 @@ TEST_F(TestMultiLevelTiling, Matmul) {
                               ScheduleBlock(temp_matmul_out_local_temp_buffer)
                               {
                                 i0_0, i1_0, i2 = axis.bind(((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3)), ((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2)))
-                                read_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)], _X[i(undefined:undefined), reduce_k(undefined:undefined)], _Y[reduce_k(undefined:undefined), j(undefined:undefined)])
-                                write_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)])
+                                read_buffers(_temp_matmul_out[i0_0(0:32), i1_0(0:32)], _X[i0_0(0:32), i2(0:32)], _Y[i2(0:32), i1_0(0:32)])
+                                write_buffers(_temp_matmul_out[i0_0(0:32), i1_0(0:32)])
                                 {
-                                  temp_matmul_out_local_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] = (temp_matmul_out_local_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] + (X_reshape_shared_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2))] * Y_reshape_shared_temp_buffer[((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2)), ((32 * j_1) + ((32 * j_2) + j_3))]))
+                                  temp_matmul_out_local_temp_buffer[i0_0, i1_0] = (temp_matmul_out_local_temp_buffer[i0_0, i1_0] + (X_reshape_shared_temp_buffer[i0_0, i2] * Y_reshape_shared_temp_buffer[i2, i1_0]))
                                 }
                               }
                             }
@@ -453,7 +447,7 @@ TEST_F(TestMultiLevelTiling, Pool2d) {
               {
                 i0, i1, i2, i3 = axis.bind(i, j, k, a)
                 {
-                  pad_temp_0[i, j, k, a] = select(((a < 17) and ((a >= 1) and ((k < 17) and (k >= 1)))), input[i, j, (-1 + k), (-1 + a)], -3.40282347e+38f)
+                  pad_temp_0[i0, i1, i2, i3] = select(((i3 < (1 + 16)) and ((i3 >= 1) and ((i2 < (1 + 16)) and (i2 >= 1)))), input[i0, i1, (i2 - 1), (i3 - 1)], -3.40282347e+38f)
                 }
               }
             }
@@ -477,7 +471,7 @@ TEST_F(TestMultiLevelTiling, Pool2d) {
                     {
                       i0_0, i1_0, i2_0, i3_0 = axis.bind(((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1))
                       {
-                        var_0__reduce_init[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((4 * ((i_j_k_a_fused / 2) % 2)) + ((i_0_j_0_k_0_a_0_fused % 4) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)] = -3.40282347e+38f
+                        var_0__reduce_init[i0_0, i1_0, i2_0, i3_0] = -3.40282347e+38f
                       }
                     }
                   }
@@ -511,10 +505,10 @@ TEST_F(TestMultiLevelTiling, Pool2d) {
                           ScheduleBlock(var_0_local_temp_buffer)
                           {
                             i0_1, i1_1, i2_1, i3_1, i4, i5 = axis.bind(((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1), kernel_idx, kernel_idx_0)
-                            read_buffers(_var_0[i(undefined:undefined), j(undefined:undefined), k(undefined:undefined), a(undefined:undefined)], _pad_temp_0[i(undefined:undefined), j(undefined:undefined)])
-                            write_buffers(_var_0[i(undefined:undefined), j(undefined:undefined), k(undefined:undefined), a(undefined:undefined)])
+                            read_buffers(_var_0[i0_1(0:2), i1_1(0:8), i2_1(0:8), i3_1(0:8)], _pad_temp_0[i0_1(0:2), i1_1(0:8)])
+                            write_buffers(_var_0[i0_1(0:2), i1_1(0:8), i2_1(0:8), i3_1(0:8)])
                             {
-                              var_0_local_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((4 * ((i_j_k_a_fused / 2) % 2)) + ((i_0_j_0_k_0_a_0_fused % 4) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)] = cinn_max(var_0_local_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)], pad_temp_0_shared_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((8 * ((i_j_k_a_fused / 2) % 2)) + ((2 * (i_0_j_0_k_0_a_0_fused % 4)) + ((2 * k_1) + kernel_idx))), ((8 * (i_j_k_a_fused % 2)) + ((2 * a_1) + kernel_idx_0))])
+                              var_0_local_temp_buffer[i0_1, i1_1, i2_1, i3_1] = cinn_max(var_0_local_temp_buffer[i0_1, i1_1, i2_1, i3_1], pad_temp_0_shared_temp_buffer[i0_1, i1_1, ((2 * i2_1) + i4), ((2 * i3_1) + i5)])
                             }
                           }
                         }
@@ -533,7 +527,7 @@ TEST_F(TestMultiLevelTiling, Pool2d) {
                     {
                       ScheduleBlock(var_0)
                       {
-                        v0, v1, v2, v3 = axis.bind((((((i_j_k_a_fused / 2) / 2) / 2) + (i_0_j_0_k_0_a_0_fused / 4)) + ax0_0), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + ax1_0), (((4 * ((i_j_k_a_fused / 2) % 2)) + (i_0_j_0_k_0_a_0_fused % 4)) + ax2_0), ((4 * (i_j_k_a_fused % 2)) + ax3_0))
+                        v0, v1, v2, v3 = axis.bind((((((i_j_k_a_fused / 2) / 2) / 2) + (i_0_j_0_k_0_a_0_fused / 4)) + ax0_0), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + ax1_0), (((i_0_j_0_k_0_a_0_fused % 4) + (4 * ((i_j_k_a_fused / 2) % 2))) + ax2_0), ((4 * (i_j_k_a_fused % 2)) + ax3_0))
                         attrs(reverse_compute_at_extra_var:ax0_0,ax1_0,ax2_0,ax3_0)
                         {
                           var_0[v0, v1, v2, v3] = var_0_local_temp_buffer[v0, v1, v2, v3]
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
new file mode 100644
index 00000000000000..c44d067610123a
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h"
+
+#include <glog/logging.h>
+
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/schedule/ir_schedule_util.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+bool ReductionFactoring::CanApply(const std::string& block_name,
+                                  ir::IRSchedule* ir_schedule) const {
+  ir::Expr block_expr = ir_schedule->GetBlock(block_name);
+  ir::ScheduleBlockRealize* block_realize =
+      block_expr.As<ir::ScheduleBlockRealize>();
+  CHECK_NOTNULL(block_realize);
+  ir::ScheduleBlock* sch_block =
+      block_realize->schedule_block.As<ir::ScheduleBlock>();
+  CHECK_NOTNULL(sch_block);
+  AnalyzeScheduleBlockReadWriteBuffer(sch_block);
+
+  // 1. The block must have write buffer
+  if (sch_block->write_buffers.empty()) {
+    return false;
+  }
+
+  // 2. The block must have at least one reduce axis
+  const std::vector<ir::Var>& iter_vars = sch_block->iter_vars;
+  bool find_reduce_axis = false;
+  for (int i = 0; i < iter_vars.size(); ++i) {
+    if (iter_vars[i]->is_reduce_axis) {
+      find_reduce_axis = true;
+      break;
+    }
+  }
+  if (!find_reduce_axis) {
+    return false;
+  }
+
+  // 3. Each loop's body only contains one sub loop or block, except reduce_init
+  // block
+  std::vector<ir::Expr> loops = ir_schedule->GetLoops(block_name);
+  for (const ir::Expr& loop : loops) {
+    const ir::Expr& body = loop.As<ir::For>()->body;
+    if (body.As<ir::Block>()) {
+      if (body.As<ir::Block>()->stmts.size() == 1) {
+        if (body.As<ir::Block>()->stmts[0].As<ir::For>() == nullptr &&
+            body.As<ir::Block>()->stmts[0].As<ir::ScheduleBlockRealize>() ==
+                nullptr) {
+          return false;
+        }
+      } else if (body.As<ir::Block>()->stmts.size() == 2) {
+        if (body.As<ir::Block>()->stmts[0].As<ir::ScheduleBlockRealize>() ==
+                nullptr ||
+            !ir::IsReduceInitTensorName(
+                GetBlockName(body.As<ir::Block>()->stmts[0]))) {
+          return false;
+        }
+        if (body.As<ir::Block>()->stmts[1].As<ir::For>() == nullptr &&
+            body.As<ir::Block>()->stmts[1].As<ir::ScheduleBlockRealize>() ==
+                nullptr) {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    } else if (body.As<ir::For>() || body.As<ir::ScheduleBlockRealize>()) {
+      continue;
+    } else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+RuleApplyType ReductionFactoring::AnalyseApplyType(
+    SearchState state, const std::string& block_name) const {
+  return this->CanApply(block_name, &(state->ir_schedule))
+             ? RuleApplyType::kApply
+             : RuleApplyType::kCannotApply;
+}
+
+std::vector<SearchState> ReductionFactoring::ApplyOnBlock(
+    SearchState state, const std::string& block_name) {
+  SearchState new_state = state.Copy();
+  Apply(block_name, &(new_state->ir_schedule));
+  return {new_state};
+}
+
+void ReductionFactoring::Apply(const std::string& block_name,
+                               ir::IRSchedule* ir_schedule) {
+  ir::Expr block = ir_schedule->GetBlock(block_name);
+  std::vector<ir::Expr> all_loops = ir_schedule->GetLoops(block_name);
+
+  std::vector<ir::Expr> new_loop_order;
+  size_t num_spatial_loops = 0;
+  size_t num_reduction_loops = 0;
+  // 1. Add all spatial loops
+  std::unordered_set<std::string> reduce_loop_var_names =
+      GetReduceLoopVarNames(block);
+  for (const ir::Expr& expr : all_loops) {
+    if (reduce_loop_var_names.count(expr.As<ir::For>()->loop_var->name) == 0) {
+      new_loop_order.push_back(expr);
+      ++num_spatial_loops;
+    }
+  }
+  // 2. Add all reduction loops
+  for (const ir::Expr& expr : all_loops) {
+    if (reduce_loop_var_names.count(expr.As<ir::For>()->loop_var->name) > 0) {
+      new_loop_order.push_back(expr);
+      ++num_reduction_loops;
+    }
+  }
+  if (num_reduction_loops == 0) {
+    return;
+  }
+  // 3. Reorder if new_loop_order differs from the original order
+  CHECK_EQ(all_loops.size(), new_loop_order.size());
+  for (int i = 0; i < all_loops.size(); ++i) {
+    if (all_loops[i].As<ir::For>()->loop_var->name !=
+        new_loop_order[i].As<ir::For>()->loop_var->name) {
+      ir_schedule->Reorder(new_loop_order);
+      break;
+    }
+  }
+
+  // 4. Fuse all reduction loops
+  ir::Expr fused_reduce_loop;
+  VLOG(6) << "before Fuse: " << ir_schedule->GetModule().GetExprs()[0];
+  if (num_reduction_loops > 1) {
+    std::vector<int> reduction_loop_indices;
+    for (int i = num_spatial_loops; i < all_loops.size(); ++i) {
+      reduction_loop_indices.push_back(i);
+    }
+    CHECK_EQ(reduction_loop_indices.size(), num_reduction_loops);
+    fused_reduce_loop = ir_schedule->Fuse(block_name, reduction_loop_indices);
+  } else {
+    all_loops = ir_schedule->GetLoops(block_name);
+    fused_reduce_loop = all_loops.back();
+  }
+  // 5. Split the reduction loop into 2 part
+  VLOG(6) << "before Split: " << ir_schedule->GetModule().GetExprs()[0];
+  int factor = 1;
+  int extent = ir::GetLoopExtent(fused_reduce_loop);
+  for (int i = ceil(sqrt(extent)); i >= 1; --i) {
+    if (extent % i == 0) {
+      factor = i;
+      break;
+    }
+  }
+  std::vector<cinn::ir::Expr> splited_reduction_loops =
+      ir_schedule->Split(fused_reduce_loop, {-1, factor});
+  // 6.  Apply FactorizeReduction
+  VLOG(6) << "before FactorizeReduction: "
+          << ir_schedule->GetModule().GetExprs()[0];
+  ir_schedule->FactorizeReduction(splited_reduction_loops[0],
+                                  num_spatial_loops);
+  VLOG(6) << "after FactorizeReduction: "
+          << ir_schedule->GetModule().GetExprs()[0];
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h
new file mode 100644
index 00000000000000..889e3e94292d2d
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class ReductionFactoring : public AutoGenRule {
+ public:
+  explicit ReductionFactoring(const common::Target& target)
+      : AutoGenRule(target) {}
+  ~ReductionFactoring() = default;
+
+  // In the future, we will no longer use this interface.
+  RuleApplyType Init(ir::IRSchedule* init_schedule) override {
+    return RuleApplyType::kCannotApply;
+  }
+  // In the future, we will no longer use this interface.
+  void Apply(int index) override {
+    LOG(FATAL) << "This is a deprecated interface, please do not use it.";
+    return;
+  }
+
+  RuleApplyType AnalyseApplyType(SearchState state,
+                                 const std::string& block_name) const override;
+
+  std::string GetRuleName() const override { return "ReductionFactoring"; }
+
+  std::vector<SearchState> ApplyOnBlock(SearchState state,
+                                        const std::string& block_name) override;
+
+  void Apply(const std::string& block_name, ir::IRSchedule* ir_schedule);
+
+ private:
+  bool CanApply(const std::string& block_name,
+                ir::IRSchedule* ir_schedule) const;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
new file mode 100644
index 00000000000000..63e808cfbd4a50
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
@@ -0,0 +1,219 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <functional>
+#include <numeric>
+
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "test/cpp/cinn/concrete_program_builder.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class TestReductionFactoring : public TestAutoGenRuleBase {
+ public:
+  std::vector<std::string> default_input_names = {"X"};
+  std::vector<std::string> default_output_names = {"out"};
+
+  void TestApplyOnReduce(const std::vector<int>& shape,
+                         const std::vector<int>& reduce_dim,
+                         const std::string& block_name,
+                         const std::string& expected_ir) {
+    Initialize(common::DefaultHostTarget());
+    auto test_program = tests::ReduceBuilder().Build(
+        {{"X", shape}}, {{"reduce_dim", reduce_dim}});
+    // construct input parameter
+    ir::IRSchedule ir_schedule = MakeIRSchedule(test_program);
+    SearchState state(ir_schedule, 0, {});
+    std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+    ASSERT_EQ(func_bodys.size(), 1UL);
+    VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+    // apply
+    ReductionFactoring reduction_factoring(target_);
+    ASSERT_EQ(reduction_factoring.AnalyseApplyType(state, block_name),
+              RuleApplyType::kApply);
+    auto result = reduction_factoring.ApplyOnBlock(state, block_name)[0];
+    std::vector<ir::Expr> exprs = result->ir_schedule.GetModule().GetExprs();
+    EXPECT_EQ(exprs.size(), 1UL);
+    std::stringstream ir;
+    ir << exprs[0];
+    VLOG(6) << "ReductionFactoring applied Expr: " << exprs[0];
+
+    // check
+    const std::vector<ir::Expr>& blocks = ir_schedule.GetAllBlocks();
+    CHECK_EQ(blocks.size(), 2UL);
+    CHECK_EQ(ir.str(), expected_ir);
+  }
+};
+
+TEST_F(TestReductionFactoring, AnalyseApplyType) {
+  Initialize(common::DefaultHostTarget());
+  auto test_program =
+      tests::OpBuilder("elementwise_add").Build({{"X", {4, 5}}, {"Y", {4, 5}}});
+  ir::IRSchedule ir_schedule = MakeIRSchedule(test_program);
+  VLOG(6) << "Original Expr:\n" << ir_schedule.GetModule().GetExprs()[0];
+  SearchState state(ir_schedule, 0, {});
+  ReductionFactoring reduction_factoring(target_);
+  EXPECT_EQ(reduction_factoring.AnalyseApplyType(state, "var_1"),
+            RuleApplyType::kCannotApply);
+}
+
+TEST_F(TestReductionFactoring, ApplyOnBlock1ReduceDim) {
+  std::string expected_ir = R"({
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (reduce_k_0_0, 0, 8)
+        {
+          ScheduleBlock(var_0_rf__reduce_init)
+          {
+            vreduce_k_0_0, i0_0 = axis.bind(reduce_k_0_0, i)
+            var_0_rf__reduce_init[i0_0, vreduce_k_0_0] = 0.00000000f
+          }
+          serial for (reduce_k_0_1, 0, 8)
+          {
+            ScheduleBlock(var_0_rf)
+            {
+              vreduce_k_0_0, i0_0, vreduce_k_0_1 = axis.bind(reduce_k_0_0, i, reduce_k_0_1)
+              var_0_rf[i0_0, vreduce_k_0_0] = (var_0_rf[i0_0, vreduce_k_0_0] + X[i0_0, ((8 * vreduce_k_0_0) + vreduce_k_0_1)])
+            }
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        ScheduleBlock(var_0__reduce_init)
+        {
+          i0_0 = axis.bind(i)
+          var_0__reduce_init[i0_0] = 0.00000000f
+        }
+        serial for (reduce_k_0_0, 0, 8)
+        {
+          ScheduleBlock(var_0)
+          {
+            vreduce_k_0_0, i0_0 = axis.bind(reduce_k_0_0, i)
+            var_0[i0_0] = (var_0[i0_0] + var_0_rf[i0_0, vreduce_k_0_0])
+          }
+        }
+      }
+    }
+  }
+})";
+  TestApplyOnReduce({32, 64}, {1}, "var_0", expected_ir);
+}
+
+TEST_F(TestReductionFactoring, ApplyOnBlock2ReduceDim) {
+  std::string expected_ir = R"({
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (reduce_k_0_reduce_k_1_fused, 0, 128)
+        {
+          ScheduleBlock(var_0_rf__reduce_init)
+          {
+            vreduce_k_0_reduce_k_1_fused, i0_0 = axis.bind(reduce_k_0_reduce_k_1_fused, i)
+            var_0_rf__reduce_init[i0_0, vreduce_k_0_reduce_k_1_fused] = 0.00000000f
+          }
+          serial for (reduce_k_0_reduce_k_1_fused_0, 0, 64)
+          {
+            ScheduleBlock(var_0_rf)
+            {
+              vreduce_k_0_reduce_k_1_fused, i0_0, vreduce_k_0_reduce_k_1_fused_0 = axis.bind(reduce_k_0_reduce_k_1_fused, i, reduce_k_0_reduce_k_1_fused_0)
+              var_0_rf[i0_0, vreduce_k_0_reduce_k_1_fused] = (var_0_rf[i0_0, vreduce_k_0_reduce_k_1_fused] + X[i0_0, (((64 * vreduce_k_0_reduce_k_1_fused) + vreduce_k_0_reduce_k_1_fused_0) / 128), (((64 * vreduce_k_0_reduce_k_1_fused) + vreduce_k_0_reduce_k_1_fused_0) % 128)])
+            }
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        ScheduleBlock(var_0__reduce_init)
+        {
+          i0_0 = axis.bind(i)
+          var_0__reduce_init[i0_0] = 0.00000000f
+        }
+        serial for (reduce_k_0_reduce_k_1_fused, 0, 128)
+        {
+          ScheduleBlock(var_0)
+          {
+            vreduce_k_0_reduce_k_1_fused, i0_0 = axis.bind(reduce_k_0_reduce_k_1_fused, i)
+            var_0[i0_0] = (var_0[i0_0] + var_0_rf[i0_0, vreduce_k_0_reduce_k_1_fused])
+          }
+        }
+      }
+    }
+  }
+})";
+  TestApplyOnReduce({32, 64, 128}, {1, 2}, "var_0", expected_ir);
+}
+
+TEST_F(TestReductionFactoring, ApplyOnBlock3ReduceDim) {
+  std::string expected_ir = R"({
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (reduce_k_0_reduce_k_1_reduce_k_2_fused, 0, 512)
+        {
+          ScheduleBlock(var_0_rf__reduce_init)
+          {
+            vreduce_k_0_reduce_k_1_reduce_k_2_fused, i0_0 = axis.bind(reduce_k_0_reduce_k_1_reduce_k_2_fused, i)
+            var_0_rf__reduce_init[i0_0, vreduce_k_0_reduce_k_1_reduce_k_2_fused] = 0.00000000f
+          }
+          serial for (reduce_k_0_reduce_k_1_reduce_k_2_fused_0, 0, 512)
+          {
+            ScheduleBlock(var_0_rf)
+            {
+              vreduce_k_0_reduce_k_1_reduce_k_2_fused, i0_0, vreduce_k_0_reduce_k_1_reduce_k_2_fused_0 = axis.bind(reduce_k_0_reduce_k_1_reduce_k_2_fused, i, reduce_k_0_reduce_k_1_reduce_k_2_fused_0)
+              var_0_rf[i0_0, vreduce_k_0_reduce_k_1_reduce_k_2_fused] = (var_0_rf[i0_0, vreduce_k_0_reduce_k_1_reduce_k_2_fused] + X[i0_0, ((((512 * vreduce_k_0_reduce_k_1_reduce_k_2_fused) + vreduce_k_0_reduce_k_1_reduce_k_2_fused_0) / 64) / 64), ((((512 * vreduce_k_0_reduce_k_1_reduce_k_2_fused) + vreduce_k_0_reduce_k_1_reduce_k_2_fused_0) / 64) % 64), (((512 * vreduce_k_0_reduce_k_1_reduce_k_2_fused) + vreduce_k_0_reduce_k_1_reduce_k_2_fused_0) % 64)])
+            }
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        ScheduleBlock(var_0__reduce_init)
+        {
+          i0_0 = axis.bind(i)
+          var_0__reduce_init[i0_0] = 0.00000000f
+        }
+        serial for (reduce_k_0_reduce_k_1_reduce_k_2_fused, 0, 512)
+        {
+          ScheduleBlock(var_0)
+          {
+            vreduce_k_0_reduce_k_1_reduce_k_2_fused, i0_0 = axis.bind(reduce_k_0_reduce_k_1_reduce_k_2_fused, i)
+            var_0[i0_0] = (var_0[i0_0] + var_0_rf[i0_0, vreduce_k_0_reduce_k_1_reduce_k_2_fused])
+          }
+        }
+      }
+    }
+  }
+})";
+  TestApplyOnReduce({32, 64, 64, 64}, {1, 2, 3}, "var_0", expected_ir);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
index 52f38e0b65b03a..5ba15a46fef188 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
@@ -21,6 +21,7 @@
 #include <iostream>
 #include <vector>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/ir/ir.h"
@@ -52,9 +53,9 @@ TEST(SkipRule, Basic) {
   ir::Tensor C = Compute(
       {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
 
-  poly::StageMap stages = CreateStages({C});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "TestSkipRule_Basic", stages, {C}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("TestSkipRule_Basic", {C}, &tensor_group, target);
 
   ir::Expr ast_expr = funcs[0]->body;
   VLOG(6) << "Expr before SkipRule: ";
@@ -101,9 +102,9 @@ TEST(SkipRule, ApplyOnSpecificBlock) {
   ir::Tensor C = Compute(
       {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
 
-  poly::StageMap stages = CreateStages({C});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "TestSkipRule_Basic", stages, {C}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("TestSkipRule_Basic", {C}, &tensor_group, target);
 
   ir::Expr ast_expr = funcs[0]->body;
   VLOG(6) << "Expr before SkipRule: ";
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
index ef7f2a4ab6dc5d..11fabfe16df2f0 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
@@ -67,7 +67,8 @@ ir::IRSchedule TestAutoGenRuleBase::MakeIRSchedule(
   lowered_funcs_ =
       op_lowerer.Lower(graph->fusion_groups.front(),
                        /*apply_op_schedule = */ apply_manual_schedule,
-                       /*apply_group_schedule = */ apply_manual_schedule);
+                       /*apply_group_schedule = */ apply_manual_schedule,
+                       /*apply_pass = */ apply_manual_schedule);
   CHECK(!lowered_funcs_.empty()) << "lowered_funcs_ is empty";
 
   std::vector<Expr> bodys;
diff --git a/paddle/cinn/auto_schedule/search_space/search_state_test.cc b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
index 61547d228302f3..b0f216c4895aa1 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
@@ -17,6 +17,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/common/context.h"
 
@@ -35,35 +36,18 @@ TEST(TestSearchState, SearchStateHash_Equal) {
   ir::Tensor C = lang::Compute(
       {M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
 
+  ast_gen_ius::TensorGroup const_group_1({A, B});
   cinn::common::Context::Global().ResetNameId();
-  auto a_plus_const_funcs_1 = lang::LowerVec("A_plus_const",
-                                             poly::CreateStages({A, B}),
-                                             {A, B},
-                                             {},
-                                             {},
-                                             nullptr,
-                                             target,
-                                             true);
-
+  auto a_plus_const_funcs_1 =
+      lang::LowerToAstVec("A_plus_const", {A, B}, &const_group_1, target);
   cinn::common::Context::Global().ResetNameId();
-  auto a_plus_const_funcs_2 = lang::LowerVec("A_plus_const",
-                                             poly::CreateStages({A, B}),
-                                             {A, B},
-                                             {},
-                                             {},
-                                             nullptr,
-                                             target,
-                                             true);
-
+  ast_gen_ius::TensorGroup const_group_2({A, B});
+  auto a_plus_const_funcs_2 =
+      lang::LowerToAstVec("A_plus_const", {A, B}, &const_group_2, target);
   cinn::common::Context::Global().ResetNameId();
-  auto a_plus_b_funcs = lang::LowerVec("A_plus_B",
-                                       poly::CreateStages({A, C}),
-                                       {A, C},
-                                       {},
-                                       {},
-                                       nullptr,
-                                       target,
-                                       true);
+  ast_gen_ius::TensorGroup plus_group({A, C});
+  auto a_plus_b_funcs =
+      lang::LowerToAstVec("A_plus_B", {A, C}, &plus_group, target);
 
   std::string a_plus_const_funcs_1_str = R"ROC(function A_plus_const (_A, _B)
 {
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
index 443c297c5e722c..2e4ecf034b740b 100644
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
@@ -17,6 +17,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 
@@ -46,16 +47,13 @@ TEST(MutateTileSize, Basic) {
       [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); },
       "C");
 
-  poly::StageMap stages = CreateStages({A, B, C});
+  ast_gen_ius::TensorGroup tensor_group({A, B, C});
   std::vector<ir::LoweredFunc> funcs =
-      lang::LowerVec("TestMutateTileSize_Basic",
-                     stages,
-                     {A, B, C},
-                     {},
-                     {},
-                     nullptr,
-                     target,
-                     true);
+      lang::LowerToAstVec("TestMutateTileSize_Basic",
+
+                          {A, B, C},
+                          &tensor_group,
+                          target);
 
   ir::Expr ast_expr = funcs[0]->body;
   VLOG(6) << "Original Expr: ";
diff --git a/paddle/cinn/backends/codegen_c_test.cc b/paddle/cinn/backends/codegen_c_test.cc
index 8db31b6c6007fb..caf4950cdfe8cd 100644
--- a/paddle/cinn/backends/codegen_c_test.cc
+++ b/paddle/cinn/backends/codegen_c_test.cc
@@ -69,7 +69,7 @@ TEST(CodeGenC, module) {
   ast_gen_ius::TensorGroup tensor_group({A, B, C});
   auto func = lang::LowerToAst("add1", {A, B, C}, &tensor_group);
 
-  LOG(INFO) << "Huihuang debug: " << func << std::endl;
+  LOG(INFO) << "Func to codegen: " << func << std::endl;
 
   builder.AddFunction(func);
 
diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index 0a64b24712f489..f63869730a11f8 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -304,6 +304,8 @@ void Compiler::CompileCudaModule(const Module& module,
     auto fn_kernel = cuda_module_->GetFunction(0, kernel_fn_name);
     CHECK(fn_kernel);
 
+    fn_ptr_.push_back(reinterpret_cast<void*>(fn_kernel));
+
     symbols.RegisterVar(kernel_fn_name + "_ptr_",
                         reinterpret_cast<void*>(fn_kernel));
   }
diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
index a468193d4d85a6..f269b00492a420 100644
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
@@ -121,6 +121,8 @@ class Compiler final {
    */
   void* Lookup(absl::string_view fn_name);
 
+  std::vector<void*> GetFnPtr() const { return fn_ptr_; }
+
  private:
   void CompileCudaModule(const ir::Module& module,
                          const std::string& code = "");
@@ -136,6 +138,7 @@ class Compiler final {
   Target target_;
   std::unique_ptr<ExecutionEngine> engine_;
 
+  std::vector<void*> fn_ptr_;
 #ifdef CINN_WITH_CUDA
   std::unique_ptr<runtime::cuda::CUDAModule> cuda_module_;
 #endif
diff --git a/paddle/cinn/backends/ir_schedule_test.cc b/paddle/cinn/backends/ir_schedule_test.cc
index 5ea30c6951d24c..2923c8dc9fe7ae 100644
--- a/paddle/cinn/backends/ir_schedule_test.cc
+++ b/paddle/cinn/backends/ir_schedule_test.cc
@@ -2310,6 +2310,270 @@ void test_rfactor(void* _args, int32_t num_args)
   ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
 }
 
+TEST(IrSchedule, factorize_reduction) {
+  Context::Global().ResetNameId();
+  Expr M(3);
+  Expr N(4);
+  Expr K(5);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, K});
+  Var j(4, "j0");
+  Var k(5, "k0");
+  auto B = Compute(
+      {M},
+      [&](Var i) {
+        return lang::ReduceSum(A(i, j, k), {j, k});
+      },
+      "B");
+
+  auto stages = CreateStages({A, B});
+  auto func = cinn::lang::LowerVec("test_factorize_reduction",
+                                   stages,
+                                   {A, B},
+                                   {},
+                                   {},
+                                   nullptr,
+                                   target,
+                                   true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK_EQ(loops.size(), 3U);
+  auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 0);
+  auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
+  CHECK(new_rf_tensor_ref);
+  CHECK(new_rf_tensor_ref->buffer.defined());
+  func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer);
+  func[0]->PrepareBufferCastExprs();
+  std::string origin = utils::GetStreamCnt(func[0]);
+  LOG(INFO) << origin;
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_factorize_reduction (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 3)
+      {
+        serial for (j0, 0, 4)
+        {
+          ScheduleBlock(B_rf__reduce_init)
+          {
+            vj0, i0_0 = axis.bind(j0, i)
+            B_rf__reduce_init[vj0, i0_0] = 0.00000000f
+          }
+          serial for (k0, 0, 5)
+          {
+            ScheduleBlock(B_rf)
+            {
+              vj0, i0_0, i2 = axis.bind(j0, i, k0)
+              B_rf[vj0, i0_0] = (B_rf[vj0, i0_0] + A[i0_0, vj0, i2])
+            }
+          }
+        }
+      }
+      serial for (i, 0, 3)
+      {
+        ScheduleBlock(B__reduce_init)
+        {
+          i0_0 = axis.bind(i)
+          B__reduce_init[i0_0] = 0.00000000f
+        }
+        serial for (j0, 0, 4)
+        {
+          ScheduleBlock(B)
+          {
+            vj0, i0_0 = axis.bind(j0, i)
+            B[i0_0] = (B[i0_0] + B_rf[vj0, i0_0])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC"));
+}
+
+TEST(IrSchedule, factorize_reduction1) {
+  Context::Global().ResetNameId();
+  Expr M(3);
+  Expr N(4);
+  Expr K(5);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, K});
+  Var j(4, "j0");
+  Var k(5, "k0");
+  auto B = Compute(
+      {M},
+      [&](Var i) {
+        return lang::ReduceSum(A(i, j, k), {j, k});
+      },
+      "B");
+
+  auto stages = CreateStages({A, B});
+  auto func = cinn::lang::LowerVec("test_factorize_reduction",
+                                   stages,
+                                   {A, B},
+                                   {},
+                                   {},
+                                   nullptr,
+                                   target,
+                                   true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK_EQ(loops.size(), 3U);
+  auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 1);
+  auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
+  CHECK(new_rf_tensor_ref);
+  CHECK(new_rf_tensor_ref->buffer.defined());
+  func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer);
+  func[0]->PrepareBufferCastExprs();
+  std::string origin = utils::GetStreamCnt(func[0]);
+  LOG(INFO) << origin;
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_factorize_reduction (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 3)
+      {
+        serial for (j0, 0, 4)
+        {
+          ScheduleBlock(B_rf__reduce_init)
+          {
+            vj0, i0_0 = axis.bind(j0, i)
+            B_rf__reduce_init[i0_0, vj0] = 0.00000000f
+          }
+          serial for (k0, 0, 5)
+          {
+            ScheduleBlock(B_rf)
+            {
+              vj0, i0_0, i2 = axis.bind(j0, i, k0)
+              B_rf[i0_0, vj0] = (B_rf[i0_0, vj0] + A[i0_0, vj0, i2])
+            }
+          }
+        }
+      }
+      serial for (i, 0, 3)
+      {
+        ScheduleBlock(B__reduce_init)
+        {
+          i0_0 = axis.bind(i)
+          B__reduce_init[i0_0] = 0.00000000f
+        }
+        serial for (j0, 0, 4)
+        {
+          ScheduleBlock(B)
+          {
+            vj0, i0_0 = axis.bind(j0, i)
+            B[i0_0] = (B[i0_0] + B_rf[i0_0, vj0])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC"));
+}
+
+TEST(IrSchedule, factorize_reduction2) {
+  Context::Global().ResetNameId();
+  Expr M(3);
+  Expr N(4);
+  Expr K(5);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N * K});
+  Var j(4 * 5, "j0");
+  auto B = Compute(
+      {M}, [&](Var i) { return lang::ReduceSum(A(i, j), {j}); }, "B");
+
+  auto stages = CreateStages({A, B});
+  auto func = cinn::lang::LowerVec("test_factorize_reduction",
+                                   stages,
+                                   {A, B},
+                                   {},
+                                   {},
+                                   nullptr,
+                                   target,
+                                   true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK_EQ(loops.size(), 2U);
+  auto splited_loops = ir_sch.Split(loops[1], {4, 5});
+  CHECK_EQ(splited_loops.size(), 2U);
+  auto new_rf_tensor = ir_sch.FactorizeReduction(splited_loops[0], 1);
+  auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
+  CHECK(new_rf_tensor_ref);
+  CHECK(new_rf_tensor_ref->buffer.defined());
+  func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer);
+  func[0]->PrepareBufferCastExprs();
+  std::string origin = utils::GetStreamCnt(func[0]);
+  LOG(INFO) << origin;
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_factorize_reduction (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 3)
+      {
+        serial for (j0, 0, 4)
+        {
+          ScheduleBlock(B_rf__reduce_init)
+          {
+            vj0, i0_0 = axis.bind(j0, i)
+            B_rf__reduce_init[i0_0, vj0] = 0.00000000f
+          }
+          serial for (j0_0, 0, 5)
+          {
+            ScheduleBlock(B_rf)
+            {
+              vj0, i0_0, vj0_0 = axis.bind(j0, i, j0_0)
+              B_rf[i0_0, vj0] = (B_rf[i0_0, vj0] + A[i0_0, ((5 * vj0) + vj0_0)])
+            }
+          }
+        }
+      }
+      serial for (i, 0, 3)
+      {
+        ScheduleBlock(B__reduce_init)
+        {
+          i0_0 = axis.bind(i)
+          B__reduce_init[i0_0] = 0.00000000f
+        }
+        serial for (j0, 0, 4)
+        {
+          ScheduleBlock(B)
+          {
+            vj0, i0_0 = axis.bind(j0, i)
+            B[i0_0] = (B[i0_0] + B_rf[i0_0, vj0])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC"));
+}
+
 TEST(IrSchedule, compute_inline1) {
   Context::Global().ResetNameId();
   Expr M(32);
diff --git a/paddle/cinn/cinn.h b/paddle/cinn/cinn.h
index 333bc051ead98a..e81771ba0c7e76 100644
--- a/paddle/cinn/cinn.h
+++ b/paddle/cinn/cinn.h
@@ -29,6 +29,7 @@
 
 namespace cinn {
 
+using ast_gen_ius::TensorGroup;
 using backends::CodeGenC;
 using backends::CodeGenCX86;
 using backends::Outputs;
@@ -39,6 +40,7 @@ using lang::CallExtern;
 using lang::CallLowered;
 using lang::Compute;
 using lang::Lower;
+using lang::LowerToAst;
 using lang::Placeholder;
 using lang::ReduceAll;
 using lang::ReduceAny;
diff --git a/paddle/cinn/hlir/dialect/operator/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/CMakeLists.txt
index dd1b708ce9fe44..570058329d0d39 100644
--- a/paddle/cinn/hlir/dialect/operator/CMakeLists.txt
+++ b/paddle/cinn/hlir/dialect/operator/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(ir)
+add_subdirectory(transforms)
diff --git a/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt
index 1a5857fd2cfe20..542ed6c21d0ce4 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt
+++ b/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt
@@ -1,4 +1,4 @@
-# TODO(Aurelius84): new_ir_compiler depends on pd_op_dialect and could
+# TODO(Aurelius84): pir_compiler depends on pd_op_dialect and could
 # not found under CINN_ONLY mode
 if(NOT CINN_ONLY)
   set(CINN_DIALECT_BINARY_DIR
@@ -35,6 +35,7 @@ if(NOT CINN_ONLY)
     COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_op_dir}
     COMMAND ${PYTHON_EXECUTABLE} ${cinn_op_gen_parsed_yaml_file} --op_yaml_path
             ${cinn_op_yaml_file} --output_path ${cinn_op_parsed_yaml_file}
+    DEPENDS ${cinn_op_gen_parsed_yaml_file} ${cinn_op_yaml_file}
     VERBATIM)
 
   add_custom_command(
diff --git a/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h b/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h
index 99e12a3d13ab45..9c6959db093e4b 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h
@@ -18,8 +18,8 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/cinn/hlir/framework/new_ir/utils.h"
 #include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/pir/core/attribute_base.h"
 #include "paddle/pir/core/operation.h"
 
@@ -51,7 +51,7 @@ struct GroupInfo {
  private:
   void Initialize() {
     op_pattern_kind = hlir::framework::OpPatternKind::kElementWise;
-    fn_name = hlir::framework::newir::CompatibleInfo::GroupOpsName(ops);
+    fn_name = hlir::framework::pir::CompatibleInfo::GroupOpsName(ops);
   }
 };
 
@@ -77,5 +77,27 @@ struct GroupInfoAttributeStorage : public pir::AttributeStorage {
   ParamKey data_;
 };
 
+struct JITInfoAttributeStorage : public pir::AttributeStorage {
+  using ParamKey = cinn::hlir::framework::pir::CUDAJITInfo;
+  explicit JITInfoAttributeStorage(const ParamKey& key) : data_(key) {}
+
+  static JITInfoAttributeStorage* Construct(const ParamKey& key) {
+    return new JITInfoAttributeStorage(key);
+  }
+
+  static std::size_t HashValue(const ParamKey& key) {
+    return std::hash<int64_t>()(*(reinterpret_cast<int64_t*>(key.fn_ptr)));
+  }
+
+  bool operator==(const ParamKey& key) const {
+    return data_.fn_ptr == key.fn_ptr;
+  }
+
+  const ParamKey& GetAsKey() const { return data_; }
+
+ private:
+  ParamKey data_;
+};
+
 }  // namespace dialect
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
index abe2ca94b9690e..3bb572250032f4 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -15,16 +15,17 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 
 #include <vector>
+#include "glog/logging.h"
 #include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/op_base.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
 
 namespace cinn {
 namespace dialect {
 
 const char *GroupOp::attributes_name[GroupOp::attributes_num] = {"group_info"};
 
-// TODO(Aurlius84): Need to figure out how to rebuild relation info of ops outer
-// GroupOp
 void GroupOp::Build(pir::Builder &builder,
                     pir::OperationArgument &argument,
                     const std::vector<pir::Type> &output_types) {
@@ -32,18 +33,33 @@ void GroupOp::Build(pir::Builder &builder,
   argument.output_types = output_types;
 }
 
-pir::Block *GroupOp::Block() {
+void GroupOp::Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    std::unique_ptr<pir::Block> &&block) {
+  VLOG(4) << "Start build GroupOp";
+  if (block && !block->empty()) {
+    IR_ENFORCE(block->back()->isa<pir::YieldOp>());
+    auto *op = block->back();
+    for (size_t i = 0; i < op->num_operands(); ++i) {
+      argument.AddOutput(op->operand(i).type());
+    }
+  }
+  argument.AddRegion()->push_back(block.release());
+}
+
+pir::Block *GroupOp::block() {
   pir::Region &region = (*this)->region(0);
   if (region.empty()) region.emplace_back();
   return region.front();
 }
 
-std::vector<pir::Operation *> GroupOp::Ops() {
-  auto *block = this->Block();
-  return std::vector<pir::Operation *>(block->begin(), block->end());
+std::vector<pir::Operation *> GroupOp::ops() {
+  auto *inner_block = this->block();
+  return std::vector<pir::Operation *>(inner_block->begin(),
+                                       inner_block->end());
 }
 
-void GroupOp::Verify() {}
+void GroupOp::VerifySig() {}
 
 void GroupOp::Print(pir::IrPrinter &printer) {
   auto &os = printer.os;
@@ -54,7 +70,7 @@ void GroupOp::Print(pir::IrPrinter &printer) {
   os << " -> ";
   printer.PrintOpReturnType(op);
   os << " {";
-  for (auto &sub_op : Ops()) {
+  for (auto &sub_op : ops()) {
     os << "\n";
     printer.PrintOperation(sub_op);
   }
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
index 0c8aa88e4fd2b9..ba116d52a98c01 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
@@ -33,10 +33,14 @@ class GroupOp : public pir::Op<GroupOp> {
                     pir::OperationArgument &argument,  // NOLINT
                     const std::vector<pir::Type> &output_types);
 
-  pir::Block *Block();
-  std::vector<pir::Operation *> Ops();
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    std::unique_ptr<pir::Block> &&block);
+
+  pir::Block *block();
+  std::vector<pir::Operation *> ops();
 
-  void Verify();
+  void VerifySig();
   void Print(pir::IrPrinter &printer);  // NOLINT
 };
 
diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_attribute.cc b/paddle/cinn/hlir/dialect/operator/ir/op_attribute.cc
index 554d7357af970e..1899d5f44bee11 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/op_attribute.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/op_attribute.cc
@@ -19,7 +19,13 @@ namespace dialect {
 const GroupInfo &GroupInfoAttribute::data() const {
   return storage()->GetAsKey();
 }
+
+const cinn::hlir::framework::pir::CUDAJITInfo &CUDAJITInfoAttribute::data()
+    const {
+  return storage()->GetAsKey();
+}
 }  // namespace dialect
 }  // namespace cinn
 
 IR_DEFINE_EXPLICIT_TYPE_ID(cinn::dialect::GroupInfoAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(cinn::dialect::CUDAJITInfoAttribute)
diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_attribute.h b/paddle/cinn/hlir/dialect/operator/ir/op_attribute.h
index 6e92b45002785a..10bd5ebc300a47 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/op_attribute.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/op_attribute.h
@@ -33,7 +33,22 @@ class GroupInfoAttribute : public pir::Attribute {
   const GroupInfo& data() const;
 };
 
+class CUDAJITInfoAttribute : public pir::Attribute {
+ public:
+  using Attribute::Attribute;
+
+  DECLARE_ATTRIBUTE_UTILITY_FUNCTOR(CUDAJITInfoAttribute,
+                                    JITInfoAttributeStorage);
+
+  bool operator<(const CUDAJITInfoAttribute& right) const {
+    return storage() < right.storage();
+  }
+
+  const cinn::hlir::framework::pir::CUDAJITInfo& data() const;
+};
+
 }  // namespace dialect
 }  // namespace cinn
 
 IR_DECLARE_EXPLICIT_TYPE_ID(cinn::dialect::GroupInfoAttribute)
+IR_DECLARE_EXPLICIT_TYPE_ID(cinn::dialect::CUDAJITInfoAttribute)
diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc
index 6d2f0409f24e96..11ccd77bb109d0 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc
@@ -39,20 +39,31 @@ void OperatorDialect::initialize() {
       >();
   RegisterOp<GroupOp>();
   RegisterAttribute<GroupInfoAttribute>();
+  RegisterAttribute<CUDAJITInfoAttribute>();
 }
 
 void OperatorDialect::PrintType(pir::Type type, std::ostream &os) const {}
 
 void OperatorDialect::PrintAttribute(pir::Attribute attr,
                                      std::ostream &os) const {
-  os << "(" << attr.dialect().name();
-  os << '.';
-  if (auto group_info_attr = attr.dyn_cast<GroupInfoAttribute>()) {
-    const GroupInfo &data = group_info_attr.data();
-    os << "GroupInfo)"
-       << "[" << data.fn_name << "]";
+  if (attr.isa<GroupInfoAttribute>()) {
+    os << "(" << attr.dialect().name();
+    os << '.';
+    if (auto group_info_attr = attr.dyn_cast<GroupInfoAttribute>()) {
+      const GroupInfo &data = group_info_attr.data();
+      os << "GroupInfo)"
+         << "[" << data.fn_name << "]";
+    }
+    { os << "<#AttrNotImplemented>"; }
+  } else if (attr.isa<CUDAJITInfoAttribute>()) {
+    auto cuda_jit_info = attr.dyn_cast<CUDAJITInfoAttribute>();
+
+    os << "(" << cuda_jit_info.data().fn_ptr;
+    os << ')';
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "cinn dialect only support GrupInfo and CUDAJITInfo"));
   }
-  { os << "<#AttrNotImplemented>"; }
 }
 
 void OperatorDialect::PrintOperation(pir::Operation *op,
diff --git a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
index 096d2c4e652b17..9f14c6e4066611 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
+++ b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
@@ -1,8 +1,25 @@
-- op : add
-  args : (Tensor x, Tensor y)
+- op : broadcast
+  args : (Tensor x, int64_t[] broadcast_axes,  int64_t[] out_shape)
   output : Tensor(out)
   infer_meta :
-    func : ElementwiseInferMeta
+    func : CINNBroadcastInferMeta
+    param : [x, broadcast_axes, out_shape]
   kernel :
-    func : add
-  inplace : (x -> out)
+    func : expand
+    param : [x, broadcast_axes]
+
+- op : reduce_max
+  args : (Tensor x, int64_t[] axis,  bool keep_dim)
+  output : Tensor(out)
+  infer_meta :
+    func : ReduceInferMeta
+  kernel :
+    func : frobenius_norm
+
+- op : reduce_sum
+  args : (Tensor x, int64_t[] axis,  bool keep_dim)
+  output : Tensor(out)
+  infer_meta :
+    func : ReduceInferMeta
+  kernel :
+    func : frobenius_norm
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt
new file mode 100644
index 00000000000000..770e78d191e3dc
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(NOT CINN_ONLY)
+  cinn_cc_library(
+    op_with_group_merge_pass
+    SRCS
+    group_with_group_merge_pass.cc
+    op_with_group_merge_pass.cc
+    tensor_node.cc
+    DEPS
+    pd_op_dialect)
+endif()
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_pass.cc
new file mode 100644
index 00000000000000..e9c165bbcec523
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_pass.cc
@@ -0,0 +1,2126 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <map>
+#include <set>
+#include <unordered_map>
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_group.h"
+#include "paddle/pir/core/value.h"
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_pass_utils.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.h"
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_util.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h"
+#include "paddle/phi/core/flags.h"
+
+PD_DECLARE_bool(enhance_vertical_fusion_with_recompute);
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+using GroupPtr = std::shared_ptr<ir::Group>;
+using GroupList = std::vector<GroupPtr>;
+
+using Comparator = ir::Group::SharedGroupComparator;
+using Hasher = ir::Group::SharedGroupHasher;
+
+using OpGroupPtr = ir::OpGroup;
+using OpGroupList = std::vector<OpGroupPtr>;
+
+using ConditionFunction = std::function<bool(const GroupPtr&, const GroupPtr&)>;
+
+class FuseHelper {
+ public:
+  virtual ~FuseHelper() = default;
+
+  virtual bool AllOutputsSameSize(const OpGroupPtr& first,
+                                  const OpGroupPtr& second) const = 0;
+
+  virtual bool HorizontalElementwiseFuseReduce(const OpGroupPtr& src,
+                                               const OpGroupPtr& dst) const = 0;
+
+  virtual bool ElementwiseFuseBroadcast(const OpGroupPtr& src,
+                                        const OpGroupPtr& dst) const = 0;
+
+  virtual bool HorizontalWithInjective(const OpGroupPtr& src,
+                                       const OpGroupPtr& dst) const = 0;
+
+  virtual bool ElementwiseFuseReduce(const OpGroupPtr& src,
+                                     const OpGroupPtr& dst) const = 0;
+
+  virtual bool BroadcastFuseReduce(const OpGroupPtr& src,
+                                   const OpGroupPtr& dst) const = 0;
+
+  virtual bool InjectiveHorizontalWithReduce(const OpGroupPtr& src,
+                                             const OpGroupPtr& dst) const = 0;
+
+  virtual bool ReduceFuseElementwise(const OpGroupPtr& src,
+                                     const OpGroupPtr& dst) const = 0;
+
+  virtual bool ReduceFuseBroadcast(const OpGroupPtr& src,
+                                   const OpGroupPtr& dst) const = 0;
+
+  virtual bool ReduceFuseReduce(const OpGroupPtr& src,
+                                const OpGroupPtr& dst) const = 0;
+
+  virtual bool IsReachable(const OpGroupPtr& lhs,
+                           const OpGroupPtr& rhs) const = 0;
+
+  virtual bool DetectCycleIfFuse(const OpGroupPtr& src,
+                                 const OpGroupPtr& dst) const = 0;
+
+  virtual bool IsConsumerSetsReachable(
+      const OpGroupPtr& group,
+      const std::unordered_set<OpGroupPtr>& consumers) const = 0;
+
+ protected:
+  FuseHelper() = default;
+};
+
+template <typename FusePassCtxT>
+class GraphGroupFuseHelper final : public FuseHelper {
+ public:
+  explicit GraphGroupFuseHelper(const FusePassCtxT* ctx) : ctx_(ctx) {}
+
+  bool AllOutputsSameSize(const OpGroupPtr& first,
+                          const OpGroupPtr& second) const override;
+
+  bool HorizontalElementwiseFuseReduce(const OpGroupPtr& src,
+                                       const OpGroupPtr& dst) const override;
+
+  bool ElementwiseFuseBroadcast(const OpGroupPtr& src,
+                                const OpGroupPtr& dst) const override;
+
+  bool HorizontalWithInjective(const OpGroupPtr& src,
+                               const OpGroupPtr& dst) const override;
+
+  bool ElementwiseFuseReduce(const OpGroupPtr& src,
+                             const OpGroupPtr& dst) const override;
+
+  bool BroadcastFuseReduce(const OpGroupPtr& src,
+                           const OpGroupPtr& dst) const override;
+
+  bool InjectiveHorizontalWithReduce(const OpGroupPtr& src,
+                                     const OpGroupPtr& dst) const override;
+
+  bool ReduceFuseElementwise(const OpGroupPtr& src,
+                             const OpGroupPtr& dst) const override;
+
+  bool ReduceFuseBroadcast(const OpGroupPtr& src,
+                           const OpGroupPtr& dst) const override;
+
+  bool ReduceFuseReduce(const OpGroupPtr& src,
+                        const OpGroupPtr& dst) const override;
+
+  bool IsReachable(const OpGroupPtr& lhs,
+                   const OpGroupPtr& rhs) const override {
+    return IsReachableInDag(lhs, rhs) || IsReachableInDag(rhs, lhs);
+  }
+
+  bool DetectCycleIfFuse(const OpGroupPtr& lhs,
+                         const OpGroupPtr& rhs) const override {
+    return ReachableIfDirectEdgeIgnored(lhs, rhs) ||
+           ReachableIfDirectEdgeIgnored(rhs, lhs);
+  }
+
+  bool IsConsumerSetsReachable(
+      const OpGroupPtr& group,
+      const std::unordered_set<OpGroupPtr>& consumers) const override {
+    for (const auto& consumer : consumers) {
+      if (group == consumer) {
+        continue;
+      }
+      if (IsReachableInDag(consumer, group)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  bool IsReachableInDag(const OpGroupPtr& producer,
+                        const OpGroupPtr& consumer) const {
+    // const auto& MinDepth4Node = [&](const OpGroupPtr& node) {
+    //   return node.GetGroup()->min_depth;
+    // };
+    // const auto& MaxDepth4Node = [&](const OpGroupPtr& node) {
+    //   return node.GetGroup()->max_depth;
+    // };
+    // const auto& VisitNextNodes =
+    //     [&](const OpGroupPtr& node,
+    //         const std::function<void(OpGroupPtr)>& Visit) {
+    //       for (const auto& node_producer : node.producers()) {
+    //         Visit(node_producer);
+    //       }
+    //     };
+    // common::IsReachablePredicator<OpGroupPtr> is_reachable(
+    //     MinDepth4Node, MaxDepth4Node, VisitNextNodes);
+    // return is_reachable(consumer, producer, [](OpGroupPtr) {});
+    // TODO(phlrain) : support IsReachable
+    return false;
+  }
+
+  bool ReachableIfDirectEdgeIgnored(const OpGroupPtr& producer,
+                                    const OpGroupPtr& consumer) const {
+    // const auto& MinDepth4Node = [&](const OpGroupPtr& node) {
+    //   return node.GetGroup()->min_depth;
+    // };
+    // const auto& MaxDepth4Node = [&](const OpGroupPtr& node) {
+    //   return node.GetGroup()->max_depth;
+    // };
+    // const auto& VisitNextNodes =
+    //     [&](const OpGroupPtr& node,
+    //         const std::function<void(OpGroupPtr)>& Visit) {
+    //       for (const auto& node_producer : node.producers()) {
+    //         if (node == consumer && node_producer == producer) {
+    //           continue;
+    //         }
+    //         Visit(node_producer);
+    //       }
+    //     };
+    // common::IsReachablePredicator<OpGroupPtr> is_reachable(
+    //     MinDepth4Node, MaxDepth4Node, VisitNextNodes);
+    // return is_reachable(consumer, producer, [](OpGroupPtr) {});
+    // TODO(phlrain) : support IsReachable
+    return false;
+  }
+
+  const FusePassCtxT* ctx_;
+};
+
+class FusePassCtx {
+ public:
+  virtual ~FusePassCtx() {}
+
+  virtual const FuseHelper& fuse_helper() const = 0;
+
+  virtual void MarkFusible(const OpGroupPtr& first,
+                           const OpGroupPtr& second) = 0;
+
+ protected:
+  FusePassCtx() = default;
+};
+
+class LightwareFusePassCtx : public FusePassCtx {
+ public:
+  virtual ~LightwareFusePassCtx() {}
+
+  virtual const OpGroupPtr& PickOpGroup() const = 0;
+
+  virtual const FuseHelper& fuse_helper() const = 0;
+
+  virtual void MarkFusible(const OpGroupPtr& first,
+                           const OpGroupPtr& second) = 0;
+
+  virtual void MarkFusible(const OpGroupList& candidates) = 0;
+
+ protected:
+  LightwareFusePassCtx() = default;
+};
+
+class GraphGroupLightwareFusePassCtx final : public LightwareFusePassCtx {
+ public:
+  GraphGroupLightwareFusePassCtx(
+      const OpGroupPtr& group,
+      const std::function<void(const OpGroupPtr& first,
+                               const OpGroupPtr& second)>& MarkFusible)
+      : group_(group),
+        MarkFusible_(MarkFusible),
+        fuse_helper_(
+            new GraphGroupFuseHelper<GraphGroupLightwareFusePassCtx>(this)) {}
+
+  GraphGroupLightwareFusePassCtx(
+      const OpGroupPtr& group,
+      const std::function<void(const OpGroupList& candidates)>&
+          MarkGroupListFusible)
+      : group_(group),
+        MarkGroupListFusible_(MarkGroupListFusible),
+        fuse_helper_(
+            new GraphGroupFuseHelper<GraphGroupLightwareFusePassCtx>(this)) {}
+
+  const OpGroupPtr& PickOpGroup() const override { return group_; }
+
+  const FuseHelper& fuse_helper() const override { return *fuse_helper_; }
+
+  void MarkFusible(const OpGroupPtr& first, const OpGroupPtr& second) override {
+    MarkFusible_(first, second);
+  }
+
+  void MarkFusible(const OpGroupList& candidates) override {
+    MarkGroupListFusible_(candidates);
+  }
+
+ private:
+  const OpGroupPtr& group_;
+  const std::function<void(const OpGroupPtr& first, const OpGroupPtr& second)>
+      MarkFusible_;
+  const std::function<void(const OpGroupList& candidates)>
+      MarkGroupListFusible_;
+  const std::unique_ptr<const FuseHelper> fuse_helper_;
+};
+
+class InputFusePassCtx : public FusePassCtx {
+ public:
+  virtual ~InputFusePassCtx() {}
+
+  virtual const OpGroupList& PickConsumersWithSameInputs() const = 0;
+
+  virtual const FuseHelper& fuse_helper() const = 0;
+
+  virtual void MarkFusible(const OpGroupPtr& first,
+                           const OpGroupPtr& second) = 0;
+
+  virtual void MarkFusible(const OpGroupList& candidates) = 0;
+
+ protected:
+  InputFusePassCtx() = default;
+};
+
+class GraphGroupInputFusePassCtx final : public InputFusePassCtx {
+ public:
+  GraphGroupInputFusePassCtx(
+      const OpGroupList& groups,
+      const std::function<void(const OpGroupPtr& first,
+                               const OpGroupPtr& second)>& MarkFusible)
+      : groups_(groups),
+        MarkFusible_(MarkFusible),
+        fuse_helper_(
+            new GraphGroupFuseHelper<GraphGroupInputFusePassCtx>(this)) {}
+
+  GraphGroupInputFusePassCtx(
+      const OpGroupList& groups,
+      const std::function<void(const OpGroupList& candidates)>&
+          MarkGroupListFusible)
+      : groups_(groups),
+        MarkGroupListFusible_(MarkGroupListFusible),
+        fuse_helper_(
+            new GraphGroupFuseHelper<GraphGroupInputFusePassCtx>(this)) {}
+
+  const OpGroupList& PickConsumersWithSameInputs() const override {
+    return groups_;
+  }
+
+  const FuseHelper& fuse_helper() const override { return *fuse_helper_; }
+
+  void MarkFusible(const OpGroupPtr& first, const OpGroupPtr& second) override {
+    MarkFusible_(first, second);
+  }
+
+  void MarkFusible(const OpGroupList& candidates) override {
+    MarkGroupListFusible_(candidates);
+  }
+
+ private:
+  const OpGroupList& groups_;
+  const std::function<void(const OpGroupPtr& first, const OpGroupPtr& second)>
+      MarkFusible_;
+  const std::function<void(const OpGroupList& candidates)>
+      MarkGroupListFusible_;
+  const std::unique_ptr<const FuseHelper> fuse_helper_;
+};
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::AllOutputsSameSize(
+    const OpGroupPtr& first, const OpGroupPtr& second) const {
+  return is_same_size(first.GetGroup(), second.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::HorizontalElementwiseFuseReduce(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return honrizontal_elementwise_fuse_reduce(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::ElementwiseFuseBroadcast(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return elementwise_fuse_broadcast(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::HorizontalWithInjective(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return horizontal_with_injective(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::ElementwiseFuseReduce(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return elementwise_fuse_reduce(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::BroadcastFuseReduce(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return broadcast_fuse_reduce(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::InjectiveHorizontalWithReduce(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return injective_horizontal_with_reduce(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::ReduceFuseElementwise(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return reduce_fuse_elementwise(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::ReduceFuseBroadcast(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return reduce_fuse_broadcast(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+bool GraphGroupFuseHelper<FusePassCtxT>::ReduceFuseReduce(
+    const OpGroupPtr& src, const OpGroupPtr& dst) const {
+  return reduce_fuse_reduce(src.GetGroup(), dst.GetGroup());
+}
+
+template <typename FusePassCtxT>
+struct HorizontalFuseUtil {
+  using KindKeyT = std::pair<OpPatternKind, OpPatternKind>;
+
+  static bool DetectFusabilityByKind(FusePassCtxT* ctx,
+                                     const OpGroupPtr& src,
+                                     const OpGroupPtr& dst) {
+    const KindKeyT kind_pair(src.kind(), dst.kind());
+    const auto& map = GetConditionMap();
+    const auto& iter = map.find(kind_pair);
+    if (iter == map.end()) {
+      return false;
+    }
+    auto out = iter->second(src, dst);
+    return out;
+  }
+
+  typedef bool (*ConditionT)(const OpGroupPtr& src, const OpGroupPtr& dst);
+
+  static const std::map<KindKeyT, ConditionT>& GetConditionMap() {
+    thread_local static std::map<KindKeyT, ConditionT> map(RawConditionMap());
+    return map;
+  }
+
+  static std::map<KindKeyT, ConditionT> RawConditionMap() {
+    return std::map<KindKeyT, ConditionT>{
+        {{kElementWise, kElementWise}, &IsSameSize},
+        {{kElementWise, kBroadcast}, &IsSameSize},
+        {{kElementWise, kInjective}, &IsSameSize},
+        {{kElementWise, kReduction}, &HorizontalElementwiseFuseReduce},
+
+        {{kBroadcast, kElementWise}, &IsSameSize},
+        {{kBroadcast, kBroadcast}, &IsSameSize},
+        {{kBroadcast, kInjective}, &IsSameSize},
+        {{kBroadcast, kReduction}, &IsSameSize},
+
+        {{kInjective, kElementWise}, &IsSameSize},
+        {{kInjective, kBroadcast}, &IsSameSize},
+        {{kInjective, kInjective}, &IsSameSize},
+        {{kInjective, kReduction}, &IsSameSize},
+
+        {{kReduction, kElementWise}, &HorizontalElementwiseFuseReduce},
+        {{kReduction, kBroadcast}, &IsSameSize},
+        {{kReduction, kInjective}, &IsSameSize},
+        {{kReduction, kReduction}, &ReduceFuseReduce},
+    };
+  }
+
+  static bool IsSameSize(const OpGroupPtr& src, const OpGroupPtr& dst) {
+    return cinn::dialect::ir::IsSameSize(src, dst);
+  }
+
+  static bool HorizontalElementwiseFuseReduce(const OpGroupPtr& src,
+                                              const OpGroupPtr& dst) {
+    // if same shape with horizontal relation
+    if (IsSameSize(src, dst)) {
+      return true;
+    }
+
+    const OpGroupPtr* ele_group = nullptr;
+    const OpGroupPtr* reduce_group = nullptr;
+
+    if (src.kind() == kReduction) {
+      ele_group = &dst;
+      reduce_group = &src;
+    } else {
+      ele_group = &src;
+      reduce_group = &dst;
+    }
+
+    size_t size_ele =
+        phi::product(GetMasterNode(*ele_group).outputs()[0].shape());
+
+    bool can_fuse = false;
+    reduce_group->WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
+      if (op.kind() == OpPatternKind::kReduction) {
+        size_t size_master = phi::product(op.outputs()[0].shape());
+        if (size_ele == size_master) {
+          can_fuse = true;
+        }
+      }
+    });
+
+    return can_fuse;
+  }
+
+  static bool ReduceFuseReduce(const OpGroupPtr& src, const OpGroupPtr& dst) {
+    // return ctx->fuse_helper().ReduceFuseReduce(src, dst);
+    return reduce_fuse_reduce(src.GetGroup(), dst.GetGroup());
+  }
+};
+
+class FusePass {
+ public:
+  virtual ~FusePass() = default;
+
+  virtual const std::string FuseMode() const = 0;
+
+  virtual int Benefit() const = 0;
+
+ protected:
+  FusePass() = default;
+};
+
+class InputFusePass : public FusePass {
+ public:
+  virtual ~InputFusePass() = default;
+
+  virtual void operator()(InputFusePassCtx* ctx) const = 0;
+
+  const std::string FuseMode() const final { return "InputFuse"; }
+
+  virtual int Benefit() const = 0;
+
+ protected:
+  InputFusePass() = default;
+};
+
+class DefaultInputFusePass final : public InputFusePass {
+ public:
+  DefaultInputFusePass() : InputFusePass() {}
+
+  int Benefit() const override { return 100; }
+
+  void operator()(InputFusePassCtx* ctx) const override {
+    const auto& consumer_set = ctx->PickConsumersWithSameInputs();
+
+    const std::unordered_set<OpGroupPtr> consumer_candidates =
+        [&]() -> std::unordered_set<OpGroupPtr> {
+      std::unordered_set<OpGroupPtr> consumers;
+      for (const auto& consumer : consumer_set) {
+        if (consumer.kind() == kElementWise || consumer.kind() == kBroadcast ||
+            consumer.kind() == kInjective || consumer.kind() == kReduction) {
+          consumers.insert(consumer);
+        }
+      }
+      return consumers;
+    }();
+    if (consumer_candidates.size() <= 1) {
+      return;
+    }
+
+    std::vector<OpGroupList> fusionable_consumers;
+    for (auto& candidate : consumer_candidates) {
+      if (ctx->fuse_helper().IsConsumerSetsReachable(candidate,
+                                                     consumer_candidates)) {
+        continue;
+      }
+      if (fusionable_consumers.empty()) {
+        fusionable_consumers.push_back({candidate});
+        continue;
+      }
+      // check each fusionable groups
+      bool fusionable = false;
+      for (auto& groups : fusionable_consumers) {
+        auto& last = groups.back();
+        if (!HorizontalFuseUtil<InputFusePassCtx>::DetectFusabilityByKind(
+                ctx, candidate, last)) {
+          continue;
+        }
+        groups.push_back(candidate);
+        fusionable = true;
+        break;
+      }
+
+      // if can't fuse to othors Groups, new Groups.
+      if (!fusionable) {
+        fusionable_consumers.push_back({candidate});
+      }
+    }
+
+    for (const auto& groups : fusionable_consumers) {
+      if (groups.size() > 1) {
+        ctx->MarkFusible(groups);
+      }
+    }
+    VLOG(1) << "DefaultInputFusePass Finish";
+  }
+};
+
+class LightwareFusePass : public FusePass {
+ public:
+  virtual ~LightwareFusePass() = default;
+
+  virtual void operator()(LightwareFusePassCtx* ctx) const = 0;
+
+  virtual const std::string FuseMode() const = 0;
+
+  virtual int Benefit() const = 0;
+
+ protected:
+  LightwareFusePass() = default;
+};
+
+class HorizontalFusePass : public LightwareFusePass {
+ public:
+  virtual ~HorizontalFusePass() = default;
+
+  virtual void operator()(LightwareFusePassCtx* ctx) const = 0;
+
+  const std::string FuseMode() const final { return "HorizontalFuse"; }
+
+  virtual int Benefit() const = 0;
+
+ protected:
+  HorizontalFusePass() = default;
+};
+
+class DefaultHorizontalFusePass final : public HorizontalFusePass {
+ public:
+  DefaultHorizontalFusePass() : HorizontalFusePass() {}
+
+  int Benefit() const override { return 100; }
+
+  void operator()(LightwareFusePassCtx* ctx) const override {
+    const auto& producer = ctx->PickOpGroup();
+    const std::unordered_set<OpGroupPtr> consumer_candidates =
+        [&]() -> std::unordered_set<OpGroupPtr> {
+      std::unordered_set<OpGroupPtr> consumers;
+      for (const auto& consumer : producer.consumers()) {
+        if (consumer.kind() == kElementWise || consumer.kind() == kBroadcast ||
+            consumer.kind() == kInjective || consumer.kind() == kReduction) {
+          consumers.insert(consumer);
+        }
+      }
+      return consumers;
+    }();
+    if (consumer_candidates.size() <= 1) {
+      return;
+    }
+
+    std::vector<OpGroupList> fusionable_consumers;
+    for (auto& candidate : consumer_candidates) {
+      if (ctx->fuse_helper().IsConsumerSetsReachable(candidate,
+                                                     consumer_candidates)) {
+        continue;
+      }
+      if (fusionable_consumers.empty()) {
+        fusionable_consumers.push_back({candidate});
+        continue;
+      }
+      // check each fusionable groups
+      bool fusionable = false;
+      for (auto& groups : fusionable_consumers) {
+        auto& last = groups.back();
+        if (!HorizontalFuseUtil<LightwareFusePassCtx>::DetectFusabilityByKind(
+                ctx, candidate, last)) {
+          continue;
+        }
+        groups.push_back(candidate);
+        fusionable = true;
+        break;
+      }
+
+      // if can't fuse to othors Groups, new Groups.
+      if (!fusionable) {
+        fusionable_consumers.push_back({candidate});
+      }
+    }
+
+    for (const auto& groups : fusionable_consumers) {
+      if (groups.size() > 1) {
+        // Trick for BERT, maybe not required, wait for substitution from
+        // unordered_set to set
+        if (groups.size() == 2) {
+          OpGroupList fuse_group;
+          if (groups[1].group_id().substr(0, 4) == "cast" &&
+              groups[0].group_id() == "reshape_split") {
+            fuse_group.push_back(groups[1]);
+            fuse_group.push_back(groups[0]);
+            ctx->MarkFusible(fuse_group);
+            continue;
+          }
+        }
+        ctx->MarkFusible(groups);
+      }
+    }
+  }
+};
+
+class VerticalFusePass : public LightwareFusePass {
+ public:
+  virtual ~VerticalFusePass() = default;
+
+  virtual void operator()(LightwareFusePassCtx* ctx) const = 0;
+
+  const std::string FuseMode() const final { return "VerticalFuse"; }
+
+  virtual int Benefit() const = 0;
+
+ protected:
+  VerticalFusePass() = default;
+};
+
+class DefaultVerticalFusePass final : public VerticalFusePass {
+ public:
+  DefaultVerticalFusePass() : VerticalFusePass() {}
+
+  int Benefit() const override { return 100; }
+
+  void operator()(LightwareFusePassCtx* ctx) const override {
+    const auto& producer = ctx->PickOpGroup();
+    const OpGroupList consumers = [&]() {
+      OpGroupList consumers;
+      for (const auto& consumer : producer.consumers()) {
+        consumers.push_back(consumer);
+      }
+      return consumers;
+    }();
+    if (consumers.size() == 0) {
+      return;
+    }
+
+    std::vector<OpGroupPtr> candidates;
+    for (size_t i = 0; i < consumers.size(); ++i) {
+      const auto& consumer = consumers.at(i);
+      if (!DetectFusabilityByKind(ctx, producer, consumer)) {
+        break;
+      }
+      candidates.push_back(consumer);
+    }
+    if (candidates.size() == consumers.size() &&
+        producer.kind() == kElementWise) {
+      return;
+    }
+
+    for (size_t i = 0; i < consumers.size(); ++i) {
+      const auto& consumer = consumers.at(i);
+      if (!DetectFusabilityByKind(ctx, producer, consumer)) {
+        continue;
+      }
+      if (ctx->fuse_helper().DetectCycleIfFuse(producer, consumer)) {
+        VLOG(4) << "Can't fuse because detect cycle";
+        continue;
+      }
+      ctx->MarkFusible(producer, consumer);
+    }
+  }
+
+  using KindKeyT = std::pair<OpPatternKind, OpPatternKind>;
+  bool DetectFusabilityByKind(LightwareFusePassCtx* ctx,
+                              const OpGroupPtr& src,
+                              const OpGroupPtr& dst) const {
+    const KindKeyT kind_pair(src.kind(), dst.kind());
+    const auto& map = GetConditionMap();
+    const auto& iter = map.find(kind_pair);
+    if (iter == map.end()) {
+      return false;
+    }
+    return iter->second(ctx, src, dst);
+  }
+
+  typedef bool (*ConditionT)(LightwareFusePassCtx* ctx,
+                             const OpGroupPtr& src,
+                             const OpGroupPtr& dst);
+
+  static const std::map<KindKeyT, ConditionT>& GetConditionMap() {
+    thread_local static std::map<KindKeyT, ConditionT> map(RawConditionMap());
+    return map;
+  }
+
+  static std::map<KindKeyT, ConditionT> RawConditionMap() {
+    return std::map<KindKeyT, ConditionT>{
+        {{OpPatternKind::kElementWise, kElementWise},
+         &DefaultVerticalFusePass::IsSameSize},
+        {{OpPatternKind::kElementWise, kBroadcast},
+         &DefaultVerticalFusePass::ElementwiseFuseBroadcast},
+        {{OpPatternKind::kElementWise, kInjective},
+         &DefaultVerticalFusePass::HorizontalWithInjective},
+        {{OpPatternKind::kElementWise, kReduction},
+         &DefaultVerticalFusePass::ElementwiseFuseReduce},
+
+        {{OpPatternKind::kBroadcast, kElementWise},
+         &DefaultVerticalFusePass::IsSameSize},
+        {{OpPatternKind::kBroadcast, kBroadcast},
+         &DefaultVerticalFusePass::IsSameSize},
+        {{OpPatternKind::kBroadcast, kInjective},
+         &DefaultVerticalFusePass::HorizontalWithInjective},
+        {{OpPatternKind::kBroadcast, kReduction},
+         &DefaultVerticalFusePass::BroadcastFuseReduce},
+
+        {{OpPatternKind::kInjective, kElementWise},
+         &DefaultVerticalFusePass::IsSameSize},
+        {{OpPatternKind::kInjective, kBroadcast},
+         &DefaultVerticalFusePass::IsSameSize},
+        {{OpPatternKind::kInjective, kInjective},
+         &DefaultVerticalFusePass::HorizontalWithInjective},
+        {{OpPatternKind::kInjective, kReduction},
+         &DefaultVerticalFusePass::InjectiveHorizontalWithReduce},
+
+        {{OpPatternKind::kReduction, kElementWise},
+         &DefaultVerticalFusePass::ReduceFuseElementwise},
+        {{OpPatternKind::kReduction, kBroadcast},
+         &DefaultVerticalFusePass::ReduceFuseBroadcast},
+        {{OpPatternKind::kReduction, kInjective},
+         &DefaultVerticalFusePass::HorizontalWithInjective},
+        {{OpPatternKind::kReduction, kReduction},
+         &DefaultVerticalFusePass::ReduceFuseReduce},
+    };
+  }
+
+  static bool IsSameSize(LightwareFusePassCtx* ctx,
+                         const OpGroupPtr& src,
+                         const OpGroupPtr& dst) {
+    return cinn::dialect::ir::IsSameSize(src, dst);
+  }
+
+  static bool ElementwiseFuseBroadcast(LightwareFusePassCtx* ctx,
+                                       const OpGroupPtr& src,
+                                       const OpGroupPtr& dst) {
+    return ctx->fuse_helper().ElementwiseFuseBroadcast(src, dst);
+  }
+
+  static bool HorizontalWithInjective(LightwareFusePassCtx* ctx,
+                                      const OpGroupPtr& src,
+                                      const OpGroupPtr& dst) {
+    return ctx->fuse_helper().HorizontalWithInjective(src, dst);
+  }
+
+  static bool ElementwiseFuseReduce(LightwareFusePassCtx* ctx,
+                                    const OpGroupPtr& src,
+                                    const OpGroupPtr& dst) {
+    return ctx->fuse_helper().ElementwiseFuseReduce(src, dst);
+  }
+
+  static bool BroadcastFuseReduce(LightwareFusePassCtx* ctx,
+                                  const OpGroupPtr& src,
+                                  const OpGroupPtr& dst) {
+    return ctx->fuse_helper().BroadcastFuseReduce(src, dst);
+  }
+
+  static bool InjectiveHorizontalWithReduce(LightwareFusePassCtx* ctx,
+                                            const OpGroupPtr& src,
+                                            const OpGroupPtr& dst) {
+    return ctx->fuse_helper().InjectiveHorizontalWithReduce(src, dst);
+  }
+
+  static bool ReduceFuseElementwise(LightwareFusePassCtx* ctx,
+                                    const OpGroupPtr& src,
+                                    const OpGroupPtr& dst) {
+    return ctx->fuse_helper().ReduceFuseElementwise(src, dst);
+  }
+
+  static bool ReduceFuseBroadcast(LightwareFusePassCtx* ctx,
+                                  const OpGroupPtr& src,
+                                  const OpGroupPtr& dst) {
+    return ctx->fuse_helper().ReduceFuseBroadcast(src, dst);
+  }
+
+  static bool ReduceFuseReduce(LightwareFusePassCtx* ctx,
+                               const OpGroupPtr& src,
+                               const OpGroupPtr& dst) {
+    return ctx->fuse_helper().ReduceFuseReduce(src, dst);
+  }
+};
+
+class RecomputeFusePass : public LightwareFusePass {
+ public:
+  virtual ~RecomputeFusePass() = default;
+
+  virtual void operator()(LightwareFusePassCtx* ctx) const = 0;
+
+  const std::string FuseMode() const final { return "RecomputeFuse"; }
+
+  virtual int Benefit() const = 0;
+
+ protected:
+  RecomputeFusePass() = default;
+};
+
+class DefaultRecomputeFusePass final : public RecomputeFusePass {
+ public:
+  DefaultRecomputeFusePass() : RecomputeFusePass() {}
+
+  int Benefit() const override { return 100; }
+
+  void operator()(LightwareFusePassCtx* ctx) const override {
+    const auto& producer = ctx->PickOpGroup();
+    const OpGroupList consumers = [&]() {
+      OpGroupList consumers;
+      for (const auto& consumer : producer.consumers()) {
+        consumers.push_back(consumer);
+      }
+      return consumers;
+    }();
+    // Borrows unsafe_candidates and candidates concept from origin
+    // fusion_merge_pass
+    std::vector<OpGroupPtr> unsafe_candidates;
+    std::vector<OpGroupPtr> candidates;
+    for (size_t i = 0; i < consumers.size(); ++i) {
+      const auto& consumer = consumers.at(i);
+      if (!DetectFusabilityByKind(ctx, producer, consumer)) {
+        continue;
+      }
+      unsafe_candidates.push_back(consumer);
+      if (ctx->fuse_helper().DetectCycleIfFuse(producer, consumer)) {
+        continue;
+      }
+      candidates.push_back(consumer);
+    }
+
+    if (!candidates.empty() && unsafe_candidates.size() == consumers.size() &&
+        producer.kind() == kElementWise) {
+      for (const auto& consumer : consumers) {
+        ctx->MarkFusible(producer, consumer);
+      }
+    }
+  }
+
+  using KindKeyT = std::pair<OpPatternKind, OpPatternKind>;
+  bool DetectFusabilityByKind(LightwareFusePassCtx* ctx,
+                              const OpGroupPtr& src,
+                              const OpGroupPtr& dst) const {
+    const KindKeyT kind_pair(src.kind(), dst.kind());
+    const auto& map = DefaultVerticalFusePass::GetConditionMap();
+    const auto& iter = map.find(kind_pair);
+    if (iter == map.end()) {
+      return false;
+    }
+    return iter->second(ctx, src, dst);
+  }
+};
+
+struct LightwareFusePassComparator {
+  bool operator()(const std::shared_ptr<LightwareFusePass>& lhs,
+                  const std::shared_ptr<LightwareFusePass>& rhs) const {
+    return lhs->Benefit() > rhs->Benefit();
+  }
+};
+
+struct InputFusePassComparator {
+  bool operator()(const std::shared_ptr<InputFusePass>& lhs,
+                  const std::shared_ptr<InputFusePass>& rhs) const {
+    return lhs->Benefit() > rhs->Benefit();
+  }
+};
+
+class FusionPassMap {
+ public:
+  static FusionPassMap& Instance() {
+    static FusionPassMap global_fusion_pass_map;
+    return global_fusion_pass_map;
+  }
+
+  bool Has(const std::string& pass_name) const {
+    return map_.find(pass_name) != map_.end();
+  }
+
+  void Insert(const std::string& pass_name,
+              const std::shared_ptr<FusePass>& pass) {
+    CHECK(!Has(pass_name)) << "FusePass " << pass_name
+                           << " has already been registered.";
+    map_.insert({pass_name, pass});
+  }
+
+  std::shared_ptr<FusePass> Get(const std::string& pass_name) const {
+    auto it = map_.find(pass_name);
+    CHECK(it != map_.end())
+        << "FusePass " << pass_name << " has not been registered.";
+    return it->second;
+  }
+
+  // fuse_mode: HorizontalFuse, VerticalFuse, RecomputeFuse
+  std::vector<std::shared_ptr<LightwareFusePass>> GetLightwareFusePassesByMode(
+      const std::string& fuse_mode) const {
+    CHECK(fuse_mode == "HorizontalFuse" || fuse_mode == "VerticalFuse" ||
+          fuse_mode == "RecomputeFuse")
+        << "fuse_mode only supports HorizontalFuse, VerticalFuse and "
+           "RecomputeFuse. Please check your input modes = "
+        << fuse_mode;
+    std::set<std::shared_ptr<LightwareFusePass>, LightwareFusePassComparator>
+        candidate_passes;
+    for (const auto& iter : map_) {
+      if (fuse_mode == iter.second->FuseMode()) {
+        candidate_passes.insert(
+            std::dynamic_pointer_cast<LightwareFusePass>(iter.second));
+      }
+    }
+    return std::vector<std::shared_ptr<LightwareFusePass>>(
+        candidate_passes.begin(), candidate_passes.end());
+  }
+
+  std::vector<std::shared_ptr<InputFusePass>> GetInputFusePasses() const {
+    std::set<std::shared_ptr<InputFusePass>, InputFusePassComparator>
+        candidate_passes;
+    for (const auto& iter : map_) {
+      if (iter.second->FuseMode() == "InputFuse") {
+        candidate_passes.insert(
+            std::dynamic_pointer_cast<InputFusePass>(iter.second));
+      }
+    }
+    return std::vector<std::shared_ptr<InputFusePass>>(candidate_passes.begin(),
+                                                       candidate_passes.end());
+  }
+
+ private:
+  FusionPassMap() = default;
+  std::unordered_map<std::string, std::shared_ptr<FusePass>> map_;
+
+  DISABLE_COPY_AND_ASSIGN(FusionPassMap);
+};
+
+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_OP macros to
+  // call this method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+
+template <typename PassClassT>
+class FusionPassRegistrar final : public Registrar {
+ public:
+  explicit FusionPassRegistrar(const std::string& pass_name) {
+    FusionPassMap::Instance().Insert(
+        pass_name, std::shared_ptr<PassClassT>(new PassClassT()));
+  }
+};
+
+// Op Fusion Pass which performs Ops fusion, Ops are fused
+// "vertically", meaning producing Ops are fused into their consumers
+// with the intent that the loops which compute their values will be fused in
+// code generation.
+class GeneralFusionMergePassHelper {
+ public:
+  explicit GeneralFusionMergePassHelper(const ::pir::Program* graph,
+                                        const GroupList& group_list)
+      : graph_(graph) {
+    fusion_groups_ = group_list;
+    // init input to consumers.
+    InitInputToConsumers();
+    // init fusion group index.
+    InitFusionGroupsAndIndex();
+
+    if (!FusionPassMap::Instance().Has("DefaultHorizontalFusePass")) {
+      FusionPassMap::Instance().Insert(
+          "DefaultHorizontalFusePass",
+          std::make_shared<ir::DefaultHorizontalFusePass>());
+    }
+    if (!FusionPassMap::Instance().Has("DefaultVerticalFusePass")) {
+      FusionPassMap::Instance().Insert(
+          "DefaultVerticalFusePass",
+          std::make_shared<ir::DefaultVerticalFusePass>());
+    }
+
+    if (!FusionPassMap::Instance().Has("DefaultRecomputeFusePass")) {
+      FusionPassMap::Instance().Insert(
+          "DefaultRecomputeFusePass",
+          std::make_shared<ir::DefaultRecomputeFusePass>());
+    }
+
+    if (!FusionPassMap::Instance().Has("DefaultInputFusePass")) {
+      FusionPassMap::Instance().Insert(
+          "DefaultInputFusePass", std::make_shared<ir::DefaultInputFusePass>());
+    }
+  }
+
+  GroupList operator()() {
+    // run fusion merge untill no update.
+    DoFusionMerge();
+    for (auto& group : fusion_groups_) {
+      VLOG(3) << "Fusion Group -> " << group->group_id;
+      for (auto& sub_group : group->fused_sub_groups) {
+        VLOG(3) << "  Fused Sub-Group -> " << sub_group->group_id;
+      }
+      for (const auto& producer : group->producer_groups()) {
+        VLOG(3) << "  Producer -> " << producer->group_id;
+      }
+      for (const auto& consumer : group->consumer_groups()) {
+        VLOG(3) << "  Consumer -> " << consumer->group_id;
+      }
+    }
+    return fusion_groups_;
+  }
+
+ private:
+  void DoFusionMerge() {
+    VLOG(3) << "DoFusionMerge...!";
+    while (DoGeneralHorizontalFusion()) {
+    }
+    while (DoGeneralVerticalFusion()) {
+    }
+    while (DoGeneralRecomputeAndVerticalFusion()) {
+    }
+  }
+
+  bool DoGeneralHorizontalFusion() {
+    VLOG(3) << "DoGeneralHorizontalFusion...!";
+    bool updated = false;
+    for (size_t idx = 0; idx < fusion_groups_.size(); ++idx) {
+      auto producer = fusion_groups_[idx];
+      VLOG(3) << "Fusion Producer idx " << idx << " Group -> "
+              << producer->group_id;
+      // if producer is sub group.
+      if (producer->belong_groups.size()) {
+        continue;
+      }
+      // do horizontal fusion.
+      updated |= GeneralHorizontalFuse(producer);
+    }
+
+    if (updated) {
+      UpdateFusionGroup();
+    }
+    return updated;
+  }
+
+  bool DoGeneralVerticalFusion() {
+    VLOG(3) << "DoGeneralVerticalFusion...!";
+    bool updated = false;
+    for (size_t idx = 0; idx < fusion_groups_.size(); ++idx) {
+      auto producer = fusion_groups_[idx];
+      VLOG(3) << "Fusion Producer idx " << idx << " Group -> "
+              << producer->group_id;
+      // if producer is sub group.
+      if (producer->belong_groups.size()) {
+        continue;
+      }
+      // do horizontal fusion.
+      updated |= GeneralHorizontalFuse(producer);
+      updated |= GeneralVerticalFuse(producer);
+    }
+
+    // fuse input consumers
+    updated |= GeneralInputFuse();
+
+    if (updated) {
+      UpdateFusionGroup();
+    }
+    return updated;
+  }
+
+  bool DoGeneralRecomputeAndVerticalFusion() {
+    VLOG(3) << "DoGeneralRecomputeAndVerticalFusion...!";
+    bool updated = false;
+    for (size_t idx = 0; idx < fusion_groups_.size(); ++idx) {
+      auto producer = fusion_groups_[idx];
+      VLOG(3) << "Fusion Producer idx " << idx << " Group -> "
+              << producer->group_id;
+      // if producer is sub group.
+      if (producer->belong_groups.size()) {
+        continue;
+      }
+      // do horizontal fusion.
+      bool recompute_success = GeneralRecomputeFuse(producer);
+      updated |= recompute_success;
+      if (!recompute_success) {
+        updated |= GeneralVerticalFuse(producer);
+      }
+    }
+
+    // fuse input consumers
+    updated |= GeneralInputFuse();
+
+    if (updated) {
+      UpdateFusionGroup();
+    }
+    return updated;
+  }
+
+  void UpdateFusionGroup() {
+    VLOG(3) << "UpdateFusionGroup...";
+    GroupList fusion_groups;
+    std::unordered_set<GroupPtr, Hasher, Comparator> fusion_groups_set;
+    // update fusion_groups_
+    for (auto& group : fusion_groups_) {
+      if (!group->belong_groups.size()) {
+        fusion_groups.push_back(group);
+        fusion_groups_set.insert(group);
+      }
+    }
+    // keep group in order
+    fusion_groups_.clear();
+    fusion_groups_index_.clear();
+    while (!fusion_groups_set.empty()) {
+      bool is_ring = true;
+      for (size_t idx = 0; idx < fusion_groups.size(); ++idx) {
+        auto& group = fusion_groups[idx];
+        if (!group.get()) {
+          continue;
+        }
+
+        bool exist = false;
+        for (const auto& producer : group->producer_groups()) {
+          if (fusion_groups_set.count(producer)) {
+            VLOG(4) << group->group_id << " " << producer->group_id;
+            exist = true;
+            break;
+          }
+        }
+
+        if (!exist) {
+          fusion_groups_index_[group] = fusion_groups_.size();
+          fusion_groups_.push_back(group);
+          fusion_groups_set.erase(group);
+          group.reset();
+          is_ring = false;
+          continue;
+        }
+      }
+      if (is_ring) {
+        LOG(FATAL) << "Exists Ring, Please Check!";
+      }
+    }
+  }
+
+  std::vector<std::shared_ptr<LightwareFusePass>> RawHorizontalFusePasses()
+      const {
+    return FusionPassMap::Instance().GetLightwareFusePassesByMode(
+        "HorizontalFuse");
+  }
+
+  const std::vector<std::shared_ptr<LightwareFusePass>>&
+  GetHorizontalFusePasses() const {
+    thread_local static std::vector<std::shared_ptr<LightwareFusePass>>
+        fuse_passes = RawHorizontalFusePasses();
+    return fuse_passes;
+  }
+
+  void EnableFusedHorizontalGroups(LightwareFusePassCtx* ctx) const {
+    const auto& producer = ctx->PickOpGroup();
+    if (producer.consumers().size() <= 1) {
+      return;
+    }
+    const auto& fuse_passes = GetHorizontalFusePasses();
+    for (const auto& fuse_pass : fuse_passes) {
+      (*fuse_pass)(ctx);
+    }
+  }
+
+  bool GeneralHorizontalFuse(const GroupPtr& producer) {
+    VLOG(3) << "GeneralHorizontalFuse handling producer : "
+            << producer->group_id;
+    const auto& GetFusableConsumerGroupLists =
+        [&]() -> std::vector<OpGroupList> {
+      std::vector<OpGroupList> tagged_lists;
+      const auto& MarkFusible = [&](const OpGroupList& candidates) {
+        tagged_lists.push_back(candidates);
+      };
+      GraphGroupLightwareFusePassCtx fuse_ctx(ir::OpGroup(producer),
+                                              MarkFusible);
+      EnableFusedHorizontalGroups(&fuse_ctx);
+      return tagged_lists;
+    };
+    const auto& GetFusableConsumerGroupList = [&]() -> std::vector<GroupList> {
+      const auto& group_lists = GetFusableConsumerGroupLists();
+      if (group_lists.empty()) {
+        return std::vector<GroupList>{};
+      }
+      std::vector<GroupList> ret;
+      for (const auto& group_list : group_lists) {
+        GroupList tmp;
+        for (const auto& group : group_list) {
+          tmp.push_back(group.GetGroup());
+        }
+        ret.push_back(tmp);
+      }
+      return ret;
+    };
+
+    const auto& group_lists = GetFusableConsumerGroupList();
+    if (group_lists.empty()) {
+      return false;
+    }
+    for (const auto& group_list : group_lists) {
+      HorizontalFuse(group_list);
+    }
+
+    return true;
+  }
+
+  std::vector<std::shared_ptr<InputFusePass>> RawInputFusePasses() const {
+    return FusionPassMap::Instance().GetInputFusePasses();
+  }
+
+  const std::vector<std::shared_ptr<InputFusePass>>& GetInputFusePasses()
+      const {
+    thread_local static std::vector<std::shared_ptr<InputFusePass>>
+        fuse_passes = RawInputFusePasses();
+    return fuse_passes;
+  }
+
+  void EnableFusedInputGroups(InputFusePassCtx* ctx) const {
+    const auto& fuse_passes = GetInputFusePasses();
+    for (const auto& fuse_pass : fuse_passes) {
+      (*fuse_pass)(ctx);
+    }
+  }
+
+  bool CallGeneralInputFusePass(
+      const std::unordered_set<GroupPtr, Hasher, Comparator>& consumers) {
+    VLOG(3) << "CallGeneralInputFusePass...!";
+    const auto& GetFusableConsumerGroupLists =
+        [&]() -> std::vector<OpGroupList> {
+      std::vector<OpGroupList> tagged_lists;
+      const auto& MarkFusible = [&](const OpGroupList& candidates) {
+        tagged_lists.push_back(candidates);
+      };
+      OpGroupList consumer_groups;
+      consumer_groups.reserve(consumers.size());
+      for (auto& consumer : consumers) {
+        consumer_groups.push_back(ir::OpGroup(consumer));
+      }
+      GraphGroupInputFusePassCtx fuse_ctx(consumer_groups, MarkFusible);
+      EnableFusedInputGroups(&fuse_ctx);
+      return tagged_lists;
+    };
+    const auto& GetFusableConsumerGroupList = [&]() -> std::vector<GroupList> {
+      const auto& group_lists = GetFusableConsumerGroupLists();
+      if (group_lists.empty()) {
+        return std::vector<GroupList>{};
+      }
+      std::vector<GroupList> ret;
+      for (const auto& group_list : group_lists) {
+        GroupList tmp;
+        for (const auto& group : group_list) {
+          tmp.push_back(group.GetGroup());
+        }
+        ret.push_back(tmp);
+      }
+      return ret;
+    };
+
+    const auto& group_lists = GetFusableConsumerGroupList();
+    if (group_lists.empty()) {
+      return false;
+    }
+    for (const auto& group_list : group_lists) {
+      HorizontalFuse(group_list);
+    }
+
+    return true;
+  }
+
+  void HorizontalFuse(const GroupList& consumers) {
+    VLOG(3) << "HorizontalFuse Groups...";
+    // create fusion group
+    auto fused_group = std::make_shared<ir::Group>();
+    // As recompute exist which may case sub-group used by more than one time.
+    std::vector<GroupPtr> repeat_sub_groups;
+    std::unordered_set<GroupPtr, Hasher, Comparator> sub_group_set;
+    // find the first consumer.
+    GroupPtr first_consumer(nullptr);
+    // fuse all group into fusion group.
+    for (const auto& consumer : consumers) {
+      VLOG(3) << "fuse consumer " << consumer->group_id << " into fused_group!";
+      // update depth
+      fused_group->max_depth =
+          std::max(fused_group->max_depth, consumer->max_depth);
+      fused_group->min_depth =
+          std::min(fused_group->min_depth, consumer->min_depth);
+      // update group id
+      if (fused_group->group_id.size()) {
+        fused_group->group_id += "_" + consumer->group_id;
+      } else {
+        fused_group->group_id = consumer->group_id;
+      }
+      // set op pattern kind
+      fused_group->op_pattern_kind =
+          static_cast<int>(fused_group->op_pattern_kind) >=
+                  static_cast<int>(consumer->op_pattern_kind)
+              ? fused_group->op_pattern_kind
+              : consumer->op_pattern_kind;
+      // input nodes
+      for (auto& node : consumer->input_nodes) {
+        if (fused_group->input_nodes.count(node.first)) {
+          fused_group->input_nodes[node.first] += node.second;
+        } else {
+          fused_group->input_nodes.insert(node);
+        }
+      }
+      // output node
+      for (auto& node : consumer->output_nodes) {
+        fused_group->output_nodes.insert(node);
+      }
+      // internal node
+      if (consumer->fused_sub_groups.size()) {
+        for (auto& node : consumer->internal_nodes) {
+          fused_group->internal_nodes.insert(node);
+        }
+      }
+      // master node
+      for (auto& node : consumer->master_nodes) {
+        if (GetOpKind(node->name()) == kReduction) {
+          fused_group->master_nodes.insert(node);
+        }
+      }
+      // insert sub group
+      if (consumer->fused_sub_groups.size()) {
+        for (auto& sub_group : consumer->fused_sub_groups) {
+          // check sub group is repeat.
+          if (sub_group_set.count(sub_group)) {
+            VLOG(3) << sub_group->group_id << " is repeated!";
+            repeat_sub_groups.push_back(sub_group);
+            continue;
+          }
+          // record sub group
+          sub_group_set.insert(sub_group);
+
+          // insert to fused sub group.
+          fused_group->fused_sub_groups.push_back(sub_group);
+          // update belongs group
+          sub_group->belong_groups.erase(consumer);
+          sub_group->belong_groups.insert(fused_group);
+        }
+      } else {
+        fused_group->fused_sub_groups.push_back(consumer);
+      }
+      // producer group
+      for (auto& producer : *consumer->mut_producer_groups()) {
+        fused_group->mut_producer_groups()->insert(producer);
+        // update producer's consumer
+        producer->mut_consumer_groups()->erase(consumer);
+        producer->mut_consumer_groups()->insert(fused_group);
+      }
+      // consumer group
+      for (auto& gconsumer : *consumer->mut_consumer_groups()) {
+        fused_group->mut_consumer_groups()->insert(gconsumer);
+        // update consumer's producer
+        gconsumer->mut_producer_groups()->erase(consumer);
+        gconsumer->mut_producer_groups()->insert(fused_group);
+      }
+      // belongs group
+      consumer->belong_groups.insert(fused_group);
+
+      // find the first consumer.
+      CHECK(fusion_groups_index_.count(consumer))
+          << "Can't find consumer " << consumer->group_id
+          << " index in fusion_groups_index_!";
+      if (first_consumer.get()) {
+        if (fusion_groups_index_[consumer] <
+            fusion_groups_index_[first_consumer]) {
+          first_consumer = consumer;
+        }
+      } else {
+        first_consumer = consumer;
+      }
+    }
+
+    // if node is output nodes of sub_group, check it can't be internal node.
+    for (auto& sub_group : repeat_sub_groups) {
+      // check each output node in sub_group.
+      for (auto& node : sub_group->output_nodes) {
+        // if node is not output node of fused_group.
+        if (!fused_group->output_nodes.count(node)) {
+          fused_group->internal_nodes.insert(node);
+        }
+      }
+    }
+
+    if (static_cast<int>(kReduction) >
+        static_cast<int>((consumers.back())->op_pattern_kind)) {
+      auto consumer = consumers.back();
+
+      for (auto& node : consumer->master_nodes) {
+        fused_group->master_nodes.insert(node);
+      }
+    } else {
+      for (auto consumer = consumers.rbegin(); consumer != consumers.rend();
+           ++consumer) {
+        ::pir::Operation* master_node = nullptr;
+        for (auto& node : (*consumer)->master_nodes) {
+          if (GetOpKind(node->name()) != kReduction) {
+            master_node = node;
+            break;
+          }
+        }
+        if (master_node) {
+          // VLOG(3) << "Insert Master node : " << master_node->id()
+          //         << " into group : " << fused_group->group_id;
+          fused_group->master_nodes.insert(master_node);
+          break;
+        }
+      }
+    }
+
+    auto postion = fusion_groups_index_[first_consumer];
+    fusion_groups_[postion] = fused_group;
+    fusion_groups_index_[fused_group] = postion;
+
+    CHECK(fused_group->output_nodes.size())
+        << "No output node is found, " << fused_group->group_id;
+  }
+
+  std::vector<std::shared_ptr<LightwareFusePass>> RawVerticalFusePasses()
+      const {
+    return FusionPassMap::Instance().GetLightwareFusePassesByMode(
+        "VerticalFuse");
+  }
+
+  const std::vector<std::shared_ptr<LightwareFusePass>>& GetVerticalFusePasses()
+      const {
+    thread_local static std::vector<std::shared_ptr<LightwareFusePass>>
+        fuse_passes = RawVerticalFusePasses();
+    return fuse_passes;
+  }
+
+  void TagVerticalGroups(LightwareFusePassCtx* ctx) const {
+    const auto& producer = ctx->PickOpGroup();
+    if (producer.consumers().size() == 0) {
+      return;
+    }
+    const auto& fuse_passes = GetVerticalFusePasses();
+    for (const auto& fuse_pass : fuse_passes) {
+      (*fuse_pass)(ctx);
+    }
+  }
+
+  bool GeneralVerticalFuse(const GroupPtr& producer) {
+    VLOG(3) << "GeneralVerticalFuse...!";
+    using GroupSets = std::vector<std::pair<OpGroupPtr, OpGroupPtr>>;
+    const auto& GetFusableConsumerOpGroupSets = [&]() -> GroupSets {
+      GroupSets tagged_sets;
+      const auto& MarkFusible = [&](const OpGroupPtr& first,
+                                    const OpGroupPtr& second) {
+        tagged_sets.push_back(std::make_pair(first, second));
+      };
+      GraphGroupLightwareFusePassCtx fuse_ctx(ir::OpGroup(producer),
+                                              MarkFusible);
+      TagVerticalGroups(&fuse_ctx);
+      return tagged_sets;
+    };
+
+    auto GetFusableConsumerGroupSet =
+        [&]() -> std::unordered_set<GroupPtr, Hasher, Comparator> {
+      const auto& group_sets = GetFusableConsumerOpGroupSets();
+      if (group_sets.empty()) {
+        return {};
+      }
+      std::unordered_set<GroupPtr, Hasher, Comparator> ret;
+      for (const auto& group_pair : group_sets) {
+        ret.insert(group_pair.second.GetGroup());
+      }
+      return ret;
+    };
+
+    bool update = false;
+    auto consumer_groups = GetFusableConsumerGroupSet();
+    if (consumer_groups.size()) {
+      SelectConsumerToFuse(producer, &consumer_groups);
+    }
+    if (consumer_groups.size() > 0) {
+      VerticalFuse(producer, consumer_groups);
+      update = true;
+    }
+    return update;
+  }
+
+  void VerticalFuse(const GroupPtr& producer,
+                    const std::unordered_set<GroupPtr, Hasher, Comparator>&
+                        fusionable_consumers) {
+    VLOG(3) << "VerticalFuse...!";
+    GroupList fused_groups;
+    GroupPtr master_fuesd_group(nullptr);
+    for (auto& consumer : fusionable_consumers) {
+      auto fused_group = std::make_shared<ir::Group>();
+      // update depth using consumer depth.
+      fused_group->max_depth =
+          std::max(producer->max_depth, consumer->max_depth);
+      fused_group->min_depth =
+          std::min(producer->min_depth, consumer->min_depth);
+      // update group id
+      fused_group->group_id = producer->group_id + "_" + consumer->group_id;
+      VLOG(3) << "fuse producer " << producer->group_id << " into consumer "
+              << consumer->group_id;
+      // fuse producer into fusion group
+      fused_group->op_pattern_kind =
+          static_cast<int>(producer->op_pattern_kind) >=
+                  static_cast<int>(consumer->op_pattern_kind)
+              ? producer->op_pattern_kind
+              : consumer->op_pattern_kind;
+      // input nodes
+      fused_group->input_nodes = producer->input_nodes;
+
+      // internal nodes
+      if (producer->fused_sub_groups.size()) {
+        for (auto& node : producer->internal_nodes) {
+          fused_group->internal_nodes.insert(node);
+        }
+      }
+      // convert producer's output node to internal.
+      for (auto node : producer->output_nodes) {
+        // if node is used more than 1 time.
+        if (consumer->input_nodes.count(node)) {
+          if (consumer->input_nodes[node] > 1 && node->num_operands() > 0) {
+            fused_group->internal_nodes.insert(node);
+          }
+        }
+      }
+      // master nodes
+      for (auto& node : producer->master_nodes) {
+        if (GetOpKind(node->name()) == kReduction) {
+          fused_group->master_nodes.insert(node);
+        }
+      }
+
+      // producer groups
+      for (auto& group : *producer->mut_producer_groups()) {
+        fused_group->mut_producer_groups()->insert(group);
+        // update producer's producer's consumer
+        group->mut_consumer_groups()->erase(producer);
+        group->mut_consumer_groups()->insert(fused_group);
+      }
+
+      // sub groups
+      if (producer->fused_sub_groups.size()) {
+        for (auto& group : producer->fused_sub_groups) {
+          fused_group->fused_sub_groups.push_back(group);
+          // update belong group
+          group->belong_groups.erase(producer);
+          group->belong_groups.insert(fused_group);
+        }
+      } else {
+        fused_group->fused_sub_groups.push_back(producer);
+      }
+      producer->belong_groups.insert(fused_group);
+
+      // input nodes
+      for (auto& input_node : consumer->input_nodes) {
+        // if input node not in producer output.
+        if (!producer->output_nodes.count(input_node.first)) {
+          if (fused_group->input_nodes.count(input_node.first)) {
+            fused_group->input_nodes[input_node.first] += input_node.second;
+          } else {
+            fused_group->input_nodes.insert(input_node);
+          }
+        }
+      }
+
+      // output nodes
+      for (auto& node : consumer->output_nodes) {
+        fused_group->output_nodes.insert(node);
+      }
+
+      // internal nodes
+      if (consumer->fused_sub_groups.size()) {
+        for (auto& node : consumer->internal_nodes) {
+          fused_group->internal_nodes.insert(node);
+        }
+      }
+
+      // master nodes
+      for (auto& node : consumer->master_nodes) {
+        fused_group->master_nodes.insert(node);
+      }
+
+      // producer nodes
+      for (auto& group : *consumer->mut_producer_groups()) {
+        if (group.get() != producer.get()) {
+          fused_group->mut_producer_groups()->insert(group);
+          // update consumer's producer's consumer
+          group->mut_consumer_groups()->erase(consumer);
+          group->mut_consumer_groups()->insert(fused_group);
+        }
+      }
+
+      // consumer nodes
+      for (auto& group : *consumer->mut_consumer_groups()) {
+        fused_group->mut_consumer_groups()->insert(group);
+        // update consumer's consumer's producer
+        group->mut_producer_groups()->erase(consumer);
+        group->mut_producer_groups()->insert(fused_group);
+      }
+
+      // sub group
+      if (consumer->fused_sub_groups.size()) {
+        for (auto& sub_group : consumer->fused_sub_groups) {
+          if (std::find(fused_group->fused_sub_groups.begin(),
+                        fused_group->fused_sub_groups.end(),
+                        sub_group) == fused_group->fused_sub_groups.end()) {
+            fused_group->fused_sub_groups.push_back(sub_group);
+          }
+          // update belong group
+          sub_group->belong_groups.erase(consumer);
+          sub_group->belong_groups.insert(fused_group);
+        }
+      } else {
+        fused_group->fused_sub_groups.push_back(consumer);
+      }
+      consumer->belong_groups.insert(fused_group);
+
+      fused_groups.push_back(fused_group);
+      CHECK(fusion_groups_index_.count(consumer))
+          << "Can't find consumer " << consumer->group_id
+          << " index in fusion_groups_index_!";
+      auto postion = fusion_groups_index_[consumer];
+      fusion_groups_[postion] = fused_group;
+      fusion_groups_index_[fused_group] = postion;
+
+      if (!master_fuesd_group.get()) {
+        master_fuesd_group = fused_group;
+      }
+      CHECK(fused_group->output_nodes.size())
+          << "No output node is found, " << fused_group->group_id;
+    }
+
+    for (auto& node : producer->output_nodes) {
+      bool be_output = true;
+      for (const auto& consumer : producer->consumer_groups()) {
+        // if consumer is in fusionable.
+        if (fusionable_consumers.count(consumer)) {
+          if (consumer->input_nodes.count(node)) {
+            be_output = false;
+          }
+          continue;
+        }
+        // if consumer is not in fusionable.
+        if (consumer->input_nodes.count(node)) {
+          be_output = true;
+          break;
+        }
+        // others node is as graph output.
+      }
+
+      if (output_nodes_set_.count(node)) {
+        be_output = true;
+      }
+
+      if (be_output) {
+        // VLOG(4) << "Insert Id " << node->id() << " Into Group "
+        //         << master_fuesd_group->group_id;
+        master_fuesd_group->output_nodes.insert(node);
+      }
+    }
+    // insert unfusionable consumer groups
+    for (auto& consumer : *producer->mut_consumer_groups()) {
+      if (fusionable_consumers.count(consumer)) {
+        continue;
+      }
+      master_fuesd_group->mut_consumer_groups()->insert(consumer);
+      // update consumer's producer
+      consumer->mut_producer_groups()->erase(producer);
+      consumer->mut_producer_groups()->insert(master_fuesd_group);
+    }
+  }
+
+  std::vector<std::shared_ptr<LightwareFusePass>> RawRecomputeFusePasses()
+      const {
+    return FusionPassMap::Instance().GetLightwareFusePassesByMode(
+        "RecomputeFuse");
+  }
+
+  const std::vector<std::shared_ptr<LightwareFusePass>>&
+  GetRecomputeFusePasses() const {
+    thread_local static std::vector<std::shared_ptr<LightwareFusePass>>
+        fuse_passes = RawRecomputeFusePasses();
+    return fuse_passes;
+  }
+
+  void TagRecomputeGroups(LightwareFusePassCtx* ctx) const {
+    const auto& fuse_passes = GetRecomputeFusePasses();
+    for (const auto& fuse_pass : fuse_passes) {
+      (*fuse_pass)(ctx);
+    }
+  }
+
+  bool GeneralRecomputeFuse(const GroupPtr& producer) {
+    VLOG(3) << "GeneralRecomputeFuse handling producer : "
+            << producer->group_id;
+    using GroupSets = std::set<std::pair<OpGroupPtr, OpGroupPtr>>;
+    const auto& GetFusableConsumerOpGroupSets = [&]() -> GroupSets {
+      GroupSets tagged_sets;
+      const auto& MarkFusible = [&](const OpGroupPtr& first,
+                                    const OpGroupPtr& second) {
+        tagged_sets.insert(std::make_pair(first, second));
+      };
+      GraphGroupLightwareFusePassCtx fuse_ctx(ir::OpGroup(producer),
+                                              MarkFusible);
+      TagRecomputeGroups(&fuse_ctx);
+      return tagged_sets;
+    };
+
+    auto GetFusableConsumerGroupSet =
+        [&]() -> std::unordered_set<GroupPtr, Hasher, Comparator> {
+      const auto& group_sets = GetFusableConsumerOpGroupSets();
+      if (group_sets.empty()) {
+        return {};
+      }
+      std::unordered_set<GroupPtr, Hasher, Comparator> ret;
+      for (const auto& group_pair : group_sets) {
+        ret.insert(group_pair.second.GetGroup());
+      }
+      return ret;
+    };
+
+    bool update = false;
+    auto consumer_groups = GetFusableConsumerGroupSet();
+    if (consumer_groups.size() > 0) {
+      CHECK(consumer_groups.size() == producer->mut_consumer_groups()->size())
+          << "Recompute requires fuse all consumers!";
+      RecomputeFuse(producer, consumer_groups);
+      update = true;
+    }
+    return update;
+  }
+
+  void RecomputeFuse(const GroupPtr& producer,
+                     const std::unordered_set<GroupPtr, Hasher, Comparator>&
+                         fusionable_consumers) {
+    VerticalFuse(producer, fusionable_consumers);
+  }
+
+  void SelectConsumerToFuse(
+      const GroupPtr& producer,
+      std::unordered_set<GroupPtr, Hasher, Comparator>* fusionable_consumers) {
+    // if is const op
+
+    // TODO(phlrain) : support constant
+    // if (is_const_group(this, producer)) {
+    if (false) {
+      std::unordered_set<GroupPtr, Hasher, Comparator> candidates;
+      for (auto& consumer : *fusionable_consumers) {
+        // if can be output node.
+        if (is_same_shape(producer, consumer)) {
+          candidates.insert(consumer);
+        } else {
+          VLOG(4) << "Fuse Producer : " << producer->group_id
+                  << " into Consumer : " << consumer->group_id;
+          consumer->group_id = producer->group_id + "_" + consumer->group_id;
+          // just merge the node into group.
+          auto& sub_group = consumer->fused_sub_groups.front();
+          sub_group->group_id = producer->group_id + "_" + sub_group->group_id;
+          sub_group->nodes.insert(sub_group->nodes.begin(),
+                                  producer->CollectNodes()[0]);
+          sub_group->nodes_set.insert(producer->CollectNodes()[0]);
+          // remove depency.
+          consumer->input_nodes.erase(producer->CollectNodes()[0]);
+          consumer->mut_producer_groups()->erase(producer);
+          producer->mut_consumer_groups()->erase(consumer);
+        }
+      }
+
+      CHECK_GE(producer->consumer_groups().size(), candidates.size());
+      if (producer->consumer_groups().size() == 0 && candidates.size() == 0 &&
+          output_nodes_set_.count(producer->CollectNodes()[0]) == 0) {
+        producer->belong_groups.insert(*fusionable_consumers->begin());
+      }
+
+      *fusionable_consumers = candidates;
+      return;
+    }
+    // 1 to 1 fusion.
+    if (producer->consumer_groups().size() == 1) {
+      return;
+    }
+
+    // TODO(phlrain): support flags
+    // if (FLAGS_enhance_vertical_fusion_with_recompute) {
+    if (false) {
+      std::vector<GroupPtr> candidates;
+      for (auto& consumer : *fusionable_consumers) {
+        if (consumer->op_pattern_kind == kElementWise) {
+          candidates.push_back(consumer);
+          continue;
+        }
+
+        auto producer_output_shape = phi::vectorize(
+            GetValueShape((*producer->output_nodes.begin())->result(0)));
+
+        auto consumer_output_shape = phi::vectorize(
+            GetValueShape((*consumer->output_nodes.begin())->result(0)));
+
+        auto consumer_master_input_shape = phi::vectorize(GetValueShape(
+            (*(consumer->master_nodes.begin()))->operand_source(0)));
+
+        int producer_output_numel =
+            std::accumulate(producer_output_shape.begin(),
+                            producer_output_shape.end(),
+                            1,
+                            std::multiplies<int>());
+        int consumer_output_numel =
+            std::accumulate(consumer_output_shape.begin(),
+                            consumer_output_shape.end(),
+                            1,
+                            std::multiplies<int>());
+        int consumer_master_input_numel =
+            std::accumulate(consumer_master_input_shape.begin(),
+                            consumer_master_input_shape.end(),
+                            1,
+                            std::multiplies<int>());
+        if (producer_output_numel == consumer_output_numel) {
+          candidates.push_back(consumer);
+          continue;
+        }
+
+        if (producer->op_pattern_kind != kInjective &&
+            consumer->op_pattern_kind == kReduction &&
+            producer_output_numel == consumer_master_input_numel) {
+          candidates.push_back(consumer);
+        }
+      }
+      sort(candidates.begin(),
+           candidates.end(),
+           [](const auto& lhs, const auto& rhs) {
+             return lhs->op_pattern_kind < rhs->op_pattern_kind;
+           });
+
+      fusionable_consumers->clear();
+      if (candidates.size()) {
+        fusionable_consumers->insert(*candidates.begin());
+      }
+    } else {
+      std::vector<GroupPtr> candidates;
+      for (auto& consumer : *fusionable_consumers) {
+        if (consumer->op_pattern_kind == kElementWise) {
+          candidates.push_back(consumer);
+          continue;
+        }
+
+        auto shape0 = phi::vectorize(
+            GetValueShape((*producer->output_nodes.begin())->result(0)));
+        auto shape1 = phi::vectorize(
+            GetValueShape((*consumer->output_nodes.begin())->result(0)));
+
+        if (std::accumulate(
+                shape0.begin(), shape0.end(), 1, std::multiplies<int>()) ==
+            std::accumulate(
+                shape1.begin(), shape1.end(), 1, std::multiplies<int>())) {
+          candidates.push_back(consumer);
+        }
+      }
+
+      fusionable_consumers->clear();
+      if (candidates.size()) {
+        fusionable_consumers->insert(candidates.front());
+      }
+    }
+  }
+
+  bool IsDependency(
+      const GroupPtr& producer_g,
+      const GroupPtr& consumer,
+      const std::unordered_set<GroupPtr, Hasher, Comparator>& consumers) {
+    std::queue<GroupPtr> candidates;
+    candidates.push(consumer);
+
+    std::unordered_set<GroupPtr, Hasher, Comparator> visited_set;
+    while (!candidates.empty()) {
+      auto& candidate = candidates.front();
+      candidates.pop();
+      for (const auto& producer_and_list : candidate->producer_groups()) {
+        if (producer_and_list.get() == producer_g.get()) {
+          continue;
+        }
+        const auto& producer =
+            std::dynamic_pointer_cast<ir::Group>(producer_and_list);
+        if (consumers.count(producer)) {
+          return true;
+        }
+        if (!visited_set.count(producer)) {
+          visited_set.insert(producer);
+          candidates.push(producer);
+        }
+      }
+    }
+    return false;
+  }
+
+  bool IsDependencySimplify(
+      const GroupPtr& producer_g,
+      const GroupPtr& consumer,
+      const std::unordered_set<GroupPtr, Hasher, Comparator>& consumers) {
+    std::queue<GroupPtr> candidates;
+    candidates.push(consumer);
+    // check upper.
+    int check_upper_depth = producer_g.get() ? producer_g->max_depth : INT_MAX;
+    std::unordered_set<GroupPtr, Hasher, Comparator> visited_set;
+    while (!candidates.empty()) {
+      auto& candidate = candidates.front();
+      candidates.pop();
+      for (auto& producer_and_list : candidate->producer_groups()) {
+        if (producer_and_list.get() == producer_g.get()) {
+          continue;
+        }
+        const auto& producer =
+            std::dynamic_pointer_cast<ir::Group>(producer_and_list);
+        if (producer->min_depth > check_upper_depth) {
+          continue;
+        }
+        if (consumers.count(producer)) {
+          return true;
+        }
+        if (!visited_set.count(producer)) {
+          visited_set.insert(producer);
+          candidates.push(producer);
+        }
+      }
+    }
+    return false;
+  }
+
+  bool GeneralInputFuse() {
+    VLOG(3) << "GeneralInputFuse...!";
+    auto updated = false;
+    UpdateInputToConsumers();
+    for (auto& input_consumers : input_to_consumers_) {
+      // if group set size == 1.
+      if (input_consumers.second.size() == 1) {
+        continue;
+      }
+      // do input fusion.
+      auto st = CallGeneralInputFusePass(input_consumers.second);
+      if (st) {
+        // fused consumers, update
+        UpdateInputToConsumers();
+      }
+      updated |= st;
+    }
+
+    return updated;
+  }
+
+  void UpdateInputToConsumers() {
+    for (auto& input_consumers : input_to_consumers_) {
+      auto& consumers = input_consumers.second;
+      std::unordered_set<GroupPtr, Hasher, Comparator> updated_consumers;
+      for (auto& consumer : consumers) {
+        std::queue<GroupPtr> fused_groups;
+        fused_groups.push(consumer);
+        while (!fused_groups.empty()) {
+          auto& cur = fused_groups.front();
+          fused_groups.pop();
+          // if group is sub group
+          if (cur->belong_groups.empty()) {
+            updated_consumers.insert(cur);
+          } else {
+            for (auto& belong_group : cur->belong_groups) {
+              if (belong_group->group_id == cur->group_id) {
+                updated_consumers.insert(belong_group);
+              } else {
+                fused_groups.push(belong_group);
+              }
+            }
+          }
+        }
+      }
+      consumers = updated_consumers;
+    }
+  }
+
+  void InitInputToConsumers() {
+    VLOG(3) << "InitInputToConsumers...!";
+    // init input data node -> fusion group map.
+    for (auto& group : fusion_groups_) {
+      for (auto& node : group->nodes_set) {
+        // collect producer node data.
+        for (size_t i = 0; i < node->num_operands(); ++i) {
+          auto in = node->operand_source(i);
+          if (in) {
+            input_to_consumers_[in].insert(group);
+          }
+        }
+      }
+    }
+  }
+
+  void InitFusionGroupsAndIndex() {
+    VLOG(3) << "InitFusionGroupsAndIndex...!";
+    // init the postion of groups in fusion groups.
+    for (size_t idx = 0; idx < fusion_groups_.size(); ++idx) {
+      auto group = fusion_groups_[idx];
+      auto belong_group = std::make_shared<ir::Group>();
+      // copy from group.
+      belong_group->max_depth = group->depth;
+      belong_group->min_depth = group->depth;
+      belong_group->group_id = group->group_id;
+      belong_group->input_nodes = group->input_nodes;
+      belong_group->output_nodes = group->output_nodes;
+      belong_group->op_pattern_kind = group->op_pattern_kind;
+      belong_group->master_nodes = group->master_nodes;
+      (*belong_group->mut_producer_groups()) = group->producer_groups();
+      (*belong_group->mut_consumer_groups()) = group->consumer_groups();
+      belong_group->fused_sub_groups.push_back(group);
+      group->belong_groups.insert(belong_group);
+      // replace group to fused_group
+      fusion_groups_[idx] = belong_group;
+      // record idx
+      fusion_groups_index_[belong_group] = idx;
+    }
+
+    // update producer and consumer.
+    for (auto& group : fusion_groups_) {
+      std::unordered_set<GroupPtr, Hasher, Comparator> producers;
+      std::unordered_set<GroupPtr, Hasher, Comparator> consumers;
+
+      for (const auto& producer : group->producer_groups()) {
+        CHECK(producer->belong_groups.size());
+        producers.insert(*producer->belong_groups.begin());
+      }
+
+      for (auto& consumer : *group->mut_consumer_groups()) {
+        CHECK(consumer->belong_groups.size());
+        consumers.insert(*consumer->belong_groups.begin());
+      }
+      CHECK_EQ(group->producer_groups().size(), producers.size());
+      CHECK_EQ(group->consumer_groups().size(), consumers.size());
+      (*group->mut_producer_groups()) = producers;
+      (*group->mut_consumer_groups()) = consumers;
+    }
+  }
+
+  const ::pir::Program* graph_;
+  GroupList fusion_groups_;
+  std::unordered_map<GroupPtr, int> fusion_groups_index_;
+  std::unordered_set<const ::pir::Operation*> output_nodes_set_;
+  std::unordered_map<::pir::Value,
+                     std::unordered_set<GroupPtr, Hasher, Comparator>>
+      input_to_consumers_;
+};
+
+GroupList GeneralFusionMergePassInternal(const ::pir::Program* graph,
+                                         const GroupList& group_list) {
+  if (group_list.size() <= 1) {
+    VLOG(3) << "Don't do Fusoin Merge Pass...!";
+    return group_list;
+  }
+
+  GeneralFusionMergePassHelper fusion_merge_pass_helper(graph, group_list);
+  auto res = fusion_merge_pass_helper();
+
+  return res;
+}
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_pass_utils.h b/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_pass_utils.h
new file mode 100644
index 00000000000000..19ea891531b872
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_pass_utils.h
@@ -0,0 +1,279 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_group.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+using OpGroupPtr = ir::OpGroup;
+using OpGroupList = std::vector<OpGroupPtr>;
+
+static cinn::dialect::ir::OpNode GetMasterNode(const OpGroupPtr& op_group) {
+  std::vector<cinn::dialect::ir::OpNode> master_nodes;
+  op_group.WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
+    if (op.kind() == OpPatternKind::kReduction) {
+      master_nodes.push_back(op);
+    }
+  });
+  if (!master_nodes.empty()) {
+    return master_nodes.front();
+  }
+
+  op_group.WalkOpNodes(
+      [&](const cinn::dialect::ir::OpNode& op) { master_nodes.push_back(op); });
+  return master_nodes.back();
+}
+
+static bool IsSameSize(const OpGroupPtr& src, const OpGroupPtr& dst) {
+  cinn::dialect::ir::OpNode src_master_node = GetMasterNode(src);
+  cinn::dialect::ir::OpNode dst_master_node = GetMasterNode(dst);
+
+  auto size_0 = src_master_node.outputs()[0].shape();
+  auto size_1 = dst_master_node.outputs()[0].shape();
+
+  return phi::product(size_0) == phi::product(size_1);
+}
+
+static std::unordered_set<cinn::dialect::ir::OpNode> GetInputOps(
+    const OpGroupPtr& op_group) {
+  std::unordered_set<OpNode> ops_set;
+  op_group.WalkOpNodes([&ops_set](const cinn::dialect::ir::OpNode& op_node) {
+    ops_set.insert(op_node);
+  });
+
+  std::unordered_set<cinn::dialect::ir::OpNode> input_ops;
+  op_group.WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
+    const auto& input_tensors = op.inputs();
+    for (size_t i = 0; i < input_tensors.size(); ++i) {
+      if (!ops_set.count(input_tensors[i].producer())) {
+        input_ops.insert(input_tensors[i].producer());
+      }
+    }
+  });
+  return input_ops;
+}
+
+static std::unordered_set<cinn::dialect::ir::OpNode> GetOutputOps(
+    const OpGroupPtr& op_group) {
+  std::unordered_set<OpNode> ops_set;
+  op_group.WalkOpNodes([&ops_set](const cinn::dialect::ir::OpNode& op_node) {
+    ops_set.insert(op_node);
+  });
+  std::unordered_set<cinn::dialect::ir::OpNode> output_ops;
+  op_group.WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
+    const auto& output_tensors = op.outputs();
+    for (size_t i = 0; i < output_tensors.size(); ++i) {
+      auto& consumers = output_tensors[i].consumers();
+      for (auto it = consumers.begin(); it != consumers.end(); ++it) {
+        if (!ops_set.count(*it)) {
+          output_ops.insert(*it);
+          break;
+        }
+      }
+    }
+  });
+  return output_ops;
+}
+
+// limit the group args number to less equal 512, as args stack size is 4K.
+static bool limit_args(const OpGroupPtr& first, const OpGroupPtr& second) {
+  std::unordered_set<cinn::dialect::ir::OpNode> args;
+  for (auto& group : {first, second}) {
+    for (const auto& node : GetInputOps(group)) {
+      args.insert(node);
+    }
+    for (const auto& node : GetOutputOps(group)) {
+      args.insert(node);
+    }
+  }
+
+  if (args.size() > 512) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+bool WithoutLastDimInReduce(const phi::DDim& inshape,
+                            const std::vector<int>& axes) {
+  // if last axis is in reduce.
+  if (std::find(axes.begin(), axes.end(), inshape.size() - 1) != axes.end() ||
+      std::find(axes.begin(), axes.end(), -1) != axes.end()) {
+    return false;
+  }
+
+  int sum_last_axes = 1;
+  for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+    sum_last_axes *= inshape[idx];
+  }
+
+  if (sum_last_axes > 1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static int GetSharedSize(const cinn::dialect::ir::OpNode& op_node) {
+  const auto& inshape = op_node.inputs()[0].shape();
+  // const auto& axes = op_node.GetAttr<std::vector<int>>("dim");
+  // const auto& axes = op_node.Op()->attributes().at("dim").dyn_cast<>
+  // TODO(phlrain): get vector from attribute
+  std::vector<int> axes = {1};
+  if (WithoutLastDimInReduce(inshape, axes)) {
+    int lane = 1;
+    for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+      lane = inshape[idx];
+    }
+    // int max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+    int max_num_threads = 1000;
+    if (lane > max_num_threads / 2) {
+      return 0;
+    }
+    int index = axes.size() - 1;
+    for (; index >= 0; --index) {
+      if (static_cast<size_t>(index + 1) < axes.size() &&
+          axes[index] != axes[index + 1] - 1) {
+        break;
+      }
+      lane *= inshape[axes[index]];
+      if (lane > max_num_threads / 2) {
+        break;
+      }
+    }
+    // if lane > (max_num_threads / 2),the loop break from lane >
+    // max_num_threads / 2.
+    int axis = lane > (max_num_threads / 2) ? axes[index] : axes[index + 1];
+    if (lane <= max_num_threads) {
+      return lane * sizeof(float);
+    } else {
+      int prefix = inshape[axis];
+      int tail = lane / prefix;
+      for (int idx = max_num_threads / tail;
+           idx > ((max_num_threads / 2) / tail);
+           --idx) {
+        if (prefix % idx == 0) {
+          return idx * tail * sizeof(float);
+        }
+      }
+      int num = max_num_threads / tail;
+      return num * tail * sizeof(float);
+    }
+  }
+  return 0;
+}
+
+static bool ReduceFuseReduce(const OpGroupPtr& first,
+                             const OpGroupPtr& second) {
+  if (!limit_args(first, second)) {
+    return false;
+  }
+  std::unique_ptr<cinn::dialect::ir::OpNode> reducer_0 = nullptr;
+  first.WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
+    if (!reducer_0 && op.kind() == kReduction) {
+      reducer_0.reset(new cinn::dialect::ir::OpNode(op));
+    }
+  });
+  CHECK(reducer_0) << "Can't find reduce op in group " << first.group_id();
+
+  std::unique_ptr<cinn::dialect::ir::OpNode> reducer_1 = nullptr;
+  second.WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
+    if (!reducer_1 && op.kind() == kReduction) {
+      reducer_1.reset(new cinn::dialect::ir::OpNode(op));
+    }
+  });
+
+  CHECK(reducer_1) << "Can't find reduce op in group " << second.group_id();
+
+  // check reduce has same input shape and output shape
+  const auto& reducer_0_input_shape = reducer_0->inputs()[0].shape();
+  const auto& reducer_0_output_shape = reducer_0->outputs()[0].shape();
+
+  const auto& reducer_1_input_shape = reducer_1->inputs()[0].shape();
+  const auto& reducer_1_output_shape = reducer_1->outputs()[0].shape();
+
+  // TODO(phlrain): get attribute from op node
+  // auto reducer_0_reduce_dim = reducer_0->GetAttr<std::vector<int>>("dim");
+  // auto reducer_1_reduce_dim = reducer_1->GetAttr<std::vector<int>>("dim");
+
+  std::vector<int> reducer_0_reduce_dim = {0};
+  std::vector<int> reducer_1_reduce_dim = {0};
+
+  for (auto& dim : reducer_0_reduce_dim) {
+    // if dim = -1, set as shape.size() - 1
+    if (dim == -1) {
+      dim = reducer_0_reduce_dim.size() - 1;
+    }
+  }
+
+  for (auto& dim : reducer_1_reduce_dim) {
+    // if dim = -1,  set as shape.size() - 1
+    if (dim == -1) {
+      dim = reducer_1_reduce_dim.size() - 1;
+    }
+  }
+
+  // check shape is same
+  if (reducer_0_input_shape == reducer_1_input_shape &&
+      reducer_0_output_shape == reducer_1_output_shape &&
+      reducer_0_reduce_dim == reducer_1_reduce_dim) {
+    auto shared_size = 0;
+    for (auto& fusion_group : {first, second}) {
+      fusion_group.WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
+        if (op.kind() == OpPatternKind::kReduction) {
+          shared_size += GetSharedSize(op);
+        }
+      });
+    }
+
+#define MAX_AVAILABLE_SHREAD 32 * 1024
+    if (shared_size > MAX_AVAILABLE_SHREAD) {
+      return false;
+    }
+#undef MAX_AVAILABLE_SHREAD
+    return true;
+  }
+
+  if (WithoutLastDimInReduce(reducer_0_input_shape, reducer_0_reduce_dim) &&
+      WithoutLastDimInReduce(reducer_1_input_shape, reducer_1_reduce_dim) &&
+      reducer_0_output_shape == reducer_1_output_shape &&
+      reducer_0_reduce_dim == reducer_1_reduce_dim) {
+    auto shared_size = 0;
+    for (auto& fusion_group : {first, second}) {
+      fusion_group.WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
+        if (op.kind() == OpPatternKind::kReduction) {
+          shared_size += GetSharedSize(op);
+        }
+      });
+    }
+
+#define MAX_AVAILABLE_SHREAD 32 * 1024
+    if (shared_size > MAX_AVAILABLE_SHREAD) {
+      return false;
+    }
+#undef MAX_AVAILABLE_SHREAD
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_util.h b/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_util.h
new file mode 100644
index 00000000000000..1b8f5b6aeacd7b
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_with_group_merge_util.h
@@ -0,0 +1,638 @@
+// Copyright (c) 20223 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <limits.h>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+const std::set<std::string> ConstantOps = {
+    "const_scalar", "fill_constant", "arange"};
+
+// limit the group args number to less equal 512, as args stack size is 4K.
+inline bool limit_args(const std::shared_ptr<ir::Group>& first,
+                       const std::shared_ptr<ir::Group>& second) {
+  std::unordered_set<const ::pir::Operation*> args;
+  for (auto& group : {first, second}) {
+    for (auto node : group->input_nodes) {
+      args.insert(node.first);
+    }
+    for (auto node : group->output_nodes) {
+      args.insert(node);
+    }
+  }
+
+  if (args.size() > 512) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+inline bool always_fuse(const std::shared_ptr<ir::Group>& first,
+                        const std::shared_ptr<ir::Group>& second) {
+  return true;
+}
+
+inline bool is_same_shape(const std::shared_ptr<ir::Group>& first,
+                          const std::shared_ptr<ir::Group>& second) {
+  if (!limit_args(first, second)) {
+    return false;
+  }
+
+  auto output_var_0 = GetValueShape((*first->master_nodes.begin())->result(0));
+  auto output_var_1 = GetValueShape((*second->master_nodes.begin())->result(0));
+  return output_var_0 == output_var_1;
+}
+
+inline bool is_same_size(const std::shared_ptr<ir::Group>& first,
+                         const std::shared_ptr<ir::Group>& second) {
+  if (!limit_args(first, second)) {
+    return false;
+  }
+
+  auto output_var_0 = GetValueShape((*first->master_nodes.begin())->result(0));
+  auto output_var_1 = GetValueShape((*second->master_nodes.begin())->result(0));
+  if (output_var_0 == output_var_1) {
+    return true;
+  }
+
+  auto size_0 = phi::product(output_var_0);
+  auto size_1 = phi::product(output_var_1);
+  return size_0 == size_1;
+}
+
+inline bool is_const_group(const std::shared_ptr<ir::Group>& group) {
+  return group->CollectNodes().size() == 1 &&
+         ConstantOps.count(group->CollectNodes()[0]->name());
+}
+
+inline bool elementwise_fuse_broadcast(
+    const std::shared_ptr<ir::Group>& first,
+    const std::shared_ptr<ir::Group>& second) {
+  // if producer just include const op.
+  if (is_const_group(first)) {
+    return true;
+  }
+  // if same shape with horizontal relation
+  if (is_same_size(first, second)) {
+    return true;
+  }
+  // if first's output is not all in second's input
+  for (auto output : first->output_nodes) {
+    return true;
+    if (!second->input_nodes.count(output)) {
+      return false;
+    }
+
+    // TODO(phlrain): support output set here
+    // if (helper->output_nodes_set_.count(output)) {
+    //   return false;
+    // }
+
+    return true;
+  }
+  // 1.compute io-size
+  // 2.compute computation-size
+  // 3.compute recompute-times
+  // 4.compute cost
+  // TODO(sunli) : cost-model.
+  return true;
+}
+
+inline bool honrizontal_elementwise_fuse_reduce(
+    const std::shared_ptr<ir::Group>& first,
+    const std::shared_ptr<ir::Group>& second) {
+  std::shared_ptr<ir::Group> ele_group, reduce_group;
+  if (first->op_pattern_kind == kReduction) {
+    ele_group = second;
+    reduce_group = first;
+  } else {
+    ele_group = first;
+    reduce_group = second;
+  }
+  // if same shape with horizontal relation
+  if (is_same_size(first, second)) {
+    return true;
+  }
+
+  auto ele_node_shape =
+      GetValueShape((*ele_group->master_nodes.begin())->result(0));
+  int32_t size_ele = phi::product(ele_node_shape);
+  // TODO(phlrain): seems extrame danger herem, why compare multi Master Node?
+  for (auto* master : reduce_group->master_nodes) {
+    auto master_node_shape = GetValueShape(master->result(0));
+    int32_t size_master = phi::product(master_node_shape);
+    if (size_ele == size_master) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+inline bool elementwise_fuse_reduce(const std::shared_ptr<ir::Group>& first,
+                                    const std::shared_ptr<ir::Group>& second) {
+  // if (helper->target_ == common::DefaultHostTarget()) {
+  //   return true;
+  // }
+  // if same shape with horizontal relation
+  if (is_same_size(first, second)) {
+    return true;
+  }
+
+  // if reduce nodes not in consumers of first group
+  std::queue<::pir::Operation*> candidates;
+  std::unordered_set<::pir::Operation*> first_node_set = first->NodeSet();
+  std::unordered_set<::pir::Operation*> second_node_set = second->NodeSet();
+  for (const auto& pair : second->input_nodes) {
+    if (first_node_set.find(pair.first) != first_node_set.end()) {
+      candidates.push(pair.first);
+    }
+  }
+  std::unordered_set<::pir::Operation*> visited;
+  std::unordered_set<::pir::Operation*> masters_in_consumers;
+
+  while (!candidates.empty()) {
+    ::pir::Operation* candidate = candidates.front();
+    candidates.pop();
+
+    // TODO(phlrain) : why only deal with first output
+    auto first_output = candidate->result(0);
+    for (auto it = first_output.use_begin(); it != first_output.use_end();
+         ++it) {
+      auto consumer = (*it).owner();
+      if (visited.count(consumer)) {
+        continue;
+      }
+      if (second_node_set.find(consumer) != second_node_set.end()) {
+        visited.insert(consumer);
+        candidates.push(consumer);
+      }
+      if (second->master_nodes.count(consumer)) {
+        masters_in_consumers.insert(consumer);
+      }
+    }
+  }
+  if (!masters_in_consumers.empty()) {
+    bool flag = true;
+    auto first_node_shape =
+        GetValueShape((*first->master_nodes.begin())->result(0));
+    int32_t size_first = phi::product(first_node_shape);
+
+    for (::pir::Operation* master : masters_in_consumers) {
+      auto second_node_shape = GetValueShape(master->result(0));
+      int32_t size_second = phi::product(second_node_shape);
+      if (size_first != size_second) {
+        flag = false;
+        break;
+      }
+    }
+    if (flag) {
+      return true;
+    }
+  }
+
+  // if reduce using block_reduce, can't fuse producer.
+  ::pir::Operation* reducer = nullptr;
+  for (auto& node : second->master_nodes) {
+    if (GetOpKind(node->name()) == kReduction) {
+      reducer = node;
+      break;
+    }
+  }
+  // CHECK(reducer) << "Can't find reduce op in group " << second->group_id;
+
+  // If the elementwise's output should be fetched, the output var cannot be
+  // computed inline into reduce's loop, in other words, the elementwise's
+  // cannot fused into reduce's loop Like: group1 = {cast_0},
+  // group2={broadcast_0 -> elementwise_0 -> cast_1 -> reduce_max_0}
+
+  // TODO(phlrain) : pass output node set
+  // if (helper->output_nodes_set_.count(*first->master_nodes.begin())) {
+  //   return false;
+  // }
+
+  auto input_shape = GetValueShape(reducer->operand_source(0));
+  std::vector<int> reduce_axes = GetVectorAttr<int>(reducer, "axis");
+
+  // int max_num_threads = helper->target_.max_num_threads();
+  int max_num_threads = 1000;
+  // if without last dimension in reduce.
+  int lane = 1;
+  if (WithoutLastDimInReduce(input_shape, reduce_axes)) {
+    for (int idx = reduce_axes.back() + 1; idx < input_shape.size(); ++idx) {
+      lane *= input_shape[idx];
+    }
+    if (lane > max_num_threads / 2) {
+      return true;
+    }
+  }
+
+  int index = reduce_axes.size() - 1;
+  for (; index >= 0; --index) {
+    if (static_cast<size_t>(index + 1) < reduce_axes.size() &&
+        reduce_axes[index] + 1 != reduce_axes[index + 1]) {
+      break;
+    }
+    lane *= input_shape[reduce_axes[index]];
+    if (lane > max_num_threads / 2) {
+      break;
+    }
+  }
+
+  if (lane <= max_num_threads) {
+    return true;
+  } else {
+    int prefix = input_shape[reduce_axes[index]];
+    int tail = lane / prefix;
+    for (int idx = max_num_threads / tail; idx > (max_num_threads / 2) / tail;
+         --idx) {
+      if (prefix % idx == 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+inline bool broadcast_fuse_reduce(const std::shared_ptr<ir::Group>& first,
+                                  const std::shared_ptr<ir::Group>& second) {
+  // if same shape with horizontal relation
+  if (is_same_size(first, second)) {
+    return true;
+  }
+  ::pir::Operation* reducer = nullptr;
+  for (auto& node : second->master_nodes) {
+    if (GetOpKind(node->name()) == kReduction) {
+      reducer = node;
+      break;
+    }
+  }
+  // CHECK(reducer) << "Can't find reduce op in group " << second->group_id;
+
+  auto input_shape = GetValueShape(reducer->operand_source(0));
+  auto input_size = phi::product(input_shape);
+
+  auto output_shape = GetValueShape((*first->master_nodes.begin())->result(0));
+  auto output_size = phi::product(output_shape);
+
+  if (input_size == output_size) {
+    return elementwise_fuse_reduce(first, second);
+  }
+  return false;
+}
+
+inline bool reduce_fuse_elementwise(const std::shared_ptr<ir::Group>& first,
+                                    const std::shared_ptr<ir::Group>& second) {
+  if (!is_same_size(first, second)) {
+    return false;
+  }
+  // if with last axis in reduce, fuse will waste computation resource.
+  // so use a simple model evaluate the cost.
+  // TODO(sunli) : cost-model.
+  return true;
+}
+
+inline bool horizontal_relation(const std::shared_ptr<ir::Group>& first,
+                                const std::shared_ptr<ir::Group>& second,
+                                const OpPatternKind op_pattern_kind) {
+  // merge injective
+  auto merge_nodes_set = [](const std::shared_ptr<ir::Group>& group) {
+    std::unordered_set<::pir::Operation*> nodes_set = group->nodes_set;
+    for (auto& sub_group : group->fused_sub_groups) {
+      nodes_set.insert(sub_group->nodes_set.begin(),
+                       sub_group->nodes_set.end());
+    }
+    return nodes_set;
+  };
+  auto first_set = merge_nodes_set(first);
+  auto second_set = merge_nodes_set(second);
+
+  auto select_node_set = [](const std::unordered_set<::pir::Operation*>& nodes,
+                            OpPatternKind kind) {
+    std::unordered_set<::pir::Operation*> selected;
+    for (auto node : nodes) {
+      if (GetOpKind(node->name()) == kind) {
+        selected.insert(node);
+      }
+    }
+    return selected;
+  };
+  auto selected_nodes = select_node_set(second_set, op_pattern_kind);
+
+  auto check_depency = [&](::pir::Operation* node) {
+    std::queue<::pir::Operation*> candidates;
+    std::unordered_set<::pir::Operation*> visited_set;
+    candidates.push(node);
+
+    while (!candidates.empty()) {
+      auto& candidate = candidates.front();
+      candidates.pop();
+      // visit all producer node
+      // Get all the input Op
+      for (size_t i = 0; i < candidate->num_operands(); ++i) {
+        auto producer =
+            candidate->operand_source(i).dyn_cast<pir::OpResult>().owner();
+        // check dependency.
+        if (first_set.count(producer)) {
+          return true;
+        }
+        // check node is in region.
+        if (!second_set.count(producer)) {
+          continue;
+        }
+        // recorded visited node.
+        if (!visited_set.count(producer)) {
+          visited_set.insert(producer);
+          candidates.push(producer);
+        }
+      }
+    }
+
+    return false;
+  };
+
+  for (auto node : selected_nodes) {
+    if (check_depency(node)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+inline bool horizontal_with_injective(
+    const std::shared_ptr<ir::Group>& first,
+    const std::shared_ptr<ir::Group>& second) {
+  if (is_const_group(first)) {
+    return true;
+  }
+
+  if (!is_same_size(first, second)) {
+    return false;
+  }
+  return horizontal_relation(first, second, kInjective);
+}
+
+inline bool injective_horizontal_with_reduce(
+    const std::shared_ptr<ir::Group>& first,
+    const std::shared_ptr<ir::Group>& second) {
+  // check injective with injective.
+  if (!horizontal_relation(first, second, kInjective)) {
+    return false;
+  }
+  return elementwise_fuse_reduce(first, second);
+}
+
+inline bool reduce_fuse_broadcast(const std::shared_ptr<ir::Group>& first,
+                                  const std::shared_ptr<ir::Group>& second) {
+  // if same shape with horizontal relation
+  if (is_same_size(first, second)) {
+    return true;
+  }
+
+  // Traversing all reducers in all producers requires two types of conditions
+  // to be met. The first type is the condition that the reducer itself needs to
+  // meet, and the second type is the condition that the relationship between
+  // each reducer and its consumers with type of Broadcast needs to meet. It is
+  // required that each consumer of type Broadcast meet the same shape after
+  // broadcast as before reduce.
+  for (auto& node_in_master : first->master_nodes) {
+    if (GetOpKind(node_in_master->name()) != kReduction) {
+      continue;
+    }
+    ::pir::Operation* reducer = node_in_master;
+    // First type conditions
+    // Get some reduce information
+    auto reducer_input_shape =
+        phi::vectorize(GetValueShape(reducer->operand_source(0)));
+    auto reducer_output_shape =
+        phi::vectorize(GetValueShape(reducer->result(0)));
+    std::vector<int64_t> reduce_axes = GetVectorAttr(reducer, "axis");
+
+    auto keep_dim = false;
+    for (auto& axis : reduce_axes) {
+      if (axis == -1) {
+        axis = reducer_input_shape.size() - 1;
+      }
+    }
+    // Check if the reduce axes are continuous
+    int reduce_size = reducer_input_shape.back();
+    for (auto idx = reduce_axes.size() - 1; idx >= 1; --idx) {
+      if (reduce_axes[idx] != reduce_axes[idx - 1] + 1) {
+        return false;
+      }
+      reduce_size *= reducer_input_shape[idx - 1];
+    }
+    // Check if the reduce size exceeds the hardware limit
+    // if (helper->target_ == common::DefaultNVGPUTarget() &&
+    //     reduce_size > helper->target_.max_num_threads()) {
+    //   return false;
+    // }
+
+    // Second type conditions
+    // Find directly or indirectly consumers with type of Broadcast in the
+    // second group
+    auto find_broadcasters_in_descendants = [&](::pir::Operation* producer)
+        -> std::unordered_set<::pir::Operation*> {
+      std::queue<::pir::Operation*> candidates;
+      std::unordered_set<::pir::Operation*> visited_set;
+      std::unordered_set<::pir::Operation*> broadcasters;
+      candidates.push(producer);
+
+      while (!candidates.empty()) {
+        auto candidate = candidates.front();
+        candidates.pop();
+        // TODO(phlrain) : why only deal with first output
+        auto first_output = candidate->result(0);
+        for (auto it = first_output.use_begin(); it != first_output.use_end();
+             ++it) {
+          auto consumer = (*it).owner();
+
+          if (!visited_set.count(consumer)) {
+            visited_set.insert(consumer);
+            candidates.push(consumer);
+          }
+          if (GetOpKind(consumer->name()) == kBroadcast &&
+              second->NodeSet().find(consumer) != second->NodeSet().end()) {
+            broadcasters.insert(consumer);
+          }
+        }
+      }
+
+      return broadcasters;
+    };
+
+    // Check if each broadcast node meets the conditions
+    std::unordered_set<::pir::Operation*> broadcasters_in_consumers =
+        find_broadcasters_in_descendants(reducer);
+    for (auto broadcaster : broadcasters_in_consumers) {
+      // auto  = absl::get<std::vector<int>>(
+      //     broadcaster->attrs.attr_store.at("out_shape"));
+
+      // auto broadcast_axes = absl::get<std::vector<int>>(
+      //     broadcaster->attrs.attr_store.at("broadcast_axes"));
+      // TODO(phlrain) : suport here
+      std::vector<int64_t> broadcaster_output_shape =
+          GetVectorAttr(broadcaster, "out_shape");
+      std::vector<int64_t> broadcast_axes =
+          GetVectorAttr(broadcaster, "broadcast_axes");
+      for (auto& axis : broadcast_axes) {
+        if (axis == -1) {
+          axis = broadcaster_output_shape.size() - 1;
+        }
+      }
+
+      if (reducer_input_shape != broadcaster_output_shape) {
+        return false;
+      }
+
+      if (keep_dim) {
+        continue;
+      } else {
+        // if reducer_output_shape = [1]
+        if (reducer_output_shape.size() == 1 && reducer_output_shape[0] == 1) {
+          continue;
+        }
+        // check union [reduce_axes, broadcast_axes] = reducer_input_shape
+        for (size_t idx = 0; idx < reducer_input_shape.size(); ++idx) {
+          if (!(std::find(broadcast_axes.begin(), broadcast_axes.end(), idx) ==
+                broadcast_axes.end()) ^
+              std::find(reduce_axes.begin(), reduce_axes.end(), idx) ==
+                  reduce_axes.end()) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+inline bool reduce_fuse_reduce(const std::shared_ptr<ir::Group>& first,
+                               const std::shared_ptr<ir::Group>& second) {
+  if (!limit_args(first, second)) {
+    return false;
+  }
+  ::pir::Operation* reducer_0 = nullptr;
+  for (auto& reducer : first->master_nodes) {
+    if (GetOpKind(reducer->name()) == kReduction) {
+      reducer_0 = reducer;
+      break;
+    }
+  }
+  // CHECK(reducer_0) << "Can't find reduce op in group " << first->group_id;
+
+  ::pir::Operation* reducer_1 = nullptr;
+  for (auto& reducer : second->master_nodes) {
+    if (GetOpKind(reducer->name()) == kReduction) {
+      reducer_1 = reducer;
+      break;
+    }
+  }
+  CHECK(reducer_1) << "Can't find reduce op in group " << second->group_id;
+  // check reduce has same input shape and output shape
+  auto reducer_0_input_shape = GetValueShape(reducer_0->operand_source(0));
+  auto reducer_0_output_shape = GetValueShape(reducer_0->result(0));
+
+  auto reducer_1_input_shape = GetValueShape(reducer_1->operand_source(0));
+  auto reducer_1_output_shape = GetValueShape(reducer_1->result(0));
+
+  // auto reducer_0_reduce_dim =
+  //     absl::get<std::vector<int>>(reducer_0->attrs.attr_store.at("dim"));
+  // auto reducer_1_reduce_dim =
+  //     absl::get<std::vector<int>>(reducer_1->attrs.attr_store.at("dim"));
+  // TODO(phlrain)
+  std::vector<int> reducer_0_reduce_dim = GetVectorAttr<int>(reducer_0, "axis");
+  std::vector<int> reducer_1_reduce_dim = GetVectorAttr<int>(reducer_1, "axis");
+
+  for (auto& dim : reducer_0_reduce_dim) {
+    // if dim = -1, set as shape.size() - 1
+    if (dim == -1) {
+      dim = reducer_0_reduce_dim.size() - 1;
+    }
+  }
+
+  for (auto& dim : reducer_1_reduce_dim) {
+    // if dim = -1,  set as shape.size() - 1
+    if (dim == -1) {
+      dim = reducer_1_reduce_dim.size() - 1;
+    }
+  }
+
+  // check shape is same
+  if (reducer_0_input_shape == reducer_1_input_shape &&
+      reducer_0_output_shape == reducer_1_output_shape &&
+      reducer_0_reduce_dim == reducer_1_reduce_dim) {
+    auto shared_size = 0;
+    for (auto& fusion_group : {first, second}) {
+      for (auto* master : fusion_group->master_nodes) {
+        if (GetOpKind(master->name()) == kReduction) {
+          shared_size += GetSharedSize(master);
+        }
+      }
+    }
+
+#define MAX_AVAILABLE_SHREAD 32 * 1024
+    if (shared_size > MAX_AVAILABLE_SHREAD) {
+      return false;
+    }
+#undef MAX_AVAILABLE_SHREAD
+    return true;
+  }
+
+  if (WithoutLastDimInReduce(reducer_0_input_shape, reducer_0_reduce_dim) &&
+      WithoutLastDimInReduce(reducer_1_input_shape, reducer_1_reduce_dim) &&
+      reducer_0_output_shape == reducer_1_output_shape &&
+      reducer_0_reduce_dim == reducer_1_reduce_dim) {
+    auto shared_size = 0;
+    for (auto& fusion_group : {first, second}) {
+      for (auto* master : fusion_group->master_nodes) {
+        if (GetOpKind(master->name()) == kReduction) {
+          shared_size += GetSharedSize(master);
+        }
+      }
+    }
+
+#define MAX_AVAILABLE_SHREAD 32 * 1024
+    if (shared_size > MAX_AVAILABLE_SHREAD) {
+      return false;
+    }
+#undef MAX_AVAILABLE_SHREAD
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/op_group.h b/paddle/cinn/hlir/dialect/operator/transforms/op_group.h
new file mode 100644
index 00000000000000..87138df17be85b
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/op_group.h
@@ -0,0 +1,195 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_node.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+class OpGroup {
+ public:
+  explicit OpGroup(const std::shared_ptr<ir::Group>& group) : group_(group) {}
+
+  OpGroup(const OpGroup& other) = default;
+
+  using Comparator = ir::Group::SharedGroupComparator;
+  using Hasher = ir::Group::SharedGroupHasher;
+
+  class OpGroupListIterator {
+   public:
+    OpGroupListIterator(std::unordered_set<std::shared_ptr<ir::Group>,
+                                           Hasher,
+                                           Comparator>::const_iterator it)
+        : iter_(it) {}
+
+    OpGroupListIterator& operator++() {
+      ++iter_;
+      return *this;
+    }
+
+    OpGroupListIterator operator++(int) {
+      OpGroupListIterator tmp = *this;
+      ++iter_;
+      return tmp;
+    }
+
+    bool operator==(const OpGroupListIterator& other) const {
+      return iter_ == other.iter_;
+    }
+
+    bool operator!=(const OpGroupListIterator& other) const {
+      return !(*this == other);
+    }
+
+    OpGroup operator*() const { return OpGroup(*iter_); }
+
+   private:
+    std::unordered_set<std::shared_ptr<ir::Group>, Hasher, Comparator>::
+        const_iterator iter_;
+  };
+
+  class ProducerOpGroupListView {
+   public:
+    explicit ProducerOpGroupListView(const std::weak_ptr<ir::Group>& group)
+        : group_(group) {}
+
+    ProducerOpGroupListView(const ProducerOpGroupListView& other) = delete;
+    ProducerOpGroupListView(ProducerOpGroupListView&& other) = delete;
+
+    ProducerOpGroupListView& operator=(const ProducerOpGroupListView& other) =
+        delete;
+
+    using const_iterator = OpGroupListIterator;
+
+    size_t size() const {
+      CHECK(group_.lock());
+      return group_.lock()->producer_groups().size();
+    }
+
+    const_iterator begin() const {
+      CHECK(group_.lock());
+      return const_iterator(group_.lock()->producer_groups().begin());
+    }
+
+    const_iterator end() const {
+      CHECK(group_.lock());
+      return const_iterator(group_.lock()->producer_groups().end());
+    }
+
+   private:
+    const std::weak_ptr<ir::Group> group_;
+  };
+
+  class ConsumerOpGroupListView {
+   public:
+    explicit ConsumerOpGroupListView(const std::weak_ptr<ir::Group>& group)
+        : group_(group) {}
+
+    ConsumerOpGroupListView(const ConsumerOpGroupListView& other) = delete;
+    ConsumerOpGroupListView(ConsumerOpGroupListView&& other) = delete;
+
+    ConsumerOpGroupListView& operator=(const ConsumerOpGroupListView& other) =
+        delete;
+
+    using const_iterator = OpGroupListIterator;
+
+    size_t size() const {
+      CHECK(group_.lock());
+      return group_.lock()->consumer_groups().size();
+    }
+
+    const_iterator begin() const {
+      CHECK(group_.lock());
+      return const_iterator(group_.lock()->consumer_groups().begin());
+    }
+
+    const_iterator end() const {
+      CHECK(group_.lock());
+      return const_iterator(group_.lock()->consumer_groups().end());
+    }
+
+   private:
+    const std::weak_ptr<ir::Group> group_;
+  };
+
+  const std::string& group_id() const { return group_.lock()->group_id; }
+
+  OpPatternKind kind() const { return group_.lock()->kind(); }
+
+  // The WalkOpNodes function is used to traverse the op_nodes in the group and
+  // execute the VisitOpNode function for each OpNode. This function is
+  // equivalent to for loop for op_nodes in graph.
+  //
+  // In order to avoid unnecessary memory copies, we use WalkOpNodes function
+  // instead of providing a function to get all op_nodes directly.
+  //
+  // Example: Get the all Reduction op_nodes in the group.
+  //   OpGroup group = ...;
+  //   std::set<cinn::dialect::ir::OpNode> reduce_ op_set;
+  //   // The lambda funtion of VisitOpNode to get reduction op_nodes.
+  //   auto get_reduce_op = [&reduce_op_set](const cinn::dialect::ir::OpNode&
+  //   op){
+  //     if (op.kind() == OpPatternKind::kReduction) {
+  //       reduce_op_set.insert(op);
+  //     }
+  //   };
+  //   group.WalkOpNodes(get_reduce_op);
+  void WalkOpNodes(
+      const std::function<void(const OpNode&)>& VisitOpNode) const {
+    group_.lock()->WalkNodes(
+        [&](::pir::Operation* node) { VisitOpNode(OpNode(node)); });
+  }
+
+  ProducerOpGroupListView producers() const {
+    return ProducerOpGroupListView(group_);
+  }
+
+  ConsumerOpGroupListView consumers() const {
+    return ConsumerOpGroupListView(group_);
+  }
+
+  std::shared_ptr<ir::Group> GetGroup() const { return group_.lock(); }
+
+  bool operator==(const OpGroup& other) const {
+    return group_.lock().get() == other.group_.lock().get();
+  }
+
+  bool operator<(const OpGroup& other) const {
+    return group_.lock().get() < other.group_.lock().get();
+  }
+
+ private:
+  const std::weak_ptr<ir::Group> group_;
+};
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
+
+namespace std {
+
+template <>
+struct hash<cinn::dialect::ir::OpGroup> {
+  size_t operator()(const cinn::dialect::ir::OpGroup& obj) const {
+    return std::hash<size_t>()(reinterpret_cast<size_t>(obj.GetGroup().get()));
+  }
+};
+
+}  // namespace std
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/op_node.h b/paddle/cinn/hlir/dialect/operator/transforms/op_node.h
new file mode 100644
index 00000000000000..8579d11b19bb96
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/op_node.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/tensor_node.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/pir/core/operation.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+class OpNode {
+ public:
+  explicit OpNode(::pir::Operation* node)
+      : node_(node), input_tensors_(node), output_tensors_(node) {}
+
+  OpPatternKind kind() const {
+    auto kind = GetOpKind(node_->name());
+    if (kind == kBroadcast) {
+      // As binary op was defined as broadcast, actually it should be
+      // element-wise.
+      if (node_->name() != "broadcast_to") {
+        return kElementWise;
+      }
+    }
+    return kind;
+  }
+
+  class TensorListIterator {
+   public:
+    TensorListIterator(size_t index, ::pir::Operation* op)
+        : iter_(index), op_(op) {}
+
+    TensorListIterator& operator++() {
+      ++iter_;
+      return *this;
+    }
+
+    TensorListIterator operator++(int) {
+      TensorListIterator tmp = *this;
+      ++iter_;
+      return tmp;
+    }
+
+    bool operator==(const TensorListIterator& other) const {
+      return iter_ == other.iter_;
+    }
+
+    bool operator!=(const TensorListIterator& other) const {
+      return !(*this == other);
+    }
+
+    TensorNode operator*() const {
+      return TensorNode(op_->operand_source(iter_));
+    }
+
+   private:
+    size_t iter_;
+    ::pir::Operation* op_;
+  };
+
+  using const_iterator = TensorListIterator;
+
+  class InputTensorListView {
+   public:
+    explicit InputTensorListView(::pir::Operation* op) : op_(op) {}
+
+    // InputTensorListView(const InputTensorListView& other) = delete;
+    // InputTensorListView(InputTensorListView&& other) = delete;
+
+    // InputTensorListView& operator=(const InputTensorListView& other) =
+    // delete;
+
+    size_t size() const { return op_->num_operands(); }
+
+    TensorNode operator[](size_t index) const {
+      return TensorNode(op_->operand_source(index));
+    }
+
+    const_iterator begin() const { return const_iterator(0, op_); }
+
+    const_iterator end() const {
+      return const_iterator(op_->num_operands(), op_);
+    }
+
+   private:
+    ::pir::Operation* op_;
+  };
+
+  class OutputTensorListView {
+   public:
+    explicit OutputTensorListView(::pir::Operation* op) : op_(op) {}
+
+    // OutputTensorListView(const OutputTensorListView& other) = delete;
+    // OutputTensorListView(OutputTensorListView&& other) = delete;
+
+    // OutputTensorListView& operator=(const OutputTensorListView& other) =
+    // delete;
+
+    size_t size() const { return op_->num_results(); }
+
+    TensorNode operator[](size_t index) const {
+      return TensorNode(op_->result(index));
+    }
+
+    const_iterator begin() const { return const_iterator(0, op_); }
+
+    const_iterator end() const {
+      return const_iterator(op_->num_results(), op_);
+    }
+
+   private:
+    ::pir::Operation* op_;
+  };
+
+  bool operator==(const OpNode& other) const { return node_ == other.node_; }
+
+  bool operator<(const OpNode& other) const { return node_ < other.node_; }
+
+  const InputTensorListView& inputs() const { return input_tensors_; }
+
+  const OutputTensorListView& outputs() const { return output_tensors_; }
+
+  template <typename T>
+  const T& GetAttr(const std::string& attr_name) const {
+    auto attr =
+        paddle::dialect::GetAttributeData(node_->attributes().at(attr_name));
+    return PADDLE_GET_CONST(T, attr);
+  }
+
+ private:
+  friend struct std::hash<OpNode>;
+
+  ::pir::Operation* node_;
+
+  const InputTensorListView input_tensors_;
+  const OutputTensorListView output_tensors_;
+};
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
+
+namespace std {
+
+template <>
+struct hash<cinn::dialect::ir::OpNode> {
+  size_t operator()(const cinn::dialect::ir::OpNode& obj) const {
+    return std::hash<size_t>()(reinterpret_cast<size_t>(obj.node_));
+  }
+};
+
+}  // namespace std
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.cc
new file mode 100644
index 00000000000000..3039d81ff83a35
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.cc
@@ -0,0 +1,528 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.h"
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+std::unordered_map<std::string, OpPatternKind> OpKindMap = {
+    {"pd_op.add", OpPatternKind::kElementWise},
+    {"pd_op.subtract", OpPatternKind::kElementWise},
+    {"pd_op.multiply", OpPatternKind::kElementWise},
+    {"pd_op.divide", OpPatternKind::kElementWise},
+    {"pd_op.sqrt", OpPatternKind::kElementWise},
+    {"pd_op.full", OpPatternKind::kElementWise},
+    {"pd_op.relu", OpPatternKind::kElementWise},
+    {"pd_op.exp", OpPatternKind::kElementWise},
+    {"pd_op.sum", OpPatternKind::kReduction},
+    {"cinn_op.reduce_sum", OpPatternKind::kReduction},
+    {"cinn_op.reduce_max", OpPatternKind::kReduction},
+    {"cinn_op.broadcast", OpPatternKind::kBroadcast},
+};
+
+OpPatternKind GetOpKind(const std::string& op_name) {
+  auto found_it = OpKindMap.find(op_name);
+  if (found_it == OpKindMap.end()) {
+    throw std::runtime_error("not support op yet in op kind map");
+  }
+
+  return found_it->second;
+}
+
+phi::DDim GetFirstInputShape(const ::pir::Operation* op) {
+  auto in = op->operand_source(0);
+
+  return in.type().dyn_cast<paddle::dialect::DenseTensorType>().dims();
+}
+
+phi::DDim GetValueShape(const ::pir::Value value) {
+  return value.type().dyn_cast<paddle::dialect::DenseTensorType>().dims();
+}
+
+bool WithoutLastDimInReduce(const std::vector<int64_t>& inshape,
+                            const std::vector<int64_t>& axes) {
+  // if last axis is in reduce.
+  if (std::find(axes.begin(), axes.end(), inshape.size() - 1) != axes.end() ||
+      std::find(axes.begin(), axes.end(), -1) != axes.end()) {
+    return false;
+  }
+
+  int64_t sum_last_axes = 1;
+  for (size_t idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+    sum_last_axes *= inshape[idx];
+  }
+
+  if (sum_last_axes > 1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int GetSharedSize(::pir::Operation* node) {
+  auto inshape = phi::vectorize<int64_t>(GetValueShape(node->result(0)));
+
+  auto axes = GetVectorAttr(node, "axis");
+
+  if (WithoutLastDimInReduce(inshape, axes)) {
+    int lane = 1;
+    for (size_t idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+      lane = inshape[idx];
+    }
+    // int max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+    // todo(phlrain): get gpu max threads
+    int max_num_threads = 2048;
+    if (lane > max_num_threads / 2) {
+      return 0;
+    }
+    int index = axes.size() - 1;
+    for (; index >= 0; --index) {
+      if (static_cast<size_t>(index + 1) < axes.size() &&
+          axes[index] != axes[index + 1] - 1) {
+        break;
+      }
+      lane *= inshape[axes[index]];
+      if (lane > max_num_threads / 2) {
+        break;
+      }
+    }
+    // if lane > (max_num_threads / 2),the loop break from lane >
+    // max_num_threads / 2.
+    int axis = lane > (max_num_threads / 2) ? axes[index] : axes[index + 1];
+    if (lane <= max_num_threads) {
+      return lane * sizeof(float);
+    } else {
+      int prefix = inshape[axis];
+      int tail = lane / prefix;
+      for (int idx = max_num_threads / tail;
+           idx > ((max_num_threads / 2) / tail);
+           --idx) {
+        if (prefix % idx == 0) {
+          return idx * tail * sizeof(float);
+        }
+      }
+      int num = max_num_threads / tail;
+      return num * tail * sizeof(float);
+    }
+  }
+  return 0;
+}
+
+using ConditionFunction =
+    std::function<bool(::pir::Operation*, const GroupPtr&)>;
+
+// Op Fusion Pass which performs Ops fusion, Ops are fused
+// "vertically", meaning producing Ops are fused into their consumers
+// with the intent that the loops which compute their values will be fused in
+// code generation.
+class OpFusionPassHelper {
+ public:
+  explicit OpFusionPassHelper(const ::pir::Program& graph) {
+    // init fusion relation
+    InitFusionRelation();
+    // filter node data, create group for each node
+    // auto nodes_inorder = std::get<0>(graph->topological_order());
+
+    for (auto it = graph.block()->begin(); it != graph.block()->end(); ++it) {
+      auto node = *it;
+      local_ops_.insert(node);
+    }
+
+    int index = 0;
+    for (auto it = graph.block()->begin(); it != graph.block()->end(); ++it) {
+      auto node = *it;
+      if (node) {
+        nodes_.push_back(node);
+        auto group = std::make_shared<Group>();
+        // init group
+        group->nodes.push_back(node);
+        group->nodes_set.insert(node);
+        group->output_nodes.insert(node);
+        // input node
+
+        for (size_t i = 0; i < node->num_operands(); ++i) {
+          auto input =
+              node->operand_source(i).dyn_cast<pir::OpResult>().owner();
+          if (input && (local_ops_.count(input))) {
+            group->input_nodes[input] = 1;
+          }
+        }
+
+        // group type
+        group->op_pattern_kind = GetOpKind(node->name());
+        // use current node as master node for schedule
+        group->master_nodes.insert(node);
+
+        // get opration unique id
+        group->group_id = "id_" + std::to_string(index++);
+        fusion_groups_[node] = group;
+      }
+    }
+    // reverse node for output to input
+    std::reverse(nodes_.begin(), nodes_.end());
+  }
+
+  // return a vector of groups in topological order.
+  GroupList operator()(bool do_fusion = true) {
+    // do op fusion.
+    if (do_fusion) {
+      DoOpFusion();
+    }
+
+    // find all fusion group.
+    GroupList fusion_groups;
+    std::unordered_set<Group*> groups_set;
+    for (auto node : nodes_) {
+      auto& group = fusion_groups_[node];
+      if (!groups_set.count(group.get())) {
+        groups_set.insert(group.get());
+        fusion_groups.push_back(group);
+        // reverse nodes order to producer->consumer.
+        std::reverse(group->nodes.begin(), group->nodes.end());
+      }
+    }
+
+    // producer consumer
+    for (auto& consumer : fusion_groups) {
+      for (auto& input_node : consumer->input_nodes) {
+        if (!local_ops_.count(input_node.first)) {
+          continue;
+        }
+        auto& producer = fusion_groups_[input_node.first];
+        consumer->mut_producer_groups()->insert(producer);
+        producer->mut_consumer_groups()->insert(consumer);
+      }
+    }
+
+    // init group depth.
+    for (auto& group : fusion_groups) {
+      for (const auto& consumer : group->consumer_groups()) {
+        // update depth.
+        group->depth = std::max(group->depth, consumer->depth + 1);
+      }
+    }
+
+    // reverse to keep fusion group in order.
+    std::reverse(fusion_groups.begin(), fusion_groups.end());
+
+    return fusion_groups;
+  }
+
+ private:
+  void DoOpFusion() {
+    for (auto consumer : nodes_) {
+      auto consumer_kind = GetOpKind(consumer->name());
+      // kNonFusible op can't fuse any other op.
+      if (consumer_kind == kNonFusible) {
+        continue;
+      }
+
+      // fusion op for consumer
+      auto consumer_fusion = fusion_groups_[consumer];  //
+      // check all linkin node
+      for (size_t i = 0; i < consumer->num_operands(); ++i) {
+        auto producer_data = consumer->operand_source(i);
+
+        auto producer = producer_data.dyn_cast<pir::OpResult>().owner();
+        if (!local_ops_.count(producer)) {
+          continue;
+        }
+
+        // if producer is fused.
+        if (consumer_fusion->nodes_set.count(producer)) {
+          // VLOG(3) << "Op " << producer->id() << " is fused.";
+          continue;
+        }
+        // if producer data is placeholder
+        if (!producer) {
+          continue;
+        }
+        // kNonFusible op can't fuse any other op.
+        auto producer_kind = GetOpKind(producer->name());
+        if (producer_kind == kNonFusible) {
+          continue;
+        }
+        // VLOG(3) << "Producer Op: " << producer->id()
+        //         << ", Op Pattern: " << producer_kind
+        //         << " -> Consumer Op: " << consumer->id()
+        //         << ", Op Pattern: " << consumer_kind;
+        bool can_fuse = true;
+        // checkout producer node outputs are all in fusion op
+
+        // find all the op use by
+        size_t producer_data_used_num = 0;
+        for (auto it = producer_data.use_begin(); it != producer_data.use_end();
+             ++it) {
+          auto consumer_node = it->owner();
+          producer_data_used_num++;
+          // if fusion group can't find node, can't merge
+          if (consumer_fusion->nodes_set.find(consumer_node) ==
+              consumer_fusion->nodes_set.end()) {
+            can_fuse = false;
+            break;
+          }
+        }
+
+        if (!can_fuse || !CanFuse(producer, consumer)) continue;
+        // VLOG(3) << "Fuse Op " << producer->id() << " into Op "
+        //         << consumer->id();
+
+        // fuse producer to fusion group
+        // TODO(phrain) : support id
+        // consumer_fusion->group_id =
+        //     producer->id() + "_" + consumer_fusion->group_id;
+
+        consumer_fusion->group_id = consumer_fusion->group_id;
+        consumer_fusion->nodes.push_back(producer);
+        consumer_fusion->nodes_set.insert(producer);
+        consumer_fusion->input_nodes.erase(producer);
+        consumer_fusion->op_pattern_kind =
+            static_cast<int>(consumer_fusion->op_pattern_kind) >
+                    static_cast<int>(producer_kind)
+                ? consumer_fusion->op_pattern_kind
+                : producer_kind;
+
+        if (producer_kind == kReduction) {
+          consumer_fusion->master_nodes.insert(producer);
+        }
+
+        if (output_nodes_set_.count(producer)) {
+          // VLOG(3) << "Insert Global Output Node : " << producer->id();
+          consumer_fusion->output_nodes.insert(producer);
+        } else if (producer_data_used_num > 1 && producer->num_operands() > 0 &&
+                   is_same_size(producer, consumer_fusion)) {
+          // producer is not a const value node.
+          consumer_fusion->internal_nodes.insert(producer);
+        }
+
+        // fuse input node
+
+        auto producer_fusion = fusion_groups_[producer];
+        for (auto input_node : producer_fusion->input_nodes) {
+          if (consumer_fusion->input_nodes.count(input_node.first)) {
+            consumer_fusion->input_nodes[input_node.first] += input_node.second;
+          } else {
+            consumer_fusion->input_nodes.insert(input_node);
+          }
+        }
+        // update node group
+        fusion_groups_[producer] = consumer_fusion;
+      }
+    }
+  }
+
+  void InitFusionRelation() {
+    // fusion relation.
+    // 1.kElementwise as producer
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {kElementWise, kBroadcast, kReduction, kInjective};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // horizontal or vertical relation(Elementwise + *Elementwise*). As
+          // has same output shape, can always fuse.
+          {kElementWise, always_fuse},
+          // must be horizontal, as Elementwise + Broadcast is left to fusion
+          // merge pass.
+          {kBroadcast,
+           [](::pir::Operation* producer, const GroupPtr& consumer) -> bool {
+             // NOTE, producer and consumer NEVER be same size
+             if (is_same_size(producer, consumer)) {
+               return true;
+             }
+
+             // NOTE, original code is below, if produer is not output node,
+             // result always be true
+             // !helper->output_nodes_set_.count(producer);
+             return true;
+           }},
+          // horizontal or vertical relation, check with same output shape with
+          // horizontal relation or with last
+          // successive dimension less than 1024 for gpu.
+          {kReduction, horizontal_or_vertical_reduce_relation},
+          // can be horizontal or can compute inline, check with same output
+          // shape or can compute inline.
+          {kInjective, horizontal_or_can_inline},
+          // must be horizontal, check with same output shape.
+          {kOutFusible, is_same_shape}};
+      fusion_relation_map_[kElementWise] = std::move(relation);
+    }
+    // 2.kBroadcast as producer
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {kElementWise, kReduction, kInjective};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // horizontal or vertical relation(Broadcast + *Elementwise*), check
+          // with same output shape.
+          {kElementWise, is_same_size},
+          // must be horizontal, as Broadcast + Broadcast is not allowed.
+          {kBroadcast, is_same_size},
+          // horizontal or vertical relation(Broadcast + Reduce).
+          {kReduction, horizontal_or_vertical_reduce_relation},
+          // can be horizontal or can compute inline, check with same output
+          // shape or just one consumer.
+          {kInjective, horizontal_or_can_inline},
+          // must be horizontal, check with same output shape.
+          {kOutFusible, is_same_shape}};
+      fusion_relation_map_[kBroadcast] = std::move(relation);
+    }
+    // 3.kReduction as producer
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {kElementWise, kBroadcast};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // horizontal or vertical relation(Reduce + Elementwise*), check
+          // without last dimension in reduce.
+          {kElementWise, is_same_size},
+          // must be horizontal relation, check with same output shape and
+          // without last dimension in reduce.
+          {kBroadcast, reduce_fuse_broadcast},
+          // must be horizontal relation and with same reduce attr.
+          {kReduction, reduce_fuse_reduce},
+          // no_fuse
+          {kInjective, no_fuse},
+          // can't fuse.
+          {kOutFusible, no_fuse}};
+      fusion_relation_map_[kReduction] = std::move(relation);
+    }
+    // 4.kInjective
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {kElementWise, kInjective};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // can be horizontal or vertical(Injective + Elementwise), check with
+          // same output shape.
+          {kElementWise, is_same_size},
+          // must be horizontal relation, check with same output shape.
+          {kBroadcast, horizontal_with_same_size},
+          // left to fusion merge pass.
+          {kReduction, no_fuse},
+          // must be horizontal relation, check with same output shape.
+          {kInjective, horizontal_or_can_inline},
+          // can't fuse.
+          {kOutFusible, no_fuse},
+      };
+      fusion_relation_map_[kInjective] = std::move(relation);
+    }
+    // 5.kOutFusible
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {kElementWise, kBroadcast};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // horizontal or vertical relation, check has same shape.
+          {kElementWise, is_same_shape},
+          // it must be horizontal relation, check has same shape.
+          {kBroadcast, is_same_shape},
+          // can't fuse.
+          {kReduction, no_fuse},
+          // must be horizontal relation, check has same shape.
+          {kInjective, is_same_shape},
+          // can't fuse.
+          {kOutFusible, no_fuse},
+      };
+      fusion_relation_map_[kOutFusible] = std::move(relation);
+    }
+  }
+
+  bool CanFuse(::pir::Operation* producer, const ::pir::Operation* consumer) {
+    auto& relation = fusion_relation_map_[GetOpKind(producer->name())];
+    // first step: check producer can be fused into consumer
+    if (relation.op_kind.count(GetOpKind(consumer->name()))) {
+      auto& consumer_group = fusion_groups_[consumer];
+      // second step: check producer can be fused into consumer group
+      VLOG(3) << "Call ConditionFunction, Producer Op Pattern : "
+              << GetOpKind(producer->name()) << " , Consumer Group Pattern : "
+              << consumer_group->op_pattern_kind;
+      return relation.fusion_op_kind[consumer_group->op_pattern_kind](
+          producer, fusion_groups_[consumer]);
+    }
+
+    return false;
+  }
+  std::vector<::pir::Operation*> nodes_;
+  std::unordered_map<const ::pir::Operation*, GroupPtr> fusion_groups_;
+  std::unordered_set<const ::pir::Operation*> output_nodes_set_;
+
+  std::vector<std::shared_ptr<Group>> groups_;
+
+  std::unordered_set<const ::pir::Operation*> local_ops_;
+
+  struct FusionRelation {
+    // producer -> consumer
+    std::unordered_set<OpPatternKind> op_kind = {};
+    // producer -> fusion sonsumer
+    std::unordered_map<OpPatternKind, ConditionFunction> fusion_op_kind = {};
+  };
+  std::unordered_map<OpPatternKind, FusionRelation> fusion_relation_map_;
+};
+
+GroupList OpFusionPassInternal(const ::pir::Program& program) {
+  VLOG(3) << "OpFusionPass...!";
+  auto op_fusion_helper = OpFusionPassHelper(program);
+  auto res = op_fusion_helper();
+
+  for (size_t i = 0; i < res.size(); ++i) {
+    auto group = res[i];
+
+    for (size_t j = 0; j < group->nodes.size(); ++j) {
+    }
+  }
+
+  // for (auto& group : graph->fusion_groups) {
+  //   VLOG(3) << "Group Id : " << group->group_id;
+  //   for (const auto& producer : group->producer_groups()) {
+  //     VLOG(3) << "  producer group -> " << producer->group_id;
+  //   }
+  //   for (const auto& consumer : group->consumer_groups()) {
+  //     VLOG(3) << "  consumer group -> " << consumer->group_id;
+  //   }
+  // }
+  VLOG(3) << "OpFusionPass Finish...!";
+
+  return res;
+}
+
+// void BuildNonFusedGroupsPassInternal(framework::Graph* graph) {
+//   auto op_fusion_helper = OpFusionPassHelper(graph);
+//   VLOG(3) << "Apply OpFusionPass to generate initial non-fusion groups";
+//   graph->fusion_groups = op_fusion_helper(false);
+// }
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.h b/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.h
new file mode 100644
index 00000000000000..c784140c1cf363
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h"
+#include "paddle/pir/core/program.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+using GroupPtr = std::shared_ptr<Group>;
+using GroupList = std::vector<GroupPtr>;
+
+GroupList OpFusionPassInternal(const ::pir::Program& program);
+
+GroupList GeneralFusionMergePassInternal(const ::pir::Program* graph,
+                                         const GroupList& group_list);
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h b/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h
new file mode 100644
index 00000000000000..1ba6ba85b51588
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_util.h
@@ -0,0 +1,587 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <limits.h>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+enum OpPatternKind {
+  // The relation between input tensor index and output tensor index is
+  // one-to-one correspondence.
+  // for example :code:`out[i, j] = input[i, j] + 1`.
+  // Note that the axis need to be in order.
+  kElementWise = 0,
+  // The relation between input tensor index and output tensor index is
+  // one-to-many correspondence.
+  // for example :code:`out[i, j, k] = input[i, j]`.
+  // Note that the axis need to be in order.
+  kBroadcast = 1,
+  // Injective operator, we can always injectively map a output axis to a input
+  // axis.
+  // for example :code:`out[i, j] = input[j, i]`.
+  kInjective = 2,
+  // The relation between input tensor index and output tensor index is
+  // many-to-one correspondence.
+  // for example :code:`out[i, j] = sum(input[i, j, k]) along k`.
+  kReduction = 3,
+  // Complex operation, can still fuse one-to-one operations into its output.
+  kOutFusible = 4,
+  // Operation that cannot fuse anything.
+  kNonFusible = 8
+};
+
+OpPatternKind GetOpKind(const std::string& op_name);
+
+template <typename T = int64_t>
+std::vector<T> GetVectorAttr(const ::pir::Operation* op,
+                             const std::string& name) {
+  auto& attr_map = op->attributes();
+  PADDLE_ENFORCE(
+      attr_map.count(name),
+      phi::errors::PreconditionNotMet(
+          "attr [%s] MUST in attribute map for [%s] op", name, op->name()));
+  auto& val = attr_map.at(name);
+
+  PADDLE_ENFORCE(val.isa<::pir::ArrayAttribute>(),
+                 phi::errors::PreconditionNotMet(
+                     "axis Type MUST ArrayAttribute for [%s] op", op->name()));
+  auto array_list = val.dyn_cast<::pir::ArrayAttribute>().AsVector();
+  std::vector<T> vec_res;
+  if (array_list.size() > 0) {
+    PADDLE_ENFORCE_EQ(array_list[0].isa<::pir::Int64Attribute>(),
+                      true,
+                      phi::errors::Unimplemented(
+                          "the 0th elementwise MUST be ir::Int64Attribute"));
+    for (size_t i = 0; i < array_list.size(); ++i) {
+      vec_res.push_back(array_list[i].dyn_cast<::pir::Int64Attribute>().data());
+    }
+  }
+  return vec_res;
+}
+
+struct Group {
+  Group() = default;
+
+  // distance to last group.
+  int depth{0};
+  int max_depth{0};
+  int min_depth{INT_MAX};
+  // group id, consisted of node's id.
+  std::string group_id{""};
+  // global unique id.
+  std::string unique_id{"uniq"};
+  // node in this group
+  std::vector<::pir::Operation*> nodes;
+  std::unordered_set<::pir::Operation*> nodes_set;
+  // input nodes of the group.
+  std::unordered_map<::pir::Operation*, int> input_nodes;
+  // output nodes of the group.
+  std::unordered_set<::pir::Operation*> output_nodes;
+  // op pattern kind.
+  OpPatternKind op_pattern_kind{kElementWise};
+  // internal node, the output is used by multi-node.
+  // internal node can't use compute inline, should use buffer.
+  std::unordered_set<::pir::Operation*> internal_nodes;
+  // master node for schedule
+  std::unordered_set<::pir::Operation*> master_nodes;
+
+  // fused sub-groups, used for fusion merge pass
+  std::vector<std::shared_ptr<Group>> fused_sub_groups;
+  // if as sub-group, used for belong groups.
+  std::unordered_set<std::shared_ptr<Group>> belong_groups;
+
+  // for op lowering.
+  std::vector<std::string> input_names;
+  std::vector<std::string> output_names;
+
+  struct SharedGroupHasher {
+    size_t operator()(const std::shared_ptr<Group>& group) const noexcept {
+      return std::hash<uint64_t>()(reinterpret_cast<uint64_t>(group.get()));
+    }
+  };
+  struct SharedGroupComparator {
+    bool operator()(const std::shared_ptr<Group>& first,
+                    const std::shared_ptr<Group>& second) const noexcept {
+      return first.get() == second.get();
+    }
+  };
+
+  std::vector<::pir::Operation*> CollectNodes() {
+    if (fused_sub_groups.size()) {
+      std::vector<::pir::Operation*> tmp_nodes;
+      for (auto& group : fused_sub_groups) {
+        tmp_nodes.insert(
+            tmp_nodes.end(), group->nodes.begin(), group->nodes.end());
+      }
+      return tmp_nodes;
+    } else {
+      return nodes;
+    }
+  }
+
+  void WalkNodes(
+      const std::function<void(::pir::Operation*)>& VisitNode) const {
+    if (fused_sub_groups.size()) {
+      for (auto& group : fused_sub_groups) {
+        for (const auto& node : group->nodes) {
+          VisitNode(node);
+        }
+      }
+    } else {
+      for (const auto& node : nodes) {
+        VisitNode(node);
+      }
+    }
+  }
+
+  std::unordered_set<::pir::Operation*> NodeSet() {
+    std::unordered_set<::pir::Operation*> node_set;
+    for (auto node : CollectNodes()) {
+      node_set.insert(node);
+    }
+    return node_set;
+  }
+
+  // TODO(phlrain) : impliment GetInputNodeDatas GetOutputNodeDatas func
+  // std::unordered_set<::pir::Value> GetInputNodeDatas() { return {}; }
+  // std::unordered_set<::pir::Value> GetOutputNodeDatas() { return {}; }
+
+  std::string GetFuncName() { return "fn_" + group_id + unique_id; }
+
+ public:
+  const std::unordered_set<std::shared_ptr<Group>,
+                           SharedGroupHasher,
+                           SharedGroupComparator>&
+  producer_groups() const {
+    return producer_groups_;
+  }
+
+  const std::unordered_set<std::shared_ptr<Group>,
+                           SharedGroupHasher,
+                           SharedGroupComparator>&
+  consumer_groups() const {
+    return consumer_groups_;
+  }
+
+  std::unordered_set<std::shared_ptr<Group>,
+                     SharedGroupHasher,
+                     SharedGroupComparator>*
+  mut_producer_groups() {
+    return &producer_groups_;
+  }
+
+  std::unordered_set<std::shared_ptr<Group>,
+                     SharedGroupHasher,
+                     SharedGroupComparator>*
+  mut_consumer_groups() {
+    return &consumer_groups_;
+  }
+
+  OpPatternKind kind() const { return op_pattern_kind; }
+
+ private:
+  // input groups
+  std::unordered_set<std::shared_ptr<Group>,
+                     SharedGroupHasher,
+                     SharedGroupComparator>
+      producer_groups_;
+  // output grous
+  std::unordered_set<std::shared_ptr<Group>,
+                     SharedGroupHasher,
+                     SharedGroupComparator>
+      consumer_groups_;
+};
+
+phi::DDim GetFirstInputShape(const ::pir::Operation* op);
+
+phi::DDim GetValueShape(const ::pir::Value value);
+
+bool WithoutLastDimInReduce(const std::vector<int64_t>& inshape,
+                            const std::vector<int64_t>& axes);
+
+int GetSharedSize(::pir::Operation* node);
+
+inline bool always_fuse(::pir::Operation* producer,
+                        const std::shared_ptr<Group>& consumer) {
+  return true;
+}
+
+inline bool no_fuse(::pir::Operation* producer,
+                    const std::shared_ptr<Group>& consumer) {
+  return false;
+}
+
+inline bool is_same_shape(::pir::Operation* producer,
+                          const std::shared_ptr<Group>& consumer) {
+  auto master_node = consumer->master_nodes.begin();
+  return GetValueShape(producer->result(0)) ==
+         GetValueShape((*master_node)->result(0));
+}
+
+inline bool is_same_size(::pir::Operation* producer,
+                         const std::shared_ptr<Group>& consumer) {
+  auto master_node = consumer->master_nodes.begin();
+  auto producer_shape = GetValueShape(producer->result(0));
+  auto consumer_shape = GetValueShape((*master_node)->result(0));
+  if (producer_shape == consumer_shape) {
+    return true;
+  }
+  auto psize = phi::product(producer_shape);
+  auto csize = phi::product(consumer_shape);
+  return psize == csize;
+}
+
+inline bool without_last_dimension_in_reduce(
+    ::pir::Operation* producer, const std::shared_ptr<Group>& consumer) {
+  auto in_shape = phi::vectorize<int64_t>(GetFirstInputShape(producer));
+  auto reduce_axes = GetVectorAttr(producer, "axis");
+  return WithoutLastDimInReduce(in_shape, reduce_axes);
+}
+
+inline bool reduce_fuse_reduce(::pir::Operation* producer,
+                               const std::shared_ptr<Group>& consumer) {
+  ::pir::Operation* reducer = NULL;
+  for (auto* master : consumer->master_nodes) {
+    if (GetOpKind(master->name()) == kReduction) {
+      reducer = master;
+      break;
+    }
+  }
+  // check reduce has same input shape and output shape
+  auto producer_input_shape =
+      phi::vectorize<int64_t>(GetValueShape(producer->operand_source(0)));
+  auto producer_output_shape =
+      phi::vectorize<int64_t>(GetValueShape(producer->result(0)));
+
+  auto reducer_input_shape =
+      phi::vectorize<int64_t>(GetValueShape(reducer->operand_source(0)));
+  auto reducer_output_shape =
+      phi::vectorize<int64_t>(GetValueShape(reducer->result(0)));
+
+  auto producer_reduce_dim = GetVectorAttr(producer, "axis");
+  auto reducer_reduce_dim = GetVectorAttr(reducer, "axis");
+
+  for (auto& dim : producer_reduce_dim) {
+    // if dim = -1, set as shape.size() - 1
+    if (dim < 0) {
+      dim += producer_input_shape.size();
+    }
+  }
+
+  for (auto& dim : reducer_reduce_dim) {
+    // if dim = -1,  set as shape.size() - 1
+    if (dim < 0) {
+      dim += reducer_input_shape.size();
+    }
+  }
+
+  if (producer_output_shape == reducer_output_shape &&
+      producer_reduce_dim == reducer_reduce_dim) {
+    bool input_shape_same = producer_input_shape == reducer_input_shape;
+    bool without_last_dim =
+        WithoutLastDimInReduce(producer_input_shape, producer_reduce_dim) &&
+        WithoutLastDimInReduce(reducer_input_shape, reducer_reduce_dim);
+    // check shape is same
+    if (input_shape_same || without_last_dim) {
+      auto shared_size = GetSharedSize(producer);
+      for (auto* master : consumer->master_nodes) {
+        if (GetOpKind(master->name()) == kReduction) {
+          shared_size += GetSharedSize(master);
+        }
+      }
+
+      constexpr int MAX_AVAILABLE_SHREAD = 32 * 1024;
+      if (shared_size > MAX_AVAILABLE_SHREAD) {
+        return false;
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+
+inline bool is_horizontal_relation(::pir::Operation* producer,
+                                   const std::shared_ptr<Group>& consumer) {
+  auto check_depency = [&](::pir::Operation* node) {
+    std::queue<::pir::Operation*> candidates;
+    std::unordered_set<::pir::Operation*> visited_set;
+    candidates.push(node);
+
+    while (!candidates.empty()) {
+      auto& candidate = candidates.front();
+      candidates.pop();
+      // visit all producer node
+      for (size_t i = 0; i < candidate->num_operands(); ++i) {
+        auto tmp_node =
+            candidate->operand_source(i).dyn_cast<pir::OpResult>().owner();
+        // check depency.
+        if (producer == tmp_node) {
+          return true;
+        }
+        // check node is in region.
+        if (!consumer->nodes_set.count(tmp_node)) {
+          continue;
+        }
+        // recored visited node.
+        if (!visited_set.count(tmp_node)) {
+          visited_set.insert(tmp_node);
+          candidates.push(tmp_node);
+        }
+      }
+    }
+
+    return false;
+  };
+
+  for (auto node : consumer->nodes_set) {
+    if (GetOpKind(node->name()) != consumer->op_pattern_kind) {
+      continue;
+    }
+    if (check_depency(node)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+inline bool horizontal_or_vertical_reduce_relation(
+    ::pir::Operation* producer, const std::shared_ptr<Group>& consumer) {
+  // check is same shape with horizontal relation.
+  if (is_same_size(producer, consumer)) {
+    return true;
+  }
+
+  // reducer node in fusion op.
+  ::pir::Operation* reducer = NULL;
+  for (auto* master : consumer->master_nodes) {
+    if (GetOpKind(master->name()) == kReduction) {
+      reducer = master;
+      break;
+    }
+  }
+
+  // check producer has same shape with reducer node.
+  auto reduce_shape = phi::vectorize(GetFirstInputShape(reducer));
+  auto reduce_axes = GetVectorAttr(reducer, "axis");
+
+  for (auto& axis : reduce_axes) {
+    // if axis = -1, set as shape.size() - 1
+    if (axis < 0) {
+      axis += reduce_shape.size();
+    }
+  }
+
+  auto node_shape = phi::vectorize<int64_t>(GetFirstInputShape(producer));
+  auto node_size = std::accumulate(
+      node_shape.begin(), node_shape.end(), 1, std::multiplies<int>());
+  auto reduce_size = std::accumulate(
+      reduce_shape.begin(), reduce_shape.end(), 1, std::multiplies<int>());
+
+  // is not same size with reduce size.
+  if (node_size != reduce_size) {
+    return false;
+  }
+  // check without last axis in reduce.
+  if (WithoutLastDimInReduce(reduce_shape, reduce_axes)) {
+    return false;
+  }
+
+  int succesive_reduce_dimension = reduce_shape.at(reduce_axes.back());
+  for (int idx = reduce_axes.size() - 2; idx >= 0; --idx) {
+    if (reduce_axes[idx] == reduce_axes[idx + 1] - 1) {
+      succesive_reduce_dimension *= reduce_shape[reduce_axes[idx]];
+      continue;
+    }
+    break;
+  }
+
+  // helper->target_ == common::DefaultNVGPUTarget()
+  // succesive_reduce_dimension <= helper->target_.max_num_threads()
+  // TODO(phlrain): support is_gpu_target and max_thread
+  bool is_gpu_target = true;
+  int max_thread = 32 * 1024;
+  return is_gpu_target
+             ? (succesive_reduce_dimension <= max_thread ? true : false)
+             : true;
+}
+
+inline bool horizontal_or_can_inline(::pir::Operation* producer,
+                                     const std::shared_ptr<Group>& consumer) {
+  // horizontal relation.
+  if (is_horizontal_relation(producer, consumer)) {
+    if (is_same_size(producer, consumer)) {
+      return true;
+    } else {
+      // if do broadcast, check can compute inline.
+      // return helper->output_nodes_set_.count(producer) == 0;
+      // TODO(phlrain): support output node set check
+      return false;
+    }
+  }
+  // vertical relation: 1.can compute inline
+  // if (helper->GetNodeData(producer)->outlinks().size() == 1 &&
+  //     helper->output_nodes_set_.count(producer) == 0) {
+  //   return true;
+  // }
+
+  // link to same node.
+  // auto& out_links = helper->GetNodeData(producer)->outlinks();
+  // for (auto link : out_links) {
+  //   if ((*out_links.begin())->sink() != link->sink()) {
+  //     return false;
+  //   }
+  // }
+
+  // return helper->output_nodes_set_.count(producer) == 0;
+
+  return false;
+}
+
+inline bool horizontal_with_same_size(::pir::Operation* producer,
+                                      const std::shared_ptr<Group>& consumer) {
+  return is_horizontal_relation(producer, consumer) &&
+         is_same_size(producer, consumer);
+}
+
+inline bool reduce_fuse_broadcast(::pir::Operation* producer,
+                                  const std::shared_ptr<Group>& consumer) {
+  if (is_horizontal_relation(producer, consumer)) {
+    if (is_same_size(producer, consumer)) {
+      return true;
+    }
+    return false;
+  }
+
+  // if (helper->target_ != common::DefaultNVGPUTarget()) {
+  //   return true;
+  // }
+
+  auto rinput_shape = phi::vectorize<int64_t>(GetFirstInputShape(producer));
+  auto reduce_axes = GetVectorAttr(producer, "axis");
+  auto keep_dim = producer->attributes()
+                      .at("keep_dim")
+                      .dyn_cast<::pir::BoolAttribute>()
+                      .data();
+  for (auto& axis : reduce_axes) {
+    if (axis < 0) {
+      axis += rinput_shape.size();
+    }
+  }
+
+  int reduce_size = rinput_shape.back();
+  for (auto idx = reduce_axes.size() - 1; idx >= 1; --idx) {
+    if (reduce_axes[idx] != reduce_axes[idx - 1] + 1) {
+      return false;
+    }
+    reduce_size *= rinput_shape[idx - 1];
+  }
+
+  // if (reduce_size > helper->target_.max_num_threads()) {
+  //   return false;
+  // }
+
+  auto routput_shape =
+      phi::vectorize<int64_t>(GetValueShape(producer->result(0)));
+  auto find_reducer =
+      [&](::pir::Operation* node,
+          ::pir::Operation* reducer,
+          const std::unordered_set<::pir::Operation*>& nodes_set) {
+        std::queue<::pir::Operation*> candidates;
+        candidates.push(node);
+
+        while (!candidates.empty()) {
+          auto candidate = candidates.front();
+          candidates.pop();
+
+          for (size_t i = 0; i < candidate->num_operands(); ++i) {
+            auto producer =
+                candidate->operand_source(i).dyn_cast<pir::OpResult>().owner();
+            if (producer == reducer) {
+              return true;
+            }
+
+            if (nodes_set.count(producer)) {
+              candidates.push(producer);
+            }
+          }
+        }
+
+        return false;
+      };
+
+  for (auto node : consumer->nodes_set) {
+    if (GetOpKind(node->name()) != kBroadcast) {
+      continue;
+    }
+
+    if (!find_reducer(node, producer, consumer->nodes_set)) {
+      continue;
+    }
+
+    auto broadcast_shape = GetVectorAttr(node, "out_shape");
+    auto broadcast_axes = GetVectorAttr(node, "broadcast_axes");
+
+    for (auto& axis : broadcast_axes) {
+      if (axis < 0) {
+        axis += broadcast_shape.size();
+      }
+    }
+
+    if (rinput_shape != broadcast_shape) {
+      return false;
+    }
+    // if keep dim = true.
+    if (keep_dim) {
+      continue;
+    } else {
+      // if routput_shape = [1]
+      if (routput_shape.size() == 1 && routput_shape[0] == 1) {
+        continue;
+      }
+      // check [reduce_axes, axes] = {0, 1, 2, 3, 4, 5, 6, ...}
+      for (size_t idx = 0; idx < rinput_shape.size(); ++idx) {
+        // note: !x ^ y == (!x) ^ y == !(x ^ y)
+        if ((std::find(broadcast_axes.begin(), broadcast_axes.end(), idx) !=
+             broadcast_axes.end()) ^
+            std::find(reduce_axes.begin(), reduce_axes.end(), idx) ==
+                reduce_axes.end()) {
+          return false;
+        }
+      }
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/tensor_node.cc b/paddle/cinn/hlir/dialect/operator/transforms/tensor_node.cc
new file mode 100644
index 00000000000000..0688b513f4497c
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/tensor_node.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/tensor_node.h"
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_node.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+OpNode TensorNode::producer() const {
+  return OpNode(node_data_.dyn_cast<pir::OpResult>().owner());
+}
+
+OpNode TensorNode::ConsumerOpListView::Iterator::operator*() const {
+  return OpNode(iter_.owner());
+}
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/tensor_node.h b/paddle/cinn/hlir/dialect/operator/transforms/tensor_node.h
new file mode 100644
index 00000000000000..c48e476ec2a8d0
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/tensor_node.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+class OpNode;
+
+class TensorNode final {
+ public:
+  TensorNode(::pir::Value value) : node_data_(value), consumers_(value) {}
+
+  // Get the shape of tensor.
+  const phi::DDim& shape() const {
+    return node_data_.type()
+        .dyn_cast<paddle::dialect::DenseTensorType>()
+        .dims();
+  }
+
+  // Input data has no producer.
+  bool HasProducer() const { return consumers_.size() != 0; }
+
+  OpNode producer() const;
+
+  class ConsumerOpListView {
+   public:
+    explicit ConsumerOpListView(pir::Value data) : node_data_(data) {}
+
+    ConsumerOpListView(const ConsumerOpListView& other) = delete;
+    ConsumerOpListView(ConsumerOpListView&& other) = delete;
+
+    ConsumerOpListView& operator=(const ConsumerOpListView& other) = delete;
+
+    using UseIterator = ::pir::ValueUseIterator<::pir::OpOperand>;
+    class Iterator {
+     public:
+      explicit Iterator(UseIterator it) : iter_(it) {}
+
+      Iterator& operator++() {
+        ++iter_;
+        return *this;
+      }
+
+      Iterator operator++(int) {
+        Iterator tmp = *this;
+        ++iter_;
+        return tmp;
+      }
+
+      bool operator==(const Iterator& other) const {
+        return iter_ == other.iter_;
+      }
+
+      bool operator!=(const Iterator& other) const { return !(*this == other); }
+
+      OpNode operator*() const;
+
+     private:
+      UseIterator iter_;
+    };
+
+    size_t size() const { return node_data_.use_count(); }
+
+    Iterator begin() const { return Iterator(node_data_.use_begin()); }
+
+    Iterator end() const { return Iterator(node_data_.use_end()); }
+
+   private:
+    ::pir::Value node_data_;
+  };
+
+  const ConsumerOpListView& consumers() const { return consumers_; }
+
+ private:
+  ::pir::Value node_data_;
+
+  const ConsumerOpListView consumers_;
+};
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/runtime/ir/CMakeLists.txt b/paddle/cinn/hlir/dialect/runtime/ir/CMakeLists.txt
index 6023117faee098..c85931ad954cf3 100644
--- a/paddle/cinn/hlir/dialect/runtime/ir/CMakeLists.txt
+++ b/paddle/cinn/hlir/dialect/runtime/ir/CMakeLists.txt
@@ -1,4 +1,10 @@
 if(NOT CINN_ONLY)
-  cinn_cc_library(cinn_runtime_dialect SRCS runtime_dialect.cc jit_kernel_op.cc
-                  DEPS pir_core)
+  cinn_cc_library(
+    cinn_runtime_dialect
+    SRCS
+    runtime_dialect.cc
+    jit_kernel_op.cc
+    DEPS
+    cinn_op_dialect
+    pir_core)
 endif()
diff --git a/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
index ed3d4a4045c595..2d8833a6acefc0 100644
--- a/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
+++ b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 
+#include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h"
+#include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/enforce.h"
 
@@ -22,20 +24,22 @@ namespace dialect {
 
 const char* JitKernelOp::attributes_name[attributes_num] = {kAttrName};
 
-void JitKernelOp::Verify() {
+void JitKernelOp::VerifySig() {
   VLOG(4) << "Verifying inputs, outputs and attributes for: JitKernelOp.";
 
   auto& attributes = this->attributes();
 
-  IR_ENFORCE(attributes.count(kAttrName) > 0 &&
-                 attributes.at(kAttrName).isa<::pir::PointerAttribute>(),
-             "Type of attribute: instruction is not right.");
+  IR_ENFORCE(
+      attributes.count(kAttrName) > 0 &&
+          attributes.at(kAttrName).isa<cinn::dialect::CUDAJITInfoAttribute>(),
+      "Type of attribute: instruction is not right.");
 }
 
-hlir::framework::Instruction* JitKernelOp::instruction() {
-  void* ptr =
-      attributes().at(kAttrName).dyn_cast<::pir::PointerAttribute>().data();
-  return reinterpret_cast<hlir::framework::Instruction*>(ptr);
+const hlir::framework::pir::CUDAJITInfo& JitKernelOp::cuda_jit_info() {
+  return attributes()
+      .at(kAttrName)
+      .dyn_cast<cinn::dialect::CUDAJITInfoAttribute>()
+      .data();
 }
 
 }  // namespace dialect
diff --git a/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h
index f410e4d46c021a..0ac3d26c262b74 100644
--- a/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h
+++ b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h
@@ -14,16 +14,11 @@
 
 #pragma once
 
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/pir/core/op_base.h"
 
 namespace cinn {
 
-namespace hlir {
-namespace framework {
-class Instruction;
-}  // namespace framework
-}  // namespace hlir
-
 namespace dialect {
 
 /*
@@ -46,12 +41,12 @@ class JitKernelOp : public ::pir::Op<JitKernelOp> {
   static const char* name() { return "cinn_runtime.jit_kernel"; }
   // TODO(Aurelius84): Think deeply what should contains
   static constexpr uint32_t attributes_num = 1;
-  static constexpr char* kAttrName = "instruction";
+  static constexpr char* kAttrName = "jit_info";
   static const char* attributes_name[attributes_num];
 
-  hlir::framework::Instruction* instruction();
+  const hlir::framework::pir::CUDAJITInfo& cuda_jit_info();
 
-  void Verify();
+  void VerifySig();
 };
 
 }  // namespace dialect
diff --git a/paddle/cinn/hlir/framework/CMakeLists.txt b/paddle/cinn/hlir/framework/CMakeLists.txt
index 54da1e2b7dc904..c353eb3810ff89 100755
--- a/paddle/cinn/hlir/framework/CMakeLists.txt
+++ b/paddle/cinn/hlir/framework/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_subdirectory(new_ir)
+add_subdirectory(pir)
 core_gather_headers()
 
 gather_srcs(
@@ -24,13 +24,10 @@ gather_srcs(
   visualize_helper.cc
   compile_error.cc)
 
-# TODO(Aurelius84): new_ir_compiler depends on pd_op_dialect and could
+# TODO(Aurelius84): pir_compiler depends on pd_op_dialect and could
 # not found under CINN_ONLY mode
 if(NOT CINN_ONLY)
-  cinn_cc_library(new_ir_compiler SRCS new_ir_compiler.cc DEPS cinnapi
-                  pd_op_dialect)
-  cinn_cc_library(convert_to_dialect SRCS convert_to_dialect.cc DEPS cinnapi
-                  cinn_op_dialect)
+  cinn_cc_library(pir_compiler SRCS pir_compiler.cc DEPS cinnapi pd_op_dialect)
 endif()
 
 if(WITH_CUDA)
diff --git a/paddle/cinn/hlir/framework/convert_to_dialect.cc b/paddle/cinn/hlir/framework/convert_to_dialect.cc
deleted file mode 100644
index f76b49a54555f9..00000000000000
--- a/paddle/cinn/hlir/framework/convert_to_dialect.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/hlir/framework/convert_to_dialect.h"
-
-#include <string>
-#include <unordered_map>
-
-#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
-#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
-#include "paddle/cinn/hlir/framework/program.h"
-#include "paddle/pir/core/builtin_attribute.h"
-#include "paddle/pir/core/program.h"
-
-namespace cinn {
-namespace hlir {
-namespace framework {
-
-std::unique_ptr<::pir::Program> ConvertToRuntimeDialect(
-    const hlir::framework::Program& program) {
-  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<cinn::dialect::RuntimeDialect>();
-  auto ir_program = std::make_unique<::pir::Program>(ctx);
-
-  std::string jit_op_name = dialect::JitKernelOp::name();
-  ::pir::OpInfo op_info = ctx->GetRegisteredOpInfo(jit_op_name);
-
-  auto& instrs = program.GetRunInstructions();
-  for (auto& instr : instrs) {
-    std::unordered_map<std::string, ::pir::Attribute> op_attrs{
-        {dialect::JitKernelOp::kAttrName,
-         ::pir::PointerAttribute::get(ctx, instr.get())},
-    };
-
-    ::pir::Operation* cinn_op =
-        ::pir::Operation::Create({}, op_attrs, {}, op_info);
-    ir_program->block()->push_back(cinn_op);
-  }
-  return std::move(ir_program);
-}
-
-}  // namespace framework
-}  // namespace hlir
-}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc
index 2720388e7e22c0..acd4387efb7121 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -32,6 +32,8 @@
 #include "paddle/cinn/utils/enum_string.h"
 #include "paddle/cinn/utils/profiler.h"
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
+
 namespace cinn {
 namespace hlir {
 namespace framework {
@@ -372,14 +374,17 @@ std::vector<ir::LoweredFunc> GetFuncFromImpl(
 
   poly::StageMap stages = C.back();
   std::string func_name_prefix = "fn_";
-  auto funcs = lang::LowerVec(func_name_prefix + node_id,
-                              stages,
-                              all_arg_tensors,
-                              {},
-                              {},
-                              nullptr,
-                              target,
-                              true);
+
+  ast_gen_ius::TensorGroup tensor_group =
+      ast_gen_ius::ConvertStageMapToTensorGroup(stages);
+  auto funcs = lang::LowerToAstVec(
+      func_name_prefix + node_id, all_arg_tensors, &tensor_group, target);
+
+  VLOG(4) << "Lower op: " << node_id << ", get " << funcs.size()
+          << " LoweredFunc:\n";
+  for (auto fun : funcs) {
+    VLOG(4) << fun;
+  }
 
   std::vector<common::CINNValue> schedule_inputs;
   for (int i = 0; i < C.size() - 1; ++i) {
@@ -426,7 +431,8 @@ std::vector<ir::LoweredFunc> GetFuncFromImpl(
     optim::OptimizeExprGPU(&(funcs_after_schedule[i]->body));
 #endif
     auto temp_buffers = lang::GetTempBuffers(
-        all_arg_tensors, stages, funcs_after_schedule[i]->body);
+        all_arg_tensors, tensor_group, funcs_after_schedule[i]->body);
+
     funcs_after_schedule[i]->temp_bufs = temp_buffers;
     funcs_after_schedule[i] =
         ir::_LoweredFunc_::Make(funcs_after_schedule[i]->name,
diff --git a/paddle/cinn/hlir/framework/new_ir/CMakeLists.txt b/paddle/cinn/hlir/framework/new_ir/CMakeLists.txt
deleted file mode 100755
index e08baf06dbd13f..00000000000000
--- a/paddle/cinn/hlir/framework/new_ir/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-if(NOT CINN_ONLY)
-  core_gather_headers()
-  gather_srcs(cinnapi_src SRCS utils.cc op_lowering_impl.cc)
-endif()
diff --git a/paddle/cinn/hlir/framework/new_ir/utils.cc b/paddle/cinn/hlir/framework/new_ir/utils.cc
deleted file mode 100644
index 3f938981390fbc..00000000000000
--- a/paddle/cinn/hlir/framework/new_ir/utils.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/hlir/framework/new_ir/utils.h"
-
-namespace cinn {
-namespace hlir {
-namespace framework {
-namespace newir {
-
-const std::unordered_map<std::string, std::string> CompatibleInfo::OP_NAMES = {
-    {"pd_op.full", "fill_constant"}};
-
-std::string CompatibleInfo::OpName(const ::pir::Operation& op) {
-  std::string name = op.name();
-  if (OP_NAMES.count(name)) {
-    return OP_NAMES.at(name);
-  }
-  auto pos = name.find(".");
-  if (pos == std::string::npos) {
-    return name;
-  }
-  auto cinn_op_name = name.substr(pos + 1);
-  VLOG(4) << "GetOpName: " << name << " -> " << cinn_op_name;
-  return cinn_op_name;
-}
-
-std::string CompatibleInfo::ValueName(const ::pir::Value& value) {
-  return CompatibleInfo::kNamePrefix +
-         std::to_string(std::hash<::pir::Value>()(value));
-}
-
-std::string CompatibleInfo::OpFuncName(const ::pir::Operation& op) {
-  std::string op_name = OpName(op);
-  std::string func_name =
-      cinn::common::Context::Global().NewName("fn_" + op_name);
-  return func_name;
-}
-
-std::string CompatibleInfo::GroupOpsName(
-    const std::vector<::pir::Operation*>& ops) {
-  std::string name = "fn";
-  for (auto* op : ops) {
-    std::string op_name = OpName(*op);
-    name += "_" + cinn::common::Context::Global().NewName(op_name);
-  }
-  return name;
-}
-
-std::vector<std::string> CompatibleInfo::InputNames(const ::pir::Operation& op,
-                                                    bool allow_duplicate) {
-  std::vector<std::string> names;
-  std::unordered_set<std::string> repeat;
-  for (int i = 0; i < op.num_operands(); ++i) {
-    auto value = op.operand_source(i);
-    std::string name = CompatibleInfo::ValueName(value);
-    if (!allow_duplicate && repeat.count(name)) {
-      continue;
-    }
-    repeat.insert(name);
-    names.push_back(name);
-  }
-  return names;
-}
-
-std::vector<std::string> CompatibleInfo::OutputNames(::pir::Operation& op) {
-  std::vector<std::string> names;
-  for (int i = 0; i < op.num_results(); ++i) {
-    auto value = op.result(i);
-    std::string name = CompatibleInfo::ValueName(value);
-    names.push_back(std::move(name));
-  }
-  return names;
-}
-
-}  // namespace newir
-}  // namespace framework
-}  // namespace hlir
-}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_lowering.h b/paddle/cinn/hlir/framework/op_lowering.h
index b0e0ad7d97b119..8ae0d5869c1a4c 100644
--- a/paddle/cinn/hlir/framework/op_lowering.h
+++ b/paddle/cinn/hlir/framework/op_lowering.h
@@ -22,7 +22,7 @@
 #include "paddle/cinn/hlir/framework/op_lowering_impl_base.h"
 #include "paddle/cinn/lang/packed_func.h"
 #ifndef CINN_WITH_ONLY
-#include "paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h"
+#include "paddle/cinn/hlir/framework/pir/op_lowering_impl.h"
 #endif
 
 namespace cinn {
@@ -40,8 +40,10 @@ class OpLowerer {
 
   std::vector<ir::LoweredFunc> Lower(const T& group,
                                      bool apply_op_schedule = true,
-                                     bool apply_group_schedule = true) {
-    return impl_->Lower(group, apply_op_schedule, apply_group_schedule);
+                                     bool apply_group_schedule = true,
+                                     bool apply_pass = true) {
+    return impl_->Lower(
+        group, apply_op_schedule, apply_group_schedule, apply_pass);
   }
 
  private:
@@ -63,13 +65,13 @@ inline OpLowerer<GroupPtr> CreateOpLowerer(
 }
 
 #ifndef CINN_WITH_ONLY
-template <typename T = newir::GroupPtr>
+template <typename T = pir::GroupPtr>
 OpLowerer<T> CreateOpLowerer(const Target&);
 
 template <>
-inline OpLowerer<newir::GroupPtr> CreateOpLowerer(const Target& target) {
-  auto* impl_base = new newir::OpLowererImpl(target);
-  return OpLowerer<newir::GroupPtr>(impl_base);
+inline OpLowerer<pir::GroupPtr> CreateOpLowerer(const Target& target) {
+  auto* impl_base = new pir::OpLowererImpl(target);
+  return OpLowerer<pir::GroupPtr>(impl_base);
 }
 #endif
 
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.cc b/paddle/cinn/hlir/framework/op_lowering_impl.cc
index 156ad756a50afe..b380ee8aaba2ee 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_impl.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/cinn/hlir/framework/op_lowering_impl.h"
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/hlir/framework/compile_error.h"
 #include "paddle/cinn/hlir/framework/graph_compiler_util.h"
 #include "paddle/cinn/hlir/framework/op_lowering_util.h"
@@ -48,7 +49,8 @@ OpLowererImpl::OpLowererImpl(
 
 std::vector<ir::LoweredFunc> OpLowererImpl::Lower(const GroupPtr& group,
                                                   bool apply_op_schedule,
-                                                  bool apply_group_schedule) {
+                                                  bool apply_group_schedule,
+                                                  bool apply_pass) {
   VLOG(3) << "Lowering Group : " << group->group_id
           << " , Op Pattern : " << group->op_pattern_kind;
   group->input_names.clear();
@@ -60,11 +62,13 @@ std::vector<ir::LoweredFunc> OpLowererImpl::Lower(const GroupPtr& group,
       return LowerGroup(group,
                         apply_op_schedule,
                         apply_group_schedule,
+                        apply_pass,
                         &OpLowererImpl::ElementwiseScheduleDetermineFunction);
     case framework::kReduction:
       return LowerGroup(group,
                         apply_op_schedule,
                         apply_group_schedule,
+                        apply_pass,
                         &OpLowererImpl::ReduceScheduleDetermineFunction);
     case framework::kOutFusible:
       LOG(FATAL) << "Group Pattern Kind kOutFusible Is Not Implemented!";
@@ -72,6 +76,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::Lower(const GroupPtr& group,
       return LowerGroup(group,
                         apply_op_schedule,
                         apply_group_schedule,
+                        apply_pass,
                         &OpLowererImpl::NonFusibleScheduleDetermineFunction);
     default:
       LOG(FATAL) << "Group Pattern Kind Is Unknown!";
@@ -95,6 +100,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerGroup(
     const GroupPtr& group,
     bool apply_op_schedule,
     bool apply_group_schedule,
+    bool apply_pass,
     ScheduleDetermineFunction schedule_determine_func) {
   // 1.Do compute, lower and schedule for each op.
   VLOG(3) << "group->fused_sub_groups.size() is : "
@@ -126,8 +132,12 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerGroup(
   // 3.Do post-processing,
   // including preparing function args and temporary variables,
   // applying low-level optimization passes, etc.
-  return PostProcess(
-      group, tensor_map, do_op_schedule, &ir_sch, &group_func_arg_tensors);
+  return PostProcess(group,
+                     tensor_map,
+                     do_op_schedule,
+                     apply_pass,
+                     &ir_sch,
+                     &group_func_arg_tensors);
 }
 
 std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
@@ -221,6 +231,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
     const GroupPtr& group,
     const std::unordered_map<std::string, ir::Tensor>& tensor_map,
     bool done_op_schedule,
+    bool apply_pass,
     ir::IRSchedule* ir_sch,
     std::vector<ir::Tensor>* group_func_arg_tensors) {
   // 1.Prepare function args
@@ -277,9 +288,10 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
 
   auto func_body = ir_sch->GetModule().GetExprs().at(0);
 #ifdef CINN_WITH_CUDA
-  optim::OptimizeExprGPU(&(func_body));
+  if (apply_pass) {
+    optim::OptimizeExprGPU(&(func_body));
+  }
 #endif
-
   // 2.Prepare temp buffers
   poly::StageMap stages;
   auto temp_buffers =
@@ -293,7 +305,9 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
     func->PrepareBufferCastExprs();
   }
   // 4.Apply low level pass
-  func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
+  if (apply_pass) {
+    func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
+  }
   return {func};
 }
 
@@ -391,16 +405,16 @@ std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
   }
 
   // 2.Do lower
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("fn_" + node->id(),
-                                                      tmp_stages,
-                                                      *op_func_arg_tensors,
-                                                      {},
-                                                      {},
-                                                      nullptr,
-                                                      this->target_,
-                                                      true);
+  ast_gen_ius::TensorGroup tensor_group =
+      ast_gen_ius::ConvertStageMapToTensorGroup(tmp_stages);
+  std::vector<ir::LoweredFunc> funcs = lang::LowerToAstVec(
+      "fn_" + node->id(), *op_func_arg_tensors, {&tensor_group}, this->target_);
+
   VLOG(4) << "Lower op: " << node->op()->name << ", get " << funcs.size()
           << " LoweredFunc:\n";
+  for (auto fun : funcs) {
+    VLOG(4) << fun;
+  }
 
   op_func_arg_tensors->clear();
   for (int idx = 0; idx < pack.size() - 1; ++idx) {
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.h b/paddle/cinn/hlir/framework/op_lowering_impl.h
index a4c79a3268004c..99be348d5be327 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl.h
@@ -56,7 +56,8 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
    */
   std::vector<ir::LoweredFunc> Lower(const GroupPtr& group,
                                      bool apply_op_schedule = true,
-                                     bool apply_group_schedule = true);
+                                     bool apply_group_schedule = true,
+                                     bool apply_pass = true);
 
  private:
   /**
@@ -72,6 +73,7 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
       const GroupPtr& group,
       bool apply_op_schedule,
       bool apply_group_schedule,
+      bool apply_pass,
       ScheduleDetermineFunction schedule_determine_func);
 
   /**
@@ -96,6 +98,7 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
       const GroupPtr& group,
       const std::unordered_map<std::string, ir::Tensor>& tensor_map,
       bool done_op_schedule,
+      bool apply_pass,
       ir::IRSchedule* ir_sch,
       std::vector<ir::Tensor>* group_func_arg_tensors);
 
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
index 9f2c0e7a35dada..6479419852a2b7 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
@@ -32,10 +32,10 @@ class OpLowererImplBase {
   OpLowererImplBase() = default;
   ~OpLowererImplBase() = default;
 
-  virtual std::vector<ir::LoweredFunc> Lower(
-      const T& group,
-      bool apply_op_schedule = true,
-      bool apply_group_schedule = true) = 0;
+  virtual std::vector<ir::LoweredFunc> Lower(const T& group,
+                                             bool apply_op_schedule = true,
+                                             bool apply_group_schedule = true,
+                                             bool apply_pass = true) = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/cinn/hlir/framework/pir/CMakeLists.txt b/paddle/cinn/hlir/framework/pir/CMakeLists.txt
new file mode 100755
index 00000000000000..775bed0d835493
--- /dev/null
+++ b/paddle/cinn/hlir/framework/pir/CMakeLists.txt
@@ -0,0 +1,4 @@
+if(NOT CINN_ONLY)
+  core_gather_headers()
+  gather_srcs(cinnapi_src SRCS utils.cc op_lowering_impl.cc op_mapper.cc)
+endif()
diff --git a/paddle/cinn/hlir/framework/new_ir/group.h b/paddle/cinn/hlir/framework/pir/group.h
similarity index 94%
rename from paddle/cinn/hlir/framework/new_ir/group.h
rename to paddle/cinn/hlir/framework/pir/group.h
index 1a67a02e58ca9a..cb6c23c4d1e59e 100644
--- a/paddle/cinn/hlir/framework/new_ir/group.h
+++ b/paddle/cinn/hlir/framework/pir/group.h
@@ -16,14 +16,14 @@
 #include <string>
 #include <vector>
 
-#include "paddle/cinn/hlir/framework/new_ir/utils.h"
 #include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/pir/core/operation.h"
 
 namespace cinn {
 namespace hlir {
 namespace framework {
-namespace newir {
+namespace pir {
 using framework::OpPatternKind;
 
 // TODO(Aurelius84): Need to be replaced with CinnGroupOp
@@ -53,7 +53,7 @@ struct Group {
   }
 };
 
-}  // namespace newir
+}  // namespace pir
 }  // namespace framework
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
similarity index 93%
rename from paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc
rename to paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index 56282996b9e26a..19b613aac4a244 100644
--- a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -12,17 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h"
+#include "paddle/cinn/hlir/framework/pir/op_lowering_impl.h"
 
 #include <string>
+
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/hlir/framework/op_lowering_util.h"
 #include "paddle/cinn/hlir/op/external_api_registry.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 
-#include "paddle/cinn/hlir/framework/new_ir/utils.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/cinn/lang/placeholder.h"
-#include "paddle/cinn/utils/attribute_util.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -31,7 +32,7 @@ PD_DECLARE_bool(cinn_use_cuda_vectorize);
 namespace cinn {
 namespace hlir {
 namespace framework {
-namespace newir {
+namespace pir {
 
 using cinn::hlir::op::ExternalApiRegistry;
 using common::Type;
@@ -45,7 +46,7 @@ ir::Tensor GetTensor(const ::pir::Value& value) {
   auto dtype = type_info.dtype();
   std::string input_id = CompatibleInfo::ValueName(value);
   return lang::CreatePlaceHolder(
-      in_shape, utils::ConvertIRType(dtype), input_id);
+      in_shape, CompatibleInfo::ConvertIRType(dtype), input_id);
 }
 
 std::vector<ir::Tensor> CollectInputTensor(
@@ -53,9 +54,8 @@ std::vector<ir::Tensor> CollectInputTensor(
     std::vector<ir::Tensor>* func_args,
     std::unordered_map<::pir::Value, ir::Tensor>* tensor_map) {
   std::vector<ir::Tensor> tensors;
-  for (auto in_value : op->operands_source()) {
+  for (auto in_value : CompatibleInfo::RealOperandSources(*op)) {
     VLOG(4) << "input tensor name: " << CompatibleInfo::ValueName(in_value);
-    // NOTE(Aurelius84): Need always to create placeholder for input tensor.
     ir::Tensor tensor = details::GetTensor(in_value);
     if (!tensor_map->count(in_value)) {
       // record tensor.
@@ -80,7 +80,7 @@ void CollectOutputInfo(::pir::Operation* op,
     auto type_info =
         out_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
 
-    out_types->push_back(utils::ConvertIRType(type_info.dtype()));
+    out_types->push_back(CompatibleInfo::ConvertIRType(type_info.dtype()));
     auto out_shape = phi::vectorize<int>(type_info.dims());
     out_shapes->push_back(std::move(out_shape));
   }
@@ -89,7 +89,7 @@ void CollectOutputInfo(::pir::Operation* op,
 NodeAttr CollectAttrs(const ::pir::Operation& op) {
   NodeAttr node_attrs;
   VLOG(4) << "op.attributes():" << op.attributes().size();
-  auto attrs = utils::ConvertAttributes(op.attributes());
+  auto attrs = CompatibleInfo::ConvertAttributes(op);
   node_attrs.node_name = CompatibleInfo::OpName(op);
   node_attrs.attr_store = std::move(attrs);
 
@@ -102,7 +102,8 @@ OpLowererImpl::OpLowererImpl(const Target& target) : target_(target) {}
 
 std::vector<ir::LoweredFunc> OpLowererImpl::Lower(const GroupPtr& group,
                                                   bool apply_op_schedule,
-                                                  bool apply_group_schedule) {
+                                                  bool apply_group_schedule,
+                                                  bool apply_pass) {
   VLOG(3) << "Lowering Group : " << group->group_id
           << " , Op Pattern : " << group->op_pattern_kind;
   group->input_names.clear();
@@ -334,7 +335,6 @@ std::vector<ir::Expr> OpLowererImpl::LowerOps(
     const hlir::framework::Operator* cinn_op = Operator::Get(cinn_op_name);
     auto op_impl = OpStrategy::SelectImpl(strategy[cinn_op](
         node_attrs, op_func_arg_tensors, out_types, out_shapes, this->target_));
-
     // 2.Perform the lower process of Op
     std::vector<ir::LoweredFunc> funcs =
         DoOpLower(op_impl, op, tensor_map, &op_func_arg_tensors);
@@ -381,14 +381,14 @@ std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
   for (int idx = 0; idx < pack.size() - 1; ++idx) {
     Expr expr = pack[idx];
     // Insert the output tensor defined by Compute into the tensor_map
-    if (pack.size() - 1 > op_results.size()) {
+    if (pack.size() - 1 > op_results.size() && post == "") {
       // Some op may output multiple temp tensors in their Compute
       // definition, but only one output  in the graph, and we use id +
       // "_0"/"_1" as key.
       // FIXME(Aurelius84): It seems that the implementation is relate with
       // string name.
-      // (*tensor_map)[op_results[0] + post] = expr.as_tensor_ref();
-      // post = "_" + std::to_string(idx);
+      (*tensor_map)[op_results[idx]] = expr.as_tensor_ref();
+      post = "_" + std::to_string(idx);
     } else {
       // If the number of output tensors defined by Compute is less equal than
       // the output node_data on the graph, then there is a one-to-one
@@ -408,16 +408,17 @@ std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
 
   // 2.Do lower
   std::string lower_fn_name = CompatibleInfo::OpFuncName(*op);
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(lower_fn_name,
-                                                      tmp_stages,
-                                                      *op_func_arg_tensors,
-                                                      {},
-                                                      {},
-                                                      nullptr,
-                                                      this->target_,
-                                                      true);
+  ast_gen_ius::TensorGroup tensor_group =
+      ast_gen_ius::ConvertStageMapToTensorGroup(tmp_stages);
+  std::vector<ir::LoweredFunc> funcs = lang::LowerToAstVec(
+      lower_fn_name, *op_func_arg_tensors, {&tensor_group}, this->target_);
   VLOG(4) << "Lower op: " << lower_fn_name << ", get " << funcs.size()
           << " LoweredFunc:\n";
+  if (VLOG_IS_ON(4)) {
+    for (auto fun : funcs) {
+      VLOG(4) << fun;
+    }
+  }
 
   op_func_arg_tensors->clear();
   for (int idx = 0; idx < pack.size() - 1; ++idx) {
@@ -451,7 +452,7 @@ ir::Expr OpLowererImpl::DoOpSchedule(
   return expr_pack[0].operator ir::Expr();
 }
 
-}  // namespace newir
+}  // namespace pir
 }  // namespace framework
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
similarity index 97%
rename from paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h
rename to paddle/cinn/hlir/framework/pir/op_lowering_impl.h
index 705c1f6f8c12d7..ead590526dd407 100644
--- a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
@@ -19,9 +19,9 @@
 
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/hlir/framework/instruction.h"
-#include "paddle/cinn/hlir/framework/new_ir/group.h"
 #include "paddle/cinn/hlir/framework/op_lowering_impl_base.h"
 #include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/framework/pir/group.h"
 #include "paddle/cinn/ir/lowered_func.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
@@ -36,7 +36,7 @@
 namespace cinn {
 namespace hlir {
 namespace framework {
-namespace newir {
+namespace pir {
 
 using GroupPtr = std::shared_ptr<Group>;
 
@@ -58,7 +58,8 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
    */
   std::vector<ir::LoweredFunc> Lower(const GroupPtr& group,
                                      bool apply_op_schedule = true,
-                                     bool apply_group_schedule = true);
+                                     bool apply_group_schedule = true,
+                                     bool apply_pass = true);
 
  private:
   /**
@@ -156,7 +157,7 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
   Target target_;
 };
 
-}  // namespace newir
+}  // namespace pir
 }  // namespace framework
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/pir/op_mapper.cc b/paddle/cinn/hlir/framework/pir/op_mapper.cc
new file mode 100644
index 00000000000000..3de2c3fbec0900
--- /dev/null
+++ b/paddle/cinn/hlir/framework/pir/op_mapper.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/hlir/framework/pir/op_mapper.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+namespace pir {
+
+namespace {
+
+void AppendAttrForReduceOp(const ::pir::Operation& op,
+                           utils::AttributeMap& attrs) {  // NOLINT
+  auto* source_op =
+      op.operand_source(/*dim_idx=*/1).dyn_cast<::pir::OpResult>().owner();
+  CHECK(source_op->isa<paddle::dialect::FullIntArrayOp>());
+  const std::vector<int64_t>& dim_val =
+      source_op->attributes()
+          .at("value")
+          .dyn_cast<paddle::dialect::IntArrayAttribute>()
+          .data()
+          .GetData();
+  std::vector<int> dim(dim_val.begin(), dim_val.end());
+  attrs["dim"] = dim;
+}
+
+}  // namespace
+
+#define REGISTER_OPERAND_RULE(OP, args...)                                    \
+  operand_funcs_[paddle::dialect::OP::name()] = []() -> std::vector<size_t> { \
+    return {args};                                                            \
+  };
+
+#define REGISTER_ATTR_RULE(OP, func) \
+  attr_funcs_[paddle::dialect::OP::name()] = func;
+
+void OpMapper::RegisterMapRules() {
+  // max(x, dim) -> reduce_max(x)
+  REGISTER_OPERAND_RULE(MaxOp, 0);
+  REGISTER_OPERAND_RULE(SumOp, 0);
+  REGISTER_OPERAND_RULE(MinOp, 0);
+  REGISTER_OPERAND_RULE(ProdOp, 0);
+  REGISTER_ATTR_RULE(MaxOp, AppendAttrForReduceOp);
+  REGISTER_ATTR_RULE(SumOp, AppendAttrForReduceOp);
+  REGISTER_ATTR_RULE(MinOp, AppendAttrForReduceOp);
+  REGISTER_ATTR_RULE(ProdOp, AppendAttrForReduceOp);
+}
+
+}  // namespace pir
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/pir/op_mapper.h b/paddle/cinn/hlir/framework/pir/op_mapper.h
new file mode 100644
index 00000000000000..0a0527cf9abf18
--- /dev/null
+++ b/paddle/cinn/hlir/framework/pir/op_mapper.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/cinn/utils/type_defs.h"
+#include "paddle/pir/core/operation.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+namespace pir {
+
+enum MapperType {
+  OPERAND,
+  ATTRIBUTE,
+};
+
+class OpMapper {
+  using OprandIndexsFunction = std::function<std::vector<size_t>()>;
+  using AppendAttrFunction =
+      std::function<void(const ::pir::Operation& op,
+                         utils::AttributeMap& attrs)>;  // NOLINT
+
+ public:
+  static OpMapper& Instance() {
+    static OpMapper instance;
+    return instance;
+  }
+
+  bool has(const ::pir::Operation& op, MapperType type) const {
+    if (type == MapperType::OPERAND) {
+      return operand_funcs_.find(op.name()) != operand_funcs_.end();
+    } else if (type == MapperType::ATTRIBUTE) {
+      return attr_funcs_.find(op.name()) != attr_funcs_.end();
+    }
+    return false;
+  }
+
+  std::vector<::pir::Value> RealOprandSources(
+      const ::pir::Operation& op) const {
+    CHECK(has(op, MapperType::OPERAND))
+        << "Not register OprandIndexsFunction for " << op.name();
+    std::vector<::pir::Value> inputs;
+    for (auto idx : operand_funcs_.at(op.name())()) {
+      inputs.push_back(op.operand_source(idx));
+    }
+    return inputs;
+  }
+
+  void AppendVariantAttrs(const ::pir::Operation& op,
+                          utils::AttributeMap& attrs) const {  // NOLINT
+    CHECK(has(op, MapperType::ATTRIBUTE))
+        << "Not register AppendAttrFunction for " << op.name();
+    attr_funcs_.at(op.name())(op, attrs);
+  }
+
+ private:
+  OpMapper() { RegisterMapRules(); }
+  void RegisterMapRules();
+
+  std::unordered_map<std::string, OprandIndexsFunction> operand_funcs_;
+  std::unordered_map<std::string, AppendAttrFunction> attr_funcs_;
+};
+
+}  // namespace pir
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/utils/attribute_util.h b/paddle/cinn/hlir/framework/pir/utils.cc
similarity index 51%
rename from paddle/cinn/utils/attribute_util.h
rename to paddle/cinn/hlir/framework/pir/utils.cc
index 474bc09e2c64c2..2f7b05c72fb302 100644
--- a/paddle/cinn/utils/attribute_util.h
+++ b/paddle/cinn/hlir/framework/pir/utils.cc
@@ -12,24 +12,102 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "paddle/cinn/hlir/framework/pir/utils.h"
+
 #include <string>
 #include <unordered_map>
 
-#include "paddle/cinn/common/type.h"
-#include "paddle/cinn/utils/type_defs.h"
+#include "paddle/cinn/hlir/framework/pir/op_mapper.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/builtin_type.h"
 
 namespace cinn {
-namespace utils {
+namespace hlir {
+namespace framework {
+namespace pir {
+
+const std::unordered_map<std::string, std::string> CompatibleInfo::OP_NAMES = {
+    {"pd_op.full", "fill_constant"},
+    {"pd_op.sum", "reduce_sum"},
+    {"pd_op.max", "reduce_max"},
+    {"pd_op.add", "elementwise_add"}};
+
+std::string CompatibleInfo::OpName(const ::pir::Operation& op) {
+  std::string name = op.name();
+  if (OP_NAMES.count(name)) {
+    return OP_NAMES.at(name);
+  }
+  auto pos = name.find(".");
+  if (pos == std::string::npos) {
+    return name;
+  }
+  auto cinn_op_name = name.substr(pos + 1);
+  VLOG(4) << "GetOpName: " << name << " -> " << cinn_op_name;
+  return cinn_op_name;
+}
+
+std::string CompatibleInfo::ValueName(const ::pir::Value& value) {
+  return CompatibleInfo::kNamePrefix +
+         std::to_string(std::hash<::pir::Value>()(value));
+}
+
+std::string CompatibleInfo::OpFuncName(const ::pir::Operation& op) {
+  std::string op_name = OpName(op);
+  std::string func_name =
+      cinn::common::Context::Global().NewName("fn_" + op_name);
+  return func_name;
+}
+
+std::string CompatibleInfo::GroupOpsName(
+    const std::vector<::pir::Operation*>& ops) {
+  std::string name = "fn";
+  for (auto* op : ops) {
+    std::string op_name = OpName(*op);
+    name += "_" + cinn::common::Context::Global().NewName(op_name);
+  }
+  return name;
+}
 
-using NewIR_AttributeMap = std::unordered_map<std::string, ::pir::Attribute>;
+std::vector<std::string> CompatibleInfo::InputNames(const ::pir::Operation& op,
+                                                    bool allow_duplicate) {
+  std::vector<std::string> names;
+  std::unordered_set<std::string> repeat;
+  for (int i = 0; i < op.num_operands(); ++i) {
+    auto value = op.operand_source(i);
+    std::string name = CompatibleInfo::ValueName(value);
+    if (!allow_duplicate && repeat.count(name)) {
+      continue;
+    }
+    repeat.insert(name);
+    names.push_back(name);
+  }
+  return names;
+}
+
+std::vector<std::string> CompatibleInfo::OutputNames(::pir::Operation& op) {
+  std::vector<std::string> names;
+  for (int i = 0; i < op.num_results(); ++i) {
+    auto value = op.result(i);
+    std::string name = CompatibleInfo::ValueName(value);
+    names.push_back(std::move(name));
+  }
+  return names;
+}
 
-Attribute ConvertAttribute(const ::pir::Attribute& src_attr) {
-  Attribute dst_attr;
+std::vector<::pir::Value> CompatibleInfo::RealOperandSources(
+    const ::pir::Operation& op) {
+  if (OpMapper::Instance().has(op, MapperType::OPERAND)) {
+    return OpMapper::Instance().RealOprandSources(op);
+  } else {
+    return op.operands_source();
+  }
+}
+
+utils::Attribute CompatibleInfo::ConvertAttribute(
+    const ::pir::Attribute& src_attr) {
+  utils::Attribute dst_attr;
   if (src_attr.isa<::pir::BoolAttribute>()) {
     dst_attr = src_attr.dyn_cast<::pir::BoolAttribute>().data();
   } else if (src_attr.isa<::pir::FloatAttribute>()) {
@@ -58,8 +136,10 @@ Attribute ConvertAttribute(const ::pir::Attribute& src_attr) {
   return dst_attr;
 }
 
-AttributeMap ConvertAttributes(const NewIR_AttributeMap& src_attrs) {
-  AttributeMap dst_attrs;
+utils::AttributeMap CompatibleInfo::ConvertAttributes(
+    const ::pir::Operation& op) {
+  auto& src_attrs = op.attributes();
+  utils::AttributeMap dst_attrs;
   for (auto& item : src_attrs) {
     VLOG(4) << "deal with " << item.first;
     if (item.first == ::pir::kStopGradientAttrName) {
@@ -73,6 +153,10 @@ AttributeMap ConvertAttributes(const NewIR_AttributeMap& src_attrs) {
       dst_attrs[item.first] = std::move(ConvertAttribute(item.second));
     }
   }
+
+  if (OpMapper::Instance().has(op, MapperType::ATTRIBUTE)) {
+    OpMapper::Instance().AppendVariantAttrs(op, dst_attrs);
+  }
   VLOG(4) << "dst_attrs.size(): " << dst_attrs.size();
   return dst_attrs;
 }
@@ -80,7 +164,7 @@ AttributeMap ConvertAttributes(const NewIR_AttributeMap& src_attrs) {
 #define CASE_TYPE(src, dst) \
   else if (type.isa<::pir::src>()) return common::dst();
 
-common::Type ConvertIRType(::pir::Type type) {
+common::Type CompatibleInfo::ConvertIRType(::pir::Type type) {
   if (type.isa<::pir::BFloat16Type>()) return common::BF16();
   CASE_TYPE(Float16Type, F16)
   CASE_TYPE(Float32Type, F32)
@@ -96,5 +180,7 @@ common::Type ConvertIRType(::pir::Type type) {
   LOG(FATAL) << "unknown ir::Type " << type;
 }
 
-}  // namespace utils
+}  // namespace pir
+}  // namespace framework
+}  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/new_ir/utils.h b/paddle/cinn/hlir/framework/pir/utils.h
similarity index 76%
rename from paddle/cinn/hlir/framework/new_ir/utils.h
rename to paddle/cinn/hlir/framework/pir/utils.h
index 953dc6672bc18f..4d84ca66f82c9e 100644
--- a/paddle/cinn/hlir/framework/new_ir/utils.h
+++ b/paddle/cinn/hlir/framework/pir/utils.h
@@ -16,12 +16,20 @@
 #include <string>
 #include <unordered_map>
 #include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/utils/type_defs.h"
 #include "paddle/pir/core/operation.h"
 
 namespace cinn {
 namespace hlir {
 namespace framework {
-namespace newir {
+namespace pir {
+
+struct CUDAJITInfo {
+  void* fn_ptr;
+  std::vector<int> block_dims;
+  std::vector<int> grid_dims;
+};
 
 struct CompatibleInfo {
   static constexpr char* kNamePrefix = "var_";
@@ -41,9 +49,18 @@ struct CompatibleInfo {
                                              bool allow_duplicate = false);
 
   static std::vector<std::string> OutputNames(::pir::Operation& op);  // NOLINT
+
+  static std::vector<::pir::Value> RealOperandSources(
+      const ::pir::Operation& op);
+
+  static utils::Attribute ConvertAttribute(const ::pir::Attribute& src_attr);
+
+  static utils::AttributeMap ConvertAttributes(const ::pir::Operation& op);
+
+  static common::Type ConvertIRType(::pir::Type type);
 };
 
-}  // namespace newir
+}  // namespace pir
 }  // namespace framework
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/new_ir_compiler.cc b/paddle/cinn/hlir/framework/pir_compiler.cc
similarity index 74%
rename from paddle/cinn/hlir/framework/new_ir_compiler.cc
rename to paddle/cinn/hlir/framework/pir_compiler.cc
index 2a40531196da4d..73fbd68c409ddd 100644
--- a/paddle/cinn/hlir/framework/new_ir_compiler.cc
+++ b/paddle/cinn/hlir/framework/pir_compiler.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/hlir/framework/new_ir_compiler.h"
+#include "paddle/cinn/hlir/framework/pir_compiler.h"
 
 #include <absl/types/variant.h>
-#include "paddle/cinn/hlir/framework/new_ir/utils.h"
-#include "paddle/cinn/utils/attribute_util.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/pir/core/builtin_type.h"
 
@@ -26,22 +25,60 @@ namespace framework {
 
 // TODO(Aurelius84): Need abstract this logic to implement Proxy for
 // the co-existance with GraphCompiler.
-std::unique_ptr<Program> NewIRCompiler::Build() {
+std::unique_ptr<Program> PIRCompiler::Build() {
   m_builder_.Clear();
   // NOTE(Aurelius84): Currently only support each op for one group
-  std::vector<newir::GroupPtr> groups;
+  std::vector<pir::GroupPtr> groups;
   for (auto it = program_.block()->begin(); it != program_.block()->end();
        ++it) {
     std::vector<::pir::Operation*> ops = {*it};
-    groups.push_back(std::make_shared<newir::Group>(ops));
+    groups.push_back(std::make_shared<pir::Group>(ops));
   }
   VLOG(4) << "Groups size: " << groups.size();
   return std::move(Build(groups));
 }
 
-std::unique_ptr<Program> NewIRCompiler::Build(
-    const std::vector<newir::GroupPtr>& groups) {
-  auto op_lowerer = CreateOpLowerer<newir::GroupPtr>(target_);
+std::vector<pir::CUDAJITInfo> PIRCompiler::BuildCUDAJITInfo(
+    const std::vector<pir::GroupPtr>& groups) {
+  std::vector<pir::CUDAJITInfo> vec_res;
+
+  auto op_lowerer = CreateOpLowerer<pir::GroupPtr>(target_);
+
+  std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
+  for (int i = 0; i < groups.size(); ++i) {
+    lowered_funcs.emplace_back(op_lowerer.Lower(groups[i]));
+  }
+
+  for (auto&& lowered_func : lowered_funcs) {
+    ProcessFunction(lowered_func);
+  }
+
+  compiler_ = backends::Compiler::Create(target_);
+  auto build_module = m_builder_.Build();
+  compiler_->Build(build_module, "");
+
+  auto instructions = BuildInstructions(groups);
+
+  auto fn_ptrs = compiler_->GetFnPtr();
+
+  for (int idx = 0; idx < groups.size(); ++idx) {
+    pir::CUDAJITInfo jit_info;
+    jit_info.fn_ptr = fn_ptrs[idx];
+
+    lowered_funcs[idx][0]->cuda_axis_info.CopyBlockDimsTo(
+        &(jit_info.block_dims));
+
+    lowered_funcs[idx][0]->cuda_axis_info.CopyGridDimsTo(&(jit_info.grid_dims));
+
+    vec_res.push_back(jit_info);
+  }
+
+  return vec_res;
+}
+
+std::unique_ptr<Program> PIRCompiler::Build(
+    const std::vector<pir::GroupPtr>& groups) {
+  auto op_lowerer = CreateOpLowerer<pir::GroupPtr>(target_);
 
   std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
   for (int i = 0; i < groups.size(); ++i) {
@@ -72,7 +109,7 @@ std::unique_ptr<Program> NewIRCompiler::Build(
   return std::make_unique<Program>(scope_, std::move(instructions));
 }
 
-void NewIRCompiler::ProcessFunction(
+void PIRCompiler::ProcessFunction(
     const std::vector<ir::LoweredFunc>& lowered_funcs) {
   for (auto&& func : lowered_funcs) {
     for (auto&& arg : func->args) {
@@ -97,8 +134,8 @@ void NewIRCompiler::ProcessFunction(
   }
 }
 
-std::vector<std::unique_ptr<Instruction>> NewIRCompiler::BuildInstructions(
-    const std::vector<newir::GroupPtr>& groups) {
+std::vector<std::unique_ptr<Instruction>> PIRCompiler::BuildInstructions(
+    const std::vector<pir::GroupPtr>& groups) {
   std::vector<std::unique_ptr<Instruction>> instructions;
   for (int idx = 0; idx < groups.size(); ++idx) {
     auto& fn_name = groups[idx]->fn_name;
@@ -130,7 +167,7 @@ std::shared_ptr<Scope> BuildScope(const Target& target,
     if (visited.count(value) > 0) return;
     visited.emplace(value);
 
-    std::string name = newir::CompatibleInfo::ValueName(value);
+    std::string name = pir::CompatibleInfo::ValueName(value);
     auto type_info = value.type().dyn_cast<paddle::dialect::DenseTensorType>();
     auto* var = scope->Var<Tensor>(name);
     auto& tensor = absl::get<Tensor>(*var);
@@ -140,7 +177,7 @@ std::shared_ptr<Scope> BuildScope(const Target& target,
       shape.push_back(Shape::dim_t(type_info.dims()[i]));
     }
     tensor->Resize(Shape{shape});
-    tensor->set_type(utils::ConvertIRType(type_info.dtype()));
+    tensor->set_type(pir::CompatibleInfo::ConvertIRType(type_info.dtype()));
   };
 
   for (auto it = program.block()->begin(); it != program.block()->end(); ++it) {
diff --git a/paddle/cinn/hlir/framework/new_ir_compiler.h b/paddle/cinn/hlir/framework/pir_compiler.h
similarity index 80%
rename from paddle/cinn/hlir/framework/new_ir_compiler.h
rename to paddle/cinn/hlir/framework/pir_compiler.h
index 62c3d97a21a415..c567ec2c44eb29 100644
--- a/paddle/cinn/hlir/framework/new_ir_compiler.h
+++ b/paddle/cinn/hlir/framework/pir_compiler.h
@@ -28,11 +28,11 @@ namespace framework {
 
 // TODO(Aurelius84): Need abstract this logic to implement Proxy for
 // the co-existance with GraphCompiler.
-class NewIRCompiler final {
+class PIRCompiler final {
  public:
-  NewIRCompiler(const ::pir::Program& prog,
-                const Target& target,
-                const std::shared_ptr<Scope>& scope)
+  PIRCompiler(const ::pir::Program& prog,
+              const Target& target,
+              const std::shared_ptr<Scope>& scope)
       : program_(prog),
         m_builder_("NewIR", target),
         target_(target),
@@ -40,17 +40,20 @@ class NewIRCompiler final {
 
   std::unique_ptr<Program> Build();
 
-  std::unique_ptr<Program> Build(const std::vector<newir::GroupPtr>& groups);
+  std::vector<pir::CUDAJITInfo> BuildCUDAJITInfo(
+      const std::vector<pir::GroupPtr>& groups);
+
+  std::unique_ptr<Program> Build(const std::vector<pir::GroupPtr>& groups);
 
  private:
-  CINN_DISALLOW_COPY_AND_ASSIGN(NewIRCompiler);
+  CINN_DISALLOW_COPY_AND_ASSIGN(PIRCompiler);
 
   std::vector<ir::LoweredFunc> GetOpFunc(const ::pir::Operation& op, int idx);
 
   void ProcessFunction(const std::vector<ir::LoweredFunc>& lowered_funcs);
 
   std::vector<std::unique_ptr<Instruction>> BuildInstructions(
-      const std::vector<newir::GroupPtr>& groups);
+      const std::vector<pir::GroupPtr>& groups);
 
   const ::pir::Program& program_;
   ir::Module::Builder m_builder_;
diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
index e38465babbb380..181c1568a36930 100644
--- a/paddle/cinn/hlir/pe/reduction.cc
+++ b/paddle/cinn/hlir/pe/reduction.cc
@@ -1077,6 +1077,31 @@ std::vector<ir::Tensor> TwoStepBlockReduceAny(const ir::Tensor& A,
                                     Expr(false));
 }
 
+std::string CrossThreadReduceExternalFuncName(const ir::Expr& op,
+                                              const ir::Expr& tensor) {
+  CHECK_NOTNULL(tensor.as_tensor());
+  if (op.As<ir::Add>()) {
+    return "cinn_block_reduce_sum" +
+           Type2StrForReduce(tensor.as_tensor()->type()) + "_internal";
+  } else if (op.As<ir::Mul>()) {
+    return "cinn_block_reduce_prod" +
+           Type2StrForReduce(tensor.as_tensor()->type()) + "_internal";
+  } else if (op.As<ir::Max>()) {
+    return "cinn_block_reduce_max" +
+           Type2StrForReduce(tensor.as_tensor()->type()) + "_internal";
+  } else if (op.As<ir::Min>()) {
+    return "cinn_block_reduce_min" +
+           Type2StrForReduce(tensor.as_tensor()->type()) + "_internal";
+  } else if (op.As<ir::And>()) {
+    return "cinn_block_reduce_all_internal";
+  } else if (op.As<ir::Or>()) {
+    return "cinn_block_reduce_any_internal";
+  } else {
+    LOG(FATAL) << "Reduce type: " << op << " Not supported yet!";
+  }
+  return "";
+}
+
 }  // namespace pe
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/reduction.h b/paddle/cinn/hlir/pe/reduction.h
index ceb82e8f6fe0b6..b85f01f79a05d3 100644
--- a/paddle/cinn/hlir/pe/reduction.h
+++ b/paddle/cinn/hlir/pe/reduction.h
@@ -467,6 +467,9 @@ std::vector<ir::Tensor> TwoStepBlockReduceAny(
     const std::vector<int>& axes,
     const bool keep_dim,
     const std::string& output_name = "T_Reduce_Any_out");
+
+std::string CrossThreadReduceExternalFuncName(const ir::Expr& op,
+                                              const ir::Expr& tensor);
 }  // namespace pe
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/schedule/factorize_reduction.h b/paddle/cinn/ir/schedule/factorize_reduction.h
new file mode 100644
index 00000000000000..0973d123fd40c1
--- /dev/null
+++ b/paddle/cinn/ir/schedule/factorize_reduction.h
@@ -0,0 +1,408 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Used in FactorizeReduction
+
+#pragma once
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/schedule/ir_schedule_util.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/utils/error.h"
+
+namespace cinn {
+namespace ir {
+
+// Create the new Reduction-Factorized tensor,
+// only used for FactorizeReduction schedule primitive.
+Tensor CreateRFTensor(const Tensor& original_tensor,
+                      const Expr& rf_loop,
+                      int rf_axis) {
+  std::string name = original_tensor->name + "_rf";
+  std::vector<Expr> new_shape = original_tensor->shape;
+  new_shape.insert(new_shape.begin() + rf_axis, rf_loop.As<For>()->extent);
+  Tensor rf_tensor = _Tensor_::Make(name,
+                                    original_tensor->type(),
+                                    new_shape,
+                                    new_shape,
+                                    original_tensor->operation,
+                                    original_tensor->reduce_axis);
+  rf_tensor->WithBuffer("global", name, original_tensor->type());
+  return rf_tensor;
+}
+
+// Base class to create a new reduce block,
+// only used for FactorizeReduction schedule primitive.
+class ReduceBlockCreater {
+ public:
+  ReduceBlockCreater(const Expr& original_block,
+                     const std::vector<Expr>& original_loops,
+                     const Expr& rf_loop,
+                     const Expr& original_update_stmt,
+                     const ir::Tensor& rf_tensor,
+                     bool is_rf_block)
+      : original_block_(original_block),
+        original_loops_(original_loops),
+        rf_loop_(rf_loop),
+        original_update_stmt_(original_update_stmt),
+        rf_tensor_(rf_tensor),
+        is_rf_block_(is_rf_block) {
+    const ScheduleBlockRealize* block_real =
+        original_block_.As<ir::ScheduleBlockRealize>();
+    CHECK_NOTNULL(block_real);
+    num_block_iters_ = block_real->iter_values.size();
+  }
+
+  void CreateBlock() {
+    CreateRFIter();
+    for (int i = 0; i < num_block_iters_; ++i) {
+      CreateNormalIter(i);
+    }
+    CreateUpdateStmt();
+
+    std::string new_update_block_name =
+        original_block_.As<ir::ScheduleBlockRealize>()
+            ->schedule_block.As<ir::ScheduleBlock>()
+            ->name;
+    if (is_rf_block_) {
+      new_update_block_name += "_rf";
+    }
+    std::string new_init_block_name =
+        ir::GenReduceInitTensorNameOf(new_update_block_name);
+    VLOG(5) << "new_init_block_name = " << new_init_block_name;
+
+    Expr init_value = rf_tensor_->GetReduceInitVal();
+    const std::vector<Expr>& domain = rf_tensor_->domain_without_reduce_axis();
+    ir::Tensor init_tensor = lang::Compute(
+        domain,
+        [=](const std::vector<Expr>& axis) { return init_value; },
+        new_init_block_name);
+    init_tensor->Bind(rf_tensor_->buffer);
+    Expr init_stmt = ir::Store::Make(
+        init_tensor, init_value, new_update_stmt_.As<ir::Store>()->indices);
+    new_init_sch_block_ = ScheduleBlock::Make(
+        new_init_iter_vars_, {}, {}, new_init_block_name, init_stmt);
+    new_init_block_realize_ =
+        ScheduleBlockRealize::Make(new_init_iter_values_, new_init_sch_block_);
+
+    new_update_sch_block_ = ScheduleBlock::Make(
+        new_iter_vars_, {}, {}, new_update_block_name, new_update_stmt_);
+    new_update_block_realize_ =
+        ScheduleBlockRealize::Make(new_iter_values_, new_update_sch_block_);
+    VLOG(4) << "new_update_block_realize:\n" << new_update_block_realize_;
+  }
+
+  Expr CreateLoops() {
+    int num_loops = original_loops_.size();
+    std::vector<Expr> new_loops(num_loops);
+    Expr body = new_update_block_realize_;
+    bool has_add_init_block = false;
+    for (int i = num_loops - 1; i >= 0; --i) {
+      bool is_spatial_loop =
+          new_spatial_loop_var_names_.count(
+              original_loops_[i].As<For>()->loop_var->name) > 0;
+      bool is_rf_loop = rf_loop_.As<For>()->loop_var->name ==
+                        original_loops_[i].As<For>()->loop_var->name;
+      // Skip non rf reduction loops of write back block.
+      if (!is_rf_block_ && !is_spatial_loop && !is_rf_loop) {
+        continue;
+      }
+      // Add reduce init block.
+      if (!has_add_init_block && is_spatial_loop) {
+        body = Block::Make({new_init_block_realize_, body});
+        has_add_init_block = true;
+      }
+      // Add loops
+      Var loop_var = ir_utils::IRCopy(original_loops_[i].As<For>()->loop_var);
+      Expr min = ir_utils::IRCopy(original_loops_[i].As<For>()->min);
+      Expr extent = ir_utils::IRCopy(original_loops_[i].As<For>()->extent);
+      body = For::Make(loop_var,
+                       min,
+                       extent,
+                       original_loops_[i].As<For>()->for_type(),
+                       original_loops_[i].As<For>()->device_api,
+                       body,
+                       original_loops_[i].As<For>()->vectorize_info(),
+                       original_loops_[i].As<For>()->bind_info());
+      VLOG(5) << "new body:\n" << body;
+    }
+    VLOG(4) << "new loop nest:\n" << body;
+    return body;
+  }
+
+ private:
+  virtual void CreateRFIter() = 0;
+  virtual void CreateNormalIter(int idx) = 0;
+  virtual void CreateUpdateStmt() = 0;
+
+ public:
+  Var rf_var_;
+  std::vector<Expr> rf_tensor_access_indices_;
+
+ protected:
+  const Expr& original_block_;
+  const std::vector<Expr>& original_loops_;
+  const Expr& rf_loop_;
+  const Expr& original_update_stmt_;
+  const ir::Tensor& rf_tensor_;
+  std::map<Var, Expr, CompVar> original_indice2new_expr_;
+  int num_block_iters_;
+  bool is_rf_block_;
+
+  std::vector<Var> new_iter_vars_;
+  std::vector<Expr> new_iter_values_;
+  std::vector<Var> new_init_iter_vars_;
+  std::vector<Expr> new_init_iter_values_;
+  std::unordered_set<std::string> new_spatial_loop_var_names_;
+  Expr new_update_stmt_;
+
+  Expr new_update_sch_block_;
+  Expr new_update_block_realize_;
+  Expr new_init_sch_block_;
+  Expr new_init_block_realize_;
+};
+
+// Implement class for building Reduction-Factorized block,
+// only used for FactorizeReduction schedule primitive.
+class RFBlockCreater : public ReduceBlockCreater {
+ public:
+  RFBlockCreater(const Expr& original_block,
+                 const std::vector<Expr>& original_loops,
+                 const Expr& rf_loop,
+                 const Expr& original_update_stmt,
+                 const ir::Tensor& rf_tensor,
+                 const std::map<Var, Expr, CompVar>& var2loops,
+                 int rf_axis)
+      : ReduceBlockCreater(original_block,
+                           original_loops,
+                           rf_loop,
+                           original_update_stmt,
+                           rf_tensor,
+                           true),
+        var2loops_(var2loops),
+        rf_axis_(rf_axis) {}
+
+ private:
+  void CreateRFIter() override {
+    std::string loop_var_name = rf_loop_.As<ir::For>()->loop_var->name;
+    std::string rf_var_name = "v" + loop_var_name;
+    rf_var_ = Var(rf_loop_.As<ir::For>()->min,
+                  rf_loop_.As<ir::For>()->extent,
+                  rf_var_name,
+                  /* is_reduce = */ false);
+    loop_var2block_iters_[rf_loop_.As<ir::For>()->loop_var] = rf_var_;
+    new_iter_vars_.push_back(rf_var_);
+    new_iter_values_.push_back(rf_loop_.As<ir::For>()->loop_var);
+    new_init_iter_vars_.push_back(rf_var_);
+    new_init_iter_values_.push_back(rf_loop_.As<ir::For>()->loop_var);
+    new_spatial_loop_var_names_.insert(rf_loop_.As<ir::For>()->loop_var->name);
+    VLOG(4) << "create new_rf_var = " << rf_var_
+            << ", with iter value = " << new_iter_values_.back();
+  }
+
+  void CreateNormalIter(int idx) override {
+    Var original_iter_var = original_block_.As<ir::ScheduleBlockRealize>()
+                                ->schedule_block.As<ir::ScheduleBlock>()
+                                ->iter_vars[idx];
+    Expr original_iter_value =
+        original_block_.As<ir::ScheduleBlockRealize>()->iter_values[idx];
+    // The original iter is either a spatial iter, or a reduction iter that
+    // doesn't touch the rf loop. In this case reuse the old iter var and its
+    // corresponding iter value.
+    if (!original_iter_var->is_reduce_axis) {
+      new_iter_vars_.push_back(original_iter_var);
+      new_iter_values_.push_back(original_iter_value);
+      new_init_iter_vars_.push_back(original_iter_var);
+      new_init_iter_values_.push_back(original_iter_value);
+      ir_utils::CollectIRNodesWithoutTensor(
+          original_iter_value, [&](const Expr* x) {
+            if (x->as_var()) {
+              new_spatial_loop_var_names_.insert(x->as_var()->name);
+            }
+            return false;
+          });
+      return;
+    } else if (!ContainVar({original_iter_value},
+                           rf_loop_.As<ir::For>()->loop_var->name)) {
+      new_iter_vars_.push_back(original_iter_var);
+      new_iter_values_.push_back(original_iter_value);
+      return;
+    }
+    CHECK(original_iter_var->is_reduce_axis);
+
+    // This iter is a reduction iter and touches the rfactor loop. So we try to
+    // create a new iter for each loop var that appear in the original iter
+    // value.
+    std::vector<Var> vars_in_original_iter_values;
+    ir_utils::CollectIRNodesWithoutTensor(
+        original_iter_value, [&](const Expr* x) {
+          if (x->as_var()) {
+            vars_in_original_iter_values.push_back(x->as_var_ref());
+          }
+          return false;
+        });
+    for (const Var& loop_var : vars_in_original_iter_values) {
+      if (var2loops_.count(loop_var) == 0) {
+        continue;
+      }
+      Expr loop = var2loops_.at(loop_var);
+      if (loop_var2block_iters_.count(loop_var) == 0) {
+        Var new_iter_var(loop.As<ir::For>()->min,
+                         loop.As<ir::For>()->extent,
+                         "v" + loop_var->name,
+                         /* is_reduce = */ true);
+        new_iter_vars_.push_back(new_iter_var);
+        new_iter_values_.emplace_back(loop_var);
+        loop_var2block_iters_[loop_var] = new_iter_var;
+      }
+    }
+    // Substitute the original iter values with new iter vars,
+    // and store the new iter values in original_indice2new_expr_,
+    // it will be used in Load/Store indices.
+    Expr new_iters = ir_utils::IRCopy(original_iter_value);
+    ReplaceExpr(&new_iters, loop_var2block_iters_);
+    original_indice2new_expr_[original_iter_var] = new_iters;
+    VLOG(4) << "original_indice2new_expr_[" << original_iter_var
+            << "] = " << new_iters;
+  }
+
+  void CreateUpdateStmt() override {
+    rf_tensor_access_indices_ = original_update_stmt_.As<ir::Store>()->indices;
+    rf_tensor_access_indices_.insert(
+        rf_tensor_access_indices_.begin() + rf_axis_, rf_var_);
+    Expr original_store_body = original_update_stmt_.As<ir::Store>()->value;
+    Expr new_store_body = ir_utils::IRCopy(original_store_body);
+#define REPLACE_RF_TENSOR(Op)                                    \
+  if (new_store_body.As<Op>()) {                                 \
+    auto* node = new_store_body.As<Op>();                        \
+    CHECK(node);                                                 \
+    auto& operand = node->a();                                   \
+    operand = Load::Make(rf_tensor_, rf_tensor_access_indices_); \
+  }
+
+    REPLACE_RF_TENSOR(Add)
+    REPLACE_RF_TENSOR(Mul)
+    REPLACE_RF_TENSOR(Max)
+    REPLACE_RF_TENSOR(Min)
+#undef REPLACE_RF_TENSOR
+
+    new_update_stmt_ =
+        ir::Store::Make(rf_tensor_, new_store_body, rf_tensor_access_indices_);
+    ReplaceExpr(&new_update_stmt_, original_indice2new_expr_);
+    VLOG(4) << "new_update_stmt of rf block: \n" << new_update_stmt_;
+  }
+
+ private:
+  const std::map<Var, Expr, CompVar>& var2loops_;
+  int rf_axis_;
+
+  std::map<Var, Expr, CompVar> loop_var2block_iters_;
+};
+
+// Implement class for building Writing-Back block,
+// only used for FactorizeReduction schedule primitive.
+class RBBlockCreater : public ReduceBlockCreater {
+ public:
+  RBBlockCreater(const Expr& original_block,
+                 const std::vector<Expr>& original_loops,
+                 const Expr& rf_loop,
+                 const Expr& original_update_stmt,
+                 const ir::Tensor& rf_tensor,
+                 const std::vector<Expr>& rf_tensor_access_indices,
+                 const Var& rf_block_rf_iter_var)
+      : ReduceBlockCreater(original_block,
+                           original_loops,
+                           rf_loop,
+                           original_update_stmt,
+                           rf_tensor,
+                           false),
+        rf_tensor_access_indices_(rf_tensor_access_indices),
+        rf_block_rf_iter_var_(rf_block_rf_iter_var) {}
+
+ private:
+  void CreateRFIter() override {
+    std::string loop_var_name = rf_loop_.As<ir::For>()->loop_var->name;
+    std::string rf_var_name = "v" + loop_var_name;
+    rf_var_ = Var(rf_loop_.As<ir::For>()->min,
+                  rf_loop_.As<ir::For>()->extent,
+                  rf_var_name,
+                  /* is_reduce = */ true);
+    new_iter_vars_.push_back(rf_var_);
+    new_iter_values_.push_back(rf_loop_.As<ir::For>()->loop_var);
+    original_indice2new_expr_[rf_block_rf_iter_var_] = Expr(rf_var_);
+    VLOG(4) << "create new_rf_var = " << rf_var_
+            << ", with iter value = " << new_iter_values_.back();
+  }
+
+  void CreateNormalIter(int idx) override {
+    Var original_iter_var = original_block_.As<ir::ScheduleBlockRealize>()
+                                ->schedule_block.As<ir::ScheduleBlock>()
+                                ->iter_vars[idx];
+    Expr original_iter_value =
+        original_block_.As<ir::ScheduleBlockRealize>()->iter_values[idx];
+    if (!original_iter_var->is_reduce_axis) {
+      new_iter_vars_.push_back(original_iter_var);
+      new_iter_values_.push_back(original_iter_value);
+      new_init_iter_vars_.push_back(original_iter_var);
+      new_init_iter_values_.push_back(original_iter_value);
+      ir_utils::CollectIRNodesWithoutTensor(
+          original_iter_value, [&](const Expr* x) {
+            if (x->as_var()) {
+              new_spatial_loop_var_names_.insert(x->as_var()->name);
+            }
+            return false;
+          });
+      // original_indice2new_expr_[original_iter_var] = new_iter_vars_.back();
+      VLOG(4) << "create new iter var = " << new_iter_vars_.back()
+              << ", with iter value = " << new_iter_values_.back();
+    }
+  }
+
+  void CreateUpdateStmt() override {
+    Expr original_store_body = original_update_stmt_.As<ir::Store>()->value;
+    Expr new_store_body = ir_utils::IRCopy(original_store_body);
+#define REPLACE_RF_TENSOR(Op)                                    \
+  if (new_store_body.As<Op>()) {                                 \
+    auto* node = new_store_body.As<Op>();                        \
+    CHECK(node);                                                 \
+    auto& operand = node->b();                                   \
+    operand = Load::Make(rf_tensor_, rf_tensor_access_indices_); \
+  }
+
+    REPLACE_RF_TENSOR(Add)
+    REPLACE_RF_TENSOR(Mul)
+    REPLACE_RF_TENSOR(Max)
+    REPLACE_RF_TENSOR(Min)
+#undef REPLACE_RF_TENSOR
+
+    Expr original_store_tensor = original_update_stmt_.As<ir::Store>()->tensor;
+    std::vector<Expr> original_store_indices =
+        original_update_stmt_.As<ir::Store>()->indices;
+    new_update_stmt_ = ir::Store::Make(
+        original_store_tensor, new_store_body, original_store_indices);
+    ReplaceExpr(&new_update_stmt_, original_indice2new_expr_);
+    VLOG(4) << "new_update_stmt of write back block: \n" << new_update_stmt_;
+  }
+
+ private:
+  const std::vector<Expr>& rf_tensor_access_indices_;
+  const Var& rf_block_rf_iter_var_;
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc
index f17e17b73019d6..24f97b6e03d1e2 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule.cc
@@ -33,6 +33,7 @@
 #include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/ir_visitor.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/schedule/factorize_reduction.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_error.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
@@ -120,6 +121,7 @@ class ScheduleImpl {
   void ReverseComputeInline(const Expr& schedule_block);
   void Bind(const Expr& loop, const std::string& thread_axis);
   Expr Rfactor(const Expr& rf_loop, int rf_axis);
+  Expr FactorizeReduction(const Expr& rf_loop, int rf_axis);
   Expr AddUnitLoop(const Expr& block) const;
   void Annotate(const Expr& block, const std::string& key, const attr_t& value);
   void Unannotate(Expr& block, const std::string& key);  // NOLINT
@@ -717,6 +719,79 @@ Expr ScheduleImpl::Rfactor(const Expr& rf_loop, int rf_axis) {
   return rf_create.CreateRfAllStmts();
 }
 
+Expr ScheduleImpl::FactorizeReduction(const Expr& rf_loop, int rf_axis) {
+  std::string primitive = "FactorizeReduction";
+  // Get child block of the rf_loop and check.
+  std::vector<Expr> blocks = GetChildBlocks(rf_loop);
+  if (blocks.size() != 1) {
+    std::ostringstream os;
+    os << "The rf_loop is required to have only one child block, but got "
+       << blocks.size() << std::endl;
+    throw IRScheduleErrorHandler(primitive, os.str(), this->module_expr_);
+  }
+  Expr original_block = blocks.at(0);
+  Expr root_block = GetRootBlock(original_block);
+  // TODO(BiynXu): Add CheckReductionBlock()
+
+  // Collect the loops of the block.
+  // Construct a map from loop var names to corresponding loops.
+  std::vector<Expr> original_loops = this->GetLoops(original_block);
+  CHECK_GT(original_loops.size(), 0);
+  VLOG(3) << "before FactorizeReduction, original computational body of the "
+             "reduction is:\n"
+          << original_loops[0];
+  std::map<Var, Expr, CompVar> var2loops;
+  for (const Expr& loop : original_loops) {
+    var2loops[loop.As<For>()->loop_var] = loop;
+  }
+
+  // Get original stmt of reduction update and original store tensor.
+  Expr original_update_body = original_block.As<ir::ScheduleBlockRealize>()
+                                  ->schedule_block.As<ir::ScheduleBlock>()
+                                  ->body;
+  Expr original_update_stmt;
+  CHECK(original_update_body.As<Block>() || original_update_body.As<Store>());
+  if (original_update_body.As<Block>()) {
+    CHECK_EQ(original_update_body.As<Block>()->stmts.size(), 1);
+    original_update_stmt = original_update_body.As<Block>()->stmts[0];
+  } else if (original_update_body.As<Store>()) {
+    original_update_stmt = original_update_body;
+  }
+  Tensor original_tensor =
+      original_update_stmt.As<Store>()->tensor.as_tensor_ref();
+
+  // Create new blocks and loops.
+  Tensor rf_tensor = CreateRFTensor(original_tensor, rf_loop, rf_axis);
+  RFBlockCreater rf_block_creater(original_block,
+                                  original_loops,
+                                  rf_loop,
+                                  original_update_stmt,
+                                  rf_tensor,
+                                  var2loops,
+                                  rf_axis);
+  rf_block_creater.CreateBlock();
+  RBBlockCreater wb_block_creater(original_block,
+                                  original_loops,
+                                  rf_loop,
+                                  original_update_stmt,
+                                  rf_tensor,
+                                  rf_block_creater.rf_tensor_access_indices_,
+                                  rf_block_creater.rf_var_);
+  wb_block_creater.CreateBlock();
+
+  Expr rf_body = rf_block_creater.CreateLoops();
+  Expr wb_body = wb_block_creater.CreateLoops();
+
+  Expr new_computational_body = Block::Make({rf_body, wb_body});
+
+  // Replace and update the AST.
+  this->Replace(original_loops[0], new_computational_body);
+  VLOG(3) << "After FactorizeReduction, new computational body of the "
+             "reduction is:\n"
+          << new_computational_body;
+  return rf_tensor;
+}
+
 struct CacheReadRewriter : public ir::IRMutator<> {
  public:
   static Expr Rewrite(const Expr& root, CacheBlockInfo* info) {
@@ -2647,6 +2722,15 @@ Expr IRSchedule::Rfactor(const Expr& rf_loop, int rf_axis) {
   return result;
 }
 
+Expr IRSchedule::FactorizeReduction(const Expr& rf_loop, int rf_axis) {
+  auto result = impl_->FactorizeReduction(rf_loop, rf_axis);
+  trace_.Append(ScheduleDesc::Step("FactorizeReduction",
+                                   {{"rf_loop", std::vector<Expr>({rf_loop})}},
+                                   {{"rf_axis", rf_axis}},
+                                   {result}));
+  return result;
+}
+
 void IRSchedule::Annotate(const Expr& block,
                           const std::string& key,
                           const attr_t& value) {
diff --git a/paddle/cinn/ir/schedule/ir_schedule.h b/paddle/cinn/ir/schedule/ir_schedule.h
index ce341c502b1fb5..4c5fc1d10f1b69 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.h
+++ b/paddle/cinn/ir/schedule/ir_schedule.h
@@ -381,6 +381,46 @@ class IRSchedule {
    */
   Expr Rfactor(const Expr& rf_loop, int rf_axis);
 
+  /**
+   * \brief Factorize the reduction block by the given loop. The block will be
+   * split into two blocks: reduction-factorized block and write-back block.
+   * @param rf_loop the reduce loop to be factorized.
+   * @param rf_axis The position where the new dimension is placed in the new rf
+   * tensor.
+   * @return The new created rf tensor.
+   *
+   * For example, input the block:
+   * \code
+   * for (i, 0, 10)      // serial loop
+   *   B_init[i] = 0
+   *   for (j, 0, 20)    // reduce loop
+   *      for (k, 0, 30) // reduce loop
+   *         B[i] = B[i] + A[i, j, k]
+   * \endcode
+   *
+   * If the rf loop is j and rf_axis is 0, the transformation is
+   * divided into 2 steps:
+   * 1. get the rf block where the reduce loop j is transformed to the
+   * serial loop with no accumalation and a new rf tensor is created.
+   * The axis j will be placed in the rf_axis of the new rf_tensor.
+   * The rf_block is as follows:
+   * \code
+   * for (i, 0, 10)   // serial loop
+   *   for (j, 0, 20) //  rf loop j is transformed to the serial loop
+   *     rf_B_init[j, i] = 0
+   *     for (k, 0, 30)  // reduce loop.
+   *       rf_B[j, i] = rf_B[j, i] + A[i, j, k]
+   * \endcode
+   * 2. do reduction of the rf loop j to get the final result block:
+   * \code
+   *   for (i, 0, 10)     // serial loop
+   *      B_init[i] = 0
+   *      for (j, 0, 20)  // rf reduction loop
+   *        B[i] = B[i] + rf_B[j, i]
+   * \endcode
+   */
+  Expr FactorizeReduction(const Expr& rf_loop, int rf_axis);
+
   /*!
    * \brief Annotate a block with a key-value pair to set as its attribute
    * \param block The block to be annotated
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index 7144e1484a58c6..7a2daa3106612f 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -221,6 +221,14 @@ void ReplaceExpr(Expr* source,
   return;
 }
 
+void ReplaceExpr(Expr* source,
+                 const std::map<Var, Expr, CompVar>& replacing_map) {
+  if (replacing_map.empty()) return;
+  MappingVarToExprMutator mapper(replacing_map);
+  mapper(source);
+  return;
+}
+
 std::vector<int> ValidateFactors(const std::vector<int>& factors,
                                  int total_extent,
                                  const ModuleExpr& module_expr) {
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h
index 50515e5f3cfa94..9c9418b4d577ec 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.h
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.h
@@ -193,7 +193,7 @@ Tensor GetReadTensor(const Expr& block, int index);
 int GetLoopExtent(const Expr& loop);
 
 /**
- * \brief Given a vector of Exors, return whether they contain a var with
+ * \brief Given a vector of Exprs, return whether they contain a var with
  * specific name.
  * @param exprs The given vector of Exprs
  * @param var_name The name of specific var
@@ -241,6 +241,15 @@ void ReplaceExpr(Expr* source,
                  const std::vector<Var>& replaced,
                  const std::vector<Expr>& candidates);
 
+/**
+ * Replace Vars in replaced to Exprs in candidates in source.
+ * @param source The Expr we will implement the change.
+ * @param replacing_map The one-to-one corresponded Vars -> Exprs to be
+ * replaced.
+ */
+void ReplaceExpr(Expr* source,
+                 const std::map<Var, Expr, CompVar>& replacing_map);
+
 /**
  * Validate the factors param of Split. We will check if factors are validate
  * and change -1 to positive integer.
diff --git a/paddle/cinn/ir/schedule/schedule_desc.cc b/paddle/cinn/ir/schedule/schedule_desc.cc
index a3ef7e72a1bc9e..e0d5f4ab217018 100644
--- a/paddle/cinn/ir/schedule/schedule_desc.cc
+++ b/paddle/cinn/ir/schedule/schedule_desc.cc
@@ -474,6 +474,12 @@ CINN_BUILD_STEP_KIND(Rfactor)
     .SetApplyFn(
         APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::Rfactor)));
 
+CINN_BUILD_STEP_KIND(FactorizeReduction)
+    .Inputs({"rf_loop"})
+    .Attrs({"rf_axis"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(
+        FREE_FUNCTION_CONVERTER(&IRSchedule::FactorizeReduction)));
+
 CINN_BUILD_STEP_KIND(MergeExprs)
     .SetApplyFn(
         APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::MergeExprs)));
diff --git a/paddle/cinn/ir/test/collect_ir_nodes_test.cc b/paddle/cinn/ir/test/collect_ir_nodes_test.cc
index d380b4475e37d8..859a35a5c0fa94 100644
--- a/paddle/cinn/ir/test/collect_ir_nodes_test.cc
+++ b/paddle/cinn/ir/test/collect_ir_nodes_test.cc
@@ -42,15 +42,15 @@ TEST(CollectIRNodes, basic) {
   auto C = Compute(
       {M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
 
-  auto stages = CreateStages({C});
+  ast_gen_ius::TensorGroup tensor_group({C});
 
-  auto fn = Lower("fn", stages, {A, B, C});
+  auto fn = LowerToAst("fn", {A, B, C}, &tensor_group);
 
   LOG(INFO) << "fn:\n" << fn;
 
   auto tensors =
       CollectIRNodes(fn, [](const Expr* x) { return x->as_tensor(); });
-  ASSERT_EQ(tensors.size(), 5UL);
+  ASSERT_EQ(tensors.size(), 3UL);
 
   auto fn_body = fn.As<ir::_LoweredFunc_>()->body;
   LOG(INFO) << "fn.body:\n" << fn_body;
diff --git a/paddle/cinn/ir/test/schedule_block_graph_test.cc b/paddle/cinn/ir/test/schedule_block_graph_test.cc
index 80c39f493be416..20c7f03b4d235d 100644
--- a/paddle/cinn/ir/test/schedule_block_graph_test.cc
+++ b/paddle/cinn/ir/test/schedule_block_graph_test.cc
@@ -97,8 +97,8 @@ frontend::Program CreateReduceProgram() {
 TEST(ScheduleBlockGraph, elementwise) {
   frontend::Program program = CreateElementwiseProgram();
   IRSchedule ir_sch = MakeIRSchedule(&program);
-  ScheduleBlockGraph sbg(ir_sch);
   LOG(INFO) << GetIR(ir_sch);
+  ScheduleBlockGraph sbg(ir_sch);
   LOG(INFO) << sbg.Visualize();
   CHECK_EQ(sbg.BlockIdsInOrder().size(), 6);
   CHECK_EQ(sbg.nodes().size(), 6);
@@ -138,8 +138,8 @@ TEST(ScheduleBlockGraph, elementwise) {
 TEST(ScheduleBlockGraph, reduce) {
   frontend::Program program = CreateReduceProgram();
   IRSchedule ir_sch = MakeIRSchedule(&program);
-  ScheduleBlockGraph sbg(ir_sch);
   LOG(INFO) << GetIR(ir_sch);
+  ScheduleBlockGraph sbg(ir_sch);
   LOG(INFO) << sbg.Visualize();
   CHECK_EQ(sbg.BlockIdsInOrder().size(), 8);
   CHECK_EQ(sbg.nodes().size(), 8);
diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc
index 92812e65f412de..c509a1977555f2 100644
--- a/paddle/cinn/lang/lower.cc
+++ b/paddle/cinn/lang/lower.cc
@@ -104,7 +104,7 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
   auto all_temp_tensors =
       ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
         return x->as_tensor() && x->as_tensor()->buffer.defined() &&
-               (!tensor_group.Contain(x->as_tensor()->name) &&
+               (!tensor_group.Contain(x->as_tensor()->name) ||
                 ((!buffer_arg_names.count(x->as_tensor()->buffer->name) &&
                   !tensor_arg_names.count(x->as_tensor()->name)) ||
                  utils::Endswith(x->as_tensor()->buffer->name, "temp_buffer")));
@@ -284,15 +284,25 @@ ir::LoweredFunc LowerToAst(const std::string& name,
                            const std::vector<Tensor>& tensor_args,
                            ast_gen_ius::TensorGroup* tensor_group,
                            const Target& target) {
-  // Merge the ctrl_deps with the given temp_tensors ang get a new temp_tensors
+  std::vector<ir::LoweredFunc> result =
+      LowerToAstVec(name, tensor_args, tensor_group, target);
+  CHECK_EQ(result.size(), 1UL) << "LowerToAst contains not only 1 LoweredFunc, "
+                                  "use LowerToAstVec instead.";
+  return result[0];
+}
+
+std::vector<ir::LoweredFunc> LowerToAstVec(
+    const std::string& name,
+    const std::vector<Tensor>& tensor_args,
+    ast_gen_ius::TensorGroup* tensor_group,
+    const Target& target) {
   std::set<ir::Tensor> ctrl_deps =
       CollectTempTensorsFromCtrlDepends(tensor_group, tensor_args);
-  std::vector<ast_gen_ius::TensorGroup*> group_vec = {tensor_group};
   auto lower_instance = detail::LowerTensorGroup(
       name,
       tensor_args,
       {},
-      group_vec,
+      tensor_group,
       std::vector<Tensor>(ctrl_deps.begin(), ctrl_deps.end()),
       target);
   std::vector<ir::LoweredFunc> result = lower_instance();
@@ -301,19 +311,7 @@ ir::LoweredFunc LowerToAst(const std::string& name,
       res->device_api = ir::DeviceAPI::GPU;
     }
   }
-  return result[0];
-}
-
-std::vector<ir::LoweredFunc> LowerToAstVec(
-    const std::string& name,
-    const std::vector<Tensor>& tensor_args,
-    std::vector<ast_gen_ius::TensorGroup*> tensor_groups,
-    const Target& target) {
-  std::vector<ir::LoweredFunc> ret;
-  for (ast_gen_ius::TensorGroup* tg : tensor_groups) {
-    ret.push_back(LowerToAst(name, tensor_args, tg, target));
-  }
-  return ret;
+  return result;
 }
 
 ir::LoweredFunc Lower(const std::string& name,
diff --git a/paddle/cinn/lang/lower.h b/paddle/cinn/lang/lower.h
index c80d4bc769cdfe..b3f27129778b9c 100644
--- a/paddle/cinn/lang/lower.h
+++ b/paddle/cinn/lang/lower.h
@@ -82,7 +82,7 @@ ir::LoweredFunc LowerToAst(const std::string &name,
 std::vector<ir::LoweredFunc> LowerToAstVec(
     const std::string &name,
     const std::vector<Tensor> &tensor_args,
-    std::vector<ast_gen_ius::TensorGroup *> tensor_groups,
+    ast_gen_ius::TensorGroup *tensor_group,
     const Target &target = common::DefaultHostTarget());
 
 std::vector<ir::Buffer> GetTempBuffers(
diff --git a/paddle/cinn/lang/lower_tensor_group.cc b/paddle/cinn/lang/lower_tensor_group.cc
index 6bbe1017913629..f59ac4ceff52fc 100644
--- a/paddle/cinn/lang/lower_tensor_group.cc
+++ b/paddle/cinn/lang/lower_tensor_group.cc
@@ -41,24 +41,29 @@ LowerTensorGroup::LowerTensorGroup(
     const std::string& fn_name,
     const std::vector<ir::Tensor>& tensor_args,
     const std::vector<ir::Var>& scalar_args,
-    const std::vector<ast_gen_ius::TensorGroup*>& tensor_groups,
+    ast_gen_ius::TensorGroup* tensor_group,
     const std::vector<ir::Tensor>& temp_tensor_args,
     const Target& target)
     : fn_name_(fn_name),
       tensor_args_(tensor_args),
       scalar_args_(scalar_args),
-      tensor_groups_(tensor_groups),
+      tensor_group_(tensor_group),
       temp_tensor_args_(temp_tensor_args),
       target_(target) {}
 
 std::vector<ir::LoweredFunc> LowerTensorGroup::operator()() {
   std::vector<ir::LoweredFunc> result;
   int num_func = 0;
-  for (ast_gen_ius::TensorGroup* tensor_group : tensor_groups_) {
-    // 1. Generate function body
-    ir::Expr func_body = GenerateFunctionBody(tensor_group);
+
+  // 1. Generate function body
+  std::vector<ir::Expr> func_bodies = GenerateFunctionBody(tensor_group_);
+  for (ir::Expr& func_body : func_bodies) {
+    func_body = ir::ScheduleBlockRealize::Make(
+        {},
+        ir::ScheduleBlock::Make(
+            {}, {}, {}, common::UniqName("root"), func_body));
     // 2. Assign buffer to tensors
-    auto tensor_map = tensor_group->AllocateBuffers();
+    auto tensor_map = tensor_group_->AllocateBuffers();
     // copy the tensor(with buffer assigned) back to func's args.
     for (auto& arg : tensor_args_) {
       if (arg->is_placeholder_node() || arg->buffer.defined()) {
@@ -195,21 +200,36 @@ std::vector<ir::Argument> LowerTensorGroup::GenerateFunctionArgumentList(
   return args;
 }
 
-ir::Expr LowerTensorGroup::GenerateFunctionBody(
+std::vector<ir::Expr> LowerTensorGroup::GenerateFunctionBody(
     ast_gen_ius::TensorGroup* tensor_group) {
-  std::vector<ir::Tensor> ordered_tensors =
-      tensor_group->GetGenFuncTopoOrder(tensor_args_);
+  // TODO(zhhsplendid): GetGenFuncTopoOrder() may remove args
+  std::vector<ir::Tensor> ordered_tensors = tensor_group->GetGenFuncTopoOrder();
+
+  std::vector<ir::Expr> result;
   std::vector<ir::Expr> bodies;
   for (const ir::Tensor& tensor : ordered_tensors) {
-    if (!tensor->is_placeholder_node()) {
+    VLOG(6) << "tensor_name = " << tensor->name;
+    if (!tensor->is_placeholder_node() && tensor->has_expression()) {
+      VLOG(6) << "ast_gen_ius::AstGen::Build for Tensor " << tensor;
       bodies.emplace_back(ast_gen_ius::AstGen::Build(tensor, tensor_group));
+
+      bool gpu_local =
+          tensor->buffer.defined() &&
+          (tensor->buffer->memory_type == ir::MemoryType::GPUShared ||
+           tensor->buffer->memory_type == ir::MemoryType::GPULocal);
+      if (target_ == common::DefaultNVGPUTarget() && !gpu_local) {
+        result.push_back(bodies.size() == 1 ? bodies[0]
+                                            : ir::Block::Make(bodies));
+        bodies.clear();
+      }
     }
   }
-  if (bodies.size() == 1) {
-    return bodies[0];
-  }
 
-  return ir::Block::Make(bodies);
+  if (!bodies.empty()) {
+    result.push_back(bodies.size() == 1 ? bodies[0] : ir::Block::Make(bodies));
+    bodies.clear();
+  }
+  return result;
 }
 
 }  // namespace detail
diff --git a/paddle/cinn/lang/lower_tensor_group.h b/paddle/cinn/lang/lower_tensor_group.h
index c66dc014d0f9a5..358e2d9ec953d5 100644
--- a/paddle/cinn/lang/lower_tensor_group.h
+++ b/paddle/cinn/lang/lower_tensor_group.h
@@ -47,13 +47,14 @@ class LowerTensorGroup {
   LowerTensorGroup(const std::string& fn_name,
                    const std::vector<ir::Tensor>& tensor_args,
                    const std::vector<ir::Var>& scalar_args,
-                   const std::vector<ast_gen_ius::TensorGroup*>& tensor_groups,
+                   ast_gen_ius::TensorGroup* tensor_group,
                    const std::vector<ir::Tensor>& temp_tensor_args = {},
                    const Target& target = common::DefaultHostTarget());
 
   std::vector<ir::LoweredFunc> operator()();
 
-  ir::Expr GenerateFunctionBody(ast_gen_ius::TensorGroup* tensor_group);
+  std::vector<ir::Expr> GenerateFunctionBody(
+      ast_gen_ius::TensorGroup* tensor_group);
 
   std::vector<ir::Argument> GenerateFunctionArgumentList(ir::Expr fn_body);
 
@@ -62,11 +63,8 @@ class LowerTensorGroup {
   const std::vector<ir::Tensor>& tensor_args_;
   const std::vector<Var>& scalar_args_;
   std::vector<ir::Tensor> temp_tensor_args_;
-  std::vector<ast_gen_ius::TensorGroup*> tensor_groups_;
+  ast_gen_ius::TensorGroup* tensor_group_;
   Target target_;
-
-  //! CUDA axis info for this function.
-  std::vector<ir::CudaAxisInfo> cuda_axis_info_;
 };
 
 }  // namespace detail
diff --git a/paddle/cinn/lang/lower_test.cc b/paddle/cinn/lang/lower_test.cc
index e97d0f596a7ea3..452b9e7afb7725 100644
--- a/paddle/cinn/lang/lower_test.cc
+++ b/paddle/cinn/lang/lower_test.cc
@@ -177,11 +177,18 @@ TEST(lower_to_ast, basic) {
   auto out = R"ROC(
 function cal_B (_A, _B)
 {
-  serial for (i, 0, 100)
+  ScheduleBlock(root)
   {
-    serial for (j, 0, 15)
+    serial for (i, 0, 100)
     {
-      B[i, j] = (A[i, j] + 1.00000000f)
+      serial for (j, 0, 15)
+      {
+        ScheduleBlock(B)
+        {
+          i0, i1 = axis.bind(i, j)
+          B[i0, i1] = (A[i0, i1] + 1.00000000f)
+        }
+      }
     }
   }
 }
@@ -212,13 +219,20 @@ TEST(lower_to_ast, three_dim) {
   auto out = R"ROC(
 function cal_C (_A, _B, _C)
 {
-  serial for (i, 0, 100)
+  ScheduleBlock(root)
   {
-    serial for (j, 0, 15)
+    serial for (i, 0, 100)
     {
-      serial for (k, 0, 200)
+      serial for (j, 0, 15)
       {
-        C[i, j, k] = (A[i, j] * B[j, k])
+        serial for (k, 0, 200)
+        {
+          ScheduleBlock(C)
+          {
+            i0, i1, i2 = axis.bind(i, j, k)
+            C[i0, i1, i2] = (A[i0, i1] * B[i1, i2])
+          }
+        }
       }
     }
   }
@@ -247,14 +261,25 @@ TEST(lower_to_ast, matmul_with_reduce_sum) {
   auto out = R"ROC(
 function matmul (_A, _B, _C)
 {
-  serial for (i, 0, 100)
+  ScheduleBlock(root)
   {
-    serial for (j, 0, 50)
+    serial for (i, 0, 100)
     {
-      C__reduce_init[i, j] = 0.00000000f
-      serial for (k0, 0, 20)
+      serial for (j, 0, 50)
       {
-        C[i, j] = (C[i, j] + (A[i, k0] * B[k0, j]))
+        ScheduleBlock(C__reduce_init)
+        {
+          i0, i1 = axis.bind(i, j)
+          C__reduce_init[i0, i1] = 0.00000000f
+        }
+        serial for (k0, 0, 20)
+        {
+          ScheduleBlock(C)
+          {
+            i0_0, i1_0, i2 = axis.bind(i, j, k0)
+            C[i0_0, i1_0] = (C[i0_0, i1_0] + (A[i0_0, i2] * B[i2, i1_0]))
+          }
+        }
       }
     }
   }
diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
index 03b8c95b74173b..7c30a6e565c431 100755
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -23,7 +23,8 @@ gather_srcs(
   lower_intrin.cc
   cast_bool_to_int8.cc
   var_mod_simplify.cc
-  remove_schedule_block.cc)
+  remove_schedule_block.cc
+  replace_cross_thread_reduction.cc)
 
 if(WITH_CUDA)
   gather_srcs(cinnapi_src SRCS transform_gpu_forloop.cc)
@@ -55,3 +56,5 @@ cinn_cc_test(test_cast_simplify SRCS cast_simplify_test.cc DEPS cinncore)
 cinn_cc_test(test_remove_schedule_block SRCS remove_schedule_block_test.cc DEPS
              cinncore)
 cinn_cc_test(test_unroll_loops SRCS unroll_loops_test.cc DEPS cinncore)
+cinn_cc_test(test_replace_cross_thread_reduction SRCS
+             replace_cross_thread_reduction_test.cc DEPS cinncore)
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
index 7d6dfe60744ab0..238a28ab4da1d3 100644
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -29,6 +29,7 @@
 #include "paddle/cinn/optim/map_extern_call.h"
 #include "paddle/cinn/optim/remove_schedule_block.h"
 #include "paddle/cinn/optim/replace_const_param_to_integer.h"
+#include "paddle/cinn/optim/replace_cross_thread_reduction.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 #include "paddle/cinn/optim/transform_polyfor_to_for.h"
 #include "paddle/cinn/optim/unroll_loops.h"
@@ -49,6 +50,7 @@ Expr Optimize(Expr e,
   ReplaceConstParamToInteger(&copied);
   // Simplify already contains CastSimplify
   Simplify(&copied);
+  ReplaceCrossThreadReduction(&copied);
   UnrollLoop(&copied);
   VLOG(4) << "After Optimize UnrollLoop:" << copied;
 
@@ -85,6 +87,7 @@ Expr Optimize(Expr e,
 
 ir::Module Optimize(const ir::Module& module, const Target& target) {
   auto copied = ir::ir_utils::IRCopy(Expr(module));
+  ReplaceCrossThreadReduction(&copied);
   UnrollLoop(&copied);
   VectorizeLoops(&copied, Target());
   VLOG(10) << "After VectorizeLoops:" << copied.as_module_ref();
diff --git a/paddle/cinn/optim/replace_cross_thread_reduction.cc b/paddle/cinn/optim/replace_cross_thread_reduction.cc
new file mode 100644
index 00000000000000..5102e8bc6468ff
--- /dev/null
+++ b/paddle/cinn/optim/replace_cross_thread_reduction.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file implements the strategy to remove the unnecessary nested block.
+ */
+#pragma once
+#include "paddle/cinn/optim/replace_cross_thread_reduction.h"
+#include <vector>
+
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/hlir/pe/reduction.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/schedule/ir_schedule_util.h"
+#include "paddle/cinn/lang/compute.h"
+
+namespace cinn {
+namespace optim {
+
+struct CrossThreadReductionReplacer : public ir::IRMutator<Expr*> {
+  void operator()(ir::Expr* expr) { Visit(expr); }
+
+ private:
+  bool CanReplace(const ir::ScheduleBlockRealize* block_realize) {
+    const ir::ScheduleBlock* schedule_block =
+        block_realize->schedule_block.As<ir::ScheduleBlock>();
+    CHECK_NOTNULL(schedule_block);
+
+    if (block_realize->schedule_block.As<ir::ScheduleBlock>()->name.substr(
+            0, 4) == "root") {
+      return false;
+    }
+
+    const std::vector<ir::Expr>& iter_values = block_realize->iter_values;
+    const std::vector<ir::Var>& iter_vars = schedule_block->iter_vars;
+    ir::Expr body = schedule_block->body;
+
+    std::unordered_set<std::string> reduce_var_names;
+    for (int i = 0; i < iter_values.size(); ++i) {
+      if (!iter_vars[i]->is_reduce_axis) {
+        continue;
+      }
+      ir::ir_utils::CollectIRNodesWithoutTensor(
+          iter_values[i], [&](const ir::Expr* x) {
+            if (x->as_var()) {
+              reduce_var_names.insert(x->as_var()->name);
+            }
+            return false;
+          });
+    }
+
+    std::vector<int> thread_binded_reduce_loop_indices;
+    for (int i = 0; i < cur_loops_.size(); ++i) {
+      if (reduce_var_names.count(cur_loops_[i].As<ir::For>()->loop_var->name) >
+          0) {
+        if (cur_loops_[i].As<ir::For>()->is_gpu_thread_binded()) {
+          if (ir::GetLoopExtent(cur_loops_[i]) > 1024) {
+            return false;
+          }
+          thread_binded_reduce_loop_indices.push_back(i);
+        }
+      }
+    }
+    if (thread_binded_reduce_loop_indices.size() == 0 ||
+        thread_binded_reduce_loop_indices.back() != cur_loops_.size() - 1) {
+      return false;
+    }
+    for (int i = 1; i < thread_binded_reduce_loop_indices.size(); ++i) {
+      if (thread_binded_reduce_loop_indices[i - 1] + 1 !=
+          thread_binded_reduce_loop_indices[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  void Visit(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::ScheduleBlockRealize* expr, ir::Expr* op) override {
+    if (!CanReplace(expr)) {
+      VLOG(6) << "Can't replace cross thread reduction: " << *op;
+      IRMutator::Visit(expr, op);
+      return;
+    }
+    VLOG(6) << "Can replace cross thread reduction: " << *op;
+
+    const ir::ScheduleBlock* schedule_block =
+        expr->schedule_block.As<ir::ScheduleBlock>();
+    CHECK_NOTNULL(schedule_block);
+    ir::Expr original_update_body = schedule_block->body;
+    ir::Expr original_update_stmt;
+    CHECK(original_update_body.As<ir::Block>() ||
+          original_update_body.As<ir::Store>());
+    if (original_update_body.As<ir::Block>()) {
+      CHECK_EQ(original_update_body.As<ir::Block>()->stmts.size(), 1);
+      original_update_stmt = original_update_body.As<ir::Block>()->stmts[0];
+    } else if (original_update_body.As<ir::Store>()) {
+      original_update_stmt = original_update_body;
+    }
+
+#define REPLACE_TO_EXTERNAL_CALL(Op)                                   \
+  if (original_update_stmt.As<ir::Store>()->value.As<Op>()) {          \
+    auto* node = original_update_stmt.As<ir::Store>()->value.As<Op>(); \
+    CHECK(node);                                                       \
+    auto& operand = node->b();                                         \
+    std::string reduce_func_name =                                     \
+        hlir::pe::CrossThreadReduceExternalFuncName(                   \
+            original_update_stmt.As<ir::Store>()->value,               \
+            operand.As<ir::Load>()->tensor);                           \
+    original_update_stmt.As<ir::Store>()->value =                      \
+        lang::CallExtern(reduce_func_name, {node->b()});               \
+  }
+
+    REPLACE_TO_EXTERNAL_CALL(ir::Add)
+    REPLACE_TO_EXTERNAL_CALL(ir::Mul)
+    REPLACE_TO_EXTERNAL_CALL(ir::Max)
+    REPLACE_TO_EXTERNAL_CALL(ir::Min)
+    REPLACE_TO_EXTERNAL_CALL(ir::And)
+    REPLACE_TO_EXTERNAL_CALL(ir::Or)
+#undef REPLACE_TO_EXTERNAL_CALL
+
+    VLOG(6) << "Replace cross thread reduction: " << *op;
+
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::For* expr, ir::Expr* op) override {
+    cur_loops_.push_back(*op);
+    IRMutator::Visit(expr, op);
+    cur_loops_.pop_back();
+  }
+
+ private:
+  std::vector<ir::Expr> cur_loops_;
+};
+
+void ReplaceCrossThreadReduction(Expr* e) { CrossThreadReductionReplacer()(e); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/cinn/optim/replace_cross_thread_reduction.h
similarity index 54%
rename from paddle/fluid/operators/random_crop_op.cu
rename to paddle/cinn/optim/replace_cross_thread_reduction.h
index 33182dff93fa42..5bc0d2828d6b21 100644
--- a/paddle/fluid/operators/random_crop_op.cu
+++ b/paddle/cinn/optim/replace_cross_thread_reduction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,16 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/random_crop_op.h"
+/**
+ * This file implements the strategy to remove the unnecessary nested block.
+ */
+#pragma once
+#include <vector>
 
-namespace ops = paddle::operators;
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
 
-PD_REGISTER_STRUCT_KERNEL(random_crop,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::RandomCropKernel,
-                          float,
-                          int,
-                          double,
-                          uint8_t,
-                          int16_t) {}
+namespace cinn {
+namespace optim {
+
+/**
+ * Replace cross thread reduction to external call.
+ */
+void ReplaceCrossThreadReduction(Expr* e);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/replace_cross_thread_reduction_test.cc b/paddle/cinn/optim/replace_cross_thread_reduction_test.cc
new file mode 100644
index 00000000000000..03f72ed7d6f4b8
--- /dev/null
+++ b/paddle/cinn/optim/replace_cross_thread_reduction_test.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/optim/replace_cross_thread_reduction.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+
+TEST(CrossThreadReductionReplacer, basic) {
+#ifdef CINN_WITH_CUDA
+  Context::Global().ResetNameId();
+  Placeholder<float> A("A", {Expr(64), Expr(128)});
+  Target target = common::DefaultNVGPUTarget();
+  Module::Builder builder("reduce_sum", target);
+  Var reduce_j(128, "reduce_j");
+  ir::Tensor B = Compute(
+      {Expr(64)},
+      [&](Var i) { return lang::ReduceSum(A(i, reduce_j), {reduce_j}); },
+      "B");
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  auto func = lang::LowerToAst("reduce_sum", {A, B}, &tensor_group);
+  VLOG(6) << "original func\n" << func;
+
+  ir::ModuleExpr mod_expr({func->body});
+  ir::IRSchedule ir_sch(mod_expr);
+
+  ir_sch.Bind(ir_sch.GetLoops("B")[0], "blockIdx.x");
+  ir_sch.Bind(ir_sch.GetLoops("B")[1], "threadIdx.x");
+
+  ir::Expr new_func = ir_sch.GetModule().GetExprs()[0];
+  VLOG(6) << "After Bind: " << new_func;
+
+  ReplaceCrossThreadReduction(&new_func);
+  VLOG(6) << "After ReplaceCrossThreadReduction: " << new_func;
+
+  EXPECT_EQ(utils::GetStreamCnt(new_func), utils::Trim(R"ROC({
+  ScheduleBlock(root)
+  {
+    thread_bind[blockIdx.x] for (i, 0, 64)
+    {
+      ScheduleBlock(B__reduce_init)
+      {
+        i0 = axis.bind(i)
+        B__reduce_init[i0] = 0.00000000f
+      }
+      thread_bind[threadIdx.x] for (reduce_j, 0, 128)
+      {
+        ScheduleBlock(B)
+        {
+          i0_0, i1 = axis.bind(i, reduce_j)
+          B[i0_0] = cinn_block_reduce_sum_fp32_internal(A[i0_0, i1])
+        }
+      }
+    }
+  }
+}
+)ROC"));
+#endif
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/pybind/ir/ir_api.cc b/paddle/cinn/pybind/ir/ir_api.cc
index ffbfd3375bf751..2170f360f50627 100644
--- a/paddle/cinn/pybind/ir/ir_api.cc
+++ b/paddle/cinn/pybind/ir/ir_api.cc
@@ -843,6 +843,8 @@ void BindIrContext(py::module *m) {
       .def_static("MakeThenContext",
                   []() { return IRContext(new ThenContextNode()); });
 
+  m->def("link_to_parent_context", &pybind::LinkToParentContext);
+
   py::class_<IRBuilder> ir_builder(*m, "IRBuilder");
   ir_builder.def(py::init<>())
       .def("EnterWithContext", &IRBuilder::EnterWithContext)
diff --git a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
index a7e4dc6e1de1a3..2aefb019eac731 100644
--- a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
+++ b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -474,11 +474,11 @@ __device__ inline bool cinn_any(const bool left, const bool right) { return left
       tmp_val = __shfl_sync(mask, tmp_val, 0, 32);                                        \
       return tmp_val;                                                                     \
     } else {                                                                              \
-      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 16, 32));     \
-      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 8, 32));      \
-      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 4, 32));      \
-      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 2, 32));      \
-      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 1, 32));      \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_xor_sync(mask, tmp_val, 16, 32));     \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_xor_sync(mask, tmp_val, 8, 32));      \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_xor_sync(mask, tmp_val, 4, 32));      \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_xor_sync(mask, tmp_val, 2, 32));      \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_xor_sync(mask, tmp_val, 1, 32));      \
       return tmp_val;                                                                     \
     }                                                                                     \
   }
@@ -530,25 +530,22 @@ __device__ inline float cinn_warp_reduce_avg_fp32(const float *buf, int offset,
 
 #define CINN_BLOCK_REDUCE_INTERNAL_IMPL(TYPE, value, init_value, cinn_warp_shuffle_internal) \
   int warp_id = threadIdx.x / 32;                                                            \
-  __shared__ TYPE tmp[32];                                                                   \
-  if (warp_id == 0) {                                                                        \
-    tmp[threadIdx.x] = init_value;                                                           \
-  }                                                                                          \
   TYPE tmp_val = cinn_warp_shuffle_internal(value);                                          \
   if (blockDim.x <= 32) {                                                                    \
     return tmp_val;                                                                          \
   }                                                                                          \
+  __shared__ TYPE tmp[32];                                                                   \
+  if (warp_id == 0) {                                                                        \
+    tmp[threadIdx.x] = init_value;                                                           \
+  }                                                                                          \
   __syncthreads();                                                                           \
-  if (threadIdx.x % 32 == 0) {                                                               \
+  if ((threadIdx.x & 31) == 0) {                                                             \
     tmp[warp_id] = tmp_val;                                                                  \
   }                                                                                          \
   __syncthreads();                                                                           \
   if (warp_id == 0) {                                                                        \
     tmp_val = tmp[threadIdx.x];                                                              \
-    tmp_val = cinn_warp_shuffle_internal(tmp_val);                                           \
-    if (threadIdx.x == 0) {                                                                  \
-      tmp[0] = tmp_val;                                                                      \
-    }                                                                                        \
+    tmp[threadIdx.x] = cinn_warp_shuffle_internal(tmp_val);                                  \
   }                                                                                          \
   __syncthreads();                                                                           \
   return tmp[0];
diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.cc b/paddle/fluid/distributed/auto_parallel/dist_attr.cc
index b7bb47b3b859e9..e6c31d06e21c2d 100644
--- a/paddle/fluid/distributed/auto_parallel/dist_attr.cc
+++ b/paddle/fluid/distributed/auto_parallel/dist_attr.cc
@@ -83,7 +83,7 @@ OperatorDistAttr& OperatorDistAttr::operator=(
 
 void OperatorDistAttr::initialize(const OpDesc* op) {
   if (op == nullptr) return;
-  for (std::string name : op->InputArgumentNames()) {
+  for (std::string const& name : op->InputArgumentNames()) {
     VarDesc* input = op->Block()->FindVarRecursive(name);
     VLOG(4) << "[OperatorDistAttr create input dist attr] " << name;
     if (input == nullptr || op->Type() == "create_py_reader") {
@@ -92,7 +92,7 @@ void OperatorDistAttr::initialize(const OpDesc* op) {
       input_dist_attrs_[name] = TensorDistAttr(get_tensor_shape(input));
     }
   }
-  for (std::string name : op->OutputArgumentNames()) {
+  for (std::string const& name : op->OutputArgumentNames()) {
     VarDesc* output = op->Block()->FindVarRecursive(name);
     VLOG(4) << "[OperatorDistAttr create output dist attr] " << name;
     if (output == nullptr) {
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc
index e8ef88c03032b9..09b4d6a2189b7a 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc
@@ -57,7 +57,7 @@ std::unordered_map<std::string, int64_t> ShardingMergeForTensors(
     const bool merge_conflicts) {
   std::unordered_map<std::string, int64_t> axis_to_dim_map;
   std::unordered_map<int64_t, std::string> dim_to_axis_map;
-  int64_t merge_dim;
+  int64_t merge_dim = 0;
 
   for (auto& pair : tensor_axes_to_dim_pairs) {
     for (size_t i = 0; i < pair.second.size(); ++i) {
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.cc
index f38a4b2f533b31..5cc6cf4e5e1376 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.cc
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.cc
@@ -34,7 +34,7 @@ CrossEntropyWithSoftmaxSPMDRule::InferForward(
                                    input_specs_size));
 
   auto x_shape = input_specs[0].shape();
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   auto x_dist_attr_src = input_specs[0].dist_attr();
   std::vector<int64_t> x_dims_mapping_src = x_dist_attr_src.dims_mapping();
 
@@ -176,8 +176,8 @@ CrossEntropyWithSoftmaxSPMDRule::InferBackward(
     const std::vector<DistTensorSpec>& output_specs,
     const paddle::framework::AttributeMap& attrs) {
   // step0: verify input args based on cross_entropy_with_softmax logic
-  int64_t ninputs = input_specs.size();
-  int64_t noutputs = output_specs.size();
+  int64_t ninputs = static_cast<int64_t>(input_specs.size());
+  int64_t noutputs = static_cast<int64_t>(output_specs.size());
   PADDLE_ENFORCE_EQ(
       ninputs,
       2,
@@ -194,7 +194,7 @@ CrossEntropyWithSoftmaxSPMDRule::InferBackward(
 
   // step1: build Einsum Notation
   std::vector<int64_t> x_shape = input_specs[0].shape();
-  int64_t x_ndim = x_shape.size();
+  int64_t x_ndim = static_cast<int64_t>(x_shape.size());
   std::vector<int64_t> label_shape = input_specs[1].shape();
 
   int axis = ExtractAttr<int>("axis", attrs);
@@ -205,7 +205,7 @@ CrossEntropyWithSoftmaxSPMDRule::InferBackward(
 
   // normalize axis
   if (axis < 0) {
-    axis = x_ndim + axis;
+    axis = static_cast<int>(x_ndim + axis);
   }
 
   std::string alphabet =
diff --git a/paddle/fluid/distributed/collective/process_group_custom.cc b/paddle/fluid/distributed/collective/process_group_custom.cc
index f6d0c761ec7972..64dce7b4c6b116 100644
--- a/paddle/fluid/distributed/collective/process_group_custom.cc
+++ b/paddle/fluid/distributed/collective/process_group_custom.cc
@@ -32,23 +32,57 @@ PD_DECLARE_bool(use_stream_safe_cuda_allocator);
 namespace paddle {
 namespace distributed {
 
+static std::mutex g_unfinished_xccl_task_events_mutex;
+static std::list<std::unique_ptr<phi::event::Event>>
+    g_unfinished_xccl_task_events;
+
 ProcessGroupCustom::XCCLTask::XCCLTask(const Place& place,
                                        int rank,
                                        CommType comm_type,
                                        bool sync_op,
                                        bool use_calc_stream)
     : TaskStream(rank, comm_type, sync_op, use_calc_stream),
-      task_place_(place) {
-  comm_event_.Init(place);
+      task_place_(place),
+      comm_event_(std::make_unique<phi::event::Event>()) {
+  comm_event_->Init(task_place_);
+}
+
+ProcessGroupCustom::XCCLTask::XCCLTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType CommType,
+    const std::vector<phi::DenseTensor>& inputs)
+    : TaskStream(rank, inputs, CommType),
+      task_place_(places[0]),
+      comm_event_(std::make_unique<phi::event::Event>()) {
+  comm_event_->Init(task_place_);
 }
 
-ProcessGroupCustom::XCCLTask::~XCCLTask() = default;
+ProcessGroupCustom::XCCLTask::~XCCLTask() {
+  if (!IsCompleted()) {
+    std::lock_guard<std::mutex> lock(g_unfinished_xccl_task_events_mutex);
+    g_unfinished_xccl_task_events.push_back(std::move(comm_event_));
+  }
+}
 
-bool ProcessGroupCustom::XCCLTask::IsCompleted() { return comm_event_.Query(); }
+bool ProcessGroupCustom::XCCLTask::IsCompleted() {
+  return comm_event_->Query();
+}
 
 void ProcessGroupCustom::XCCLTask::UpdateWaitChain(
     const phi::DeviceContext& ctx) {
-  comm_event_.Record(
+  {
+    std::lock_guard<std::mutex> lock(g_unfinished_xccl_task_events_mutex);
+    for (auto iter = g_unfinished_xccl_task_events.begin();
+         iter != g_unfinished_xccl_task_events.end();) {
+      if ((*iter)->Query()) {
+        iter = g_unfinished_xccl_task_events.erase(iter);
+      } else {
+        iter++;
+      }
+    }
+  }
+  comm_event_->Record(
       reinterpret_cast<const phi::CustomContext&>(ctx).GetStream().get());
 }
 
@@ -62,7 +96,7 @@ bool ProcessGroupCustom::XCCLTask::Wait(std::chrono::milliseconds timeout) {
 
   const auto* calc_ctx = reinterpret_cast<phi::CustomContext*>(
       platform::DeviceContextPool::Instance().Get(task_place_));
-  calc_ctx->GetStream()->WaitEvent(&comm_event_);
+  calc_ctx->GetStream()->WaitEvent(comm_event_.get());
 
   if (IsBlockCPUInWait()) {
     // If we use the work to do barrier, we should block cpu
@@ -494,7 +528,7 @@ void ProcessGroupCustom::CreateXCCLEnvCache(const Place& place,
           << ", place: " << place_key;
 
   phi::distributed::CommContextManager::CreateXCCLCommContext(
-      store_, std::to_string(gid_), place.GetDeviceType(), rank_, size_);
+      store_, std::to_string(gid_), place, rank_, size_);
 
   auto* calc_ctx = static_cast<phi::CustomContext*>(
       platform::DeviceContextPool::Instance().Get(place));
@@ -590,15 +624,6 @@ std::shared_ptr<ProcessGroupCustom::XCCLTask> ProcessGroupCustom::CreateTask(
       places, rank, comm_type, inputs);
 }
 
-ProcessGroupCustom::XCCLTask::XCCLTask(
-    const std::vector<Place>& places,
-    int rank,
-    CommType CommType,
-    const std::vector<phi::DenseTensor>& inputs)
-    : TaskStream(rank, inputs, CommType), task_place_(places[0]) {
-  comm_event_.Init(places[0]);
-}
-
 // create XCCLManager cache for places_key
 void ProcessGroupCustom::CreateXCCLManagerCache(
     const std::string& places_key, const std::vector<Place>& places) {
@@ -676,7 +701,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Collective(
   {
     std::lock_guard<std::mutex> lock(mutex_);
     if (place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end()) {
-      CreateXCCLManagerCache(key, places);
+      CreateXCCLEnvCache(places[0], key);
     }
   }
 
diff --git a/paddle/fluid/distributed/collective/process_group_custom.h b/paddle/fluid/distributed/collective/process_group_custom.h
index c60d185c9e4808..13970b2e349a0e 100644
--- a/paddle/fluid/distributed/collective/process_group_custom.h
+++ b/paddle/fluid/distributed/collective/process_group_custom.h
@@ -61,8 +61,8 @@ class ProcessGroupCustom final : public ProcessGroupWithStream {
 
    private:
     bool block_cpu_in_wait_{false};
-    phi::event::Event comm_event_;  // event on comm stream
     Place task_place_;
+    std::unique_ptr<phi::event::Event> comm_event_;  // event on comm stream
   };
 
  public:
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 9a048b64e49838..80609b9fd68289 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -571,7 +571,7 @@ void EagerReducer::InitializeGroups(
       tensor_locator.inside_group_index = inside_group_index++;
       variable_locators_[var_index] = tensor_locator;
     }
-    group.tensor_indices_ = std::move(tensor_indices_);
+    group.tensor_indices_ = tensor_indices_;
     groups_.emplace_back(std::move(group));
 
     VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
@@ -757,9 +757,7 @@ void EagerReducer::AddDistHook(size_t var_index) {
     auto *autograd_meta = tensors_[var_index].get_autograd_meta();
     auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
 
-    if (!HasGrad(var_index)) {
-      group_tensor.ShareDataWith(phi::DenseTensor());
-    } else {
+    if (HasGrad(var_index)) {
       auto grad_dense_tensor =
           *(std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl()));
       group_tensor.ShareDataWith(grad_dense_tensor);
@@ -987,6 +985,7 @@ void EagerReducer::ProcessUnusedDenseVars() {
   opts.reduce_op = ReduceOp::SUM;
   std::vector<Tensor> reduce_tensors = {global_used_vars_};
   std::vector<phi::DenseTensor> in_out;
+  in_out.reserve(reduce_tensors.size());
   for (auto &t : reduce_tensors) {
     in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
   }
@@ -1083,6 +1082,7 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
   // all_reduce
   std::vector<Tensor> reduce_tensors = {group->dense_contents_};
   std::vector<phi::DenseTensor> in_out;
+  in_out.reserve(reduce_tensors.size());
   for (auto &t : reduce_tensors) {
     in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
   }
@@ -1168,6 +1168,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
   opts.reduce_op = ReduceOp::SUM;
   std::vector<Tensor> reduce_tensors = {rows_num_tensor};
   std::vector<phi::DenseTensor> in_out;
+  in_out.reserve(reduce_tensors.size());
   for (auto &t : reduce_tensors) {
     in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
   }
@@ -1216,6 +1217,8 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
     std::vector<Tensor> dst_rows_tensors = {dst_rows_tensor};
     std::vector<phi::DenseTensor> in;
     std::vector<phi::DenseTensor> out;
+    in.reserve(src_rows_tensors.size());
+    out.reserve(dst_rows_tensors.size());
     for (auto &t : src_rows_tensors) {
       in.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
     }
@@ -1247,6 +1250,8 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
     std::vector<Tensor> dst_value_tensors = {dst_value_tensor};
     std::vector<phi::DenseTensor> src_dense;
     std::vector<phi::DenseTensor> dst_dense;
+    src_dense.reserve(src_value_tensors.size());
+    dst_dense.reserve(dst_value_tensors.size());
     for (auto &t : src_value_tensors) {
       src_dense.push_back(
           *std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 6dc25faa80b4be..82a3514f2791f9 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -121,7 +121,7 @@ void Carrier::CopyParameters(
     const framework::ProgramDesc& program,
     const std::vector<std::string>& inference_root_scope_vars) {
   std::map<std::string, int> inference_root_scope_var_map;
-  for (auto var_name : inference_root_scope_vars) {
+  for (auto const& var_name : inference_root_scope_vars) {
     inference_root_scope_var_map.insert({var_name, 1});
   }
   for (size_t i = 0; i < program.Size(); ++i) {
@@ -392,6 +392,7 @@ void Carrier::CreateInterceptors(
         }
       }
 
+      cores.reserve(microbatch_scopes_.size());
       for (framework::Scope* scope : microbatch_scopes_) {
         cores.push_back(std::make_shared<InterpreterCore>(
             place_, task_node->program()->Block(0), scope, execution_config));
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 5bf026661d5146..7817b9bc0e9dfe 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -85,7 +85,7 @@ InterceptorMessage ComputeInterceptor::PrepareVarsMsg() {
   ready_msg.set_message_type(DATA_WITH_VARS);
   ready_msg.set_scope_idx(cur_scope_id_);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  for (auto iter : node_->vars_to_dtype()) {
+  for (auto const& iter : node_->vars_to_dtype()) {
     VarList* vars = ready_msg.add_vars_list();
     const auto& var_name = iter.first;
     vars->set_name(var_name);
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 2a6da1b437a1b6..dc89c551fdc711 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -47,7 +47,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
                                  const platform::Place &place) {
   VLOG(3) << "Loading data from DistModelTensor for " << input_data.name;
   framework::DDim dims = phi::make_ddim(input_data.shape);
-  void *input_tensor_ptr;
+  void *input_tensor_ptr = nullptr;
   if (input_data.dtype == DistModelDataType::INT64) {
     input_tensor_ptr = input_tensor->mutable_data<int64_t>(dims, place);
   } else if (input_data.dtype == DistModelDataType::FLOAT32) {
@@ -295,7 +295,7 @@ void DistModel::InsertCommOp(std::string tmp_var_name,
      << ". The ring id is: " << ring_id << ". The group has: " << nranks
      << " ranks. Current rank in the group is: " << rank
      << ". The endpoint is: " << endpoint << ". Peer endpoints are: ";
-  for (auto ep : peer_endpoints) {
+  for (const auto &ep : peer_endpoints) {
     ss << ep << ", ";
   }
   VLOG(3) << ss.str();
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index 8daf0636ce890a..99dd6175787e86 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -82,7 +82,7 @@ void PreventVarsDelete(
   for (const auto& pair : *unused_vars) {
     const framework::OperatorBase* op = pair.first;
     std::vector<std::string> cur_unused = pair.second;
-    for (auto name : vars_not_gc) {
+    for (auto const& name : vars_not_gc) {
       auto iter = std::find(cur_unused.begin(), cur_unused.end(), name);
       if (iter != cur_unused.end()) {
         VLOG(3) << "Removing var: [" << name
@@ -165,7 +165,7 @@ void FleetExecutor::Init(
       while_block_vars = GetUnusedVarsAfterWhile(
           program_desc, task_node, inference_root_scope_vars);
       VLOG(3) << "Vars will be gced after while op";
-      for (auto var : while_block_vars) {
+      for (auto const& var : while_block_vars) {
         VLOG(3) << var;
       }
       task_node->SetWhileBlockVars(while_block_vars);
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 0dd44c2318eec1..aaae9761330254 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -1,157 +1,115 @@
 set_source_files_properties(
   table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   table_test
-  SRCS
-  table_test.cc
-  DEPS
-  common_table
-  table
-  ps_framework_proto
-  ${COMMON_DEPS}
-  ${RPC_DEPS})
+  SRCS table_test.cc
+  DEPS common_table table ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
 
 set_source_files_properties(
   dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   dense_table_test
-  SRCS
-  dense_table_test.cc
-  DEPS
-  common_table
-  table
-  ps_framework_proto
-  ${COMMON_DEPS}
-  ${RPC_DEPS})
+  SRCS dense_table_test.cc
+  DEPS common_table table ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
 
 set_source_files_properties(
   barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   barrier_table_test
-  SRCS
-  barrier_table_test.cc
-  DEPS
-  common_table
-  table
-  ps_framework_proto
-  ${COMMON_DEPS})
+  SRCS barrier_table_test.cc
+  DEPS common_table table ps_framework_proto ${COMMON_DEPS})
 
 set_source_files_properties(
   brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS
                                             ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   brpc_service_dense_sgd_test
-  SRCS
-  brpc_service_dense_sgd_test.cc
-  DEPS
-  scope
-  ps_service
-  table
-  ps_framework_proto
-  ${COMMON_DEPS})
+  SRCS brpc_service_dense_sgd_test.cc
+  DEPS scope ps_service table ps_framework_proto ${COMMON_DEPS})
 
 set_source_files_properties(
   brpc_service_sparse_sgd_test.cc PROPERTIES COMPILE_FLAGS
                                              ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   brpc_service_sparse_sgd_test
-  SRCS
-  brpc_service_sparse_sgd_test.cc
-  DEPS
-  scope
-  ps_service
-  table
-  ps_framework_proto
-  ${COMMON_DEPS})
+  SRCS brpc_service_sparse_sgd_test.cc
+  DEPS scope ps_service table ps_framework_proto ${COMMON_DEPS})
 
 set_source_files_properties(
   brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   brpc_utils_test
-  SRCS
-  brpc_utils_test.cc
-  DEPS
-  brpc_utils
-  scope
-  phi
-  sendrecv_rpc
-  ps_service
-  ${COMMON_DEPS}
-  ${RPC_DEPS})
+  SRCS brpc_utils_test.cc
+  DEPS brpc_utils
+       scope
+       phi
+       sendrecv_rpc
+       ps_service
+       ${COMMON_DEPS}
+       ${RPC_DEPS})
 
 set_source_files_properties(
   graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   graph_node_test
-  SRCS
-  graph_node_test.cc
-  DEPS
-  scope
-  ps_service
-  table
-  ps_framework_proto
-  ${COMMON_DEPS})
+  SRCS graph_node_test.cc
+  DEPS scope ps_service table ps_framework_proto ${COMMON_DEPS})
 
 set_source_files_properties(
   graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   graph_node_split_test
-  SRCS
-  graph_node_split_test.cc
-  DEPS
-  scope
-  ps_service
-  table
-  ps_framework_proto
-  ${COMMON_DEPS})
+  SRCS graph_node_split_test.cc
+  DEPS scope ps_service table ps_framework_proto ${COMMON_DEPS})
 
 set_source_files_properties(
   graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS
                                         ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
+cc_test(
   graph_table_sample_test
-  SRCS
-  graph_table_sample_test.cc
-  DEPS
-  table
-  ps_framework_proto
-  ${COMMON_DEPS})
+  SRCS graph_table_sample_test.cc
+  DEPS table ps_framework_proto ${COMMON_DEPS})
 
 set_source_files_properties(
   feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-cc_test_old(
+cc_test(
   feature_value_test
-  SRCS
-  feature_value_test.cc
-  DEPS
-  table
-  common_table
-  sendrecv_rpc
-  ${COMMON_DEPS})
+  SRCS feature_value_test.cc
+  DEPS table common_table sendrecv_rpc ${COMMON_DEPS})
 
 set_source_files_properties(
   sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS
-            ${COMMON_DEPS} table)
+cc_test(
+  sparse_sgd_rule_test
+  SRCS sparse_sgd_rule_test.cc
+  DEPS ${COMMON_DEPS} table)
 
 set_source_files_properties(
   ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS}
-            table)
+cc_test(
+  ctr_accessor_test
+  SRCS ctr_accessor_test.cc
+  DEPS ${COMMON_DEPS} table)
 set_source_files_properties(
   ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS
                                        ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(ctr_dymf_accessor_test SRCS ctr_dymf_accessor_test.cc DEPS
-            ${COMMON_DEPS} table)
+cc_test(
+  ctr_dymf_accessor_test
+  SRCS ctr_dymf_accessor_test.cc
+  DEPS ${COMMON_DEPS} table)
 
 set_source_files_properties(
   memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS
                                          ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS
-            ${COMMON_DEPS} table)
+cc_test(
+  memory_sparse_table_test
+  SRCS memory_sparse_table_test.cc
+  DEPS ${COMMON_DEPS} table)
 
 set_source_files_properties(
   memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(memory_sparse_geo_table_test SRCS memory_geo_table_test.cc DEPS
-            ${COMMON_DEPS} table)
+cc_test(
+  memory_sparse_geo_table_test
+  SRCS memory_geo_table_test.cc
+  DEPS ${COMMON_DEPS} table)
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 96210c16dd9ef5..ab155de79feedd 100755
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -75,3 +75,7 @@ cc_library(
        generated_op
        autograd_meta
        hook_utils)
+# FIXME(Aurelius84): It seems utils library is depended in cycle, but
+# CMake only find it twice to deal cycle depend problem. If it is still
+# not found, ld error will be raised.
+set_target_properties(utils PROPERTIES LINK_INTERFACE_MULTIPLICITY 3)
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
index ed83bb29714ffe..2e4489fdcc12ee 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
@@ -74,6 +74,12 @@ MultiplyGradNode::operator()(
   // Runtime check if we need next grad
   bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;
 
+  // Set DistAttr of Out Tensor for semi-auto parallel
+  if (IsRunAutoParallel()) {
+    egr::EagerUtils::SetGradOutputDistAttr(
+        out_metas, {0, 1}, api_output_0, api_output_1);
+  }
+
   // Inplace Check
 
   // Inplace Strategy
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index eb5cd7fb1242da..c2613dffa201d5 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -410,7 +410,7 @@ static std::pair<std::string, std::string> GetAttrType(
       ret = "std::vector<std::string>";
       if (is_arg) ret += "&";
       val += "{";
-      for (auto x : PADDLE_GET_CONST(std::vector<std::string>, attr)) {
+      for (auto const& x : PADDLE_GET_CONST(std::vector<std::string>, attr)) {
         val += "\"" + x + "\"" + ",";
       }
       if (val.size() > 1) val.pop_back();
@@ -1238,7 +1238,7 @@ static std::string GenerateGradNodeCreationContent(
       bool found_target_name = false;
       for (const auto& iter : op_base_infos) {
         const auto& grad_outs_slot_map = iter.GetGradOutsSlotnameMap();
-        for (auto iter : grad_outs_slot_map) {
+        for (auto const& iter : grad_outs_slot_map) {
           if ((!found_target_name) && (input_name == iter.second)) {
             const char* SET_GRAD_OUT_META_TEMPLATE =
                 "      grad_node->SetGradOutMeta(%s, %d);\n";
@@ -1256,7 +1256,7 @@ static std::string GenerateGradNodeCreationContent(
       bool found_target_name = false;
       for (const auto& iter : op_base_infos) {
         const auto& grad_outs_slot_map = iter.GetGradOutsSlotnameMap();
-        for (auto iter : grad_outs_slot_map) {
+        for (auto const& iter : grad_outs_slot_map) {
           if ((!found_target_name) && (input_name == iter.second)) {
             const char* SET_GRAD_OUT_META_TEMPLATE =
                 "      grad_node->SetGradOutMeta(%s, %d);\n";
@@ -1877,6 +1877,18 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   trace_op_body_str += trace_op_str;
   trace_op_body_str += "\n";
 
+  // [Generation] Log memory infomation
+  const char* LOG_MEMORY_INFO_TEMPLATE =
+      " // Log memory information\n"
+      "  "
+      "paddle::memory::LogDeviceMemoryStats(egr::Controller::Instance()."
+      "GetExpectedPlace(), \"%s\");\n";
+  std::string log_memory_info_str =
+      paddle::string::Sprintf(LOG_MEMORY_INFO_TEMPLATE, op_type);
+
+  trace_op_body_str += log_memory_info_str;
+  trace_op_body_str += "\n";
+
   VLOG(6) << "Generated AttrMap & TraceOp";
 
   // [Generation] Convert output VarBase to Vector/Tensor
@@ -2142,7 +2154,7 @@ static std::string GenerateSingleOpBase(
   // [Generation] Get Full Zero
   std::string fill_zero_str = "";
   if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
-    for (auto iter : grad_ins) {
+    for (auto const& iter : grad_ins) {
       const std::string& grad_input_name = iter.first;
       if (grad_ins_grad_slotname_map.count(grad_input_name)) {
         size_t fwd_output_position = fwd_outputs_name_pos_map.at(
@@ -2189,7 +2201,7 @@ static std::string GenerateSingleOpBase(
       "backward_inplace_tensor" + std::to_string(*outs_size);
   bool process_backward_inplace = false;
   std::string ins_contents_str = "";
-  for (auto iter : grad_ins) {
+  for (auto const& iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
 
     if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
@@ -2293,7 +2305,7 @@ static std::string GenerateSingleOpBase(
       paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
   generated_grad_function_body += ins_map_str;
 
-  for (auto iter : grad_ins) {
+  for (auto const& iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
 
     if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
@@ -2335,7 +2347,7 @@ static std::string GenerateSingleOpBase(
   VLOG(6) << "Generated Ins Map";
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
-  for (auto iter : grad_outs) {
+  for (auto const& iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
 
     if (grad_outs_slotname_map.count(grad_output_name)) {
@@ -2440,7 +2452,7 @@ static std::string GenerateSingleOpBase(
   generated_grad_function_body += outs_map_str;
   generated_grad_function_body += outs_contents_str;
   generated_grad_function_body += "\n";
-  for (auto iter : grad_outs) {
+  for (auto const& iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
 
     if (grad_outs_slotname_map.count(grad_output_name)) {
@@ -2498,7 +2510,7 @@ static std::string GenerateSingleOpBase(
         "%s[\"%s\"][0]);\n"
         "  };\n";
     std::string backward_inplace_map_str = "";
-    for (auto iter : backward_inplace_map) {
+    for (auto const& iter : backward_inplace_map) {
       std::string backward_inplace_input_name = iter.first;
       std::string backward_inplace_output_name = iter.second;
       backward_inplace_map_str += paddle::string::Sprintf(
@@ -2553,7 +2565,7 @@ static std::string GenerateSingleOpBase(
   // [Generation] Get Return
   std::string outputs_str = "";
   size_t num_appended_outputs = 0;
-  for (auto iter : grad_outs) {
+  for (auto const& iter : grad_outs) {
     const std::string& grad_out_name = iter.first;
     const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
 
@@ -2594,7 +2606,7 @@ static std::string GenerateSingleOpBase(
 
   /* Handle Special Case: "PullSparseOp", etc
      For returns, append "GradOut" to the very end of return list. */
-  for (auto iter : grad_outs) {
+  for (auto const& iter : grad_outs) {
     const std::string& grad_out_name = iter.first;
     const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
 
@@ -2968,6 +2980,7 @@ static std::string GenerateDygraphHFileIncludes() {
       "#pragma once\n"
       "#include \"glog/logging.h\"\n"
       "#include \"paddle/fluid/eager/autograd_meta.h\"\n"
+      "#include \"paddle/fluid/memory/stats.h\"\n"
       "#include \"paddle/phi/api/all.h\"\n"
       "#include \"paddle/fluid/eager/utils.h\"\n"
       "#include \"paddle/fluid/imperative/tracer.h\"\n"
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 073c5588b1eb86..ab0e3d6a3e3700 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -270,6 +270,8 @@ class {} : public egr::GradNodeBase {{
 {}
 
   // Forward API Call
+{}
+  // Log memory infomation
 {}
   // Check NaN and Inf if needed
 {}
@@ -320,6 +322,8 @@ class {} : public egr::GradNodeBase {{
   // Before log info
 {}
   // Forward API Call
+{}
+  // Log memory infomation
 {}
   // Check NaN and Inf if needed
 {}
@@ -412,6 +416,7 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/prim/api/all.h"
 #include "paddle/fluid/prim/utils/utils.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/fluid/memory/stats.h"
 #include "paddle/phi/api/lib/data_transform.h"
 PHI_DECLARE_bool(check_nan_inf);
 {}
@@ -1742,6 +1747,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                 forward_call_str = f"{indent}{api_out_type} api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str_tmp});"
 
         dygraph_event_str = f"{indent}paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);\n"
+        log_memory_info_str = f"{indent}paddle::memory::LogDeviceMemoryStats(egr::Controller::Instance().GetExpectedPlace(), \"{forward_api_name}\");"
         forward_ad_function_name = GetDygraphForwardFunctionName(
             forward_api_name
         )
@@ -1828,6 +1834,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                     forward_api_name,
                     before_log_str,
                     forward_call_str,
+                    log_memory_info_str,
                     check_nan_inf_str,
                     get_outputs_str,
                     forward_api_name,
@@ -1854,6 +1861,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                 node_creation_pre_contiguous_str,
                 node_creation_before_call_str,
                 forward_call_str,
+                log_memory_info_str,
                 check_nan_inf_str,
                 get_outputs_str,
                 outputs_autograd_meta_str,
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 685ff8d4b72975..8aa2f64ccb2ec5 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/eager/backward.h"
 
 #include "paddle/fluid/eager/general_grad.h"
+#include "paddle/fluid/memory/stats.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 
 namespace egr {
@@ -85,7 +86,7 @@ void EnforceGradNodeHasInput(GradNodeBase* node) {
 void DuplicateCheck(const std::vector<paddle::Tensor>& inputs, bool is_input) {
   std::unordered_set<AutogradMeta*> visisted_ins;
   std::string msg = is_input ? "inputs" : "outputs";
-  for (auto in : inputs) {
+  for (auto const& in : inputs) {
     AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
     PADDLE_ENFORCE_EQ(
         visisted_ins.count(auto_grad_meta),
@@ -111,6 +112,8 @@ std::vector<paddle::Tensor> RunBackward(
     const std::vector<paddle::Tensor>& no_grad_vars = {}) {
   VLOG(3) << "Start Backward";
 
+  auto place = egr::Controller::Instance().GetExpectedPlace();
+
   std::queue<GradNodeBase*> force_sequential_nodes_forward_queue =
       egr::Controller::Instance().GetForceSequentialNodes();
   std::deque<GradNodeBase*> force_sequential_nodes_queue;
@@ -378,9 +381,9 @@ std::vector<paddle::Tensor> RunBackward(
         auto add_next_node_func = [&node_in_degree_map,
                                    &queue](GradNodeBase* next_node) {
           if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
-            queue.push_front(std::move(next_node));
+            queue.push_front(next_node);
           } else {
-            queue.push_back(std::move(next_node));
+            queue.push_back(next_node);
           }
         };
         if (node_in_degree_map[next_node] == 0) {
@@ -405,6 +408,7 @@ std::vector<paddle::Tensor> RunBackward(
         }
       }
     }
+    paddle::memory::LogDeviceMemoryStats(place, std::string((*node).name()));
   }
 
   VLOG(7) << "Run Backward Final hook size: "
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index af914d3ae3c791..5643c0e69391f0 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -154,7 +154,7 @@ static void ConstructFwdAndBwdMap(
                 << "'s No." << j << " attrs: " << attrs_names[j]
                 << " related to No." << i
                 << " grad_attrs: " << grad_attrs_names[i];
-        in_out_map[op_type][1][4][j] = i;
+        in_out_map[op_type][1][4][j] = i;  // NOLINT
       }
     }
   }
@@ -190,12 +190,12 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
     }
   }
 
-  for (auto it : fwd_outs) {
+  for (auto it : fwd_outs) {  // NOLINT
     VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
     tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
   }
 
-  for (auto it : fwd_ins) {
+  for (auto it : fwd_ins) {  // NOLINT
     // NOTE(HongyuJia): returned tensor maybe un-defined tensor when inputs
     // optional<Tensor>
     VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
@@ -406,12 +406,12 @@ RunCustomOpDoubleGradNode::operator()(
     }
   }
 
-  for (auto it : fwd_outs) {
+  for (auto it : fwd_outs) {  // NOLINT
     VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
     tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
   }
 
-  for (auto it : fwd_ins) {
+  for (auto it : fwd_ins) {  // NOLINT
     VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
     tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
   }
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 3ceeda65c8e611..41f8f8eb4c3fbd 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -277,6 +277,12 @@ void GradNodeBase::SetGradOutMeta(const paddle::Tensor& fwd_in,
       meta.SetTensorMeta(dense_tensor.meta());
       meta.SetPlace(fwd_in.place());
       // Set DistAttr
+      // Forward input DistTensor could be uninitialized.
+      PADDLE_ENFORCE_NE(
+          dist_tensor->dist_attr().empty(),
+          true,
+          phi::errors::InvalidArgument(
+              "The forward input DistTensor's dist attr is empty."));
       meta.SetDistAttr(dist_tensor->dist_attr());
       SetIsRunAutoParallel(true);
     } else {
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 5051dd39d9819f..34469f875198b7 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -89,11 +89,11 @@ void GradTensorHolder::CopyValueFromTensor(size_t slot_id,
         auto init_grad =
             paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
         auto global_dense_t =
-            static_cast<phi::DenseTensor*>(init_grad.impl().get());
+            std::static_pointer_cast<phi::DenseTensor>(init_grad.impl());
         auto dist_t =
             static_cast<phi::distributed::DistTensor*>(t.impl().get());
         init_grad.set_impl(std::make_shared<phi::distributed::DistTensor>(
-            *global_dense_t, dist_t->dist_attr()));
+            global_dense_t, dist_t->dist_attr()));
         buffer_[slot_id][rank] = init_grad;
       } else {
         PADDLE_THROW(paddle::platform::errors::Fatal(
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 0ac940ab496e29..83e4424a212514 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -32,6 +32,7 @@
 #include "paddle/pir/core/value.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
+PHI_DECLARE_bool(print_ir);
 
 namespace details {
 using Tensor = paddle::Tensor;
@@ -191,6 +192,12 @@ static auto GetNameFromValue(const ::pir::Block *block,
                  .dyn_cast<pir::StrAttribute>()
                  .AsString();
       value2name[op->operand(0).source()] = name;
+    } else if (!is_input && op->name() == "builtin.shadow_output") {
+      name = op->attributes()
+                 .at("output_name")
+                 .dyn_cast<pir::StrAttribute>()
+                 .AsString();
+      value2name[op->operand(0).source()] = name;
     } else if (is_input && op->name() == "builtin.get_parameter") {
       name = op->attributes()
                  .at("parameter_name")
@@ -463,12 +470,13 @@ inline void NewIRRunProgramAPI(
   auto *backward_program =
       backward_global_block->GetParentOp()->GetParentProgram();
 
-  if (VLOG_IS_ON(4)) {
+  if (FLAGS_print_ir) {
     std::ostringstream print_stream;
+    print_stream << "ForwardProgram is :\n";
     forward_program->Print(print_stream);
-    print_stream << "\n";
+    print_stream << "BackwardProgram is:\n";
     backward_program->Print(print_stream);
-    VLOG(4) << print_stream.str();
+    std::cout << "Program (fwd | bwd): \n" << print_stream.str() << std::endl;
   }
 
   VLOG(10) << is_test << program_id;
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index eb2dcca4d3b314..28ca8636720dcd 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -28,6 +28,38 @@
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
+
+void SetGradOutputDistAttrIter::visit_element(paddle::Tensor* element,
+                                              const GradSlotMeta& meta) {
+  if (element == nullptr) {
+    VLOG(4) << "The input element is nullptr when calling "
+               "SetGradOutputDistAttrIter.";
+    return;
+  }
+  // Here the element is empty or defined DistTensor
+  VLOG(4) << "The input element is set DistTensor impl when calling "
+             "SetGradOutputDistAttrIter.";
+  element->set_impl(std::make_shared<phi::distributed::DistTensor>(
+      phi::DDim(), meta.DistAttr()));
+}
+
+void SetGradOutputDistAttrIter::visit(paddle::Tensor* element) {
+  if (!out_meta_[out_indexes_[cur_pos_]].empty()) {
+    visit_element(element, out_meta_[out_indexes_[cur_pos_]][0]);
+  }
+  cur_pos_++;
+}
+
+void SetGradOutputDistAttrIter::visit(
+    const std::vector<paddle::Tensor*>& elements) {
+  if (!out_meta_[out_indexes_[cur_pos_]].empty()) {
+    for (size_t i = 0; i < elements.size(); ++i) {
+      visit_element(elements.at(i), out_meta_[out_indexes_[cur_pos_]][i]);
+    }
+  }
+  cur_pos_++;
+}
+
 /**
  * Implementation of Eager Utils.
  **/
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index c1fe208c4c72a8..8dd950be0cbe22 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -97,41 +97,9 @@ class SetGradOutputDistAttrIter : public IterHelper<paddle::Tensor*> {
       : out_meta_(out_meta), out_indexes_{out_indexes} {}
 
  private:
-  void visit_element(paddle::Tensor* element, const GradSlotMeta& meta) {
-    if (element == nullptr) {
-      return;
-    }
-    if (meta.DistAttr().empty()) {
-      return;
-    }
-    if (element->defined()) {
-      if (element->is_dist_tensor()) {
-        PADDLE_THROW(phi::errors::Unimplemented(
-            "Unsupport set defined dist tensor now."));
-      } else {
-        // Only deal with dist tensor here
-        return;
-      }
-    } else {
-      element->set_impl(std::make_shared<phi::distributed::DistTensor>(
-          phi::DDim(), meta.DistAttr()));
-    }
-  }
-  void visit(paddle::Tensor* element) override {
-    if (!out_meta_[out_indexes_[cur_pos_]].empty()) {
-      visit_element(element, out_meta_[out_indexes_[cur_pos_]][0]);
-    }
-    cur_pos_++;
-  }
-
-  void visit(const std::vector<paddle::Tensor*>& elements) override {
-    if (!out_meta_[out_indexes_[cur_pos_]].empty()) {
-      for (size_t i = 0; i < elements.size(); ++i) {
-        visit_element(elements.at(i), out_meta_[out_indexes_[cur_pos_]][i]);
-      }
-    }
-    cur_pos_++;
-  }
+  void visit_element(paddle::Tensor* element, const GradSlotMeta& meta);
+  void visit(paddle::Tensor* element) override;
+  void visit(const std::vector<paddle::Tensor*>& elements) override;
 
   const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
       out_meta_;
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 8814935e3fceb5..81075e0c5fb5bd 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -110,7 +110,7 @@ static void RunKernelFunc(
         // tensor here.
         custom_vec_in.emplace_back(paddle::Tensor());
       }
-      kernel_ctx.EmplaceBackInputs(std::move(custom_vec_in));
+      kernel_ctx.EmplaceBackInputs(custom_vec_in);
     } else {                        // inputs Tensor
       if (ctx.HasInput(in_name)) {  // general Tensor inputs
         auto* x = ctx.Input<phi::DenseTensor>(in_name);
@@ -231,7 +231,7 @@ static void RunKernelFunc(
         custom_t.set_impl(std::make_shared<phi::DenseTensor>(*out));
         custom_vec_out.emplace_back(custom_t);
       }
-      kernel_ctx.EmplaceBackOutputs(std::move(custom_vec_out));
+      kernel_ctx.EmplaceBackOutputs(custom_vec_out);
     } else {
       // handle inplace optional outputs = None case
       if (!ctx.HasOutput(out_name)) {
@@ -318,7 +318,7 @@ static void RunKernelFunc(
       }
     }
   } catch (platform::EnforceNotMet& exception) {
-    throw std::move(exception);
+    throw exception;
   } catch (std::exception& ex) {
     PADDLE_THROW(platform::errors::External("%s", ex.what()));
   } catch (...) {
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 902cd0f39369a2..19c5196d2f933a 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -1808,7 +1808,7 @@ int PaddleBoxDataFeed::Next() {
       output_pv_channel_->Get(pv_instance);
       pv_vec.push_back(pv_instance);
       ++index;
-      consume_pv_channel_->Put(std::move(pv_instance));
+      consume_pv_channel_->Put(pv_instance);
     }
     this->batch_size_ = index;
     VLOG(3) << "pv_batch_size_=" << this->batch_size_
@@ -2448,9 +2448,9 @@ bool SlotRecordInMemoryDataFeed::ParseOneInstance(const std::string& line,
     }
     // parse_logkey
     std::string log_key = std::string(str + pos, len);
-    uint64_t search_id;
-    uint32_t cmatch;
-    uint32_t rank;
+    uint64_t search_id = 0;
+    uint32_t cmatch = 0;
+    uint32_t rank = 0;
     parser_log_key(log_key, &search_id, &cmatch, &rank);
 
     rec->ins_id_ = log_key;
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 18ed0bb6e901aa..6c66188567717f 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -138,7 +138,7 @@ std::vector<std::string> DatasetImpl<T>::GetSlots() {
     }
   }
   std::cout << "dataset use slots: ";
-  for (auto s : use_slots_) {
+  for (auto const& s : use_slots_) {
     std::cout << s << " | ";
   }
   std::cout << " end " << std::endl;
@@ -216,7 +216,7 @@ template <typename T>
 std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() {
   std::vector<paddle::framework::DataFeed*> ret;
   ret.reserve(readers_.size());
-  for (auto i : readers_) {
+  for (auto const& i : readers_) {
     ret.push_back(i.get());
   }
   return ret;
@@ -1533,7 +1533,7 @@ void MultiSlotDataset::MergeByInsId() {
           break;
         }
         local_uint64.insert(slot);
-        rec.uint64_feasigns_.push_back(std::move(feature));
+        rec.uint64_feasigns_.push_back(feature);
       }
       if (has_conflict_slot) {
         break;
@@ -1550,7 +1550,7 @@ void MultiSlotDataset::MergeByInsId() {
           break;
         }
         local_float.insert(slot);
-        rec.float_feasigns_.push_back(std::move(feature));
+        rec.float_feasigns_.push_back(feature);
       }
       if (has_conflict_slot) {
         break;
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 0615df45b76793..eec3439cf04316 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -52,11 +52,11 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
     const std::vector<Scope *> &local_exec_scopes,
     const std::vector<platform::Place> &places,
     std::vector<ir::Graph *> graphs)
-    : strategy_(std::move(strategy)),
-      local_scopes_(std::move(local_scopes)),
+    : strategy_(strategy),
+      local_scopes_(local_scopes),
       local_exec_scopes_(local_exec_scopes),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
-      places_(std::move(places)),
+      places_(places),
       graphs_(std::move(graphs)) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(),
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index b71c476a2c95e2..27be4b77176350 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -113,7 +113,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
     }
   } else {
     auto &val = PADDLE_GET(FetchUnmergedList, *data_);
-    val.at(offset_) = std::move(tensors_);
+    val.at(offset_) = tensors_;
   }
 }
 
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 79b43b1b501db6..0aae1ce6b60d73 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -45,7 +45,7 @@ void GatherOpHandle::RunImpl() {
           in_var_handles.size(),
           places_.size()));
 
-  VarHandle *out_var_handle;
+  VarHandle *out_var_handle = nullptr;
   {
     auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/framework/details/multi_devices_helper.cc b/paddle/fluid/framework/details/multi_devices_helper.cc
index 4849ca34e3e956..d2379c2c49a19d 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.cc
+++ b/paddle/fluid/framework/details/multi_devices_helper.cc
@@ -176,7 +176,7 @@ static bool IsDataParallelInferenceGraphImpl(
 }
 
 bool IsDataParallelInferenceGraph(const ir::Graph &graph) {
-  size_t place_num;
+  size_t place_num = 0;
   std::unordered_map<details::OpHandleBase *, size_t> op_to_dev_idx;
   return IsDataParallelInferenceGraphImpl(graph, &op_to_dev_idx, &place_num);
 }
@@ -196,7 +196,7 @@ bool IsDataParallelInferenceGraph(const ir::Graph &graph) {
  */
 std::vector<std::unique_ptr<ir::Graph>> TrySeparateToMultipleSingleDeviceGraphs(
     ir::Graph *graph) {
-  size_t place_num;
+  size_t place_num = 0;
   std::unordered_map<details::OpHandleBase *, size_t> op_to_dev_idx;
   if (!IsDataParallelInferenceGraphImpl(*graph, &op_to_dev_idx, &place_num)) {
     return {};
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 42f97e975ed3c2..b917c161193fbe 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -105,8 +105,8 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const std::vector<Scope *> &local_exec_scopes,
     const std::vector<platform::Place> &places,
     std::vector<std::unique_ptr<ir::Graph>> graphs)
-    : strategy_(std::move(strategy)),
-      local_scopes_(std::move(local_scopes)),
+    : strategy_(strategy),
+      local_scopes_(local_scopes),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(places),
       graphs_(std::move(graphs)),
@@ -297,6 +297,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
         for (size_t i = 0; i < lodtensorarray_ptrs[0]->size(); ++i) {
           phi::DenseTensor var;
           std::vector<const phi::DenseTensor *> ptrs;
+          ptrs.reserve(lodtensor_ptrs.size());
           for (auto &lodtensorarray_ptr : lodtensorarray_ptrs) {
             ptrs.push_back(&(lodtensorarray_ptr->at(i)));
           }
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7acf425fd77f30..fe43126ca8abe4 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -63,7 +63,7 @@ void ReduceOpHandle::RunImpl() {
           in_var_handles.size(),
           places_.size()));
 
-  VarHandle *out_var_handle;
+  VarHandle *out_var_handle = nullptr;
   {
     auto out_var_handles = DynamicCast<VarHandle>(outputs_);
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 7ee7fa82250a99..9d275b0fd4c2e1 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -36,7 +36,7 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
     std::vector<VariableInfo> var_infos,
     std::vector<platform::Place> places,
     std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
-    : strategy_(std::move(strategy)),
+    : strategy_(strategy),
       underlying_executor_(std::move(underlying_executor)),
       local_scopes_(std::move(local_scopes)),
       local_exec_scopes_(std::move(local_exec_scopes)),
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index ce3fe004c40bb8..0397f87f6649ef 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -128,7 +128,7 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
       run_all_ops(ready_ops);
 
       // 2. Find ready variable
-      bool timeout;
+      bool timeout = false;
       auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
       if (timeout) {
         for (auto &run_op_future : run_op_futures_) {
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 64a2efcfdccda2..50a16d8f686e78 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -62,6 +62,8 @@ message MpConfig {
     optional bool mp_skip_c_identity= 6 [default = false ];
     // Support fused_linear_param_grad_add in ColumnParallelLinear. Only works when mp_async_allreduce is true.
     optional bool mp_fused_linear_param_grad_add= 7 [default = false ];
+    // Broadcast mp input data
+    optional bool need_broadcast_data=8 [default = true];
 }
 
 message PpConfig {
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index e69a25bb32781a..c9bd59f912d7a3 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -135,7 +135,7 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
           static_cast<int>(table_idx)));
 
   TableParameter table;
-  for (auto i : param_.sparse_table()) {
+  for (auto const& i : param_.sparse_table()) {
     if (i.table_id() == table_id) {
       table = i;
       break;
@@ -191,7 +191,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
           static_cast<int>(table_idx)));
 
   TableParameter table;
-  for (auto i : param_.sparse_table()) {
+  for (auto const& i : param_.sparse_table()) {
     if (i.table_id() == table_id) {
       table = i;
       break;
@@ -485,7 +485,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
   double push_sparse_time = 0.0;
   double push_dense_time = 0.0;
   double copy_table_time = 0.0;
-  int cur_batch;
+  int cur_batch = 0;
   int batch_cnt = 0;
   uint64_t total_inst = 0;
   timeline.Start();
@@ -513,7 +513,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
       TableParameter table;
-      for (auto j : param_.sparse_table()) {
+      for (auto const& j : param_.sparse_table()) {
         if (j.table_id() == tid) {
           table = j;
           break;
@@ -599,7 +599,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_sparse_table_id(i));
         TableParameter table;
-        for (auto i : param_.sparse_table()) {
+        for (auto const& i : param_.sparse_table()) {
           if (i.table_id() == tid) {
             table = i;
             break;
@@ -804,7 +804,7 @@ void DownpourWorker::TrainFiles() {
   platform::SetNumThreads(1);
   device_reader_->Start();
   int batch_cnt = 0;
-  int cur_batch;
+  int cur_batch = 0;
   while ((cur_batch = device_reader_->Next()) > 0) {
     if (copy_table_config_.need_copy()) {
       if (batch_cnt % copy_table_config_.batch_num() == 0) {
@@ -819,7 +819,7 @@ void DownpourWorker::TrainFiles() {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
       TableParameter table;
-      for (auto j : param_.sparse_table()) {
+      for (auto const& j : param_.sparse_table()) {
         if (j.table_id() == tid) {
           table = j;
           break;
@@ -936,7 +936,7 @@ void DownpourWorker::TrainFiles() {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_sparse_table_id(i));
         TableParameter table;
-        for (auto i : param_.sparse_table()) {
+        for (auto const& i : param_.sparse_table()) {
           if (i.table_id() == tid) {
             table = i;
             break;
diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc
index 68c774965aeabf..d7d8a7ff883cdd 100644
--- a/paddle/fluid/framework/downpour_worker_opt.cc
+++ b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -262,7 +262,7 @@ void DownpourWorkerOpt::CreateThreadOperatorsWithRerank(
     uint64_t tid =
         static_cast<uint64_t>(param_.program_config(0).pull_sparse_table_id(i));
     TableParameter table;
-    for (auto j : param_.sparse_table()) {
+    for (auto const& j : param_.sparse_table()) {
       if (j.table_id() == tid) {
         table = j;
         break;
@@ -307,7 +307,7 @@ void DownpourWorkerOpt::TrainFiles() {
   platform::SetNumThreads(1);
   device_reader_->Start();
   int batch_cnt = 0;
-  int cur_batch;
+  int cur_batch = 0;
   std::future<int32_t> pull_async_status;
   std::string async_wait_name = "";
   for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
@@ -315,7 +315,7 @@ void DownpourWorkerOpt::TrainFiles() {
     uint64_t tid =
         static_cast<uint64_t>(param_.program_config(0).pull_sparse_table_id(i));
     TableParameter table;
-    for (auto j : param_.sparse_table()) {
+    for (auto const& j : param_.sparse_table()) {
       if (j.table_id() == tid) {
         table = j;
         break;
@@ -344,7 +344,7 @@ void DownpourWorkerOpt::TrainFiles() {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
       TableParameter table;
-      for (auto j : param_.sparse_table()) {
+      for (auto const& j : param_.sparse_table()) {
         if (j.table_id() == tid) {
           table = j;
           break;
@@ -455,7 +455,7 @@ void DownpourWorkerOpt::TrainFiles() {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_sparse_table_id(i));
         TableParameter table;
-        for (auto i : param_.sparse_table()) {
+        for (auto const& i : param_.sparse_table()) {
           if (i.table_id() == tid) {
             table = i;
             break;
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 5613a8dbf155e0..2e1eb0a58fe5a5 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -353,6 +353,10 @@ std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
   return core;
 }
 
+bool TensorSortHelper(const paddle::Tensor &t1, const paddle::Tensor &t2) {
+  return t1.name() < t2.name();
+}
+
 std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     const paddle::framework::BlockDesc *forward_global_block,
     const paddle::framework::BlockDesc *backward_global_block,
@@ -398,7 +402,9 @@ std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
   }
 
   std::set<std::string> input_param_names;
-  for (auto &param : params) {
+  auto sorted_params = params;
+  std::sort(sorted_params.begin(), sorted_params.end(), TensorSortHelper);
+  for (auto &param : sorted_params) {
     auto &name = param.name();
     auto p = param.place().GetType();
 
@@ -515,6 +521,8 @@ std::unique_ptr<::pir::Program> ConstructBackwardIrProgram(
   for (auto &t : x_grad) {
     param_grad_names.push_back(t->name());
   }
+
+  std::sort(param_grad_names.begin(), param_grad_names.end());
   for (auto &name : param_grad_names) {
     if (name == "@EMPTY@") {
       continue;
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index 08d681ae6411fe..3d32781216402c 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -126,7 +126,7 @@ void HogwildWorker::SetZero(phi::DenseTensor *tensor,
 void HogwildWorker::BindingDataFeedMemory() {
   const std::vector<std::string> &input_feed =
       device_reader_->GetUseSlotAlias();
-  for (auto name : input_feed) {
+  for (auto const &name : input_feed) {
     device_reader_->AddFeedVar(thread_scope_->FindVar(name), name);
   }
 }
@@ -239,7 +239,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
   platform::Timer timeline;
   double total_time = 0.0;
   double read_time = 0.0;
-  int cur_batch;
+  int cur_batch = 0;
   int batch_cnt = 0;
   if (thread_id_ == 0) {
     quit_flag_.store(false);
@@ -372,7 +372,7 @@ void HogwildWorker::TrainFiles() {
   int total_batch_num = 0;
   // how to accumulate fetched values here
   device_reader_->Start();
-  int cur_batch;
+  int cur_batch = 0;
   int batch_cnt = 0;
   if (thread_id_ == 0) {
     quit_flag_.store(false);
@@ -471,7 +471,7 @@ void HogwildWorker::PrintFetchVars() {
   }
 
   if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) {
-    time_t curtime;
+    time_t curtime = 0;
     time(&curtime);
     std::array<char, 80> mbstr;
     std::strftime(mbstr.data(),
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 38cc88f7ec9360..4c41bc27f1730e 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -505,8 +505,7 @@ CompatInferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
       result.emplace_back(in.initialized() ? &in : nullptr);
     }
 
-    return paddle::optional<std::vector<const phi::MetaTensor*>>(
-        std::move(result));
+    return paddle::optional<std::vector<const phi::MetaTensor*>>(result);
   }
   return paddle::none;
 }
@@ -637,11 +636,11 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             if (ctx->IsRuntime()) {
               Variable* var = PADDLE_GET_CONST(Variable*, infershape_input[0]);
               infer_meta_context.EmplaceBackAttr(
-                  std::move(framework::MakePhiScalarFromVar(*var)));
+                  framework::MakePhiScalarFromVar(*var));
             } else {
               phi::Scalar tensor_scalar(-1);
               tensor_scalar.SetFromTensor(true);
-              infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar));
+              infer_meta_context.EmplaceBackAttr(tensor_scalar);
             }
           } else {
             PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/framework/init_default_kernel_signature_map.h
similarity index 61%
rename from paddle/fluid/operators/cos_sim_op.cu
rename to paddle/fluid/framework/init_default_kernel_signature_map.h
index 82174a246757e8..a6b6400dd19f59 100644
--- a/paddle/fluid/operators/cos_sim_op.cu
+++ b/paddle/fluid/framework/init_default_kernel_signature_map.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,10 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/cos_sim_op.h"
 
-namespace ops = paddle::operators;
+#pragma once
 
-PD_REGISTER_STRUCT_KERNEL(cos_sim, GPU, ALL_LAYOUT, ops::CosSimKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    cos_sim_grad, GPU, ALL_LAYOUT, ops::CosSimGradKernel, float) {}
+#include "paddle/utils/test_macros.h"
+
+// The implementation of  InitDefaultKernelSignatureMap is in phi_utils.cc
+namespace paddle {
+namespace framework {
+TEST_API void InitDefaultKernelSignatureMap();
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/io/crypto/cipher.cc b/paddle/fluid/framework/io/crypto/cipher.cc
index 2001e8a416a1a1..03e0cb4d0eb273 100644
--- a/paddle/fluid/framework/io/crypto/cipher.cc
+++ b/paddle/fluid/framework/io/crypto/cipher.cc
@@ -24,8 +24,8 @@ namespace framework {
 std::shared_ptr<Cipher> CipherFactory::CreateCipher(
     const std::string& config_file) {
   std::string cipher_name;
-  int iv_size;
-  int tag_size;
+  int iv_size = 0;
+  int tag_size = 0;
   std::unordered_map<std::string, std::string> config;
   if (!config_file.empty()) {
     config = CipherUtils::LoadConfig(config_file);
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.cc b/paddle/fluid/framework/io/crypto/cipher_utils.cc
index c10da1ce6706cf..42d6223b729af5 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.cc
@@ -72,7 +72,7 @@ std::unordered_map<std::string, std::string> CipherUtils::LoadConfig(
                         "make sure input filename is available.",
                         config_file));
   std::unordered_map<std::string, std::string> ret;
-  char c;
+  char c = 0;
   std::string line;
   std::istringstream iss;
   while (std::getline(fin, line)) {
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils_test.cc b/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
index 356c919cbcbe8c..ee4453bcaab676 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
@@ -46,19 +46,19 @@ TEST(CipherUtils, load_config) {
   EXPECT_TRUE(CipherUtils::GetValue<std::string>(config, "key_str", &out_str));
   EXPECT_EQ(out_str, std::string("ciphername"));
 
-  int out_int;
+  int out_int = 0;
   EXPECT_TRUE(CipherUtils::GetValue<int>(config, "key_int", &out_int));
   EXPECT_EQ(out_int, 1);
 
-  bool out_bool;
+  bool out_bool = false;
   EXPECT_TRUE(CipherUtils::GetValue<bool>(config, "key_bool", &out_bool));
   EXPECT_EQ(out_bool, true);
 
-  bool out_bool1;
+  bool out_bool1 = false;
   EXPECT_TRUE(CipherUtils::GetValue<bool>(config, "key_bool1", &out_bool1));
   EXPECT_EQ(out_bool1, false);
 
-  bool out_bool2;
+  bool out_bool2 = false;
   EXPECT_TRUE(CipherUtils::GetValue<bool>(config, "key_bool2", &out_bool2));
   EXPECT_EQ(out_bool2, false);
 }
diff --git a/paddle/fluid/framework/ir/add_support_int8_pass.cc b/paddle/fluid/framework/ir/add_support_int8_pass.cc
index 21b45d1b1fa388..5dedfe59f6900a 100644
--- a/paddle/fluid/framework/ir/add_support_int8_pass.cc
+++ b/paddle/fluid/framework/ir/add_support_int8_pass.cc
@@ -61,8 +61,8 @@ void AddSupportInt8Pass::ApplyImpl(ir::Graph* graph) const {
     // scale for one output
     for (auto out_node : quant_op->outputs) {
       for (auto out_op_node : out_node->outputs) {
-        for (auto name : out_op_node->Op()->InputNames()) {
-          for (auto input_name : out_op_node->Op()->Input(name)) {
+        for (auto const& name : out_op_node->Op()->InputNames()) {
+          for (auto const& input_name : out_op_node->Op()->Input(name)) {
             if (out_op_node->Op()->HasAttr("Input_scale_" + input_name)) {
               for (size_t i = 0; i < quanted_op_desc->OutputNames().size();
                    i++) {
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 698de5d90c256d..14f42b129effa5 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -184,7 +184,6 @@ void AutoMixedPrecisionPass::SetDefaultBlacklist() const {
       "log",
       "mean",
       "sum",
-      "cos_sim",
       "softmax_with_cross_entropy",
       "sigmoid_cross_entropy_with_logits",
       "c_softmax_with_cross_entropy",
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
index b4307f5ce758d4..44cb004fec1729 100644
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
@@ -189,7 +189,7 @@ class CoalesceGradTensorPass : public ir::Pass {
       const {
     if (params_grads.empty()) return true;
     auto dtype = GetDtypeOfVar(vars_info, params_grads.front().second);
-    for (auto p_g : params_grads) {
+    for (auto const &p_g : params_grads) {
       auto next_dtype = GetDtypeOfVar(vars_info, p_g.second);
       if (next_dtype != dtype) {
         return false;
diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc
index b32b0bb04b94cf..1bcec1e5a898c1 100644
--- a/paddle/fluid/framework/ir/constant_folding_pass.cc
+++ b/paddle/fluid/framework/ir/constant_folding_pass.cc
@@ -93,7 +93,7 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const {
       map[out_node->Name()] = 0;
     }
     // Forbid other node in graph having the same name with nodes in map
-    for (auto iter : map) {
+    for (auto const &iter : map) {
       for (auto node : graph->Nodes()) {
         if (node->IsVar() && node->Name() == iter.first) {
           map[node->Name()]++;
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 6b21bfa5defc9d..aa15b2696d7a12 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -371,8 +371,8 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
     bool mkldnn_with_bias = is_mkldnn && has_bias;
 
     // Create eltwise_y (conv bias) variable
-    phi::DenseTensor* eltwise_y_in_tensor;
-    Node* eltwise_y_in_node;
+    phi::DenseTensor* eltwise_y_in_tensor = nullptr;
+    Node* eltwise_y_in_node = nullptr;
     if (!mkldnn_with_bias) {
       VarDesc eltwise_y_in_desc(
           patterns::PDNodeName("fuse_conv_bn", conv_type() + "_eltwise_y_in"));
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
index 7cd069eea91a81..2f420bc858e37f 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
@@ -59,7 +59,7 @@ void TestMain(const std::string& conv_type) {
   auto* in = layers.data("in", {1, 3, 20, 20});
   auto* filters = layers.data("filters", {3, 3, 2, 2}, true);
   auto* bias_0 = layers.data("bias_0", {3}, true);
-  VarDesc* conv_out;
+  VarDesc* conv_out = nullptr;
   if (conv_type == "conv_transpose") {
     conv_out = layers.conv2d_transpose(in, filters, bias_0);
   } else {
diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass.cc b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
index 6d4224982f79b4..59fd42241e0d4b 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
@@ -84,7 +84,7 @@ static std::vector<Node*> FindOpNodeWithInputName(
     if (!node->IsOp()) continue;
     auto inputs = node->Op()->Inputs();
     bool find_input = false;
-    for (auto input : inputs) {
+    for (auto const& input : inputs) {
       auto input_names = input.second;
       if (std::count(input_names.begin(), input_names.end(), input_name) > 0) {
         find_input = true;
@@ -103,7 +103,7 @@ static std::vector<Node*> FindOpNodeWithOutputName(
     if (!node->IsOp()) continue;
     auto outputs = node->Op()->Outputs();
     bool find_output = false;
-    for (auto output : outputs) {
+    for (auto const& output : outputs) {
       auto output_names = output.second;
       if (std::count(output_names.begin(), output_names.end(), output_name) >
           0) {
diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
index 11d1339f35d249..17f0c642a60d18 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
@@ -53,6 +53,7 @@ VarDesc* AddWriteToArray(BlockDesc* block,
   OpDesc* op = block->AppendOp();
   op->SetType("write_to_array");
   std::vector<std::string> x_names;
+  x_names.reserve(x.size());
   for (auto k : x) {
     x_names.push_back(k->Name());
   }
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index cb6a6e1d5d9dc4..286f7f08cdfc97 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -122,7 +122,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
         platform::errors::InvalidArgument(
             "Input scale tensor's place should be CPU."));
 
-    float input_scale;
+    float input_scale = NAN;
     if (input_scale_tensor.dtype() == phi::DataType::FLOAT32) {
       const float* input_scale_data = input_scale_tensor.data<float>();
       input_scale = input_scale_data[0];
diff --git a/paddle/fluid/framework/ir/delete_repeated_ops_pass.cc b/paddle/fluid/framework/ir/delete_repeated_ops_pass.cc
index ea0de9759f3a01..1917fb56f13ae9 100644
--- a/paddle/fluid/framework/ir/delete_repeated_ops_pass.cc
+++ b/paddle/fluid/framework/ir/delete_repeated_ops_pass.cc
@@ -36,7 +36,7 @@ namespace ir {
 bool HasOutVarName(Node* op_node, std::string name) {
   auto* op_desc = op_node->Op();
   auto outputs = op_desc->Outputs();
-  for (auto iter : outputs) {
+  for (auto const& iter : outputs) {
     auto out_names = iter.second;
     if (std::count(out_names.begin(), out_names.end(), name) > 0) {
       return true;
@@ -155,7 +155,7 @@ void DeleteRepeatedOpsPass::DeleteRepeatedOps(
       }
     }
 
-    for (auto iter : ops_map) {
+    for (auto const& iter : ops_map) {
       auto ops = iter.second;
       auto* first_op_out = ops[0]->outputs[0];
       auto first_op_out_name = first_op_out->Name();
diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
index 0b09d1b30f40af..cf5c9a2c94cf9b 100644
--- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
@@ -45,7 +45,8 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
     if (n->IsOp()) {
       auto* op = n->Op();
       if (op->Type() == "dequantize_linear") {
-        Node *weight_var_node, *calcu_op_node, *while_op_node;
+        Node *weight_var_node = nullptr, *calcu_op_node = nullptr,
+             *while_op_node = nullptr;
         Node *dequantized_weight_var_node = nullptr, *scale_var_node = nullptr;
         // 1. Judge whether for dequant weight and find
         // weight_var_node/scale_var_node
diff --git a/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc b/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
index 071e85b3a3303c..7e6eda7ac139e1 100644
--- a/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
@@ -24,6 +24,7 @@ namespace ir {
 
 std::vector<std::string> GetNodeNames(const std::vector<Node *> &node_vector) {
   std::vector<std::string> out_vector;
+  out_vector.reserve(node_vector.size());
   for (auto i : node_vector) {
     out_vector.emplace_back(i->Name());
   }
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 2b3f64927f212b..048b33a649f94d 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -326,8 +326,8 @@ void FuseBatchNormActPass::ReLinkNodes(Graph *graph,
     IR_OP_VAR_LINK(fused_op, out);
   }
 
-  nodes2delete.insert(std::move(op_1));
-  nodes2delete.insert(std::move(op_2));
+  nodes2delete.insert(op_1);
+  nodes2delete.insert(op_2);
 
   GraphSafeRemoveNodes(graph, nodes2delete);
 }
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index ceae7bb4beb9ba..2a24c5476a5010 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -322,9 +322,9 @@ void FuseBatchNormAddActPass::ReLinkNodes(Graph *graph,
     IR_OP_VAR_LINK(fused_op, out);
   }
 
-  nodes2delete.insert(std::move(op_1));
-  nodes2delete.insert(std::move(op_2));
-  nodes2delete.insert(std::move(op_3));
+  nodes2delete.insert(op_1);
+  nodes2delete.insert(op_2);
+  nodes2delete.insert(op_3);
 
   GraphSafeRemoveNodes(graph, nodes2delete);
 }
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 60267177b7a2b3..3c550ca84042d2 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -352,7 +352,7 @@ void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const {
         if (out->Name() == intermediate_out_args[0]) {
           if (out->outputs.empty()) {
             cur_node->outputs = this->RemoveNode(out, cur_node->outputs);
-            need_removed_nodes.insert(std::move(out));
+            need_removed_nodes.insert(out);
             cur_node->Op()->SetAttr("save_intermediate_out", false);
           }
         }
@@ -373,7 +373,7 @@ void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const {
             out->outputs.empty()) {
           cur_node->Op()->SetOutput(GradVarName("IntermediateOut"), {});
           cur_node->outputs = this->RemoveNode(out, cur_node->outputs);
-          need_removed_nodes.insert(std::move(out));
+          need_removed_nodes.insert(out);
         }
       }
     }
@@ -439,8 +439,8 @@ void FuseElewiseAddActPass::ReLinkNodes(Graph *graph,
     IR_OP_VAR_LINK(fused_op, out);
   }
 
-  nodes2delete.insert(std::move(op_1));
-  nodes2delete.insert(std::move(op_2));
+  nodes2delete.insert(op_1);
+  nodes2delete.insert(op_2);
 
   GraphSafeRemoveNodes(graph, nodes2delete);
 }
@@ -485,8 +485,8 @@ void FuseElewiseAddActPass::ReLinkNodes2(Graph *graph,
     IR_OP_VAR_LINK(fused_op, out);
   }
 
-  nodes2delete.insert(std::move(op_1));
-  nodes2delete.insert(std::move(op_2));
+  nodes2delete.insert(op_1);
+  nodes2delete.insert(op_2);
 
   GraphSafeRemoveNodes(graph, nodes2delete);
 }
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
index 2f92a58ba3a77e..0ba4ef378a5cbe 100644
--- a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
@@ -83,7 +83,7 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
     auto matmul_op_desc = matmul_op->Op();
     if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape)) return;
 
-    bool trans_x, trans_y;
+    bool trans_x = false, trans_y = false;
     GetTransposeAttrsFromOp(*matmul_op_desc, &trans_x, &trans_y);
 
     OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
@@ -168,7 +168,7 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
 
     auto activation = act_op->Op()->Type();
 
-    bool trans_x, trans_y;
+    bool trans_x = false, trans_y = false;
     GetTransposeAttrsFromOp(*matmul_op_desc, &trans_x, &trans_y);
 
     OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
@@ -291,7 +291,7 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
     auto matmul_grad_op_desc = matmul_grad_op->Op();
     if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape)) return;
 
-    bool trans_x, trans_y;
+    bool trans_x = false, trans_y = false;
     GetTransposeAttrsFromOp(*matmul_grad_op_desc, &trans_x, &trans_y);
 
     OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
@@ -319,10 +319,12 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
     auto ele_add_grad_op_role_val =
         details::GetOpRoleVarsOrEmpty(*(ele_add_grad_op->Op()));
     std::vector<std::string> fused_gemm_epilogue_grad_op_role_var;
-    for (auto i : matmul_grad_op_role_val) {
+    fused_gemm_epilogue_grad_op_role_var.reserve(
+        matmul_grad_op_role_val.size());
+    for (auto const &i : matmul_grad_op_role_val) {
       fused_gemm_epilogue_grad_op_role_var.push_back(i);
     }
-    for (auto i : ele_add_grad_op_role_val) {
+    for (auto const &i : ele_add_grad_op_role_val) {
       fused_gemm_epilogue_grad_op_role_var.push_back(i);
     }
     fused_gemm_epilogue_grad_op_desc.SetAttr(
@@ -430,7 +432,7 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
 
     auto activation_grad = act_grad_op->Op()->Type();
 
-    bool trans_x, trans_y;
+    bool trans_x = false, trans_y = false;
     GetTransposeAttrsFromOp(*matmul_grad_op_desc, &trans_x, &trans_y);
     OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
     fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
@@ -455,10 +457,12 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
     auto ele_add_grad_op_role_val =
         details::GetOpRoleVarsOrEmpty(*(ele_add_grad_op->Op()));
     std::vector<std::string> fused_gemm_epilogue_grad_op_role_var;
-    for (auto i : matmul_grad_op_role_val) {
+    fused_gemm_epilogue_grad_op_role_var.reserve(
+        matmul_grad_op_role_val.size());
+    for (auto const &i : matmul_grad_op_role_val) {
       fused_gemm_epilogue_grad_op_role_var.push_back(i);
     }
-    for (auto i : ele_add_grad_op_role_val) {
+    for (auto const &i : ele_add_grad_op_role_val) {
       fused_gemm_epilogue_grad_op_role_var.push_back(i);
     }
     fused_gemm_epilogue_grad_op_desc.SetAttr(
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index d2996fe4c64b39..4a9e316f30b2b8 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -192,7 +192,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   // Pass pre-condition check: check dtype of fusing vars
   auto fusing_var_dtype =
       GetDtypeOfVar(vars_info, aux_var_map.at(kParam).front());
-  for (auto vars : aux_var_map) {
+  for (auto const &vars : aux_var_map) {
     for (auto &var_name : vars.second) {
       if (fusing_var_dtype != GetDtypeOfVar(vars_info, var_name)) {
         // Note(chenweihang): Currently the fuse_optimizer_ops strategy
@@ -204,7 +204,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
 
   // Pass pre-condition check: gradients generated op kernel
   auto fusing_grad_var_names = aux_var_map.at(kGrad);
-  for (auto grad_var_name : fusing_grad_var_names) {
+  for (auto const &grad_var_name : fusing_grad_var_names) {
     if (!GradGeneratedOpKernelCheck(vars_info, grad_var_name)) {
       // Note(chenweihang): Currently the fuse_optimizer_ops strategy is risky
       //   when gradient generated operator with kernel just support CPU or
@@ -336,7 +336,7 @@ bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck(
       }
     }
   }
-  for (auto op_type : check_op_set) {
+  for (auto const &op_type : check_op_set) {
     if (!OpWithKernelSupportCPUAndGPU(op_type)) {
       return false;
     }
@@ -365,6 +365,7 @@ void FuseOptimizerOpPass::GradientsFilter(
     }
   }
   std::vector<Node *> sorted_ops;
+  sorted_ops.reserve(new_grad_idx.size());
   for (size_t i : new_grad_idx) {
     sorted_ops.emplace_back(opt_nodes->at(i));
   }
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index 61d22a4b6d056f..32515b0b0eb105 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -303,7 +303,7 @@ std::string CodeGenerator::EmitParameters(
       output_args.push_back(args_str);
     }
   }
-  for (auto args : output_args) {
+  for (auto const& args : output_args) {
     ret << args;
     if (index != output_args.size() - 1) {
       ret << ", ";
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index edd4d455df8a6c..d87ef1c1e39eac 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -21,7 +21,7 @@ namespace framework {
 namespace ir {
 namespace fusion_group {
 
-OperationMap* OperationMap::map = nullptr;
+OperationMap *OperationMap::map = nullptr;
 
 OperationMap::OperationMap() {
   InsertUnaryElementwiseOperations();
@@ -31,7 +31,7 @@ OperationMap::OperationMap() {
 
 std::unordered_set<std::string> OperationMap::Find(int type) {
   std::unordered_set<std::string> res;
-  for (auto& t : operations_) {
+  for (auto &t : operations_) {
     if (t.second.type == type) {
       res.insert(t.first);
     }
@@ -60,15 +60,15 @@ void OperationMap::Insert(int type,
     // grad_inputs = inputs + outputs + grad of outputs
     std::vector<std::string> grad_input_names = input_names;
 
-    for (auto name : output_names) {
+    for (auto const &name : output_names) {
       grad_input_names.push_back(name);
     }
-    for (auto name : output_names) {
+    for (auto const &name : output_names) {
       grad_input_names.push_back(GradVarName(name));
     }
     // grad_output = grad of inputs
     std::vector<std::string> grad_output_names;
-    for (auto name : input_names) {
+    for (auto const &name : input_names) {
       grad_output_names.push_back(GradVarName(name));
     }
     Operation grad_op(type,
diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc
index e0ab584ee32256..821bed7e6d53df 100644
--- a/paddle/fluid/framework/ir/generate_pass.cc
+++ b/paddle/fluid/framework/ir/generate_pass.cc
@@ -286,7 +286,7 @@ GraphPatternDetector::handle_t GetGenerateDelete(
         for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
           remove_nodes.emplace(subgraph.at(pdnode.get()));
         }
-        for (auto iter : var_node_maps) {
+        for (auto const& iter : var_node_maps) {
           remove_nodes.erase(iter.second);
         }
         GraphSafeRemoveNodes(graph, remove_nodes);
@@ -424,7 +424,7 @@ GraphPatternDetector::handle_t GetGenerateRewrite(
         for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
           remove_nodes.emplace(subgraph.at(pdnode.get()));
         }
-        for (auto iter : var_node_maps) {
+        for (auto const& iter : var_node_maps) {
           for (auto& node : iter.second) {
             remove_nodes.erase(node);
           }
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 5d7054721db53a..30a001777bd587 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -164,7 +164,7 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
                         "Generated graph shouldn't contain cycle."));
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
-  for (auto adj : adj_list) {
+  for (auto const &adj : adj_list) {
     if (visited.find(adj.first) == visited.end()) {
       SortHelper<ir::NodeComp>(adj_list, adj.first, &visited, &ret);
     }
@@ -449,7 +449,7 @@ std::vector<ir::Node *> TopologySortGraphByDescOrder(const Graph &graph) {
                         "Generated graph shouldn't contain cycle."));
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
-  for (auto adj : adj_list) {
+  for (auto const &adj : adj_list) {
     if (visited.find(adj.first) == visited.end()) {
       SortHelper<DescOrderComparator>(adj_list, adj.first, &visited, &ret);
     }
@@ -502,6 +502,7 @@ static OpDesc *ReplaceScaleLossGradOp(const Node &node, OpDesc *desc) {
   // TODO(Ruibiao) : Set OpDeviceAttrName when needed
 
   std::vector<std::string> output_names;
+  output_names.reserve(node.outputs.size());
   for (auto out : node.outputs) {
     output_names.emplace_back(out->Name());
   }
@@ -741,7 +742,7 @@ template <class T = Node *>
 static void GetGraphVarDesc(const Graph &graph,
                             const std::unordered_set<T> &nodes,
                             std::vector<proto::VarDesc> *vars) {
-  for (T node : nodes) {
+  for (T const &node : nodes) {
     if (node->IsVar() && node->Var() &&
         node->GetVarNodeBlockId() == graph.GetBlockId()) {
       vars->emplace_back(*node->Var()->Proto());
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 4f430ba4041d69..3f68f5c6dd72b4 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -149,7 +149,7 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
           }
         }
       }
-      decltype(op_attrs)* attr;
+      decltype(op_attrs)* attr = nullptr;
       if (marked_nodes.count(n)) {
         attr = &marked_var_attrs;
       } else if (const_cast<Node*>(n)->Var() &&
diff --git a/paddle/fluid/framework/ir/inplace_op_var_pass.cc b/paddle/fluid/framework/ir/inplace_op_var_pass.cc
index 5bbe980daaba7e..7648fd0c89a26c 100644
--- a/paddle/fluid/framework/ir/inplace_op_var_pass.cc
+++ b/paddle/fluid/framework/ir/inplace_op_var_pass.cc
@@ -85,12 +85,12 @@ std::vector<std::string> InplaceOpVarPass::GetControlFlowVarNames(
   for (auto* node : graph->Nodes()) {
     if (!node->IsOp() || control_flow_ops_.count(node->Op()->Type()) == 0)
       continue;
-    for (auto in_names : node->Op()->Inputs()) {
+    for (auto const& in_names : node->Op()->Inputs()) {
       auto var_names = in_names.second;
       control_flow_var_names.insert(
           control_flow_var_names.end(), var_names.begin(), var_names.end());
     }
-    for (auto out_names : node->Op()->Outputs()) {
+    for (auto const& out_names : node->Op()->Outputs()) {
       auto var_names = out_names.second;
       control_flow_var_names.insert(
           control_flow_var_names.end(), var_names.begin(), var_names.end());
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
index 40525a14141a6a..0398117e08b8fb 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -86,7 +86,7 @@ static void SplitIntoLoDTensorAndNonLoDTensorVars(
   lod_tensors->clear();
   other_vars->clear();
 
-  for (auto &op_vars_pair : m) {
+  for (auto const &op_vars_pair : m) {
     for (auto var_name : op_vars_pair.second) {
       auto *var_desc = TryGetLatestVarDesc(
           vars[op_vars_pair.first->GetScopeIdx()].at(var_name));
@@ -247,7 +247,7 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
         op->GetScope(),
         op->GetScopeIdx(),
         op->GetPlace(),
-        std::move(var_info),
+        var_info,
         gcs.at(places[op->GetScopeIdx()]).get());
 
     auto it = std::find_if(
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index a4cc550938495d..9c60a665de0021 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -123,6 +123,7 @@ class ReferenceCountPassTestHelper {
   std::vector<OperatorBase *> LastLivedOps(const std::string &name) const {
     auto &ops = last_live_ops_of_vars_[0].at(name).ops();
     std::vector<OperatorBase *> ret;
+    ret.reserve(ops.size());
     for (auto *op : ops) {
       ret.emplace_back(op->GetOp());
     }
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
index eee24e01a328b3..1738259d60f004 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -412,7 +412,7 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
       auto out_iter = var_quant_scales->find(op_node->Op()->Output("Out")[0]);
       if (out_iter != var_quant_scales->end()) {
         std::vector<std::string> input_names = op_node->Op()->Input("X");
-        for (auto input_name : input_names) {
+        for (auto const& input_name : input_names) {
           auto concat_in_iter = var_quant_scales->find(input_name);
           if (concat_in_iter == var_quant_scales->end())
             (*var_quant_scales)[input_name] = out_iter->second;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 9c9ea82445d60b..8d8504708f0373 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -349,14 +349,14 @@ bool CPUQuantizePass::AreScalesPresentForVarNames(
   bool present = true;
   if (var_quant_scales_->empty()) {
     auto& scales = Get<VarQuantScale>("quant_var_scales");
-    for (auto name : names) {
+    for (auto const& name : names) {
       if (scales.find(name) == scales.end()) {
         present = false;
         LogScaleIsMissingForVarName(name);
       }
     }
   } else {
-    for (auto name : names) {
+    for (auto const& name : names) {
       if (var_quant_scales_->find(name) == var_quant_scales_->end()) {
         present = false;
         LogScaleIsMissingForVarName(name);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 052c26ba8e2681..c9461060e443f2 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -202,7 +202,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
     if (dequant_scale == quant_scale && dequant_shift == quant_shift) {
       // squash dequantize-quantize to nothing
       auto quant_out_var_name = quant_out->Name();
-      for (auto input_name : next_op_desc->InputNames()) {
+      for (auto const& input_name : next_op_desc->InputNames()) {
         auto& input_names = next_op_desc->MutableInputs()->at(input_name);
         std::replace(input_names.begin(),
                      input_names.end(),
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
index d007ef16d33ec2..47c76289d187c4 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
@@ -27,7 +27,7 @@ using string::PrettyLogDetail;
 void FuseFCActOneDNNPass::ApplyImpl(Graph *graph) const {
   auto act_types = GetSupportedActivations();
 
-  for (auto act_type : act_types) FuseFCAct(graph, act_type);
+  for (auto const &act_type : act_types) FuseFCAct(graph, act_type);
 }
 
 void FuseFCActOneDNNPass::FuseFCAct(Graph *graph,
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 655183dc712c02..0087886c1c8d7b 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -224,7 +224,7 @@ void QuantDequantMkldnnPass::CollectOutputScalesFromAttr(
 
       auto var_name_map = op_desc->Outputs();
       for (auto& item : var_name_map) {
-        for (auto var_name : item.second) {
+        for (auto const& var_name : item.second) {
           var_quant_scales->insert(std::make_pair(var_name, scale_v));
         }
       }
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index d5d3804e75ca30..e35e5d297db9b9 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -130,7 +130,7 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
       auto node = forward_backward_ops[node_idx];
       OpDesc repeated_op(*(node->Op()), node->Op()->Block());
       // 3. rename grad outputs to current repeat.
-      for (auto outname : repeated_op.OutputArgumentNames()) {
+      for (auto const& outname : repeated_op.OutputArgumentNames()) {
         if (grad_names.find(outname) != grad_names.end()) {
           std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
           repeated_op.RenameOutput(outname, new_gname);
@@ -244,11 +244,12 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
 
   // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
   // scale(1/num_repeats)
-  for (auto kv : grad_repeated_map) {
+  for (auto const& kv : grad_repeated_map) {
     OpDesc sum_op;
     sum_op.SetType("sum");
     std::vector<std::string> repeated_grad_names;
     std::vector<std::string> param_grad_op_role_var;
+    repeated_grad_names.reserve(kv.second.size());
     for (auto r : kv.second) {
       repeated_grad_names.push_back(r->Var()->Name());
     }
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index 6b4e786a5aae9e..dc18979260f928 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -51,7 +51,7 @@ class FuseAllReduceOpPass : public ir::Pass {
     size_t num_of_all_reduce = params_grads.size();
     std::unordered_set<std::string> grads;
     grads.reserve(num_of_all_reduce);
-    for (auto p_g : params_grads) {
+    for (auto const &p_g : params_grads) {
       grads.insert(p_g.second);
     }
 
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 35f4e4830d882b..85f62c4a293fce 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -351,7 +351,7 @@ bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad(
   // NOTE: This is for the case that all gradients should add collective ops
   for (auto *node : ops) {
     if (node->Op()->Type() != "allreduce") continue;
-    for (auto in_name : node->Op()->InputArgumentNames()) {
+    for (auto const &in_name : node->Op()->InputArgumentNames()) {
       if (in_name == grad_name) {
         return false;
       }
@@ -862,7 +862,7 @@ int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
 size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID(
     const std::vector<std::string> &var_names) const {
   int64_t numel_sum = 0;
-  for (auto var_name : var_names) {
+  for (auto const &var_name : var_names) {
     if (all_vars_.find(var_name) == all_vars_.end()) continue;
     auto var_desc = all_vars_.at(var_name);
     PADDLE_ENFORCE_NOT_NULL(var_desc,
@@ -1137,6 +1137,7 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
             details::BuildStrategy::ReduceStrategy::kAllReduce &&
         node->inputs[0]->Name().find(".block") == std::string::npos) {
       std::vector<std::string> input_var_names;
+      input_var_names.reserve(node->inputs.size());
       for (ir::Node *n : node->inputs) {
         input_var_names.push_back(n->Name());
       }
@@ -1162,6 +1163,7 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
     }
   } else if (node->Op()->Type() == "recv") {
     std::vector<std::string> output_var_names;
+    output_var_names.reserve(node->inputs.size());
     for (ir::Node *n : node->outputs) {
       output_var_names.push_back(n->Name());
     }
@@ -1245,6 +1247,8 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   int op_dev_id = -1;
   std::vector<std::string> input_var_names;
   std::vector<std::string> output_var_names;
+  input_var_names.reserve(node->inputs.size());
+  output_var_names.reserve(node->outputs.size());
   for (ir::Node *input : node->inputs) {
     input_var_names.push_back(input->Name());
   }
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index a950ec191a4bf7..0fd3a71754f6d9 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -273,9 +273,9 @@ PDNode* MultiHeadMatmulPattern::operator()() {
   auto* mul0_out_var =
       pattern->NewNode(mul0_out_repr())->assert_is_ops_output(mul_ops);
 
-  decltype(mul0) eltadd0;
-  decltype(mul0) eltadd0_b_var;
-  decltype(mul0) eltadd0_out_var;
+  decltype(mul0) eltadd0 = nullptr;
+  decltype(mul0) eltadd0_b_var = nullptr;
+  decltype(mul0) eltadd0_out_var = nullptr;
 
   mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
 
@@ -353,9 +353,9 @@ PDNode* MultiHeadMatmulPattern::operator()() {
   auto* mul1_out_var =
       pattern->NewNode(mul1_out_repr())->assert_is_ops_output(mul_ops);
 
-  decltype(mul1) eltadd1;
-  decltype(mul1) eltadd1_b_var;
-  decltype(mul1) eltadd1_out_var;
+  decltype(mul1) eltadd1 = nullptr;
+  decltype(mul1) eltadd1_b_var = nullptr;
+  decltype(mul1) eltadd1_out_var = nullptr;
 
   mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
   eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
@@ -389,9 +389,9 @@ PDNode* MultiHeadMatmulPattern::operator()() {
   auto* mul2_out_var =
       pattern->NewNode(mul2_out_repr())->assert_is_ops_output(mul_ops);
 
-  decltype(mul2) eltadd2;
-  decltype(mul2) eltadd2_b_var;
-  decltype(mul2) eltadd2_out_var;
+  decltype(mul2) eltadd2 = nullptr;
+  decltype(mul2) eltadd2_b_var = nullptr;
+  decltype(mul2) eltadd2_out_var = nullptr;
 
   mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
   eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
@@ -465,9 +465,9 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
   auto* mul0_out_var =
       pattern->NewNode(mul0_out_repr())->assert_is_ops_output(matmul_ops);
 
-  decltype(mul0) eltadd0;
-  decltype(mul0) eltadd0_b_var;
-  decltype(mul0) eltadd0_out_var;
+  decltype(mul0) eltadd0 = nullptr;
+  decltype(mul0) eltadd0_b_var = nullptr;
+  decltype(mul0) eltadd0_out_var = nullptr;
 
   mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
 
@@ -539,9 +539,9 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
   auto* mul1_out_var =
       pattern->NewNode(mul1_out_repr())->assert_is_ops_output(matmul_ops);
 
-  decltype(mul1) eltadd1;
-  decltype(mul1) eltadd1_b_var;
-  decltype(mul1) eltadd1_out_var;
+  decltype(mul1) eltadd1 = nullptr;
+  decltype(mul1) eltadd1_b_var = nullptr;
+  decltype(mul1) eltadd1_out_var = nullptr;
 
   mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
   eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
@@ -575,9 +575,9 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
   auto* mul2_out_var =
       pattern->NewNode(mul2_out_repr())->assert_is_ops_output(matmul_ops);
 
-  decltype(mul2) eltadd2;
-  decltype(mul2) eltadd2_b_var;
-  decltype(mul2) eltadd2_out_var;
+  decltype(mul2) eltadd2 = nullptr;
+  decltype(mul2) eltadd2_b_var = nullptr;
+  decltype(mul2) eltadd2_out_var = nullptr;
 
   mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
   eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
diff --git a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
index c974f5fafd68b3..be5fad23fd6e2d 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
@@ -53,9 +53,9 @@ PDNode* MultiHeadMatmulRoformerPattern::operator()() {
   auto* mul0_out_var =
       pattern->NewNode(mul0_out_repr())->assert_is_ops_output(matmul_ops);
 
-  decltype(mul0) eltadd0;
-  decltype(mul0) eltadd0_b_var;
-  decltype(mul0) eltadd0_out_var;
+  decltype(mul0) eltadd0 = nullptr;
+  decltype(mul0) eltadd0_b_var = nullptr;
+  decltype(mul0) eltadd0_out_var = nullptr;
 
   mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
 
@@ -165,9 +165,9 @@ PDNode* MultiHeadMatmulRoformerPattern::operator()() {
   auto* mul1_out_var =
       pattern->NewNode(mul1_out_repr())->assert_is_ops_output(matmul_ops);
 
-  decltype(mul1) eltadd1;
-  decltype(mul1) eltadd1_b_var;
-  decltype(mul1) eltadd1_out_var;
+  decltype(mul1) eltadd1 = nullptr;
+  decltype(mul1) eltadd1_b_var = nullptr;
+  decltype(mul1) eltadd1_out_var = nullptr;
 
   mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
   eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
@@ -232,9 +232,9 @@ PDNode* MultiHeadMatmulRoformerPattern::operator()() {
   auto* mul2_out_var =
       pattern->NewNode(mul2_out_repr())->assert_is_ops_output(matmul_ops);
 
-  decltype(mul2) eltadd2;
-  decltype(mul2) eltadd2_b_var;
-  decltype(mul2) eltadd2_out_var;
+  decltype(mul2) eltadd2 = nullptr;
+  decltype(mul2) eltadd2_b_var = nullptr;
+  decltype(mul2) eltadd2_out_var = nullptr;
 
   mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
   eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
index a0693e8a394338..d4e8a1683ed18a 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -114,7 +114,7 @@ TEST(SeqPoolConcatFusePass, basic) {
         std::vector<std::string>({"j"}));
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
+  int before = 0, after = 0;
   graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
   // Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op
   // Add 1 Node: fusion_seqpool_concat
@@ -168,7 +168,7 @@ TEST(SeqPoolConcatFusePass, advanced) {
         std::vector<std::string>({"h"}));
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
+  int before = 0, after = 0;
   graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
   // Remove 7 Nodes: op1, op2, c, d, e, f concat_op
   // Add 1 Node: fusion_seqpool_concat
@@ -204,7 +204,7 @@ TEST(SeqPoolConcatFusePass, more_inputs) {
   for (int num : {1, 2, 10}) {
     ProgramDesc prog = BuildProgramDesc(num);
     std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-    int before, after;
+    int before = 0, after = 0;
     graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
     // Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op
     // Add Node: fusion_seqpool_concat op
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
index 16296d83dae1c1..eeef9c73db3d71 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
@@ -145,7 +145,7 @@ void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const {
     std::vector<std::string> subgraph_ins_name;
     std::unordered_set<const Node*> marked_nodes;
 
-    Node* cvm_input_of_cvm;
+    Node* cvm_input_of_cvm = nullptr;
     Node* concat_out_var = concat_node->outputs[0];
 
     GraphPatternDetector::handle_t handler =
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
index f3adab84d3a3da..390a6fc0706dfc 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
@@ -151,7 +151,7 @@ TEST(SeqPoolCVMConcatFusePass, basic) {
         std::vector<std::string>({"m"}));
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
+  int before = 0, after = 0;
   graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
   // Remove 16 Nodes: op1, op2, op3, op4, op5, op6, d, e, f, g, h, i, j, k, l,
   // concat_op
@@ -219,7 +219,7 @@ TEST(SeqPoolCVMConcatFusePass, advanced) {
         std::vector<std::string>({"j"}));
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
+  int before = 0, after = 0;
   graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
   // Remove 11 Nodes: op1, op2, op4, op5, c, d, e, f, h, i, concat_op
   // Add 1 Node: fusion_seqpool_cvm_concat
@@ -265,7 +265,7 @@ TEST(SeqPoolCVMConcatFusePass, more_inputs) {
   for (int num : {1, 2, 10}) {
     ProgramDesc prog = BuildProgramDesc(num);
     std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-    int before, after;
+    int before = 0, after = 0;
     graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
     // Remove Nodes: n * (seqpool_op, seqpool_out, out_unused, cvm_op, cvm_out),
     // and concat_op
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index 9b3eb12c3eef7b..b300dcd76119c9 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -68,7 +68,7 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
       return nullptr;
     }
     for (auto* var : x->inputs) {
-      for (auto name : x->Op()->Input(arg_name)) {
+      for (auto const& name : x->Op()->Input(arg_name)) {
         if (var->Name() == name) {
           return var;
         }
diff --git a/paddle/fluid/framework/ir/subgraph_detector.cc b/paddle/fluid/framework/ir/subgraph_detector.cc
index a6576203235923..d15a117e1a38a0 100644
--- a/paddle/fluid/framework/ir/subgraph_detector.cc
+++ b/paddle/fluid/framework/ir/subgraph_detector.cc
@@ -234,6 +234,7 @@ void FlexibleDFS(const std::vector<BriefNode *> &source,
   } FNode;
 
   std::vector<FNode> stack;
+  stack.reserve(source.size());
   for (auto &node : source) {
     stack.push_back(FNode{node, false});
   }
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
index 2fc711979194a6..828418597e623a 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
@@ -39,13 +39,6 @@ class SyncBatchNormPass : public Pass {
         if (op->Type() == "batch_norm_grad") {
           op->SetType("sync_batch_norm_grad");
         }
-        // process synchronize in inplace_abn
-        if (op->Type() == "inplace_abn") {
-          op->SetAttr("use_sync_bn", true);
-        }
-        if (op->Type() == "inplace_abn_grad") {
-          op->SetAttr("use_sync_bn", true);
-        }
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc b/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
index daf4e8ca3204a0..da39950280320d 100644
--- a/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
+++ b/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
@@ -61,6 +61,7 @@ void TransferLayoutElimPass::PutTranferlayoutAfterOp(
   // group_norm has 3 inputs, but we do not need there is a transfer_layout
   // before Bias and Scale so we extract useful_var1s from op_node->inputs.
   std::vector<Node *> useful_var1s;
+  useful_var1s.reserve(op_node->inputs.size());
   for (auto var1 : op_node->inputs) {
     // if (var1->inputs.size() >= 1 &&
     //         var1->inputs[0]->Op()->Type() == "transfer_layout") {
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 64f2801bf0220e..6774a6baae0230 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -69,6 +69,7 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
 
   GraphPatternDetector gpd;
   std::vector<PDNode *> input_nodes;
+  input_nodes.reserve(times);
   for (int i = 0; i < times; i++) {
     input_nodes.push_back(gpd.mutable_pattern()
                               ->NewNode("x" + std::to_string(i))
@@ -166,6 +167,7 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
     int concat_axis = PADDLE_GET_CONST(int, concat_op->Op()->GetAttr("axis"));
     std::string output_name = concat_out->Name();
 
+    input_names.reserve(times);
     for (int i = 0; i < times; i++) {
       input_names.push_back(nodes[i * kNumFields]->Name());
     }
diff --git a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
index 6c6174b9267016..251cf1b02e9d80 100644
--- a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
@@ -254,6 +254,7 @@ void TrtDeleteWeightQuantDequantLinearOpPass::ApplyImpl(
     float* weight_scale_data = weight_scale_tensor->data<float>();
 
     auto weight_scale_nums = weight_scale_tensor->numel();
+    weight_scale.reserve(weight_scale_nums);
     for (int i = 0; i < weight_scale_nums; i++) {
       weight_scale.push_back(weight_scale_data[i] / static_cast<float>(range));
     }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 32675d5fa09c1b..96cff2521dfe7c 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -235,7 +235,7 @@ void SerializeToStream(std::ostream &os,
 
 void SerializeToStream(std::ostream &os, const phi::DenseTensor &tensor) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  const platform::DeviceContext *dev_ctx;
+  const platform::DeviceContext *dev_ctx = nullptr;
   auto place = tensor.place();
   dev_ctx = pool.Get(place);
   SerializeToStream(os, tensor, *dev_ctx);
@@ -243,7 +243,7 @@ void SerializeToStream(std::ostream &os, const phi::DenseTensor &tensor) {
 
 void DeserializeFromStream(std::istream &os, phi::DenseTensor *tensor) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  const platform::DeviceContext *dev_ctx;
+  const platform::DeviceContext *dev_ctx = nullptr;
   dev_ctx = pool.Get(platform::CPUPlace());
   DeserializeFromStream(os, tensor, *dev_ctx);
 }
@@ -255,7 +255,7 @@ void DeserializeFromStream(std::istream &is,
                            const std::vector<int64_t> &shape) {
   {
     // the 1st field, unit32_t version for DenseTensor
-    uint32_t version;
+    uint32_t version = 0;
     is.read(reinterpret_cast<char *>(&version), sizeof(version));
     PADDLE_ENFORCE_EQ(paddle::framework::IsTensorVersionSupported(version),
                       true,
@@ -271,7 +271,7 @@ void DeserializeFromStream(std::istream &is,
   }
   {
     // the 2st field, LoD information
-    uint64_t lod_level;
+    uint64_t lod_level = 0;
     is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
     auto &lod = *tensor->mutable_lod();
     lod.resize(lod_level);
@@ -286,7 +286,7 @@ void DeserializeFromStream(std::istream &is,
                            const platform::DeviceContext &dev_ctx) {
   {
     // the 1st field, unit32_t version for DenseTensor
-    uint32_t version;
+    uint32_t version = 0;
     is.read(reinterpret_cast<char *>(&version), sizeof(version));
     PADDLE_ENFORCE_EQ(paddle::framework::IsTensorVersionSupported(version),
                       true,
@@ -302,12 +302,12 @@ void DeserializeFromStream(std::istream &is,
   }
   {
     // the 2st field, LoD information
-    uint64_t lod_level;
+    uint64_t lod_level = 0;
     is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
     auto &lod = *tensor->mutable_lod();
     lod.resize(lod_level);
     for (uint64_t i = 0; i < lod_level; ++i) {
-      uint64_t size;
+      uint64_t size = 0;
       is.read(reinterpret_cast<char *>(&size), sizeof(size));
       std::vector<size_t> tmp(size / sizeof(size_t));
       is.read(reinterpret_cast<char *>(tmp.data()),
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
index e7efc1f10c3243..e9df08d4698e28 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
@@ -53,6 +53,23 @@ void InterpreterCoreFastGarbageCollector::Add(Variable* var) {
     for (auto& t : *tensor_arr) {
       Add(t.MoveMemoryHolder());
     }
+  } else if (var->IsType<phi::SparseCooTensor>()) {
+    Add(var->GetMutable<phi::SparseCooTensor>()
+            ->mutable_indices()
+            ->MoveMemoryHolder());
+    Add(var->GetMutable<phi::SparseCooTensor>()
+            ->mutable_values()
+            ->MoveMemoryHolder());
+  } else if (var->IsType<phi::SparseCsrTensor>()) {
+    Add(var->GetMutable<phi::SparseCsrTensor>()
+            ->mutable_cols()
+            ->MoveMemoryHolder());
+    Add(var->GetMutable<phi::SparseCsrTensor>()
+            ->mutable_crows()
+            ->MoveMemoryHolder());
+    Add(var->GetMutable<phi::SparseCsrTensor>()
+            ->mutable_values()
+            ->MoveMemoryHolder());
   } else if (var->IsType<std::vector<Scope*>>()) {
     // NOTE(@xiongkun03) conditional_op / while_op will create a STEP_SCOPE
     // refer to executor.cc to see what old garbage collector does.
diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
index 0623499975b6fb..8621c158a23e22 100644
--- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
@@ -1,8 +1,9 @@
 cc_library(
   instruction_base
   SRCS instruction_base.cc phi_kernel_instruction.cc
-       legacy_kernel_instruction.cc cond_instruction.cc instruction_util.cc
-  DEPS phi framework_proto)
+       legacy_kernel_instruction.cc cond_instruction.cc while_instruction.cc
+       instruction_util.cc
+  DEPS pir_adaptor phi framework_proto)
 
 if(WITH_CINN AND NOT CINN_ONLY)
   cc_library(
diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
index 8841103213400d..e549b243f87ec4 100644
--- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
@@ -17,102 +17,111 @@
 #include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
 #include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/pir_compiler.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
 
 namespace paddle {
 namespace framework {
 
-// TODO(Aurelius84): Think deeply what's the responsibility is it.
-// Currently it assumes CinnLaunchContext role.
-class JitContext {
+class CinnJitInstruction::FnPtrImpl {
+  using CUDAJITInfo = cinn::hlir::framework::pir::CUDAJITInfo;
+
  public:
-  cinn_buffer_t* GetCinnBufferOfVar(const std::string& name) {
-    auto res = paddle2argument_.find(name);
-    PADDLE_ENFORCE_NE(
-        res,
-        paddle2argument_.end(),
-        platform::errors::NotFound(
-            "Variable(%s) not found in compilation result", name));
-    return static_cast<cinn_buffer_t*>(res->second);
-  }
+  explicit FnPtrImpl(const CUDAJITInfo& cuda_jit_info)
+      : cuda_jit_info_(cuda_jit_info) {}
+  void Run(const std::vector<phi::DenseTensor*>& kernel_args, void* stream) {
+    func_args_.clear();
+    ptr_storage_.resize(kernel_args.size());
+    for (size_t i = 0; i < kernel_args.size(); ++i) {
+      ptr_storage_[i] = kernel_args[i]->data();
+      func_args_.push_back(ptr_storage_.data() + i);
+    }
 
-  // NOTE(Aurelius84): Before running each instruction, we should share Tensor
-  // memory from paddle scope with cinn_buffer_t from cinn scope including
-  // inputs and outputs.
-  void ShareMemToCinn(const std::string& var_name,
-                      const phi::Place& place,
-                      Scope* scope) {
-    cinn_buffer_t* buffer = GetCinnBufferOfVar(var_name);
-    auto* tensor = scope->GetVar(var_name)->GetMutable<phi::DenseTensor>();
-    // TODO(Aurelius84): Maybe we should consider to unify the Scope
-    // structure between paddle and cinn, so that we don't need to develop
-    // the glue code.
-    buffer->memory = reinterpret_cast<uint8_t*>(tensor->mutable_data(
-        place, paddle2cinn::TransToPaddleDataType(buffer->type)));
+    CUDA_DRIVER_CALL(
+        cuLaunchKernel(static_cast<CUfunction>(cuda_jit_info_.fn_ptr),
+                       cuda_jit_info_.grid_dims[0],
+                       cuda_jit_info_.grid_dims[1],
+                       cuda_jit_info_.grid_dims[2],
+                       cuda_jit_info_.block_dims[0],
+                       cuda_jit_info_.block_dims[1],
+                       cuda_jit_info_.block_dims[2],
+                       0,  // share memory
+                       static_cast<CUstream>(stream),
+                       func_args_.data(),
+                       nullptr))
   }
 
-  // TODO(Aurelius84): Add logic to parse stream for different device.
-  void* GetStream() { return nullptr; }
-
  private:
-  // because a cinn_pod_value_t does not own a cinn_buffer_t object,
-  // an extra stroage is necessary to keep those objects and they can
-  // not be released until the runtime program finish execution.
-  std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers_;
-  // this map saves all execution arguments with their cinn names as key,
-  // and it is passed to the Execute interface of a cinn runtime program.
-  std::map<std::string, cinn_pod_value_t> name2argument_;
-  // this map saves all execution arguments with paddle variables as key,
-  // this map conbine name2argument_ and paddle2cinn_varmap_
-  std::map<std::string, cinn_pod_value_t> paddle2argument_;
-};
+  CUDAJITInfo cuda_jit_info_;
 
-// TODO(Aurelius84): Impl should hold JitContext instance to
-// deliver the device context for 'instr->Run' and responsible
-// to deal with inner buffer_t shareing between framework::Scope
-// and cinn::Scope.
-class CinnJitInstruction::Impl {
-  using Instruction = cinn::hlir::framework::Instruction;
-
- public:
-  explicit Impl(Instruction* instr) : instr_(instr) {}
-  // TODO(Aurelus84): Support to specify name2podargs and stream arguments.
-  void Run() {
-    PADDLE_ENFORCE_NOT_NULL(
-        instr_, platform::errors::NotFound("instr_ should not be NULL"));
-    instr_->Run(/*name2podargs=*/nullptr,
-                false,
-                /*stream=*/nullptr,
-                /*use_cache=*/true);
-  }
-  const Instruction* pointer() const { return instr_; }
-
- private:
-  Instruction* instr_{nullptr};
+  std::vector<void*> ptr_storage_;
+  std::vector<void*> func_args_;
 };
 
-CinnJitInstruction::CinnJitInstruction(size_t id,
-                                       const platform::Place& place,
-                                       ::pir::Operation* op,
-                                       Scope* scope)
+CinnJitInstruction::CinnJitInstruction(
+    size_t id,
+    const platform::Place& place,
+    ::pir::Operation* op,
+    const ValueExecutionInfo& value_exec_info)
     : InstructionBase(id, place) {
-  // TODO(Aurelius84): We shall simplify members of JitKernelOp to make it
-  // only hold related function ptrs. Impl is the real runtime data structure
-  // responsible to construct hlir::framework::Instruction.
   auto jit_kernel_op = op->dyn_cast<cinn::dialect::JitKernelOp>();
-  impl_ = std::make_shared<Impl>(jit_kernel_op.instruction());
+  fn_ptr_impl_ = std::make_shared<FnPtrImpl>(jit_kernel_op.cuda_jit_info());
   op_ = op;
+
+  place_ = place;
+
+  InitInputsOutputsIds(op, value_exec_info);
+
+  for (size_t i = 0; i < op->num_operands(); ++i) {
+    auto in = op->operand_source(i);
+
+    auto var_name = value_exec_info.GetVarName(in);
+
+    auto tensor = value_exec_info.GetScope()
+                      ->Var(var_name)
+                      ->GetMutable<phi::DenseTensor>();
+
+    tensor_args_.push_back(tensor);
+  }
+
+  dev_ctx_ = phi::DeviceContextPool::Instance().Get(place_);
+
+  for (size_t i = 0; i < op->num_results(); ++i) {
+    pir::Value result = op->result(i);
+    auto var_name = value_exec_info.GetVarName(result);
+
+    auto tensor = value_exec_info.GetScope()
+                      ->Var(var_name)
+                      ->GetMutable<phi::DenseTensor>();
+
+    tensor_args_.push_back(tensor);
+
+    out_tensor_ = tensor;
+
+    auto alloc_tensor_type =
+        result.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
+    tensor->set_type(
+        paddle::dialect::TransToPhiDataType(alloc_tensor_type.dtype()));
+    tensor->Resize(alloc_tensor_type.dims());
+  }
 }
 
 void CinnJitInstruction::Run() {
-  VLOG(6) << "Run cinn jit_kernel_op : " << Name();
-  impl_->Run();
+  auto gpu_ctx = static_cast<phi::GPUContext*>(dev_ctx_);
+
+  auto stream = gpu_ctx->stream();
+  for (size_t i = 0; i < tensor_args_.size(); ++i) {
+    gpu_ctx->Alloc(tensor_args_[i], tensor_args_[i]->dtype());
+  }
+
+  fn_ptr_impl_->Run(tensor_args_, static_cast<void*>(stream));
 }
 
 const std::string& CinnJitInstruction::Name() const {
-  // TODO(Aurelius84): Consider the case for instrucitons constaning
-  // multipule function ptrs and function names.
-  return impl_->pointer()->function_name();
+  static const std::string name = "cinn_jit";
+  return name;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h
index 5f5e4f74e88848..ceb4014f044a6f 100644
--- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h
@@ -30,7 +30,7 @@ class CinnJitInstruction : public InstructionBase {
   CinnJitInstruction(size_t id,
                      const platform::Place& place,
                      ::pir::Operation* op,
-                     Scope* scope);
+                     const ValueExecutionInfo& value_exec_info);
 
   // TODO(Aurelius84): Only implement core interface and need implement GC and
   // Event logic.
@@ -41,8 +41,17 @@ class CinnJitInstruction : public InstructionBase {
   ::pir::Operation* Operation() const override { return op_; }
 
  private:
-  class Impl;
-  std::shared_ptr<Impl> impl_{nullptr};
+  class FnPtrImpl;
+
+  std::shared_ptr<FnPtrImpl> fn_ptr_impl_{nullptr};
+
+  platform::Place place_;
+
+  phi::DeviceContext* dev_ctx_;
+
+  phi::DenseTensor* out_tensor_;
+
+  std::vector<phi::DenseTensor*> tensor_args_;
 
   ::pir::Operation* op_{nullptr};  // not owned
 };
diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc
index 5d958d72665058..2422597ece0d1a 100644
--- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc
@@ -34,225 +34,138 @@
 #include "paddle/pir/core/value.h"
 
 #include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 namespace paddle {
 namespace framework {
 
-std::vector<pir::Value> GetYiedOpInputs(pir::Block* block) {
-  std::vector<pir::Value> vec_res;
-  for (auto op : (*block)) {
-    if (op->name() == "cf.yield") {
-      for (size_t i = 0; i < op->num_operands(); ++i) {
-        vec_res.push_back(op->operand_source(i));
-      }
-    }
-  }
-
-  return vec_res;
-}
-
-void GetInputIds(
-    pir::Operation* op,
-    Scope* inner_scope,
-    const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name,
-    std::unordered_map<pir::Value, std::vector<int>>* input_ids) {
-  for (size_t i = 0; i < op->num_operands(); i++) {
-    pir::Value value = op->operand_source(i);
-    if (value) {
-      PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
-          phi::errors::PreconditionNotMet(
-              "input should in name map, [%d] 'th input of [%s] op",
-              i,
-              "if op"));
-      std::vector<int> inputs_id = GetValueIds(value,
-                                               inner_scope,
-                                               value_2_var_name,
-                                               var_name_2_id,
-                                               variable_2_var_name);
-      input_ids->emplace(value, inputs_id);
-    }
-  }
-}
-
-void GetOutsideOpInputs(
-    pir::Block* block,
-    Scope* inner_scope,
-    const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name,
-    std::unordered_map<pir::Value, std::vector<int>>* input_ids) {
-  std::unordered_set<pir::Value> inner_outputs;
-  for (auto op : (*block)) {
-    for (size_t i = 0; i < op->num_results(); ++i) {
-      inner_outputs.insert(op->result(i));
-    }
-  }
-
-  for (auto op : (*block)) {
-    for (size_t i = 0; i < op->num_operands(); ++i) {
-      pir::Value value = op->operand_source(i);
-      if (value && (!inner_outputs.count(value))) {
-        PADDLE_ENFORCE_NE(
-            value_2_var_name.find(value),
-            value_2_var_name.end(),
-            phi::errors::PreconditionNotMet(
-                "input should in name map, [%d] 'th input of [%s] op",
-                i,
-                "if op"));
-        std::vector<int> inputs_id = GetValueIds(value,
-                                                 inner_scope,
-                                                 value_2_var_name,
-                                                 var_name_2_id,
-                                                 variable_2_var_name);
-
-        input_ids->emplace(value, inputs_id);
-      }
-    }
-  }
-}
-
-CondInstruction::CondInstruction(
-    size_t id,
-    const platform::Place& place,
-    pir::Operation* op,
-    Scope* scope,
-    Scope* local_scope,
-    ValueExecutionInfo* parent_exe_info,
-    const std::map<pir::Block*, paddle::framework::Scope*>& sub_blocks)
+CondInstruction::CondInstruction(size_t id,
+                                 const platform::Place& place,
+                                 pir::Operation* op,
+                                 ValueExecutionInfo* value_exec_info)
     : InstructionBase(id, place) {
-  op_ = op;
-  VLOG(6) << "finish process dist attributes";
-
-  SetKernelType(AnalyseOpFuncType(op, place));
-  VLOG(6) << "finish process analyse kernel type";
-
-  Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
-
-  VLOG(6) << "finish process inputs outputs index";
-
   PADDLE_ENFORCE(
       op->isa<paddle::dialect::IfOp>(),
       phi::errors::PreconditionNotMet("Cond instruction only support if op"));
-
   auto if_op = op->dyn_cast<paddle::dialect::IfOp>();
+  op_ = op;
 
-  for (size_t i = 0; i < if_op.num_results(); ++i) {
-    if_op_outputs_.push_back(inner_scope->GetVar(
-        parent_exe_info->GetValue2VarName().at(if_op.result(i))));
-  }
+  SetKernelType(AnalyseOpFuncType(op, place));
+  VLOG(6) << "finish process analyse kernel type";
 
   auto cond_value = if_op.operand_source(0);
-  auto var_name = parent_exe_info->GetValue2VarName().at(cond_value);
-  cond_var = inner_scope->FindVar(var_name);
+  cond_var_ = value_exec_info->GetScope()->FindVar(
+      value_exec_info->GetValue2VarName().at(cond_value));
+  for (size_t i = 0; i < if_op.num_results(); ++i) {
+    output_vars_.push_back(value_exec_info->GetScope()->GetVar(
+        value_exec_info->GetValue2VarName().at(if_op.result(i))));
+  }
+  VLOG(6) << "finish process cond_var and output_vars";
 
+  // NOTE(zhangbo): IfOp sub_block's inputs include two kind of value: one is
+  // OpOperand of IfOp, and the other is external Values used in true_block or
+  // false_block.
   auto true_branch_block = if_op.true_block();
   auto false_branch_block = if_op.false_block();
+  std::unordered_map<pir::Value, std::vector<int>> inputs;
+  GetInputIds(op, *value_exec_info, &inputs);
+  auto true_outside_inputs =
+      GetOutsideOpInputs(true_branch_block, *value_exec_info, &inputs);
+  auto false_outside_inputs =
+      GetOutsideOpInputs(false_branch_block, *value_exec_info, &inputs);
+  SetInputs(inputs);
 
-  auto true_branch_yied_inputs = GetYiedOpInputs(true_branch_block);
-  auto false_branch_yied_inputs = GetYiedOpInputs(false_branch_block);
+  std::unordered_map<pir::Value, std::vector<int>> outputs;
+  for (size_t i = 0; i < op->num_results(); i++) {
+    pir::Value value = op->result(i);
+    if (value && value.type()) {
+      PADDLE_ENFORCE_EQ(
+          value_exec_info->HasValue(value),
+          true,
+          phi::errors::PreconditionNotMet(
+              "input should in name map, [%d] 'th input of [%s] op",
+              i,
+              "if op"));
+      outputs.emplace(value, GetValueIds(value, *value_exec_info));
+    }
+  }
+  SetOutputs(outputs);
+  VLOG(6) << "finish process inputs outputs index";
 
-  auto true_scope = sub_blocks.at(true_branch_block);
-  true_branch_inter =
+  Scope* true_scope = &(value_exec_info->GetScope()->NewScope());
+  true_branch_inter_ =
       new NewIRInterpreter(place,
                            {},
                            true_branch_block,
                            true_scope,
-                           parent_exe_info->NewChild(true_scope),
+                           value_exec_info->NewChild(true_scope),
                            {});
 
   std::set<std::string> true_skip_gc_names_set;
-  for (auto value : true_branch_yied_inputs) {
-    true_skip_gc_names_.push_back(true_branch_inter->GetNameByValue(value));
-    true_skip_gc_names_set.insert(true_branch_inter->GetNameByValue(value));
+  for (auto value : GetYiedOpInputs(true_branch_block)) {
+    true_branch_outputs_.push_back(true_branch_inter_->GetNameByValue(value));
+    true_skip_gc_names_.push_back(true_branch_inter_->GetNameByValue(value));
+    true_skip_gc_names_set.insert(true_branch_inter_->GetNameByValue(value));
   }
-  true_branch_inter->SetSkipGcVars(true_skip_gc_names_set);
+  // NOTE(zhangbo): According to the concept of control flow, child scopes
+  // should not control the lifecycle of parent scope variables.
+  for (auto value : true_outside_inputs) {
+    true_skip_gc_names_.push_back(true_branch_inter_->GetNameByValue(value));
+    true_skip_gc_names_set.insert(true_branch_inter_->GetNameByValue(value));
+  }
+  true_branch_inter_->SetSkipGcVars(true_skip_gc_names_set);
+  VLOG(6) << "finish process true branch interpreter";
 
-  auto false_scope = sub_blocks.at(false_branch_block);
-  false_branch_inter =
+  Scope* false_scope = &(value_exec_info->GetScope()->NewScope());
+  false_branch_inter_ =
       new NewIRInterpreter(place,
                            {},
                            false_branch_block,
                            false_scope,
-                           parent_exe_info->NewChild(false_scope),
+                           value_exec_info->NewChild(false_scope),
                            {});
 
   std::set<std::string> false_skip_gc_names_set;
-  for (auto value : false_branch_yied_inputs) {
-    false_skip_gc_names_.push_back(false_branch_inter->GetNameByValue(value));
-    false_skip_gc_names_set.insert(false_branch_inter->GetNameByValue(value));
+  for (auto value : GetYiedOpInputs(false_branch_block)) {
+    false_branch_outputs_.push_back(false_branch_inter_->GetNameByValue(value));
+    false_skip_gc_names_.push_back(false_branch_inter_->GetNameByValue(value));
+    false_skip_gc_names_set.insert(false_branch_inter_->GetNameByValue(value));
   }
-  false_branch_inter->SetSkipGcVars(false_skip_gc_names_set);
-
-  // the true branch and false branch input will be the if_op inputs
-
-  std::unordered_map<pir::Value, std::vector<int>> inputs;
-  GetInputIds(op,
-              inner_scope,
-              parent_exe_info->GetValue2VarName(),
-              parent_exe_info->GetVarName2Id(),
-              parent_exe_info->GetVar2VarName(),
-              &inputs);
-  GetOutsideOpInputs(true_branch_block,
-                     inner_scope,
-                     parent_exe_info->GetValue2VarName(),
-                     parent_exe_info->GetVarName2Id(),
-                     parent_exe_info->GetVar2VarName(),
-                     &inputs);
-
-  GetOutsideOpInputs(false_branch_block,
-                     inner_scope,
-                     parent_exe_info->GetValue2VarName(),
-                     parent_exe_info->GetVarName2Id(),
-                     parent_exe_info->GetVar2VarName(),
-                     &inputs);
-  SetInputs(inputs);
+  for (auto value : false_outside_inputs) {
+    false_skip_gc_names_.push_back(false_branch_inter_->GetNameByValue(value));
+    false_skip_gc_names_set.insert(false_branch_inter_->GetNameByValue(value));
+  }
+  false_branch_inter_->SetSkipGcVars(false_skip_gc_names_set);
+  VLOG(6) << "finish process false branch interpreter";
+}
 
-  std::unordered_map<pir::Value, std::vector<int>> outputs;
-  for (size_t i = 0; i < op->num_results(); i++) {
-    pir::Value value = op->result(i);
-    if (value && value.type()) {
-      PADDLE_ENFORCE_NE(
-          parent_exe_info->GetValue2VarName().find(value),
-          parent_exe_info->GetValue2VarName().end(),
-          phi::errors::PreconditionNotMet(
-              "input should in name map, [%d] 'th input of [%s] op",
-              i,
-              "if op"));
-      std::vector<int> outputs_id =
-          GetValueIds(value,
-                      inner_scope,
-                      parent_exe_info->GetValue2VarName(),
-                      parent_exe_info->GetVarName2Id(),
-                      parent_exe_info->GetVar2VarName());
-      outputs.emplace(value, outputs_id);
-    }
+CondInstruction::~CondInstruction() {
+  if (true_branch_inter_ != nullptr) {
+    delete true_branch_inter_;
+  }
+  if (false_branch_inter_ != nullptr) {
+    delete false_branch_inter_;
   }
-  SetOutputs(outputs);
 }
 
 void CondInstruction::CopyBranchOutput(
     const std::vector<std::string>& var_names, const NewIRInterpreter* inter) {
   for (size_t i = 0; i < var_names.size(); ++i) {
-    auto* inner_var = inter->local_scope()->GetVar(var_names[i]);
+    auto* inner_var = inter->InnerScope()->GetVar(var_names[i]);
 
-    if_op_outputs_[i]->GetMutable<phi::DenseTensor>()->ShareDataWith(
+    output_vars_[i]->GetMutable<phi::DenseTensor>()->ShareDataWith(
         inner_var->Get<phi::DenseTensor>());
   }
 }
 
 void CondInstruction::Run() {
-  if (cond_var->Get<phi::DenseTensor>().data<bool>()[0]) {
-    true_branch_inter->Run({}, false);
-    CopyBranchOutput(true_skip_gc_names_, true_branch_inter);
+  DeviceContext().Wait();
+  if (cond_var_->Get<phi::DenseTensor>().data<bool>()[0]) {
+    true_branch_inter_->Run({}, false);
+    CopyBranchOutput(true_branch_outputs_, true_branch_inter_);
   } else {
-    false_branch_inter->Run({}, false);
-    CopyBranchOutput(false_skip_gc_names_, false_branch_inter);
+    false_branch_inter_->Run({}, false);
+    CopyBranchOutput(false_branch_outputs_, false_branch_inter_);
   }
 
   // copy ouptut
diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.h b/paddle/fluid/framework/new_executor/instruction/cond_instruction.h
index 75eb7d0ece04f5..469c0ed0ae1ab8 100644
--- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/cond_instruction.h
@@ -29,14 +29,12 @@ class ValueExecutionInfo;
 
 class CondInstruction : public InstructionBase {
  public:
-  CondInstruction(
-      size_t id,
-      const platform::Place& place,
-      ::pir::Operation* op,
-      Scope* scope,
-      Scope* local_scope,
-      ValueExecutionInfo* parent_exe_info,
-      const std::map<pir::Block*, paddle::framework::Scope*>& sub_blocks);
+  CondInstruction(size_t id,
+                  const platform::Place& place,
+                  ::pir::Operation* op,
+                  ValueExecutionInfo* value_exe_info);
+
+  ~CondInstruction();
 
   void Run() override;
 
@@ -48,19 +46,27 @@ class CondInstruction : public InstructionBase {
   void CopyBranchOutput(const std::vector<std::string>& var_names,
                         const NewIRInterpreter* inter);
 
+  ::pir::Operation* op_;
+
   std::string cond_name_{"cond_instruction"};
 
-  Variable* cond_var;
+  Variable* cond_var_;
+
+  std::vector<Variable*> output_vars_;
+
+  NewIRInterpreter* true_branch_inter_;
 
-  std::vector<Variable*> if_op_outputs_;
+  NewIRInterpreter* false_branch_inter_;
 
-  NewIRInterpreter* true_branch_inter;
-  NewIRInterpreter* false_branch_inter;
+  std::vector<std::string> true_branch_outputs_;
 
+  std::vector<std::string> false_branch_outputs_;
+
+  // TODO(zhangbo): Currently, only the output of IfOp is included. In the
+  // future, need to consider how to support IfGradOp using IfOp value.
   std::vector<std::string> true_skip_gc_names_;
-  std::vector<std::string> false_skip_gc_names_;
 
-  ::pir::Operation* op_;
+  std::vector<std::string> false_skip_gc_names_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
index a6d2f5a201b38a..62419acffc099f 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
@@ -214,31 +215,25 @@ void InstructionBase::SetOutputs(
 }
 
 void InstructionBase::InitInputsOutputsIds(
-    ::pir::Operation* op,
-    Scope* inner_scope,
-    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name) {
+    ::pir::Operation* op, const ValueExecutionInfo& value_exec_info) {
   auto op_attributes = op->attributes();
-  auto op_name =
-      op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  std::string op_name;
+  if (op_attributes.count("op_name ")) {
+    op_name =
+        op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  }
   std::unordered_map<pir::Value, std::vector<int>> inputs;
   for (size_t i = 0; i < op->num_operands(); i++) {
     pir::Value value = op->operand_source(i);
     if (value) {
-      PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
+      PADDLE_ENFORCE_EQ(
+          value_exec_info.HasValue(value),
+          true,
           phi::errors::PreconditionNotMet(
               "input should in name map, [%d] 'th input of [%s] op",
               i,
               op_name));
-      std::vector<int> inputs_id = GetValueIds(value,
-                                               inner_scope,
-                                               value_2_var_name,
-                                               var_name_2_id,
-                                               variable_2_var_name);
+      std::vector<int> inputs_id = GetValueIds(value, value_exec_info);
       inputs.emplace(value, inputs_id);
     }
   }
@@ -248,18 +243,14 @@ void InstructionBase::InitInputsOutputsIds(
   for (size_t i = 0; i < op->num_results(); i++) {
     pir::Value value = op->result(i);
     if (value && value.type()) {
-      PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
+      PADDLE_ENFORCE_EQ(
+          value_exec_info.HasValue(value),
+          true,
           phi::errors::PreconditionNotMet(
               "input should in name map, [%d] 'th input of [%s] op",
               i,
               op_name));
-      std::vector<int> outputs_id = GetValueIds(value,
-                                                inner_scope,
-                                                value_2_var_name,
-                                                var_name_2_id,
-                                                variable_2_var_name);
+      std::vector<int> outputs_id = GetValueIds(value, value_exec_info);
       outputs.emplace(value, outputs_id);
     }
   }
@@ -269,8 +260,7 @@ void InstructionBase::InitInputsOutputsIds(
 
 std::string InstructionBase::DebugStringEx(
     const paddle::framework::Scope* scope,
-    const std::unordered_map<::pir::Value, std::string>& value_2_var_name)
-    const {
+    ValueExecutionInfo* value_exe_info) const {
   std::stringstream ss;
   ss << "Op(" << Name() << "), inputs:{";
 
@@ -280,7 +270,7 @@ std::string InstructionBase::DebugStringEx(
     auto& input = *it;
     bool is_no_need_buffer_var = (!no_need_buffer_vars.empty() &&
                                   no_need_buffer_vars.count(input.first) > 0);
-    auto var_name = value_2_var_name.at(input.first);
+    auto var_name = value_exe_info->GetVarName(input.first);
     ss << var_name;
     if (scope) {
       if (!VarInited(*scope, var_name)) {
@@ -308,7 +298,7 @@ std::string InstructionBase::DebugStringEx(
   ss << "}, outputs:{";
   for (auto it = Outputs().begin(); it != Outputs().end();) {
     auto& output = *it;
-    auto var_name = value_2_var_name.at(output.first);
+    auto var_name = value_exe_info->GetVarName(output.first);
     ss << var_name;
     if (scope) {
       if (!VarInited(*scope, var_name)) {
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
index 7a77e8e8fae859..5dd7ff3e4d2a5d 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -28,6 +28,7 @@ class Value;
 
 namespace paddle {
 namespace framework {
+class ValueExecutionInfo;
 
 using SchedulingPriority = int64_t;
 
@@ -139,19 +140,12 @@ class InstructionBase {
 
   virtual ::pir::Operation* Operation() const = 0;
 
-  void InitInputsOutputsIds(
-      ::pir::Operation* op,
-      Scope* inner_scope,
-      const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
-      const std::map<std::string, int>& var_name_2_id,
-      const std::unordered_map<const paddle::framework::Variable*, std::string>&
-          variable_2_var_name);
+  void InitInputsOutputsIds(::pir::Operation* op,
+                            const ValueExecutionInfo& value_exec_info);
 
   // if scope is not null, also show dimensions of arguments
-  virtual std::string DebugStringEx(
-      const paddle::framework::Scope* scope,
-      const std::unordered_map<::pir::Value, std::string>& value_2_var_name)
-      const;
+  virtual std::string DebugStringEx(const paddle::framework::Scope* scope,
+                                    ValueExecutionInfo* value_exe_info) const;
 
  protected:
   size_t id_;
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
index dfafd442815372..4066bc7afb3dc6 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -20,6 +20,8 @@
 #include <vector>
 
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/pir/core/builtin_attribute.h"
@@ -29,6 +31,7 @@
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
 #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
@@ -40,22 +43,17 @@ PHI_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace framework {
 
-std::vector<int> GetValueIds(
-    pir::Value value,
-    Scope* inner_scope,
-    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name) {
+std::vector<int> GetValueIds(pir::Value value,
+                             const ValueExecutionInfo& value_exec_info) {
   std::vector<int> ids;
-  auto& var_name = value_2_var_name.at(value);
-  ids.push_back(var_name_2_id.at(var_name));
+  ids.push_back(value_exec_info.GetVarId(value));
   // NOTE(zhangbo): Value maybe a VariableRefArray
-  auto var = inner_scope->FindVar(var_name);
+  auto var =
+      value_exec_info.GetScope()->FindVar(value_exec_info.GetVarName(value));
   if (var->IsType<paddle::framework::VariableRefArray>()) {
     auto& var_array = var->Get<paddle::framework::VariableRefArray>();
     for (auto item : var_array) {
-      ids.push_back(var_name_2_id.at(variable_2_var_name.at(item)));
+      ids.push_back(value_exec_info.GetVarId(item));
     }
   }
   return ids;
@@ -147,43 +145,111 @@ OpFuncType AnalyseOpFuncType(pir::Operation* op, const platform::Place& place) {
     return OpFuncType::kCpuSync;
   }
 
-  auto kernel_key = op->attributes()
-                        .at("kernel_key")
-                        .dyn_cast<dialect::KernelAttribute>()
-                        .data();
-  if (phi::TransToPhiPlace(kernel_key.backend()).GetType() ==
-      phi::AllocationType::CPU) {
-    return OpFuncType::kCpuSync;
-  }
-
   PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
                     true,
                     phi::errors::Fatal("Unsupported current place %s", place));
 
+  auto& op_attributes = op->attributes();
+
+  if ((op->dialect()->name().compare(paddle::dialect::KernelDialect::name()) ==
+       0) &&
+      (op_attributes.count("kernel_key") > 0)) {
+    auto kernel_key = op_attributes.at("kernel_key")
+                          .dyn_cast<dialect::KernelAttribute>()
+                          .data();
+    if (phi::TransToPhiPlace(kernel_key.backend()).GetType() ==
+        phi::AllocationType::CPU) {
+      return OpFuncType::kCpuSync;
+    }
+  }
+
   // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
   // computing. They execute serially in device thread and block CUDA kernel
   // launching in other GPU OPs. To improve performance, set them as kGpuSync
   // and so that they would be dispatched to host thread.
-  auto& op_attributes = op->attributes();
-  auto op_name =
-      op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
-  if (op_name == "pd_op.coalesce_tensor" &&
-      (!platform::is_xpu_place(place) ||
-       op->attribute<pir::BoolAttribute>("persist_output").data() == false) &&
-      op->attribute<pir::BoolAttribute>("set_constant").data() == false &&
-      op->attribute<pir::BoolAttribute>("copy_data").data() == false) {
-    return OpFuncType::kGpuSync;
+  if ((op->dialect()->name() == "pd_kernel") &&
+      (op_attributes.count("op_name") > 0)) {
+    auto op_name =
+        op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+    if (op_name == "pd_op.coalesce_tensor" &&
+        (!platform::is_xpu_place(place) ||
+         op->attribute<pir::BoolAttribute>("persist_output").data() == false) &&
+        op->attribute<pir::BoolAttribute>("set_constant").data() == false &&
+        op->attribute<pir::BoolAttribute>("copy_data").data() == false) {
+      return OpFuncType::kGpuSync;
+    }
+
+    if (platform::is_gpu_place(place) && op_name == "pd_op.memcpy_d2h") {
+      return OpFuncType::kGpuSync;
+    }
+
+    if (op_name.compare(paddle::dialect::ShapeOp::name()) == 0) {
+      return OpFuncType::kGpuSync;
+    }
   }
 
-  // for memcpy explicitly called by user
-  if (platform::is_gpu_place(place) && op_name == "pd_op.memcpy_d2h") {
-    return OpFuncType::kGpuSync;
+  return OpFuncType::kGpuAsync;
+}
+
+std::vector<pir::Value> GetYiedOpInputs(pir::Block* block) {
+  std::vector<pir::Value> vec_res;
+
+  if (block && !block->empty() && block->back()->isa<pir::YieldOp>()) {
+    auto* op = block->back();
+    for (size_t i = 0; i < op->num_operands(); ++i) {
+      vec_res.emplace_back(op->operand_source(i));
+    }
   }
+  return vec_res;
+}
 
-  if (op_name == "pd_op.shape") {
-    return OpFuncType::kGpuSync;
+void GetInputIds(pir::Operation* op,
+                 const ValueExecutionInfo& value_exec_info,
+                 std::unordered_map<pir::Value, std::vector<int>>* input_ids) {
+  for (size_t i = 0; i < op->num_operands(); i++) {
+    pir::Value value = op->operand_source(i);
+    if (value && value.type()) {
+      PADDLE_ENFORCE_EQ(
+          value_exec_info.HasValue(value),
+          true,
+          phi::errors::PreconditionNotMet(
+              "input should in name map, [%d] 'th input of [%s] op",
+              i,
+              "if op"));
+      input_ids->emplace(value, GetValueIds(value, value_exec_info));
+    }
   }
-  return OpFuncType::kGpuAsync;
+}
+
+std::vector<pir::Value> GetOutsideOpInputs(
+    pir::Block* block,
+    const ValueExecutionInfo& value_exec_info,
+    std::unordered_map<pir::Value, std::vector<int>>* input_ids) {
+  std::unordered_set<pir::Value> inner_outputs;
+  for (auto op : (*block)) {
+    for (size_t i = 0; i < op->num_results(); ++i) {
+      inner_outputs.insert(op->result(i));
+    }
+  }
+
+  std::vector<pir::Value> outside_op_inputs;
+  for (auto op : (*block)) {
+    for (size_t i = 0; i < op->num_operands(); ++i) {
+      pir::Value value = op->operand_source(i);
+      if (value && (!inner_outputs.count(value))) {
+        PADDLE_ENFORCE_EQ(
+            value_exec_info.HasValue(value),
+            true,
+            phi::errors::PreconditionNotMet(
+                "input should in name map, [%d] 'th input of [%s] op",
+                i,
+                op->name()));
+        input_ids->emplace(value, GetValueIds(value, value_exec_info));
+        outside_op_inputs.push_back(value);
+      }
+    }
+  }
+  return outside_op_inputs;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
index c555a101d8366d..8304b134e05341 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
@@ -28,13 +28,10 @@
 namespace paddle {
 namespace framework {
 
-std::vector<int> GetValueIds(
-    pir::Value value,
-    Scope* inner_scope,
-    const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name);
+class ValueExecutionInfo;
+
+std::vector<int> GetValueIds(pir::Value value,
+                             const ValueExecutionInfo& value_exec_info);
 
 platform::DeviceContext* ParseDeviceContext(
     pir::Operation* op,
@@ -46,5 +43,16 @@ platform::DeviceContext* ParseDeviceContext(
 OpFuncType AnalyseOpFuncType(::pir::Operation* op,
                              const platform::Place& place);
 
+std::vector<pir::Value> GetYiedOpInputs(pir::Block* block);
+
+void GetInputIds(pir::Operation* op,
+                 const ValueExecutionInfo& value_exec_info,
+                 std::unordered_map<pir::Value, std::vector<int>>* input_ids);
+
+std::vector<pir::Value> GetOutsideOpInputs(
+    pir::Block* block,
+    const ValueExecutionInfo& value_exec_info,
+    std::unordered_map<pir::Value, std::vector<int>>* input_ids);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
index 748c7e603f7d70..6a8ecd09c4cece 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -36,13 +36,8 @@ LegacyKernelInstruction::LegacyKernelInstruction(
     size_t id,
     const platform::Place& place,
     pir::Operation* op,
-    Scope* scope,
-    Scope* local_scope,
-    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name)
-    : InstructionBase(id, place) {
+    const ValueExecutionInfo& value_exec_info)
+    : InstructionBase(id, place), value_exec_info_(value_exec_info) {
   auto& op_attributes = op->attributes();
   auto op_name =
       op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
@@ -99,18 +94,13 @@ LegacyKernelInstruction::LegacyKernelInstruction(
   VLOG(6) << "finish process yaml_info_parser";
 
   if (infer_meta_interface_) {
-    pir::BuildPhiContext<
+    BuildPhiContext<
         phi::InferMetaContext,
         phi::MetaTensor,
         phi::MetaTensor,
         paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
         paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
-        false>(op,
-               value_2_var_name,
-               scope,
-               local_scope,
-               yaml_info_parser,
-               &infer_meta_context_);
+        false>(op, value_exec_info_, yaml_info_parser, &infer_meta_context_);
   }
   VLOG(6) << "finish process infer meta context";
 
@@ -126,10 +116,10 @@ LegacyKernelInstruction::LegacyKernelInstruction(
       phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name);
   VLOG(6) << "finish process select kernel: " << kernel_name;
 
-  Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
+  const Scope* inner_scope = value_exec_info_.GetScope();
+
+  operator_base_ = BuildOperatorBase(op, value_exec_info_, yaml_info_parser);
 
-  operator_base_ = pir::BuildOperatorBase(
-      op, value_2_var_name, yaml_info_parser, variable_2_var_name, inner_scope);
   paddle::framework::VariableValueMap in_map;
   paddle::framework::VariableValueMap out_map;
   auto dev_ctx = phi::DeviceContextPool::Instance().Get(
@@ -137,14 +127,11 @@ LegacyKernelInstruction::LegacyKernelInstruction(
 
   runtime_context_ = std::make_shared<paddle::framework::RuntimeContext>(
       paddle::framework::RuntimeContext(in_map, out_map));
-  pir::BuildRuntimeContext(op,
-                           value_2_var_name,
-                           scope,
-                           local_scope,
-                           yaml_info_parser,
-                           runtime_context_.get());
+  BuildRuntimeContext(
+      op, value_exec_info, yaml_info_parser, runtime_context_.get());
+
   kernel_context_ = new paddle::framework::ExecutionContext(
-      *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get()));
+      *operator_base_, *inner_scope, *dev_ctx, *(runtime_context_.get()));
 
   VLOG(6) << "finish process kernel context";
   SetDeviceContext(
@@ -156,8 +143,7 @@ LegacyKernelInstruction::LegacyKernelInstruction(
                          GetStreamPriority()));
   VLOG(6) << "finish process device context";
 
-  InitInputsOutputsIds(
-      op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name);
+  InitInputsOutputsIds(op, value_exec_info);
   VLOG(6) << "finish process inputs outputs index";
 
   auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
@@ -180,12 +166,12 @@ LegacyKernelInstruction::~LegacyKernelInstruction() {
 }
 
 void LegacyKernelInstruction::Run() {
+  VLOG(6) << "Run op " << legacy_op_name_ << " infer meta.";
   if (infer_meta_interface_) {
     infer_meta_interface_->infer_meta_(&(infer_meta_context_));
   }
-  VLOG(6) << "Run op " << legacy_op_name_ << " infer meta.";
-  (*(phi_kernel_))((kernel_context_));
   VLOG(6) << "Run op " << legacy_op_name_ << " kernel.";
+  (*(phi_kernel_))((kernel_context_));
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
index 9c6fbd9b7d8070..1ccbc8ebc01585 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
@@ -18,25 +18,19 @@
 
 namespace pir {
 class Operation;
-class Value;
 }  // namespace pir
 
 namespace paddle {
 namespace framework {
 class Scope;
+class ValueExecutionInfo;
 
 class LegacyKernelInstruction : public InstructionBase {
  public:
-  LegacyKernelInstruction(
-      size_t id,
-      const platform::Place& place,
-      ::pir::Operation* op,
-      Scope* scope,
-      Scope* local_scope,
-      const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
-      const std::map<std::string, int>& var_name_2_id,
-      const std::unordered_map<const paddle::framework::Variable*, std::string>&
-          variable_2_var_name);
+  LegacyKernelInstruction(size_t id,
+                          const platform::Place& place,
+                          ::pir::Operation* op,
+                          const ValueExecutionInfo& value_exec_info);
 
   ~LegacyKernelInstruction();
   phi::Kernel* PhiKernel() const { return phi_kernel_; }
@@ -70,6 +64,8 @@ class LegacyKernelInstruction : public InstructionBase {
   phi::Kernel* phi_kernel_{nullptr};  // not owned
 
   ::pir::Operation* op_{nullptr};  // not owned
+
+  const ValueExecutionInfo& value_exec_info_;  // not owned
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
index e779fb52f26e48..3f93161a363faf 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
@@ -40,13 +40,8 @@ PhiKernelInstruction::PhiKernelInstruction(
     size_t id,
     const platform::Place& place,
     pir::Operation* op,
-    Scope* scope,
-    Scope* local_scope,
-    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name)
-    : InstructionBase(id, place) {
+    const ValueExecutionInfo& value_exec_info)
+    : InstructionBase(id, place), value_exec_info_(value_exec_info) {
   auto op_attributes = op->attributes();
   auto op_name =
       op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
@@ -103,18 +98,13 @@ PhiKernelInstruction::PhiKernelInstruction(
   VLOG(6) << "finish process yaml_info_parser";
 
   if (infer_meta_interface_) {
-    pir::BuildPhiContext<
+    BuildPhiContext<
         phi::InferMetaContext,
         phi::MetaTensor,
         phi::MetaTensor,
         paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
         paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
-        false>(op,
-               value_2_var_name,
-               scope,
-               local_scope,
-               yaml_info_parser,
-               &infer_meta_context_);
+        false>(op, value_exec_info_, yaml_info_parser, &infer_meta_context_);
   }
   VLOG(6) << "finish process infer meta context";
 
@@ -130,17 +120,14 @@ PhiKernelInstruction::PhiKernelInstruction(
       phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name);
   VLOG(6) << "finish process select kernel";
 
-  pir::BuildPhiContext<phi::KernelContext,
-                       const phi::TensorBase*,
-                       phi::TensorBase*,
-                       paddle::small_vector<const phi::TensorBase*>,
-                       paddle::small_vector<phi::TensorBase*>,
-                       true>(op,
-                             value_2_var_name,
-                             scope,
-                             local_scope,
-                             yaml_info_parser,
-                             &kernel_context_);
+  BuildPhiContext<phi::KernelContext,
+                  const phi::TensorBase*,
+                  phi::TensorBase*,
+                  paddle::small_vector<const phi::TensorBase*>,
+                  paddle::small_vector<phi::TensorBase*>,
+                  true>(
+      op, value_exec_info_, yaml_info_parser, &kernel_context_);
+
   kernel_context_.SetDeviceContext(phi::DeviceContextPool::Instance().Get(
       phi::TransToPhiPlace(kernel_key.backend())));
   VLOG(6) << "finish process kernel context";
@@ -154,9 +141,7 @@ PhiKernelInstruction::PhiKernelInstruction(
                          GetStreamPriority()));
   VLOG(6) << "finish process device context";
 
-  Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
-  InitInputsOutputsIds(
-      op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name);
+  InitInputsOutputsIds(op, value_exec_info);
   VLOG(6) << "finish process inputs outputs index";
 
   auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
index 96484f435a9f74..41539300c45037 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
@@ -23,20 +23,14 @@ class Operation;
 namespace paddle {
 namespace framework {
 class Scope;
-class Value;
+class ValueExecutionInfo;
 
 class PhiKernelInstruction : public InstructionBase {
  public:
-  PhiKernelInstruction(
-      size_t id,
-      const platform::Place& place,
-      ::pir::Operation* op,
-      Scope* scope,
-      Scope* local_scope,
-      const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
-      const std::map<std::string, int>& var_name_2_id,
-      const std::unordered_map<const paddle::framework::Variable*, std::string>&
-          variable_2_var_name);
+  PhiKernelInstruction(size_t id,
+                       const platform::Place& place,
+                       ::pir::Operation* op,
+                       const ValueExecutionInfo& value_exec_info);
 
   ~PhiKernelInstruction();
 
@@ -71,6 +65,8 @@ class PhiKernelInstruction : public InstructionBase {
   std::string phi_op_name_;
 
   ::pir::Operation* op_{nullptr};  // not owned
+
+  const ValueExecutionInfo& value_exec_info_;  // not owned
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/while_instruction.cc b/paddle/fluid/framework/new_executor/instruction/while_instruction.cc
new file mode 100644
index 00000000000000..b511ad1f602320
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/while_instruction.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/instruction/while_instruction.h"
+
+#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/framework/new_executor/new_ir_interpreter.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/type_defs.h"
+
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+
+namespace paddle {
+namespace framework {
+
+WhileInstruction::WhileInstruction(size_t id,
+                                   const platform::Place& place,
+                                   pir::Operation* op,
+                                   Scope* scope,
+                                   Scope* local_scope,
+                                   ValueExecutionInfo* parent_exe_info)
+    : InstructionBase(id, place) {
+  op_ = op;
+  VLOG(6) << "finish process dist attributes";
+
+  SetKernelType(AnalyseOpFuncType(op, place));
+  VLOG(6) << "finish process analyse kernel type";
+
+  Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
+
+  VLOG(6) << "finish process inputs outputs index";
+
+  PADDLE_ENFORCE(op->isa<paddle::dialect::WhileOp>(),
+                 phi::errors::PreconditionNotMet(
+                     "While instruction only support While op"));
+
+  auto while_op = op->dyn_cast<paddle::dialect::WhileOp>();
+
+  cond_var_ = inner_scope->GetVar(
+      parent_exe_info->GetValue2VarName().at(while_op.operand_source(0)));
+  for (size_t i = 1; i < while_op.num_operands(); ++i) {
+    inputs_.push_back(inner_scope->GetVar(
+        parent_exe_info->GetValue2VarName().at(while_op.operand_source(i))));
+  }
+
+  for (size_t i = 0; i < while_op.num_results(); ++i) {
+    outputs_.push_back(inner_scope->GetVar(
+        parent_exe_info->GetValue2VarName().at(while_op.result(i))));
+  }
+
+  body_block_ = while_op.body_block();
+  auto body_block_outputs = GetYiedOpInputs(body_block_);
+
+  Scope* body_scope = &(parent_exe_info->GetScope()->NewScope());
+  auto body_exe_info = parent_exe_info->NewChild(body_scope);
+  for (size_t i = 0; i < body_block_->args_size(); ++i) {
+    auto var_name = "body_block_arg_" + std::to_string(i);
+    body_scope->Var(var_name);
+    body_exe_info->Add(body_block_->argument(i), var_name);
+  }
+  body_inter_ = std::unique_ptr<NewIRInterpreter>(new NewIRInterpreter(
+      place, {}, body_block_, body_scope, body_exe_info, {}));
+
+  std::set<std::string> body_skip_gc_names_set;
+  for (auto value : body_block_outputs) {
+    body_skip_gc_names_.push_back(body_inter_->GetNameByValue(value));
+    body_skip_gc_names_set.insert(body_inter_->GetNameByValue(value));
+  }
+  body_inter_->SetSkipGcVars(body_skip_gc_names_set);
+
+  std::unordered_map<pir::Value, std::vector<int>> inputs;
+  GetInputIds(op, *parent_exe_info, &inputs);
+
+  SetInputs(inputs);
+
+  std::unordered_map<pir::Value, std::vector<int>> outputs;
+  for (size_t i = 0; i < op->num_results(); i++) {
+    pir::Value value = op->result(i);
+    if (value && value.type()) {
+      PADDLE_ENFORCE_NE(
+          parent_exe_info->GetValue2VarName().find(value),
+          parent_exe_info->GetValue2VarName().end(),
+          phi::errors::PreconditionNotMet(
+              "output should in name map, [%d] 'th output of [%s] op",
+              i,
+              "while op"));
+      std::vector<int> outputs_id = GetValueIds(value, *parent_exe_info);
+      outputs.emplace(value, outputs_id);
+    }
+  }
+  SetOutputs(outputs);
+}
+
+void WhileInstruction::CopyInputsToOutputs() {
+  for (size_t i = 0; i < outputs_.size(); ++i) {
+    outputs_[i]->GetMutable<phi::DenseTensor>()->ShareDataWith(
+        inputs_[i]->Get<phi::DenseTensor>());
+  }
+}
+
+void WhileInstruction::PassArgsToBodyBlock() {
+  for (size_t i = 0; i < body_block_->args_size(); ++i) {
+    auto block_arg = body_block_->argument(i);
+    auto var_name = body_inter_->GetNameByValue(block_arg);
+    auto* inner_var = body_inter_->local_scope()->GetVar(var_name);
+    inner_var->GetMutable<phi::DenseTensor>()->ShareDataWith(
+        outputs_[i]->Get<phi::DenseTensor>());
+  }
+}
+
+void WhileInstruction::GetValueFromBodyBlock() {
+  cond_var_->GetMutable<phi::DenseTensor>()->ShareDataWith(
+      body_inter_->local_scope()
+          ->GetVar(body_skip_gc_names_[0])
+          ->Get<phi::DenseTensor>());
+  for (size_t i = 0; i < outputs_.size(); ++i) {
+    auto& out_var_name = body_skip_gc_names_[i + 1];
+    auto* out_var = body_inter_->local_scope()->GetVar(out_var_name);
+    outputs_[i]->GetMutable<phi::DenseTensor>()->ShareDataWith(
+        out_var->Get<phi::DenseTensor>());
+  }
+}
+void WhileInstruction::Run() {
+  CopyInputsToOutputs();
+  while (cond_var_->Get<phi::DenseTensor>().data<bool>()[0]) {
+    PassArgsToBodyBlock();
+    body_inter_->Run({}, false);
+    GetValueFromBodyBlock();
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/while_instruction.h b/paddle/fluid/framework/new_executor/instruction/while_instruction.h
new file mode 100644
index 00000000000000..d486c8206c5026
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/while_instruction.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
+
+namespace ir {
+class Operation;
+}  // namespace ir
+
+namespace paddle {
+namespace framework {
+class Scope;
+class Value;
+class NewIRInterpreter;
+class ValueExecutionInfo;
+
+/// The execute semantics of while op ['output' = while_op('cond', 'intput')]
+/// is:
+///   'output' = 'input';
+///   while('cond') {
+///      'cond', 'output' = body_block('output');
+///  }
+class WhileInstruction : public InstructionBase {
+ public:
+  WhileInstruction(size_t id,
+                   const platform::Place& place,
+                   ::pir::Operation* op,
+                   Scope* scope,
+                   Scope* local_scope,
+                   ValueExecutionInfo* parent_exe_info);
+
+  void Run() override;
+
+  const std::string& Name() const override { return name_; }
+
+  ::pir::Operation* Operation() const override { return op_; }
+
+ private:
+  // 'output' = 'input'
+  void CopyInputsToOutputs();
+
+  // Pass argument to body_block for execution.
+  void PassArgsToBodyBlock();
+
+  // Get return value from body_block after each execution.
+  void GetValueFromBodyBlock();
+
+  std::string name_{"while_instruction"};
+
+  Variable* cond_var_;
+
+  std::vector<Variable*> inputs_;
+  std::vector<Variable*> outputs_;
+
+  std::unique_ptr<NewIRInterpreter> body_inter_;
+  std::vector<std::string> body_skip_gc_names_;
+
+  ::pir::Block* body_block_;
+
+  ::pir::Operation* op_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
index 0baa62f8a4dcdb..0d3af1e55c2a01 100644
--- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
@@ -508,7 +508,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
           const std::string var_name = argument_names[i];
           Variable* var = arguments->at(i);
 
-          const phi::DenseTensor* tensor_in;
+          const phi::DenseTensor* tensor_in = nullptr;
           if (var->IsType<phi::DenseTensor>() ||
               var->IsType<phi::SelectedRows>()) {
             tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
index de77780abc3e53..4ce8c411a10b25 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
@@ -42,7 +42,7 @@ namespace interpreter {
 size_t CountDownstreamMap(
     const std::map<size_t, std::set<size_t>>& downstream_map) {
   size_t count = 0;
-  for (auto pair : downstream_map) {
+  for (auto const& pair : downstream_map) {
     count += pair.second.size();
   }
   return count;
@@ -50,7 +50,7 @@ size_t CountDownstreamMap(
 const std::string StringizeDownstreamMap(
     const std::map<size_t, std::set<size_t>>& downstream_map) {
   std::ostringstream oss;
-  for (auto pair : downstream_map) {
+  for (auto const& pair : downstream_map) {
     oss << pair.first << " -> ";
     std::copy(pair.second.begin(),
               pair.second.end(),
@@ -144,7 +144,7 @@ void DependencyBuilder::AddDependencyForCoalesceTensorOp() {
       auto outputs = instructions_->at(op_idx).Outputs().at("Output");
 
       auto is_read = [](const Instruction& inst, size_t var_id) -> bool {
-        for (auto pair : inst.Inputs()) {
+        for (auto const& pair : inst.Inputs()) {
           for (size_t item : pair.second) {
             if (item == var_id) {
               return true;
@@ -155,7 +155,7 @@ void DependencyBuilder::AddDependencyForCoalesceTensorOp() {
       };
 
       auto is_write = [](const Instruction& inst, size_t var_id) -> bool {
-        for (auto pair : inst.Outputs()) {
+        for (auto const& pair : inst.Outputs()) {
           for (size_t item : pair.second) {
             if (item == var_id) {
               return true;
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index b2fbed43f02fab..8c51e310b054c5 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/distributed/auto_parallel/dist_attr.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
 #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h"
 #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
@@ -45,10 +46,6 @@
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #endif
-PADDLE_DEFINE_EXPORTED_bool(
-    new_executor_log_memory_stats,
-    false,
-    "Log memory stats after each op runs, just used for debug.");
 
 PHI_DECLARE_bool(use_mkldnn);
 PHI_DECLARE_bool(check_nan_inf);
@@ -228,7 +225,9 @@ bool var_can_be_deleted(const std::string& name, const BlockDesc& block) {
 
   return type == proto::VarType::LOD_TENSOR ||
          type == proto::VarType::SELECTED_ROWS ||
-         type == proto::VarType::LOD_TENSOR_ARRAY;
+         type == proto::VarType::LOD_TENSOR_ARRAY ||
+         type == proto::VarType::SPARSE_COO ||
+         type == proto::VarType::SPARSE_CSR;
 }
 
 std::unordered_map<const paddle::framework::OperatorBase*,
@@ -620,8 +619,6 @@ void BuildOpFuncList(const platform::Place& place,
         "pylayer",
         "pylayer_grad"
         "recurrent_grad",
-        "rnn_memory_helper",
-        "rnn_memory_helper_grad",
         "while",
         "while_grad"};
     bool allow_var_not_in_program = ops_with_var_not_in_program.count(op_type);
@@ -695,7 +692,7 @@ void BuildOpFuncList(const platform::Place& place,
         // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
 
         std::vector<std::shared_ptr<OperatorBase>> following_ops(
-            ops.begin() + i + 1, ops.end());
+            ops.begin() + static_cast<int>(i) + 1, ops.end());
         HandleOperatorBase(place,
                            ops[i],
                            &op_func_node,
@@ -896,7 +893,7 @@ void BuildOpFuncList(const platform::Place& place,
             // avoid overwriting valid data
             if (static_build && original_tensor->initialized()) {
               const phi::Place& target_place = transformed_tensor->place();
-              platform::DeviceContext* dev_ctx_for_copy;
+              platform::DeviceContext* dev_ctx_for_copy = nullptr;
               if (target_place.GetType() != AllocationType::CPU) {
                 dev_ctx_for_copy = pool.Get(target_place);
               } else {
@@ -936,7 +933,7 @@ void BuildOpFuncList(const platform::Place& place,
       }
     } catch (platform::EnforceNotMet& ex) {
       framework::InsertCallStackInfo(op_type, op->Attrs(), &ex);
-      throw std::move(ex);
+      throw ex;
     } catch (platform::EOFException&) {
       std::rethrow_exception(std::current_exception());
     } catch (std::exception& ex) {
@@ -985,7 +982,7 @@ void BuildOpFuncList(const platform::Place& place,
       // gc---------------------------------------------
       auto iter = unused_var_map.find(op);
       if (iter == unused_var_map.end()) {
-        interpreter::LogDeviceMemoryStats(place);
+        memory::LogDeviceMemoryStats(place, op_type);
         continue;
       }
 
@@ -1004,11 +1001,38 @@ void BuildOpFuncList(const platform::Place& place,
         if (var->IsType<phi::DenseTensor>()) {
           garbages->emplace_back(
               var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder());
+        } else if (var->IsType<phi::SelectedRows>()) {
+          garbages->emplace_back(var->GetMutable<phi::SelectedRows>()
+                                     ->mutable_value()
+                                     ->MoveMemoryHolder());
+          var->GetMutable<phi::SelectedRows>()->mutable_rows()->clear();
+        } else if (var->IsType<LoDTensorArray>()) {
+          auto* tensor_arr = var->GetMutable<LoDTensorArray>();
+          for (auto& t : *tensor_arr) {
+            garbages->emplace_back(t.MoveMemoryHolder());
+          }
+        } else if (var->IsType<phi::SparseCooTensor>()) {
+          garbages->emplace_back(var->GetMutable<phi::SparseCooTensor>()
+                                     ->mutable_indices()
+                                     ->MoveMemoryHolder());
+          garbages->emplace_back(var->GetMutable<phi::SparseCooTensor>()
+                                     ->mutable_values()
+                                     ->MoveMemoryHolder());
+        } else if (var->IsType<phi::SparseCsrTensor>()) {
+          garbages->emplace_back(var->GetMutable<phi::SparseCsrTensor>()
+                                     ->mutable_cols()
+                                     ->MoveMemoryHolder());
+          garbages->emplace_back(var->GetMutable<phi::SparseCsrTensor>()
+                                     ->mutable_crows()
+                                     ->MoveMemoryHolder());
+          garbages->emplace_back(var->GetMutable<phi::SparseCsrTensor>()
+                                     ->mutable_values()
+                                     ->MoveMemoryHolder());
         }
       }
       delete garbages;  // free mem
 
-      interpreter::LogDeviceMemoryStats(place);
+      memory::LogDeviceMemoryStats(place, op_type);
     }
   }
 
@@ -1024,101 +1048,38 @@ void BuildOpFuncList(const platform::Place& place,
     if (var->IsType<phi::DenseTensor>()) {
       garbages->emplace_back(
           var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder());
+    } else if (var->IsType<phi::SelectedRows>()) {
+      garbages->emplace_back(var->GetMutable<phi::SelectedRows>()
+                                 ->mutable_value()
+                                 ->MoveMemoryHolder());
+      var->GetMutable<phi::SelectedRows>()->mutable_rows()->clear();
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto& t : *tensor_arr) {
+        garbages->emplace_back(t.MoveMemoryHolder());
+      }
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      garbages->emplace_back(var->GetMutable<phi::SparseCooTensor>()
+                                 ->mutable_indices()
+                                 ->MoveMemoryHolder());
+      garbages->emplace_back(var->GetMutable<phi::SparseCooTensor>()
+                                 ->mutable_values()
+                                 ->MoveMemoryHolder());
+    } else if (var->IsType<phi::SparseCsrTensor>()) {
+      garbages->emplace_back(var->GetMutable<phi::SparseCsrTensor>()
+                                 ->mutable_cols()
+                                 ->MoveMemoryHolder());
+      garbages->emplace_back(var->GetMutable<phi::SparseCsrTensor>()
+                                 ->mutable_crows()
+                                 ->MoveMemoryHolder());
+      garbages->emplace_back(var->GetMutable<phi::SparseCsrTensor>()
+                                 ->mutable_values()
+                                 ->MoveMemoryHolder());
     }
   }
   delete garbages;
 }
 
-void BuildOpFuncList(
-    const platform::Place& place,
-    pir::Block* block,
-    std::vector<OpFuncNode>* vec_func_list,
-    framework::Scope* scope,
-    framework::Scope* local_scope,
-    const std::unordered_map<pir::Value, std::string>& value_2_name_map,
-    const ExecutionConfig& execution_config) {
-  vec_func_list->reserve(block->size());
-  pir::IrContext* ctx = pir::IrContext::Instance();
-
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-
-  for (auto op : *block) {
-    OpFuncNode op_func_node;
-    auto attr_map = op->attributes();
-
-    auto op_name =
-        attr_map.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
-    op_func_node.phi_op_name_ = op_name;
-
-    if (GetSpecialOpNames().count(op_name)) {
-      VLOG(6) << "skip process " << op_name;
-      continue;
-    }
-
-    pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
-
-    auto impl =
-        op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
-
-    op_func_node.infer_meta_interface_ =
-        op_info.GetInterfaceImpl<paddle::dialect::InferMetaInterface>();
-
-    VLOG(6) << "op name" << op_func_node.phi_op_name_;
-    dialect::OpYamlInfoParser op_yaml_info_parser(impl->get_op_info_());
-    if (op_func_node.infer_meta_interface_) {
-      pir::BuildPhiContext<
-          phi::InferMetaContext,
-          phi::MetaTensor,
-          phi::MetaTensor,
-          paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
-          paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
-          false>(op,
-                 value_2_name_map,
-                 scope,
-                 local_scope,
-                 op_yaml_info_parser,
-                 &(op_func_node.infer_meta_context_));
-    }
-
-    auto kernel_name =
-        attr_map.at("kernel_name").dyn_cast<pir::StrAttribute>().AsString();
-    auto kernel_key = attr_map.at("kernel_key")
-                          .dyn_cast<paddle::dialect::KernelAttribute>()
-                          .data();
-
-    VLOG(6) << "finish process infer meta context";
-    auto t1 = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-        kernel_name, kernel_key);
-    op_func_node.phi_kernel_ = new phi::Kernel(t1.kernel);
-
-    PADDLE_ENFORCE_EQ(op_func_node.phi_kernel_->IsValid(),
-                      true,
-                      "not found kernel for [%s]",
-                      kernel_name);
-
-    pir::BuildPhiContext<phi::KernelContext,
-                         const phi::TensorBase*,
-                         phi::TensorBase*,
-                         paddle::small_vector<const phi::TensorBase*>,
-                         paddle::small_vector<phi::TensorBase*>,
-                         true>(op,
-                               value_2_name_map,
-                               scope,
-                               local_scope,
-                               op_yaml_info_parser,
-                               &(op_func_node.kernel_context_));
-
-    VLOG(6) << "finish process kernel context";
-    op_func_node.kernel_context_.SetDeviceContext(
-        phi::DeviceContextPool::Instance().Get(
-            phi::TransToPhiPlace(kernel_key.backend())));
-    op_func_node.dev_ctx_ = phi::DeviceContextPool::Instance().Get(
-        phi::TransToPhiPlace(kernel_key.backend()));
-
-    vec_func_list->emplace_back(op_func_node);
-  }
-}
-
 void BuildVariableScope(const framework::BlockDesc& block,
                         const ExecutionConfig& execution_config,
                         VariableScope* var_scope) {
@@ -1166,21 +1127,6 @@ void BuildVariableScope(const framework::BlockDesc& block,
   }
 }
 
-void LogDeviceMemoryStats(const platform::Place& place) {
-  if (FLAGS_new_executor_log_memory_stats && platform::is_gpu_place(place)) {
-    VLOG(0) << "memory_allocated: "
-            << static_cast<double>(memory::DeviceMemoryStatCurrentValue(
-                   "Allocated", place.device)) /
-                   1024 / 1024
-            << " MB";
-    VLOG(0) << "max_memory_allocated: "
-            << static_cast<double>(memory::DeviceMemoryStatPeakValue(
-                   "Allocated", place.device)) /
-                   1024 / 1024
-            << " MB";
-  }
-}
-
 void SetDeviceCommContext(framework::OperatorBase* operator_base,
                           platform::DeviceContext* dev_ctx) {
   if (operator_base->HasAttr("ring_id")) {
@@ -1230,7 +1176,7 @@ std::unordered_set<std::string> GetSpecialOpNames() {
       "builtin.set_parameter",
       "builtin.get_parameter",
       "pd_op.data",
-      "pd_op.shadow_output",
+      "builtin.shadow_output",
   };
 }
 
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
index 010d11318b4321..57343c61f6621f 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
@@ -104,22 +104,14 @@ void BuildOpFuncList(const platform::Place& place,
                      bool use_local_scope = true,
                      bool static_build = false);
 
-void BuildOpFuncList(
-    const platform::Place& place,
-    ::pir::Block* block,
-    std::vector<OpFuncNode>* vec_func_list,
-    framework::Scope* scope,
-    framework::Scope* local_scope,
-    const std::unordered_map<::pir::Value, std::string>& value_2_name_map,
-    const ExecutionConfig& execution_config);
-
 void BuildVariableScope(const framework::BlockDesc& block,
                         const ExecutionConfig& execution_config,
                         VariableScope* var_scope);
 void BuildId2VarName(const std::map<std::string, int>& var_name_2_id,
                      std::unordered_map<int, std::string>* id_2_var_name);
 
-void LogDeviceMemoryStats(const platform::Place& place);
+void LogDeviceMemoryStats(const platform::Place& place,
+                          const std::string& op_name);
 
 void SetDeviceCommContext(framework::OperatorBase* operator_base,
                           platform::DeviceContext* dev_ctx);
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
index 3751ee0a03db44..bebeb142d473f1 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -54,7 +54,6 @@ std::set<std::string> OpsCanSkipedFakeAllocInStaticBuild = {
     "nop"};
 
 std::set<std::string> StaticBuildBlackList = {
-    "batch_norm" /*: to handle reserve_space output*/,
     "cinn_instruction_run" /*: to handle subgraph infermeta*/,
     "cinn_launch" /*: to handle subgraph infermeta*/,
     "run_program" /*: to handle scope output*/,
@@ -206,6 +205,14 @@ bool TensorShouldBeFakeInitialized(const OperatorBase& op,
     }
   }
 
+  if (op_type == "batch_norm" && parameter_name == "ReserveSpace") {
+    if (dynamic_cast<const OperatorWithKernel*>(&op)->kernel_type()->place_ ==
+        phi::CPUPlace()) {
+      VLOG(2) << "Skip fake initialization for: " << parameter_name;
+      return false;
+    }
+  }
+
   if (op_type == "coalesce_tensor" && parameter_name == "Output") {
     VLOG(2) << "Skip fake initialization for: " << parameter_name;
     return false;
@@ -250,6 +257,12 @@ bool TensorShouldBeFakeInitialized(const OperatorBase& op,
     }
   }
 
+  if ((op_type == "flatten" || op_type == "flatten_contiguous_range") &&
+      parameter_name == "XShape") {
+    VLOG(2) << "Skip fake initialization for: " << parameter_name;
+    return false;
+  }
+
   if (op_type == "segment_pool" && parameter_name == "SummedIds") {
     return op.Attr<std::string>("pooltype") == "MEAN" &&
            dynamic_cast<const OperatorWithKernel*>(&op)
@@ -317,7 +330,7 @@ void FakeInitializeTensor(const platform::DeviceContext& dev_ctx,
 
   // set place
   if (tensor->initialized()) {  // avoid overwriting valid data
-    platform::DeviceContext* dev_ctx_for_copy;
+    platform::DeviceContext* dev_ctx_for_copy = nullptr;
     if (place.GetType() != AllocationType::CPU) {
       dev_ctx_for_copy = platform::DeviceContextPool::Instance().Get(place);
     } else {
@@ -856,6 +869,8 @@ void FakeInitializeOutputsForFunctionKernel(
             dtype = InferDTypeFromAttr(op, runtime_ctx, "dtype");
           } else if (op_type == "bincount" || op_type == "reduce_sum_grad") {
             dtype = GetInputDType(runtime_ctx, "X");
+          } else if (op_type == "dequantize_linear") {
+            dtype = GetInputDType(runtime_ctx, "Scale");
           } else if (op_type == "lamb") {
             bool multi_precision = op.Attr<bool>("multi_precision");
             dtype = GetInputDType(runtime_ctx, "Moment1");
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index bbbaf4c0dd75f2..3f356270e05702 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -431,6 +431,7 @@ void analyse_event_info_for_two_instructions<Instruction>(
 
   if (has_data_dependency<Instruction, std::string>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
+      !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() ||
       instructions[next_instr_id]->OpBase()->Type() == "depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
@@ -490,6 +491,7 @@ void analyse_event_info_for_two_instructions<
 
   if (has_data_dependency<paddle::framework::InstructionBase, pir::Value>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
+      !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() ||
       instructions[next_instr_id]->Name() == "pd_op.depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index ac9b826e6a5002..e527b9d254b8ce 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -29,6 +29,8 @@
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_context.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 #ifdef PADDLE_WITH_DNNL
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -42,11 +44,14 @@
 #include "paddle/fluid/framework/new_executor/instruction/cond_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
+#include "paddle/fluid/framework/new_executor/instruction/while_instruction.h"
 #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/pir/core/builtin_attribute.h"
 
@@ -114,7 +119,7 @@ NewIRInterpreter::NewIRInterpreter(
 
   std::stringstream ss;
   ss << this;
-  ::pir::BuildScope(*ir_block_, ss.str(), &sub_blocks_, value_exe_info_.get());
+  BuildScope(*ir_block_, ss.str(), value_exe_info_.get());
 }
 
 NewIRInterpreter::NewIRInterpreter(
@@ -176,7 +181,7 @@ NewIRInterpreter::NewIRInterpreter(
 
   std::stringstream ss;
   ss << this;
-  ::pir::BuildScope(*ir_block_, ss.str(), &sub_blocks_, value_exe_info_.get());
+  BuildScope(*ir_block_, ss.str(), value_exe_info_.get());
 }
 
 NewIRInterpreter::~NewIRInterpreter() {
@@ -234,7 +239,8 @@ void NewIRInterpreter::reset_scope(Scope* new_scope) {
   scope_ = new_scope;
   for (size_t i = 0; i < value_exe_info_->GetVarList().size(); i++) {
     const auto& var_name = value_exe_info_->GetNameById(static_cast<int>(i));
-    value_exe_info_->ResetVarList(i, new_scope->FindVar(var_name));
+    value_exe_info_->ResetVarList(static_cast<int>(i),
+                                  new_scope->FindVar(var_name));
   }
   // The index should be assured valid, cause the InterpreterCore may not be
   // fully built, but was still cached and used. For example, see unit test
@@ -379,12 +385,12 @@ std::string NewIRInterpreter::GetDepsString() const {
 
 bool NewIRInterpreter::HasLocalScope() const { return local_scope_ != nullptr; }
 
-Scope* NewIRInterpreter::InnerScope() {
+Scope* NewIRInterpreter::InnerScope() const {
   return local_scope_ != nullptr ? local_scope_ : scope_;
 }
 
 std::string NewIRInterpreter::GetNameByValue(::pir::Value value) const {
-  return value_exe_info_->GetValue2VarName().at(value);
+  return value_exe_info_->GetVarName(value);
 }
 
 void NewIRInterpreter::UpdateSyncOpNum() {
@@ -461,7 +467,7 @@ void NewIRInterpreter::UpdateNcclOpNum() {
       "pd_op.global_gather_grad",
       "pd_op.distributed_fused_lamb_grad",
       "pd_op.margin_cross_entropy_grad",
-      "pd_op.margin_cross_entropy_grad_"
+      "pd_op.margin_cross_entropy_grad_",
       "pd_op.sync_batch_norm_grad",
       "pd_op.data_norm_grad",
       "pd_op.class_center_sample_grad",
@@ -558,20 +564,23 @@ void NewIRInterpreter::BuildInstruction() {
     VLOG(6) << "Build Instruction for op: " << op_idx;
     if (op->dialect()->name() == "builtin") {
       if (interpreter::GetSpecialOpNames().count(op->name())) {
-        VLOG(6) << "skip process " << op->name();
+        VLOG(6) << "skip process builtin dialect op: " << op->name();
         continue;
       }
     } else if (op->dialect()->name() == "cf") {
+      VLOG(6) << "skip process cf dialect op: " << op->name();
       continue;
     } else if (op->dialect()->name() == "pd_op") {
-      vec_instruction_base_.emplace_back(
-          std::make_unique<CondInstruction>(op_idx++,
-                                            place_,
-                                            op,
-                                            scope_,
-                                            local_scope_,
-                                            value_exe_info_.get(),
-                                            sub_blocks_));
+      if (op->isa<paddle::dialect::IfOp>()) {
+        vec_instruction_base_.emplace_back(std::make_unique<CondInstruction>(
+            op_idx++, place_, op, value_exe_info_.get()));
+      } else if (op->isa<paddle::dialect::WhileOp>()) {
+        vec_instruction_base_.emplace_back(std::make_unique<WhileInstruction>(
+            op_idx++, place_, op, scope_, local_scope_, value_exe_info_.get()));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Now only support pd_kernel and cinn dialect."));
+      }
     } else if (op->dialect()->name() == "pd_kernel") {
       auto op_name = op->attributes()
                          .at("op_name")
@@ -583,33 +592,19 @@ void NewIRInterpreter::BuildInstruction() {
       }
       VLOG(6) << "process " << op_name;
 
-      if (op->name().compare(paddle::dialect::LegacyKernelOp::name()) == 0) {
+      if (op->isa<paddle::dialect::LegacyKernelOp>()) {
         vec_instruction_base_.emplace_back(
             std::make_unique<LegacyKernelInstruction>(
-                op_idx++,
-                place_,
-                op,
-                scope_,
-                local_scope_,
-                value_exe_info_->GetValue2VarName(),
-                value_exe_info_->GetVarName2Id(),
-                value_exe_info_->GetVar2VarName()));
+                op_idx++, place_, op, *(value_exe_info_.get())));
       } else {
         vec_instruction_base_.emplace_back(
             std::make_unique<PhiKernelInstruction>(
-                op_idx++,
-                place_,
-                op,
-                scope_,
-                local_scope_,
-                value_exe_info_->GetValue2VarName(),
-                value_exe_info_->GetVarName2Id(),
-                value_exe_info_->GetVar2VarName()));
+                op_idx++, place_, op, *(value_exe_info_.get())));
       }
 #ifdef PADDLE_WITH_CINN
     } else if (op->dialect()->name() == "cinn_runtime") {
-      vec_instruction_base_.emplace_back(
-          std::make_unique<CinnJitInstruction>(op_idx++, place_, op, scope_));
+      vec_instruction_base_.emplace_back(std::make_unique<CinnJitInstruction>(
+          op_idx++, place_, op, *(value_exe_info_.get())));
 #endif
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
@@ -632,7 +627,7 @@ std::string NewIRInterpreter::DebugValueInfo() {
     PADDLE_ENFORCE((bool)kv.first,
                    platform::errors::PreconditionNotMet(
                        "vlaue(%s) should not be nullptr", kv.second));
-    PADDLE_ENFORCE(value_exe_info_->GetVarName2Id().count(kv.second) > 0,
+    PADDLE_ENFORCE(value_exe_info_->HasVar(kv.second),
                    platform::errors::PreconditionNotMet(
                        "var(%s) should exist in var_name_2_id_", kv.second));
     auto* var = InnerScope()->FindVar(kv.second);
@@ -641,8 +636,7 @@ std::string NewIRInterpreter::DebugValueInfo() {
         platform::errors::PreconditionNotMet(
             "var(%s) should exist in scope (%p)", kv.second, InnerScope()));
     os << kv.first.impl() << " -> " << kv.second << " -> "
-       << value_exe_info_->GetVarName2Id().at(kv.second) << " -> " << var
-       << "\n";
+       << value_exe_info_->GetVarId(kv.first) << " -> " << var << "\n";
   }
   return os.str();
 }
@@ -810,6 +804,18 @@ void NewIRInterpreter::RecordStreamForGC(InstructionBase* instr) {
       for (auto& tensor : *tensor_arr) {
         TensorRecordStream(tensor);
       }
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCooTensor>()->mutable_indices()));
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCooTensor>()->mutable_values()));
+    } else if (var->IsType<phi::SparseCsrTensor>()) {
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCsrTensor>()->mutable_cols()));
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCsrTensor>()->mutable_crows()));
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCsrTensor>()->mutable_values()));
     } else if (var->IsType<std::vector<Scope*>>()) {
       // do nothing
     } else {
@@ -850,6 +856,7 @@ void NewIRInterpreter::CheckGC(InstructionBase* instr) {
 }
 
 void NewIRInterpreter::CalculateLastLiveOps() {
+  VLOG(4) << "NewIRInterpreter(): " << this << " start CalculateLastLiveOps";
   // calculate last_live_ops_
   for (size_t op_idx = 0; op_idx < vec_instruction_base_.size(); ++op_idx) {
     InstructionBase* instr = vec_instruction_base_[op_idx].get();
@@ -875,13 +882,20 @@ void NewIRInterpreter::CalculateLastLiveOps() {
         gc_check_vars.insert(var_id);
       }
     }
+    VLOG(4) << "get gc check vars for: " << instr->Name();
 
     for (auto var_id : gc_check_vars) {
       Scope* inner_scope = InnerScope();
       paddle::framework::Variable* var = inner_scope->FindVar(
           value_exe_info_->GetNameById(static_cast<int>(var_id)));
+      PADDLE_ENFORCE_NOT_NULL(
+          var,
+          platform::errors::NotFound("Var(id=%d) should not be nullptr.",
+                                     static_cast<int>(var_id)));
       if (var->IsType<phi::DenseTensor>() || var->IsType<phi::SelectedRows>() ||
-          var->IsType<LoDTensorArray>()) {
+          var->IsType<LoDTensorArray>() ||
+          var->IsType<phi::SparseCooTensor>() ||
+          var->IsType<phi::SparseCsrTensor>()) {
         last_live_ops_[var_id].insert(op_idx);
       } else {
         VLOG(4) << "not clear "
@@ -890,6 +904,7 @@ void NewIRInterpreter::CalculateLastLiveOps() {
                 << framework::ToTypeName(var->Type());
       }
     }
+    VLOG(4) << "update last_live_ops for: " << instr->Name();
   }
   // clear the last_live_ops list for all vars in skip_gc_vars
   for (const std::string& skip_gc_var : execution_config_.skip_gc_vars) {
@@ -899,7 +914,7 @@ void NewIRInterpreter::CalculateLastLiveOps() {
       VLOG(8) << "Skip gc for var: " << skip_gc_var;
     }
   }
-  VLOG(4) << "calculate last_live_ops_";
+  VLOG(4) << "clear the last_live_ops list for all vars in skip_gc_vars";
 
   // shrink, find the downstream op that has no other op in the
   // downstream list happens before it
@@ -940,6 +955,7 @@ void NewIRInterpreter::CalculateLastLiveOps() {
     last_live_ops_[i] = minumum_last_live_ops;
     var_ref_count_[i] = static_cast<int>(last_live_ops_[i].size());
   }
+  VLOG(4) << "shrink the last_live_ops list for all vars in skip_gc_vars";
 
   for (auto& dep : *dependecy_count_) {
     deps_.emplace_back(std::make_shared<interpreter::OpDepInfo>(dep));
@@ -948,6 +964,7 @@ void NewIRInterpreter::CalculateLastLiveOps() {
     refs_.emplace_back(std::make_shared<interpreter::VarRefInfo>(
         var_ref_count_[i], value_exe_info_->GetVarList()[i]));
   }
+  VLOG(4) << "done CalculateLastLiveOps";
 }
 
 void NewIRInterpreter::ConstructEventForJitInput() {
@@ -1401,8 +1418,7 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
                            : "kGpuAsync"))
             << " runs on " << platform::GetCurrentThreadName();
     VLOG(4) << place_ << " "
-            << instr_node->DebugStringEx(scope_,
-                                         value_exe_info_->GetValue2VarName());
+            << instr_node->DebugStringEx(scope_, value_exe_info_.get());
     if (!instr_node->IsArtificial()) {
       instr_node->Run();
 
@@ -1424,11 +1440,10 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
                              : "kGpuAsync"))
               << " runs on " << platform::GetCurrentThreadName();
       VLOG(4) << place_ << " "
-              << instr_node->DebugStringEx(scope_,
-                                           value_exe_info_->GetValue2VarName());
+              << instr_node->DebugStringEx(scope_, value_exe_info_.get());
       CheckGC(instr_node);
       VLOG(4) << "done CheckGC";
-      interpreter::LogDeviceMemoryStats(place_);
+      memory::LogDeviceMemoryStats(place_, instr_node->Name());
     }
     VLOG(5) << "after run kernel";
     instr_node->RecordEvent(place_);
@@ -1500,6 +1515,9 @@ void NewIRInterpreter::SolvePersisableVarNames() {
     ::pir::Value value = kv.first;
     const std::string& var_name = kv.second;
     ::pir::OpResult result = value.dyn_cast<::pir::OpResult>();
+    if (!result) {
+      continue;
+    }
     auto* defining_op = result.owner();
     if (defining_op->HasAttribute(kAttrIsPersisable)) {
       auto is_persisables =
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index 04a149bb6d6928..3a128791cdfce6 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -78,6 +78,8 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
   const Scope* local_scope() const override;
 
+  Scope* InnerScope() const;
+
   const platform::Place& GetPlace() const override { return place_; }
 
   void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) override {
@@ -115,8 +117,6 @@ class NewIRInterpreter : public InterpreterBaseImpl {
   // scope
   bool HasLocalScope() const;
 
-  Scope* InnerScope();
-
   // For log and debug
   std::string GetDepsString() const;
 
@@ -216,8 +216,6 @@ class NewIRInterpreter : public InterpreterBaseImpl {
   // value execution info
   std::shared_ptr<ValueExecutionInfo> value_exe_info_;
 
-  std::map<pir::Block*, paddle::framework::Scope*> sub_blocks_;
-
   std::vector<int> var_ref_count_;
 
   interpreter::NewIrDependencyBuilder ir_dependency_builder_;
diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
index 2789c7b62bff53..3ae75ffd870088 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
@@ -13,37 +13,36 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
-#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
-#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/phi/core/meta_tensor.h"
-#include "paddle/pir/core/builtin_attribute.h"
-#include "paddle/pir/core/builtin_op.h"
-#include "paddle/pir/core/ir_context.h"
-#include "paddle/pir/core/program.h"
-#include "paddle/pir/core/utils.h"
 
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_ref_array.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/phi/core/kernel_context.h"
-
-#include "paddle/fluid/framework/string_array.h"
-#include "paddle/fluid/framework/tensor_ref_array.h"
 #include "paddle/fluid/ir_adaptor/translator/op_compat_info.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_context.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 
 #include "glog/logging.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 
 namespace paddle {
 namespace framework {
@@ -51,6 +50,11 @@ std::shared_ptr<ValueExecutionInfo> ValueExecutionInfo::NewChild(Scope* scope) {
   std::shared_ptr<ValueExecutionInfo> info =
       std::make_shared<ValueExecutionInfo>(scope);
   info->parent_ = this;
+  info->value_2_var_name_ = this->value_2_var_name_;
+  info->var_2_var_name_ = this->var_2_var_name_;
+  info->var_name_2_id_ = this->var_name_2_id_;
+  info->id_2_var_name_ = this->id_2_var_name_;
+  info->var_list_ = this->var_list_;
   return info;
 }
 
@@ -125,10 +129,88 @@ std::string ValueExecutionInfo::GetNameById(int id) const {
   }
   return "";
 }
-}  // namespace framework
-}  // namespace paddle
 
-namespace pir {
+const std::unordered_map<::pir::Value, std::string>&
+ValueExecutionInfo::GetValue2VarName() const {
+  return value_2_var_name_;
+}
+
+void ValueExecutionInfo::AddValue2VarName(::pir::Value value,
+                                          const std::string& var_name) {
+  value_2_var_name_.emplace(value, var_name);
+}
+
+const std::unordered_map<const Variable*, std::string>&
+ValueExecutionInfo::GetVar2VarName() const {
+  return var_2_var_name_;
+}
+
+const std::map<std::string, int>& ValueExecutionInfo::GetVarName2Id() const {
+  return var_name_2_id_;
+}
+
+const std::unordered_map<int, std::string>& ValueExecutionInfo::GetId2VarName()
+    const {
+  return id_2_var_name_;
+}
+
+const std::vector<Variable*>& ValueExecutionInfo::GetVarList() const {
+  return var_list_;
+}
+
+void ValueExecutionInfo::ResetVarList(int id, Variable* var) {
+  var_list_[id] = var;
+}
+
+bool ValueExecutionInfo::HasVar(const std::string& var_name) const {
+  auto it = var_name_2_id_.find(var_name);
+  if (it != var_name_2_id_.end()) {
+    return true;
+  }
+  return false;
+}
+
+bool ValueExecutionInfo::HasValue(::pir::Value value) const {
+  auto it = value_2_var_name_.find(value);
+  if (it != value_2_var_name_.end()) {
+    return true;
+  }
+  return false;
+}
+
+std::string ValueExecutionInfo::GetVarName(::pir::Value value) const {
+  auto it = value_2_var_name_.find(value);
+  if (it != value_2_var_name_.end()) {
+    return it->second;
+  }
+  return "";
+}
+
+std::string ValueExecutionInfo::GetVarName(const Variable* var) const {
+  auto it = var_2_var_name_.find(var);
+  if (it != var_2_var_name_.end()) {
+    return it->second;
+  }
+  return "";
+}
+
+int ValueExecutionInfo::GetVarId(::pir::Value value) const {
+  auto var_name = GetVarName(value);
+  auto it = var_name_2_id_.find(var_name);
+  if (it != var_name_2_id_.end()) {
+    return it->second;
+  }
+  return -1;
+}
+
+int ValueExecutionInfo::GetVarId(const Variable* var) const {
+  auto var_name = GetVarName(var);
+  auto it = var_name_2_id_.find(var_name);
+  if (it != var_name_2_id_.end()) {
+    return it->second;
+  }
+  return -1;
+}
 
 const std::unordered_set<std::string> SpecialOps = {"pd_op.feed",
                                                     "pd_op.fetch",
@@ -138,34 +220,34 @@ const std::unordered_set<std::string> SpecialOps = {"pd_op.feed",
                                                     "builtin.slice",
                                                     "builtin.split",
                                                     "pd_op.data",
-                                                    "pd_op.shadow_output",
-                                                    "pd_op.if"};
-
-using VariableNameMap =
-    std::unordered_map<const paddle::framework::Variable*, std::string>;
-
-paddle::framework::Variable* CreateVar(
-    pir::Value value,
-    const std::string& var_name_prefix,
-    bool force_persisable,
-    paddle::framework::ValueExecutionInfo* value_exe_info) {
-  Operation* def_op = value.dyn_cast<OpResult>().owner();
+                                                    "builtin.shadow_output",
+                                                    "pd_op.if",
+                                                    "pd_op.while"};
+
+Variable* CreateVar(pir::Value value,
+                    const std::string& var_name_prefix,
+                    bool force_persisable,
+                    ValueExecutionInfo* value_exe_info) {
+  pir::Operation* def_op = value.dyn_cast<pir::OpResult>().owner();
   bool is_persisable = false;
-  if (def_op->isa<::pir::SetParameterOp>()) {
+  if (def_op->isa<::pir::GetParameterOp>()) {
     is_persisable = true;
+  } else if (def_op->HasAttribute(kAttrIsPersisable)) {
+    is_persisable = def_op->attribute(kAttrIsPersisable)
+                        .dyn_cast<pir::ArrayAttribute>()
+                        .AsVector()[value.dyn_cast<pir::OpResult>().index()]
+                        .dyn_cast<pir::BoolAttribute>()
+                        .data();
   }
 
-  paddle::framework::Variable* var = nullptr;
-
+  Variable* var = nullptr;
   std::string name = var_name_prefix + "_inner_var_" +
                      std::to_string(value_exe_info->GetVar2VarName().size());
 
   if (force_persisable || is_persisable) {
     VLOG(6) << "Create var: " << name << " in scope "
             << value_exe_info->GetScope()->root();
-    var = const_cast<paddle::framework::Scope*>(
-              value_exe_info->GetScope()->root())
-              ->Var(name);
+    var = const_cast<Scope*>(value_exe_info->GetScope()->root())->Var(name);
   } else {
     VLOG(6) << "Create var: " << name << " in scope "
             << value_exe_info->GetScope();
@@ -177,20 +259,19 @@ paddle::framework::Variable* CreateVar(
   return var;
 }
 
-void CheckInputVars(
-    pir::Operation* op,
-    const std::string& op_name,
-    const std::unordered_map<pir::Value, std::string>& value_2_var_name) {
+void CheckInputVars(pir::Operation* op,
+                    const std::string& op_name,
+                    ValueExecutionInfo* execution_info) {
   size_t input_num = op->num_operands();
   if (input_num > 0) {
     for (size_t i = 0; i < input_num; ++i) {
       auto value = op->operand_source(i);
       if (IsInvalid(value)) {
-        PADDLE_ENFORCE_NE(
-            value_2_var_name.find(value),
-            value_2_var_name.end(),
+        PADDLE_ENFORCE_EQ(
+            execution_info->HasValue(value),
+            true,
             phi::errors::PreconditionNotMet(
-                "input should in name map, [%d] 'th input of [%s] op",
+                "input should in execution_info, [%d] 'th input of [%s] op",
                 i,
                 op_name));
       }
@@ -200,13 +281,13 @@ void CheckInputVars(
 
 void BuildValue(pir::Value value,
                 const std::string& var_name_prefix,
-                paddle::framework::ValueExecutionInfo* value_exe_info) {
+                ValueExecutionInfo* value_exe_info) {
   if (!IsInvalid(value)) {
     VLOG(8) << "Value is not invalid, so skip build a variable.";
     return;
   }
 
-  paddle::framework::Variable* var = nullptr;
+  Variable* var = nullptr;
   auto& value_2_var_name = value_exe_info->GetValue2VarName();
   if (value_2_var_name.find(value) != value_2_var_name.end()) {
     var = value_exe_info->GetScope()->FindVar(value_2_var_name.at(value));
@@ -220,7 +301,7 @@ void BuildValue(pir::Value value,
   } else if (value.type().isa<paddle::dialect::AllocatedSelectedRowsType>()) {
     var->GetMutable<phi::SelectedRows>();
   } else if (value.type().isa<pir::VectorType>()) {
-    auto tensor_array = var->GetMutable<paddle::framework::VariableRefArray>();
+    auto tensor_array = var->GetMutable<VariableRefArray>();
     for (size_t i = 0; i < value.type().dyn_cast<pir::VectorType>().size();
          i++) {
       PADDLE_ENFORCE(value.type()
@@ -235,16 +316,15 @@ void BuildValue(pir::Value value,
       tensor_array->emplace_back(var_i);
     }
   } else {
-    PADDLE_THROW(phi::errors::PreconditionNotMet(
-        "Output only support DenseTensorType or VectorType"));
+    PADDLE_THROW(
+        phi::errors::PreconditionNotMet("Output only support DenseTensorType "
+                                        "or SelectedRowsType or VectorType"));
   }
 }
 
-void HandleForSpecialOp(
-    pir::Operation* op,
-    const std::string& var_name_prefix,
-    std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks,
-    paddle::framework::ValueExecutionInfo* value_exe_info) {
+void HandleForSpecialOp(pir::Operation* op,
+                        const std::string& var_name_prefix,
+                        ValueExecutionInfo* value_exe_info) {
   std::string op_name = op->name();
   if (op->attributes().count("op_name")) {
     op_name =
@@ -257,8 +337,7 @@ void HandleForSpecialOp(
         op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
 
     auto fetch_var_name = fetch_src_name + "@fetch";
-    auto* var = const_cast<paddle::framework::Scope*>(
-                    value_exe_info->GetScope()->root())
+    auto* var = const_cast<Scope*>(value_exe_info->GetScope()->root())
                     ->Var(fetch_var_name);
     var->GetMutable<phi::DenseTensor>();
     auto value = op->result(0);
@@ -274,7 +353,7 @@ void HandleForSpecialOp(
 
     std::string name =
         op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
-    paddle::framework::Variable* var = value_exe_info->GetScope()->Var(name);
+    Variable* var = value_exe_info->GetScope()->Var(name);
     PADDLE_ENFORCE(var,
                    paddle::platform::errors::InvalidArgument(
                        "The variable %s shoud exist", name));
@@ -285,7 +364,7 @@ void HandleForSpecialOp(
   if (op_name == "builtin.combine") {
     auto out_value = op->result(0);
 
-    paddle::framework::Variable* var = nullptr;
+    Variable* var = nullptr;
     auto& value_2_var_name = value_exe_info->GetValue2VarName();
     if (value_2_var_name.find(out_value) != value_2_var_name.end()) {
       var = value_exe_info->GetScope()->FindVar(value_2_var_name.at(out_value));
@@ -293,7 +372,7 @@ void HandleForSpecialOp(
       var = CreateVar(out_value, var_name_prefix, false, value_exe_info);
     }
 
-    auto tensor_array = var->GetMutable<paddle::framework::VariableRefArray>();
+    auto tensor_array = var->GetMutable<VariableRefArray>();
     // clear tensor array
     tensor_array->clear();
     size_t input_num = op->num_operands();
@@ -326,7 +405,7 @@ void HandleForSpecialOp(
             "SetParamer param name should not equal with var name"));
 
     if (value_exe_info->GetScope()->root()->FindVar(param_name) == nullptr) {
-      const_cast<paddle::framework::Scope*>(value_exe_info->GetScope()->root())
+      const_cast<Scope*>(value_exe_info->GetScope()->root())
           ->Rename(orig_name, param_name);
       VLOG(6) << "set_parameter rename var: " << orig_name << " -> "
               << param_name;
@@ -334,18 +413,19 @@ void HandleForSpecialOp(
 
     value_exe_info->Rename(value, param_name, orig_name);
   }
-
-  if (op_name == "pd_op.shadow_output") {
-    VLOG(6) << "Handle for pd_op.shadow_ouptut";
-    auto var_name =
-        op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
+  if (op_name.compare(pir::ShadowOutputOp::name()) == 0) {
+    VLOG(6) << "Handle for builtin.shadow_ouptut";
+    auto var_name = op->attributes()
+                        .at("output_name")
+                        .dyn_cast<pir::StrAttribute>()
+                        .AsString();
 
     auto value = op->operand_source(0);
     // change opreand name to param_name
     auto orig_name = value_exe_info->GetValue2VarName().at(value);
 
-    if (value_exe_info->GetScope()->root()->FindVar(var_name) == nullptr) {
-      const_cast<paddle::framework::Scope*>(value_exe_info->GetScope()->root())
+    if (value_exe_info->GetScope()->FindVar(var_name) == nullptr) {
+      const_cast<Scope*>(value_exe_info->GetScope())
           ->Rename(orig_name, var_name);
     }
 
@@ -376,7 +456,7 @@ void HandleForSpecialOp(
         op->attributes().at("index").dyn_cast<pir::Int32Attribute>().data();
     auto in_var = value_exe_info->GetScope()->FindVar(
         value_exe_info->GetValue2VarName().at(in_value));
-    auto variable_array = in_var->Get<paddle::framework::VariableRefArray>();
+    auto variable_array = in_var->Get<VariableRefArray>();
 
     PADDLE_ENFORCE_EQ(
         value_exe_info->GetVar2VarName().count(variable_array[index]),
@@ -400,7 +480,7 @@ void HandleForSpecialOp(
 
     auto in_var = value_exe_info->GetScope()->FindVar(
         value_exe_info->GetValue2VarName().at(in_value));
-    auto variable_array = in_var->Get<paddle::framework::VariableRefArray>();
+    auto variable_array = in_var->Get<VariableRefArray>();
 
     for (uint64_t idx = 0; idx < variable_array.size(); ++idx) {
       auto out_value = op->result(idx);
@@ -418,29 +498,25 @@ void HandleForSpecialOp(
 
   if (op_name == "pd_op.if") {
     auto if_op = op->dyn_cast<paddle::dialect::IfOp>();
-
-    auto true_block = if_op.true_block();
-
-    auto false_block = if_op.false_block();
-
-    auto& true_branch_scope = value_exe_info->GetScope()->NewScope();
-    sub_blocks->emplace(true_block, &true_branch_scope);
-
-    auto& false_branch_scope = value_exe_info->GetScope()->NewScope();
-    sub_blocks->emplace(false_block, &false_branch_scope);
-
     for (size_t i = 0; i < if_op->num_results(); ++i) {
-      // auto true_value = true_yeid_op->operand_source(i);
-
       auto if_op_out_value = if_op->result(i);
       BuildValue(if_op_out_value, var_name_prefix, value_exe_info);
     }
   }
+
+  if (op_name == "pd_op.while") {
+    auto while_op = op->dyn_cast<paddle::dialect::WhileOp>();
+
+    for (size_t i = 0; i < while_op->num_results(); ++i) {
+      auto while_op_out_value = while_op->result(i);
+      BuildValue(while_op_out_value, var_name_prefix, value_exe_info);
+    }
+  }
 }
 
 void HandleForInplaceOp(pir::Operation* op,
                         const std::string& var_name_prefix,
-                        paddle::framework::ValueExecutionInfo* value_exe_info) {
+                        ValueExecutionInfo* value_exe_info) {
   if (op->num_results() < 1) return;
   pir::IrContext* ctx = pir::IrContext::Instance();
   std::string op_name = op->name();
@@ -466,8 +542,7 @@ void HandleForInplaceOp(pir::Operation* op,
       const std::string& inplace_name = yaml_parser.InplaceName(value_name);
       pir::Value inplace_value =
           op->operand_source(yaml_parser.InputName2Id().at(inplace_name));
-      std::string var_name =
-          value_exe_info->GetValue2VarName().at(inplace_value);
+      std::string var_name = value_exe_info->GetVarName(inplace_value);
       VLOG(4) << "inplace: " << value_name << " -> " << inplace_name
               << " (var: " << var_name << ")";
       value_exe_info->AddValue2VarName(value, var_name);
@@ -476,8 +551,7 @@ void HandleForInplaceOp(pir::Operation* op,
       pir::Value view_value =
           op->operand_source(yaml_parser.InputName2Id().at(view_name));
       // const std::string& var_name = value_2_var_name->at(view_value);
-      const std::string& var_name =
-          value_exe_info->GetValue2VarName().at(view_value);
+      std::string var_name = value_exe_info->GetVarName(view_value);
       VLOG(4) << "view: " << value_name << " -> " << view_name
               << " (var: " << var_name << ")";
       value_exe_info->AddValue2VarName(value, var_name);
@@ -491,13 +565,11 @@ void HandleForInplaceOp(pir::Operation* op,
 // is created in inner_scope.
 void BuildScope(const pir::Block& block,
                 const std::string& var_name_prefix,
-                std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks,
-                paddle::framework::ValueExecutionInfo* value_exe_info) {
+                ValueExecutionInfo* value_exe_info) {
   VLOG(4) << "***** [before build] scope"
           << "(" << value_exe_info->GetScope() << ") ******\n"
-          << paddle::framework::GenScopeTreeDebugInfo(
-                 const_cast<paddle::framework::Scope*>(
-                     value_exe_info->GetScope()->root()));
+          << GenScopeTreeDebugInfo(
+                 const_cast<Scope*>(value_exe_info->GetScope()->root()));
 
   for (auto op : block) {
     std::string op_name = op->name();
@@ -509,11 +581,11 @@ void BuildScope(const pir::Block& block,
     }
     VLOG(4) << "build op:" << op_name;
     if (SpecialOps.count(op_name)) {
-      HandleForSpecialOp(op, var_name_prefix, sub_blocks, value_exe_info);
+      HandleForSpecialOp(op, var_name_prefix, value_exe_info);
       continue;
     }
 
-    CheckInputVars(op, op_name, value_exe_info->GetValue2VarName());
+    CheckInputVars(op, op_name, value_exe_info);
 
     if (op->num_results() < 1) continue;
     if (op->attributes().count("is_inplace") != 0 &&
@@ -532,22 +604,16 @@ void BuildScope(const pir::Block& block,
 
   VLOG(4) << "***** [after build] scope"
           << "(" << value_exe_info->GetScope() << ") ******\n"
-          << paddle::framework::GenScopeTreeDebugInfo(
-                 const_cast<paddle::framework::Scope*>(
-                     value_exe_info->GetScope()->root()));
+          << GenScopeTreeDebugInfo(
+                 const_cast<Scope*>(value_exe_info->GetScope()->root()));
 }
 
-void BuildRuntimeContext(
-    pir::Operation* op,
-    const std::unordered_map<pir::Value, std::string>& name_map,
-    paddle::framework::Scope* scope,
-    paddle::framework::Scope* local_scope,
-    const paddle::dialect::OpYamlInfoParser& op_yaml_info,
-    paddle::framework::RuntimeContext* runtime_ctx) {
-  paddle::framework::Scope* inner_scope =
-      local_scope != nullptr ? local_scope : scope;
-  VLOG(6) << "BuildPhiContext in scope[" << scope << "] inner_scope["
-          << inner_scope << "]";
+void BuildRuntimeContext(pir::Operation* op,
+                         const ValueExecutionInfo& value_exec_info,
+                         const paddle::dialect::OpYamlInfoParser& op_yaml_info,
+                         RuntimeContext* runtime_ctx) {
+  const Scope* inner_scope = value_exec_info.GetScope();
+  VLOG(6) << "BuildPhiContext in scope[" << inner_scope << "]";
 
   auto& vec_kernel_fn_tensor_params = op_yaml_info.TensorParams(true);
 
@@ -571,7 +637,7 @@ void BuildRuntimeContext(
     }
 
     auto legacy_attr_name = op_normalizer.GetLegacyArgName(fluid_op_name, name);
-    auto in_var_name = name_map.at(ptr);
+    auto in_var_name = value_exec_info.GetVarName(ptr);
     VLOG(6) << "ctx->EmplaceBackInput: " << name << "\t" << in_var_name;
     PADDLE_ENFORCE_NOT_NULL(inner_scope->FindVar(in_var_name),
                             phi::errors::PreconditionNotMet(
@@ -591,7 +657,7 @@ void BuildRuntimeContext(
       continue;
     }
 
-    auto in_var_name = name_map.at(ptr);
+    auto in_var_name = value_exec_info.GetVarName(ptr);
     VLOG(6) << "ctx->EmplaceBackOutput: " << name << "\t" << in_var_name;
 
     PADDLE_ENFORCE_NOT_NULL(inner_scope->FindVar(in_var_name),
@@ -605,11 +671,11 @@ void BuildRuntimeContext(
         type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
       runtime_ctx->outputs[legacy_arg_name] = {var};
     } else if (type.isa<pir::VectorType>()) {
-      auto var_ref = var->Get<paddle::framework::VariableRefArray>();
-      std::vector<paddle::framework::Variable*> vec_tmp;
+      auto var_ref = var->Get<VariableRefArray>();
+      std::vector<Variable*> vec_tmp;
       vec_tmp.reserve(var_ref.size());
       for (size_t k = 0; k < var_ref.size(); ++k) {
-        vec_tmp.push_back(const_cast<paddle::framework::Variable*>(var_ref[k]));
+        vec_tmp.push_back(const_cast<Variable*>(var_ref[k]));
       }
       runtime_ctx->outputs[legacy_arg_name] = vec_tmp;
     } else {
@@ -620,13 +686,10 @@ void BuildRuntimeContext(
   }
 }
 
-std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
+std::shared_ptr<OperatorBase> BuildOperatorBase(
     pir::Operation* op,
-    const std::unordered_map<pir::Value, std::string>& name_map,
-    const paddle::dialect::OpYamlInfoParser& op_yaml_info,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name,
-    const paddle::framework::Scope* scope) {
+    const ValueExecutionInfo& value_exec_info,
+    const paddle::dialect::OpYamlInfoParser& op_yaml_info) {
   paddle::framework::VariableNameMap in_name_map;
   paddle::framework::VariableNameMap out_name_map;
   paddle::framework::AttributeMap attr_map;
@@ -638,6 +701,8 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
 
   auto& op_normalizer = paddle::translator::OpNameNormalizer::instance();
 
+  auto scope = value_exec_info.GetScope();
+
   // build inputs
   for (auto& name : vec_kernel_fn_tensor_params) {
     PADDLE_ENFORCE_EQ(
@@ -653,8 +718,9 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
               << name;
       continue;
     }
-    VLOG(6) << "Push back inputs to VariableNameMap : " << name_map.at(ptr);
-    in_name_map[legacy_attr_name].push_back(name_map.at(ptr));
+    VLOG(6) << "Push back inputs to VariableNameMap : "
+            << value_exec_info.GetVarName(ptr);
+    in_name_map[legacy_attr_name].push_back(value_exec_info.GetVarName(ptr));
   }
 
   // build attribute
@@ -745,18 +811,17 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
 
     if (ptr.type().isa<paddle::dialect::AllocatedDenseTensorType>() ||
         ptr.type().isa<paddle::dialect::AllocatedSelectedRowsType>()) {
-      out_name_map[legacy_arg_name].push_back(name_map.at(ptr));
-      VLOG(6) << "Push back outputs to VariableNameMap : " << name_map.at(ptr);
+      out_name_map[legacy_arg_name].push_back(value_exec_info.GetVarName(ptr));
+      VLOG(6) << "Push back outputs to VariableNameMap : "
+              << value_exec_info.GetVarName(ptr);
     } else if (ptr.type().isa<pir::VectorType>()) {
-      auto var = scope->FindVar(name_map.at(ptr));
-      auto var_ref = var->Get<paddle::framework::VariableRefArray>();
+      auto var = scope->FindVar(value_exec_info.GetVarName(ptr));
+      auto var_ref = var->Get<VariableRefArray>();
       for (size_t k = 0; k < var_ref.size(); ++k) {
-        PADDLE_ENFORCE(variable_2_var_name.count(var_ref[k]),
-                       "Variable MUST in variable_2_var_name map");
         out_name_map[legacy_arg_name].push_back(
-            variable_2_var_name.at(var_ref[k]));
+            value_exec_info.GetVarName(var_ref[k]));
         VLOG(6) << "Push back outputs to VariableNameMap : "
-                << variable_2_var_name.at(var_ref[k]);
+                << value_exec_info.GetVarName(var_ref[k]);
       }
     } else {
       PADDLE_THROW(phi::errors::Unimplemented(
@@ -765,12 +830,13 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
     }
   }
 
-  auto& op_info = paddle::framework::OpInfoMap::Instance().Get(fluid_op_name);
+  auto& op_info = OpInfoMap::Instance().Get(fluid_op_name);
   auto ptr =
       op_info.Creator()(fluid_op_name, in_name_map, out_name_map, attr_map);
 
-  std::shared_ptr<paddle::framework::OperatorBase> res(ptr);
+  std::shared_ptr<OperatorBase> res(ptr);
   return res;
 }
 
-}  // namespace pir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
index 821f21ebddae89..ce0484567b64f0 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
@@ -14,31 +14,29 @@
 
 #pragma once
 
-#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
-#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/phi/core/meta_tensor.h"
-#include "paddle/pir/core/builtin_attribute.h"
-#include "paddle/pir/core/ir_context.h"
-#include "paddle/pir/core/program.h"
-#include "paddle/pir/core/utils.h"
-
 #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/phi/core/kernel_context.h"
-
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/core/kernel_context.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
 #include "paddle/pir/core/type_name.h"
+#include "paddle/pir/core/utils.h"
 
 #include "glog/logging.h"
 
@@ -46,13 +44,17 @@ namespace paddle {
 namespace framework {
 
 class CondInstruction;
+class WhileInstruction;
 class ValueExecutionInfo {
  public:
+  friend class CondInstruction;
+  friend class WhileInstruction;
+
   explicit ValueExecutionInfo(Scope* scope) : scope_(scope) {}
 
   const ValueExecutionInfo* Parent() const { return parent_; }
 
-  Scope* GetScope() { return scope_; }
+  Scope* GetScope() const { return scope_; }
 
   void Add(::pir::Value value, std::string var_name);
 
@@ -62,33 +64,32 @@ class ValueExecutionInfo {
 
   std::string GetNameById(int id) const;
 
-  const std::unordered_map<::pir::Value, std::string>& GetValue2VarName()
-      const {
-    return value_2_var_name_;
-  }
+  const std::unordered_map<::pir::Value, std::string>& GetValue2VarName() const;
 
-  void AddValue2VarName(::pir::Value value, const std::string& var_name) {
-    value_2_var_name_.emplace(value, var_name);
-  }
+  void AddValue2VarName(::pir::Value value, const std::string& var_name);
 
   const std::unordered_map<const paddle::framework::Variable*, std::string>&
-  GetVar2VarName() const {
-    return var_2_var_name_;
-  }
+  GetVar2VarName() const;
 
-  const std::map<std::string, int>& GetVarName2Id() const {
-    return var_name_2_id_;
-  }
+  const std::map<std::string, int>& GetVarName2Id() const;
 
-  const std::unordered_map<int, std::string>& GetId2VarName() const {
-    return id_2_var_name_;
-  }
+  const std::unordered_map<int, std::string>& GetId2VarName() const;
 
-  const std::vector<Variable*>& GetVarList() const { return var_list_; }
+  const std::vector<Variable*>& GetVarList() const;
 
-  void ResetVarList(int id, Variable* var) { var_list_[id] = var; }
+  void ResetVarList(int id, Variable* var);
 
-  friend class CondInstruction;
+  bool HasVar(const std::string& var_name) const;
+
+  bool HasValue(::pir::Value value) const;
+
+  std::string GetVarName(::pir::Value value) const;
+
+  std::string GetVarName(const Variable* var) const;
+
+  int GetVarId(::pir::Value value) const;
+
+  int GetVarId(const Variable* var) const;
 
  private:
   std::shared_ptr<ValueExecutionInfo> NewChild(Scope* scope);
@@ -99,8 +100,7 @@ class ValueExecutionInfo {
 
   std::unordered_map<::pir::Value, std::string> value_2_var_name_;
 
-  std::unordered_map<const paddle::framework::Variable*, std::string>
-      var_2_var_name_;
+  std::unordered_map<const Variable*, std::string> var_2_var_name_;
 
   std::map<std::string, int> var_name_2_id_;
 
@@ -109,11 +109,6 @@ class ValueExecutionInfo {
   std::vector<Variable*> var_list_;
 };
 
-}  // namespace framework
-}  // namespace paddle
-
-namespace pir {
-
 // NOTE(zhangbo): Some operators of Paddle support optional inputs or outputs,
 // representing whether the input or output exists. In the Pir, whether the
 // value itself is empty or the type it holds is empty is used to indicate
@@ -125,27 +120,19 @@ inline bool IsInvalid(pir::Value value) {
   return true;
 }
 
-void BuildScope(
-    const pir::Block& block,
-    const std::string& var_name_prefix,
-    std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks,
-    paddle::framework::ValueExecutionInfo* value_exe_info = nullptr);
+void BuildScope(const pir::Block& block,
+                const std::string& var_name_prefix,
+                ValueExecutionInfo* value_exe_info = nullptr);
 
-void BuildRuntimeContext(
-    pir::Operation* op,
-    const std::unordered_map<pir::Value, std::string>& name_map,
-    paddle::framework::Scope* scope,
-    paddle::framework::Scope* local_scope,
-    const paddle::dialect::OpYamlInfoParser& op_yaml_info,
-    paddle::framework::RuntimeContext* runtime_ctx);
+void BuildRuntimeContext(pir::Operation* op,
+                         const ValueExecutionInfo& value_exec_info,
+                         const paddle::dialect::OpYamlInfoParser& op_yaml_info,
+                         RuntimeContext* runtime_ctx);
 
-std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
+std::shared_ptr<OperatorBase> BuildOperatorBase(
     pir::Operation* op,
-    const std::unordered_map<pir::Value, std::string>& name_map,
-    const paddle::dialect::OpYamlInfoParser& op_yaml_info,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name,
-    const paddle::framework::Scope* scope);
+    const ValueExecutionInfo& value_exec_info,
+    const paddle::dialect::OpYamlInfoParser& op_yaml_info);
 
 template <typename Context,
           typename InType,
@@ -153,17 +140,13 @@ template <typename Context,
           typename InListType,
           typename OutListType,
           bool is_kernel>
-void BuildPhiContext(
-    pir::Operation* op,
-    const std::unordered_map<pir::Value, std::string>& name_map,
-    paddle::framework::Scope* scope,
-    paddle::framework::Scope* local_scope,
-    const paddle::dialect::OpYamlInfoParser& op_yaml_info,
-    Context* ctx) {
-  paddle::framework::Scope* inner_scope =
-      local_scope != nullptr ? local_scope : scope;
-  VLOG(6) << "Build " << get_type_name<Context>() << " in scope[" << scope
-          << "] inner_scope[" << inner_scope << "]";
+void BuildPhiContext(pir::Operation* op,
+                     const ValueExecutionInfo& value_exec_info,
+                     const paddle::dialect::OpYamlInfoParser& op_yaml_info,
+                     Context* ctx) {
+  Scope* inner_scope = value_exec_info.GetScope();
+  VLOG(6) << "Build " << pir::get_type_name<Context>() << "] inner_scope["
+          << inner_scope << "]";
 
   auto attr_map = op->attributes();
 
@@ -192,7 +175,7 @@ void BuildPhiContext(
       continue;
     }
 
-    auto in_var_name = name_map.at(ptr);
+    auto in_var_name = value_exec_info.GetVarName(ptr);
     VLOG(6) << "ctx->EmplaceBackInput: " << t << "\t" << in_var_name;
 
     PADDLE_ENFORCE_NOT_NULL(inner_scope->FindVar(in_var_name),
@@ -202,9 +185,9 @@ void BuildPhiContext(
     if (var->IsType<phi::DenseTensor>()) {
       const phi::TensorBase* tensor_in = &(var->Get<phi::DenseTensor>());
       ctx->EmplaceBackInput(InType(tensor_in));
-    } else if (var->IsType<paddle::framework::VariableRefArray>()) {
+    } else if (var->IsType<VariableRefArray>()) {
       InListType inputs;
-      auto& variable_array = var->Get<paddle::framework::VariableRefArray>();
+      auto& variable_array = var->Get<VariableRefArray>();
       for (size_t i = 0; i < variable_array.size(); ++i) {
         if (variable_array[i]->IsType<phi::DenseTensor>()) {
           inputs.emplace_back(InType(const_cast<phi::DenseTensor*>(
@@ -233,7 +216,7 @@ void BuildPhiContext(
       // tensor attribute, get information from input
       pir::Value ptr = op->operand_source(name2id.at(t));
 
-      auto in_var_name = name_map.at(ptr);
+      auto in_var_name = value_exec_info.GetVarName(ptr);
 
       auto& tensor_attr_type = op_yaml_info.TensorAttrTypeName(t);
       VLOG(6) << "ctx->EmplaceBack mutable attr: " << t << "\t" << in_var_name;
@@ -243,8 +226,8 @@ void BuildPhiContext(
               &(inner_scope->FindVar(in_var_name)->Get<phi::DenseTensor>()));
           ctx->EmplaceBackAttr(attr);
         } else if (ptr.type().isa<pir::VectorType>()) {
-          auto& tensor_array = inner_scope->FindVar(in_var_name)
-                                   ->Get<paddle::framework::VariableRefArray>();
+          auto& tensor_array =
+              inner_scope->FindVar(in_var_name)->Get<VariableRefArray>();
           if (tensor_array.size() == 1) {
             phi::Attribute attr =
                 phi::TensorRef(&(tensor_array[0]->Get<phi::DenseTensor>()));
@@ -274,7 +257,12 @@ void BuildPhiContext(
 
       continue;
     }
-
+    PADDLE_ENFORCE_NE(
+        attr_map.find(t),
+        attr_map.end(),
+        phi::errors::NotFound("Not found %s in attr_map, it maybe need mapping "
+                              "it in OpTranslator.",
+                              t));
     auto& attr_type_name = op_yaml_info.AttrTypeName(t);
     if (attr_type_name == "paddle::dialect::IntArrayAttribute") {
       ctx->EmplaceBackAttr(
@@ -288,6 +276,8 @@ void BuildPhiContext(
       ctx->EmplaceBackAttr(attr_map[t].dyn_cast<pir::Int64Attribute>().data());
     } else if (attr_type_name == "pir::FloatAttribute") {
       ctx->EmplaceBackAttr(attr_map[t].dyn_cast<pir::FloatAttribute>().data());
+    } else if (attr_type_name == "pir::DoubleAttribute") {
+      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<pir::DoubleAttribute>().data());
     } else if (attr_type_name == "pir::BoolAttribute") {
       ctx->EmplaceBackAttr(attr_map[t].dyn_cast<pir::BoolAttribute>().data());
     } else if (attr_type_name == "pir::StrAttribute") {
@@ -407,17 +397,18 @@ void BuildPhiContext(
 
     if (out_ptr.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
       ctx->EmplaceBackOutput(OutType(const_cast<phi::DenseTensor*>(
-          &(inner_scope->FindVar(name_map.at(out_ptr))
+          &(inner_scope->FindVar(value_exec_info.GetVarName(out_ptr))
                 ->Get<phi::DenseTensor>()))));
     } else if (out_ptr.type()
                    .isa<paddle::dialect::AllocatedSelectedRowsType>()) {
       ctx->EmplaceBackOutput(OutType(const_cast<phi::SelectedRows*>(
-          &(inner_scope->FindVar(name_map.at(out_ptr))
+          &(inner_scope->FindVar(value_exec_info.GetVarName(out_ptr))
                 ->Get<phi::SelectedRows>()))));
     } else if (out_ptr.type().isa<pir::VectorType>()) {
       OutListType outputs;
-      auto& variable_array = inner_scope->FindVar(name_map.at(out_ptr))
-                                 ->Get<paddle::framework::VariableRefArray>();
+      auto& variable_array =
+          inner_scope->FindVar(value_exec_info.GetVarName(out_ptr))
+              ->Get<VariableRefArray>();
       for (size_t i = 0; i < variable_array.size(); ++i) {
         if (variable_array[i]->IsType<phi::DenseTensor>()) {
           outputs.emplace_back(OutType(const_cast<phi::DenseTensor*>(
@@ -442,4 +433,5 @@ void BuildPhiContext(
   VLOG(6) << "Done build phi context";
 }
 
-}  // namespace pir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 2e466962c4d318..826e4e560a684e 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -25,6 +25,8 @@
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_context.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 #ifdef PADDLE_WITH_DNNL
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -740,7 +742,9 @@ void ProgramInterpreter::Convert(
       paddle::framework::Variable* var = inner_scope->FindVar(
           var_scope_.GetNameById(static_cast<int>(var_id)));
       if (var->IsType<phi::DenseTensor>() || var->IsType<phi::SelectedRows>() ||
-          var->IsType<LoDTensorArray>()) {
+          var->IsType<LoDTensorArray>() ||
+          var->IsType<phi::SparseCooTensor>() ||
+          var->IsType<phi::SparseCsrTensor>()) {
         last_live_ops_[var_id].insert(op_idx);
       } else {
         VLOG(4) << "not clear "
@@ -1012,13 +1016,13 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
     if (!instr_node.IsArtificial()) {
       RunOperator(instr_node);
       CheckGC(instr_node);
-      interpreter::LogDeviceMemoryStats(place_);
+      memory::LogDeviceMemoryStats(place_, instr_node.OpBase()->Type());
     }
 
     instr_node.RecordEvent(place_);
   } catch (platform::EnforceNotMet& ex) {
     framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
-    exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
+    exception_holder_.Catch(std::make_exception_ptr(ex));
   } catch (platform::EOFException&) {
     exception_holder_.Catch(std::current_exception());
   } catch (std::exception& ex) {
@@ -1305,6 +1309,18 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
       for (auto& tensor : *tensor_arr) {
         TensorRecordStream(tensor);
       }
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCooTensor>()->mutable_indices()));
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCooTensor>()->mutable_values()));
+    } else if (var->IsType<phi::SparseCsrTensor>()) {
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCsrTensor>()->mutable_cols()));
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCsrTensor>()->mutable_crows()));
+      TensorRecordStream(
+          *(var->GetMutable<phi::SparseCsrTensor>()->mutable_values()));
     } else if (var->IsType<std::vector<Scope*>>()) {
       // do nothing
     } else {
@@ -1331,6 +1347,8 @@ void ProgramInterpreter::CheckGC(const Instruction& instr) {
     // ignore all persistable var while GC
     if (var_scope.VarDesc(static_cast<int>(var_id)) &&
         var_scope.VarDesc(static_cast<int>(var_id))->Persistable()) {
+      VLOG(4) << "Skip persistable var: "
+              << var_scope_.GetNameById(static_cast<int>(var_id));
       continue;
     }
     if (is_ready) {
@@ -1428,7 +1446,7 @@ bool ProgramInterpreter::HasLocalScope() const {
 // miss. When a model is all KQueueAsync type OPs, all OPs will be distributed
 // to the DeviceThread for execution, and the multithreading scheduling will not
 // have any benefits. Therefore, in the dynamic to static, when the number of
-// KQueueAsync Ops is 0, we choose Trace mode.
+// KQueueSync Ops is 0, we choose Trace mode.
 void ProgramInterpreter::TraceInstructionList(
     const std::vector<Instruction>& vec_instr) {
   unfinished_op_number_ = vec_instr.size();
diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc
index 37c8dd22c174a3..1a9beec01bb46d 100644
--- a/paddle/fluid/framework/op_compatible_info.cc
+++ b/paddle/fluid/framework/op_compatible_info.cc
@@ -68,7 +68,6 @@ void OpCompatibleMap::InitOpCompatibleMap() {
   op_compatible_map_["sequence_pad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["sequence_unpad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
 
-  op_compatible_map_["center_loss"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["coalesce_tensor"] = {"1.6.0",
                                            OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["crop_tensor"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
@@ -79,8 +78,6 @@ void OpCompatibleMap::InitOpCompatibleMap() {
   op_compatible_map_["dpsgd"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["eye"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["fill_any_like"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["filter_by_instag"] = {"1.6.0",
-                                            OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["hard_swish"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["gather_nd"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["instance_norm"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
@@ -91,12 +88,9 @@ void OpCompatibleMap::InitOpCompatibleMap() {
   op_compatible_map_["multiclass_nms2"] = {"1.6.0",
                                            OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["one_hot_v2"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["prroi_pool"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["pull_box_sparse"] = {"1.6.0",
                                            OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["scatter_nd_add"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["sequence_topk_avg_pooling"] = {
-      "1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["shard_index"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["size"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["strided_slice"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 8810d7f15ac4b0..1846b7c9f0f71b 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -1133,7 +1133,7 @@ void OpDesc::InferShape(const BlockDesc &block) {
     infer_shape(&ctx);
   } catch (platform::EnforceNotMet &exception) {
     framework::AppendErrorOpHint(Type(), &exception);
-    throw std::move(exception);
+    throw exception;
   } catch (...) {
     std::rethrow_exception(std::current_exception());
   }
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index e1bc5be8c64f9e..632a8cbefc63c3 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/utils/flat_hash_map.h"
-
+#include "paddle/utils/test_macros.h"
 namespace paddle {
 namespace framework {
 
@@ -128,7 +128,7 @@ class OpInfo {
   }
 };
 
-class OpInfoMap {
+class TEST_API OpInfoMap {
  public:
   static OpInfoMap& Instance();
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7a3271a48debc8..17d5f6c4f356a1 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -797,7 +797,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     VLOG(3) << GetExecutionPlace(place) << " " << DebugStringEx(&scope);
   } catch (platform::EnforceNotMet& exception) {
     framework::InsertCallStackInfo(Type(), Attrs(), &exception);
-    throw std::move(exception);
+    throw exception;
   } catch (platform::EOFException&) {
     std::rethrow_exception(std::current_exception());
   } catch (std::exception& ex) {
@@ -1712,8 +1712,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
       VLOG(6) << *kernel_signature_.get();
       phi_kernel_name = kernel_signature_->name;
-      kernel_type_ = std::make_unique<OpKernelType>(
-          std::move(InnerGetExpectedKernelType(exe_ctx)));
+      kernel_type_ =
+          std::make_unique<OpKernelType>(InnerGetExpectedKernelType(exe_ctx));
       dev_ctx = pool.Get(kernel_type_->place_);
 // NOTE(Liu-xiandong): The register kernel used KP have library_type[KP],
 // But the default library_type is Plain, so we need to modify the
@@ -2220,8 +2220,8 @@ phi::KernelKey OperatorWithKernel::ChoosePhiKernel(
   }
   VLOG(6) << *kernel_signature_.get();
   phi_kernel_name = kernel_signature_->name;
-  kernel_type_ = std::make_unique<OpKernelType>(
-      std::move(InnerGetExpectedKernelType(ctx)));
+  kernel_type_ =
+      std::make_unique<OpKernelType>(InnerGetExpectedKernelType(ctx));
 
   auto phi_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
   phi_kernel_ =
@@ -3249,33 +3249,32 @@ void OperatorWithKernel::BuildPhiKernelContext(
           // scalar is in the attribute
           switch (AttrTypeID(attr_iter->second)) {
             case proto::AttrType::FLOAT:
-              phi_kernel_context->EmplaceBackAttr(std::move(
-                  phi::Scalar(PADDLE_GET_CONST(float, attr_iter->second))));
+              phi_kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(float, attr_iter->second)));
               break;
             case proto::AttrType::FLOAT64:
-              phi_kernel_context->EmplaceBackAttr(std::move(
-                  phi::Scalar(PADDLE_GET_CONST(double, attr_iter->second))));
+              phi_kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(double, attr_iter->second)));
               break;
             case proto::AttrType::INT:
-              phi_kernel_context->EmplaceBackAttr(std::move(
-                  phi::Scalar(PADDLE_GET_CONST(int, attr_iter->second))));
+              phi_kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(int, attr_iter->second)));
               break;
             case proto::AttrType::LONG:
-              phi_kernel_context->EmplaceBackAttr(std::move(
-                  phi::Scalar(PADDLE_GET_CONST(int64_t, attr_iter->second))));
+              phi_kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(int64_t, attr_iter->second)));
               break;
             case proto::AttrType::STRING:
-              phi_kernel_context->EmplaceBackAttr(std::move(phi::Scalar(
-                  PADDLE_GET_CONST(std::string, attr_iter->second))));
+              phi_kernel_context->EmplaceBackAttr(phi::Scalar(
+                  PADDLE_GET_CONST(std::string, attr_iter->second)));
               break;
             case proto::AttrType::BOOLEAN:
-              phi_kernel_context->EmplaceBackAttr(std::move(
-                  phi::Scalar(PADDLE_GET_CONST(bool, attr_iter->second))));
+              phi_kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(bool, attr_iter->second)));
               break;
             case proto::AttrType::SCALAR:
-              phi_kernel_context->EmplaceBackAttr(
-                  std::move(phi::Scalar(PADDLE_GET_CONST(
-                      paddle::experimental::Scalar, attr_iter->second))));
+              phi_kernel_context->EmplaceBackAttr(phi::Scalar(PADDLE_GET_CONST(
+                  paddle::experimental::Scalar, attr_iter->second)));
               break;
             default:
               PADDLE_THROW(platform::errors::Unimplemented(
@@ -3448,7 +3447,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
           } break;
           case phi::AttributeType::STRING:
             phi_kernel_context->EmplaceBackAttr(
-                std::move(PADDLE_GET_CONST(std::string, attr_iter->second)));
+                PADDLE_GET_CONST(std::string, attr_iter->second));
             break;
           case phi::AttributeType::INT64S:
             switch (AttrTypeID(attr_iter->second)) {
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index 67153a7001ece8..d1eb5558c54541 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/init_default_kernel_signature_map.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -60,8 +61,6 @@ class KernelArgsNameMaker {
   virtual const paddle::small_vector<const char*>& GetAttrsArgsNames() = 0;
 };
 
-TEST_API void InitDefaultKernelSignatureMap();
-
 // TODO(Wilber): support others device context.
 template <typename T>
 struct ConvertToPhiContext {
diff --git a/paddle/fluid/framework/program_utils.cc b/paddle/fluid/framework/program_utils.cc
index 2d8a35ca00a76f..e8e64b68d767e4 100644
--- a/paddle/fluid/framework/program_utils.cc
+++ b/paddle/fluid/framework/program_utils.cc
@@ -90,15 +90,15 @@ void ProgramProcessor::GetInputsOutputsInBlock(
   // NOTE: Here assumes that all variables are input or output of Ops,
 
   for (OpDesc *op : current_block.AllOps()) {
-    for (auto iname : op->InputNames()) {
-      for (auto in_var_name : op->Input(iname)) {
+    for (auto const &iname : op->InputNames()) {
+      for (auto const &in_var_name : op->Input(iname)) {
         VLOG(3) << "insert inner_inputs_name:" << in_var_name;
         inner_inputs->insert(in_var_name);
       }
     }
 
-    for (auto oname : op->OutputNames()) {
-      for (auto out_var_name : op->Output(oname)) {
+    for (auto const &oname : op->OutputNames()) {
+      for (auto const &out_var_name : op->Output(oname)) {
         VLOG(3) << "insert out_var_name:" << out_var_name;
         inner_outputs->insert(out_var_name);
       }
@@ -150,7 +150,7 @@ void ProgramProcessor::AddDepToBlockOp(const BlockDesc &block) {
       VLOG(3) << "sub_outputs.size:" << sub_outputs.size();
 
       auto *op_inputs = op->MutableInputs();
-      std::vector<std::string> *op_input_var_vec;
+      std::vector<std::string> *op_input_var_vec = nullptr;
       VLOG(3) << "op_type:>>>>>>" << op_type;
       if (op_type.compare("while") == 0) {
         op_input_var_vec = &((*op_inputs)["kX"]);
@@ -163,7 +163,7 @@ void ProgramProcessor::AddDepToBlockOp(const BlockDesc &block) {
         continue;
       }
 
-      for (auto sub_input : sub_inputs) {
+      for (auto const &sub_input : sub_inputs) {
         if (std::find(op_input_var_vec->begin(),
                       op_input_var_vec->end(),
                       sub_input) == op_input_var_vec->end())
@@ -175,7 +175,7 @@ void ProgramProcessor::AddDepToBlockOp(const BlockDesc &block) {
       auto *op_outputs = op->MutableOutputs();
       auto *op_output_var_vec = &((*op_outputs)["kOutputs"]);
 
-      for (auto sub_output : sub_outputs) {
+      for (auto const &sub_output : sub_outputs) {
         if (std::find(op_output_var_vec->begin(),
                       op_output_var_vec->end(),
                       sub_output) == op_output_var_vec->end())
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 9702805e176c21..8da75413e9d6d9 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -198,80 +198,3 @@ TEST(Prune, multi_target) {
   f::Prune(*pdesc, feed_var_names, &pruned);
   EXPECT_EQ(pruned.blocks(0).ops_size(), 3);
 }
-
-TEST(Prune, recurrrent_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::BlockDesc *sub_block = program.AppendBlock(*block);
-  AddOp("one_two",
-        {{"input", {"a"}}},
-        {{"output", {"b", "c"}}},
-        f::AttributeMap{},
-        block);
-
-  std::vector<std::string> state_var_name(1, "y");
-  AddOp("recurrent",
-        {{"input", {"b", "c"}}},
-        {{"output", {"b1, c1"}}},
-        {{"ex_states", state_var_name},
-         {"states", state_var_name},
-         {"sub_block", sub_block}},
-        block);
-
-  EXPECT_TRUE(sub_block != nullptr);
-  AddOp("rnn_memory_helper",
-        {{"input", {"x"}}},
-        {{"output", {"y"}}},
-        f::AttributeMap{},
-        sub_block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  std::set<std::string> feed_var_names = {"a"};
-
-  f::Prune(*pdesc, feed_var_names, &pruned);
-  EXPECT_EQ(pruned.blocks_size(), 2);
-  EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
-  EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
-}
-
-// If the output of an op modifies feed vars, the op should not clip.
-TEST(Prune, recurrrent_op_2) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::BlockDesc *sub_block = program.AppendBlock(*block);
-  AddOp("one_two",
-        {{"input", {"a"}}},
-        {{"output", {"b", "c"}}},
-        f::AttributeMap{},
-        block);
-
-  std::vector<std::string> state_var_name(1, "y");
-  AddOp("recurrent",
-        {{"input", {"b", "c"}}},
-        {{"output", {"b1, c1"}}},
-        {{"ex_states", state_var_name},
-         {"states", state_var_name},
-         {"sub_block", sub_block}},
-        block);
-
-  EXPECT_TRUE(sub_block != nullptr);
-  AddOp("rnn_memory_helper",
-        {{"input", {"x"}}},
-        {{"output", {"a"}}},
-        f::AttributeMap{},
-        sub_block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  std::set<std::string> feed_var_names = {"x", "a"};
-
-  f::Prune(*pdesc, feed_var_names, &pruned);
-  EXPECT_EQ(pruned.blocks_size(), 2);
-  EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
-  EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
-}
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index db8506e9ec5c92..f295fa7106dd43 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -45,7 +45,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
     uint64_t tid = static_cast<uint64_t>(
         dwp_param_.program_config(0).pull_dense_table_id(i));
     TableParameter table;
-    for (auto i : param_.dense_table()) {
+    for (auto const& i : param_.dense_table()) {
       if (i.table_id() == tid) {
         table = i;
         break;
diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc
index 1f7aba8e225bde..833848864a785e 100644
--- a/paddle/fluid/framework/scope_pool.cc
+++ b/paddle/fluid/framework/scope_pool.cc
@@ -29,7 +29,7 @@ void ScopePool::Insert(std::unique_ptr<Scope> &&s) {
 }
 
 void ScopePool::Remove(Scope *s) {
-  size_t has_scope;
+  size_t has_scope = 0;
   {
     std::lock_guard<std::mutex> guard(mtx_);
     has_scope = scopes_.erase(s);
diff --git a/paddle/fluid/framework/selected_rows_utils.cc b/paddle/fluid/framework/selected_rows_utils.cc
index d74e45449226f5..3f72ced811390c 100644
--- a/paddle/fluid/framework/selected_rows_utils.cc
+++ b/paddle/fluid/framework/selected_rows_utils.cc
@@ -45,7 +45,7 @@ void SerializeToStream(std::ostream& os,
 void SerializeToStream(std::ostream& os,
                        const phi::SelectedRows& selected_rows) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  const platform::DeviceContext* dev_ctx;
+  const platform::DeviceContext* dev_ctx = nullptr;
   auto place = selected_rows.place();
   dev_ctx = pool.Get(place);
   SerializeToStream(os, selected_rows, *dev_ctx);
@@ -53,7 +53,7 @@ void SerializeToStream(std::ostream& os,
 
 void DeserializeFromStream(std::istream& is, phi::SelectedRows* selected_rows) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  const platform::DeviceContext* dev_ctx;
+  const platform::DeviceContext* dev_ctx = nullptr;
   dev_ctx = pool.Get(platform::CPUPlace());
   DeserializeFromStream(is, selected_rows, *dev_ctx);
 }
@@ -63,7 +63,7 @@ void DeserializeFromStream(std::istream& is,
                            const platform::DeviceContext& dev_ctx) {
   {
     // the 1st field, unit32_t version for SelectedRows
-    uint32_t version;
+    uint32_t version = 0;
     is.read(reinterpret_cast<char*>(&version), sizeof(version));
     PADDLE_ENFORCE_EQ(version,
                       0U,
@@ -86,7 +86,7 @@ void DeserializeFromStream(std::istream& is,
   }
   {
     // the 3st field, the height of the SelectedRows
-    int64_t height;
+    int64_t height = 0;
     is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
     selected_rows->set_height(height);
   }
diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc
index 58c658a67c69eb..07e3f07294fae6 100644
--- a/paddle/fluid/framework/string_array.cc
+++ b/paddle/fluid/framework/string_array.cc
@@ -81,20 +81,20 @@ void StringMapToStream(std::ostream& os,
 void StringMapFromStream(std::istream& is,
                          std::unordered_map<std::string, int32_t>* data) {
   // first read the map size
-  size_t map_size;
+  size_t map_size = 0;
   is.read(reinterpret_cast<char*>(&map_size), sizeof(map_size));
   data->reserve(map_size);
   // then read the data
   for (size_t i = 0; i < map_size; ++i) {
     // read the token
-    size_t token_length;
+    size_t token_length = 0;
     is.read(reinterpret_cast<char*>(&token_length), sizeof(token_length));
     char* tmp = new char[token_length];
     is.read(tmp, token_length);  // NOLINT
     std::string token(tmp, tmp + token_length);
     delete[] tmp;
     // read the token_id
-    int32_t token_id;
+    int32_t token_id = 0;
     is.read(reinterpret_cast<char*>(&token_id), sizeof(token_id));
 
     data->emplace(token, token_id);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 16a9065c2eb875..d7cfb4738822af 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -274,7 +274,7 @@ void TensorCopyImpl(const TENSOR& src,
                     const platform::Place& dst_place,
                     TENSOR* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  const platform::DeviceContext* dev_ctx;
+  const platform::DeviceContext* dev_ctx = nullptr;
   if (platform::is_gpu_place(dst_place) ||
       platform::is_custom_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
@@ -585,7 +585,7 @@ void TensorFromStream(std::istream& is,
                       const platform::DeviceContext& dev_ctx,
                       const size_t& seek,
                       const std::vector<int64_t>& shape) {
-  uint32_t version;
+  uint32_t version = 0;
   is.read(reinterpret_cast<char*>(&version), sizeof(version));
 
   PADDLE_ENFORCE_EQ(
@@ -598,7 +598,7 @@ void TensorFromStream(std::istream& is,
   proto::VarType::TensorDesc desc;
   {  // int32_t size
     // proto buffer
-    int32_t size;
+    int32_t size = 0;
     is.read(reinterpret_cast<char*>(&size), sizeof(size));
     std::unique_ptr<char[]> buf(new char[size]);  // NOLINT
     is.read(reinterpret_cast<char*>(buf.get()), size);
@@ -612,7 +612,7 @@ void TensorFromStream(std::istream& is,
     size_t seekg = seek * framework::SizeOfType(desc.data_type());
     is.seekg(seekg, is.cur);  // NOLINT
 
-    void* buf;
+    void* buf = nullptr;
     phi::CPUContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
@@ -652,7 +652,7 @@ void TensorFromStream(std::istream& is,
 void TensorFromStream(std::istream& is,
                       phi::DenseTensor* tensor,
                       const platform::DeviceContext& dev_ctx) {
-  uint32_t version;
+  uint32_t version = 0;
   is.read(reinterpret_cast<char*>(&version), sizeof(version));
   PADDLE_ENFORCE_EQ(
       version,
@@ -685,7 +685,7 @@ void TensorFromStream(std::istream& is,
     dims.reserve(static_cast<size_t>(desc.dims().size()));
     std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
     tensor->Resize(phi::make_ddim(dims));
-    void* buf;
+    void* buf = nullptr;
     phi::CPUContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index ad21fdf45698bf..16e6109a606572 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -59,8 +59,6 @@ static const std::unordered_set<std::string> &GetOpWithUnusedVarAllowSet() {
       "batch_norm_grad",                    // 0
       "sync_batch_norm",                    // 0
       "sync_batch_norm_grad",               // 0
-      "inplace_abn",                        // 0
-      "inplace_abn_grad",                   // 0
       "dgc_momentum",                       // 0
       "fake_quantize_range_abs_max",        // 0
       "rmsprop",                            // 0
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 6f2b8844f52de7..cb872462a1297e 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -605,7 +605,7 @@ void BasicEngine::Execute() {
           }
         } catch (platform::EnforceNotMet& exception) {
           Clear();
-          throw std::move(exception);
+          throw exception;
         } catch (std::exception& ex) {
           Clear();
           PADDLE_THROW(platform::errors::External("%s", ex.what()));
@@ -620,7 +620,7 @@ void BasicEngine::Execute() {
       }
 
       for (auto& pair : inplace_output_grad_var_list_) {
-        *pair.first = std::move(*pair.second);
+        *pair.first = *pair.second;
       }
 
       // Step 2: Sum Gradient of This graph
diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc
index 3e2e96f1432773..bf09ac38d6d113 100644
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@@ -128,9 +128,9 @@ void SetLoadProcessSignalHandler() {
 }
 
 void ThrowErrorIfLoadProcessFailed() {
-  int error;
-  std::set<pid_t> *pids_set;
-  pid_t process_pid;
+  int error = 0;
+  std::set<pid_t> *pids_set = nullptr;
+  pid_t process_pid = 0;
   siginfo_t infop;
 
   for (auto &p : load_process_pids) {
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index 757668f12ddc70..deda1ff572a704 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -200,7 +200,7 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
         }
       }
 
-      op_desc->SetInput(pair.first, std::move(names));
+      op_desc->SetInput(pair.first, names);
     }
 
     for (auto &pair : op->Outputs()) {
@@ -212,7 +212,7 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
         }
       }
 
-      op_desc->SetOutput(pair.first, std::move(names));
+      op_desc->SetOutput(pair.first, names);
     }
   }
 
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 3528fc87b6ab1f..3f8c35b6f5e556 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -77,7 +77,7 @@ static framework::RuntimeContext PrepareRuntimeContext(
       out_ctx.emplace_back(out_var->MutableVar());
     }
   }
-  return framework::RuntimeContext(std::move(inputs), std::move(outputs));
+  return framework::RuntimeContext(inputs, outputs);
 }
 
 template <typename VarType>
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 833a13546ccd77..22651eaa1d9e0d 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -907,7 +907,7 @@ void PartialGradTask::RunEachOp(OpBase *op) {
     } else {
       for (auto &grad_var : input_pair.second) {
         if (grad_var) {
-          bool is_last;
+          bool is_last = false;
           new_inputs.emplace_back(
               ready_grad_vars_.Get(grad_var.get(), op->place(), &is_last));
           VLOG(10) << "Got ready grad var " << grad_var->Name() << " "
@@ -1031,7 +1031,7 @@ void PartialGradTask::RunEachOp(OpBase *op) {
         assign_op->SetPlace(op->place());
 
         if (auto grad_pending_node = grad_grad->GetGradNode()) {
-          assign_node->InsertGradPendingNode(std::move(grad_pending_node));
+          assign_node->InsertGradPendingNode(grad_pending_node);
         }
       }
       VLOG(10) << "Pending ops of assign is "
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 502eeb59114d0e..b03aadd4dc6aa2 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -451,7 +451,7 @@ void Reducer::InitializeGroups(
           .inside_group_index = inside_group_index++,
       };
     }
-    group.variable_indices_ = std::move(variable_indices_);
+    group.variable_indices_ = variable_indices_;
     groups_.emplace_back(std::move(group));
     // Debug Message For Reducer
     VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 6e188b3d21c642..0f992c9b8be309 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -348,7 +348,7 @@ void Tracer::TraceOpImpl(const std::string& type,
     }
   } catch (platform::EnforceNotMet& exception) {
     framework::AppendErrorOpHint(type, &exception);
-    throw std::move(exception);
+    throw exception;
   } catch (std::exception& ex) {
     PADDLE_THROW(
         platform::errors::Fatal("Operator %s raises an %s exception.\n"
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 008be29fe94fb1..08bd2749ad3993 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -280,6 +280,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_optimization_level,
                       TensorRtOptimizationLevel,
                       int);
+  DECL_ARGUMENT_FIELD(tensorrt_ops_run_float,
+                      TensorRtOpsRunFloat,
+                      std::unordered_set<std::string>);
 
   DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
   DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 9bc016fc62faf7..d3e4ce93ca01e5 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -226,6 +226,9 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector()));
       pass->Set("inspector_serialize",
                 new bool(argument->tensorrt_inspector_serialize()));
+      pass->Set("trt_ops_run_float",
+                new std::unordered_set<std::string>(
+                    argument->tensorrt_ops_run_float()));
       pass->Set("use_explicit_quantization",
                 new bool(argument->tensorrt_use_explicit_quantization()));
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
index 1f840999c07ef8..70a5b7b6b7d484 100644
--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
@@ -39,7 +39,6 @@ void analysis::DlnneSubgraphPass::InferShapeForDlnneMainGraph() const {
       "fetch",
       "recurrent",
       "go",
-      "rnn_memory_helper_grad",
       "conditional_block",
       "while",
       "send",
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 7d0d43b8c8d23e..cc64f249179dde 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -379,6 +379,23 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   std::vector<int> origin_outputs_dtype;
   std::map<std::string, int> map_origin_outputs_dtype;
 
+  // rename output names in trt_ops_run_float
+  auto trt_ops_run_float =
+      Get<std::unordered_set<std::string>>("trt_ops_run_float");
+  for (auto node : subgraph) {
+    if (node->NodeType() == Node::Type::kOperation) {
+      for (auto *x : node->outputs) {
+        if (std::count(parameters.begin(), parameters.end(), x->Name()) > 0)
+          continue;
+        if (trt_ops_run_float.count(x->Name()) > 0) {
+          trt_ops_run_float.erase(x->Name());
+          trt_ops_run_float.insert(
+              RenameVarBeUnique(x->Name(), std::to_string(x->id())));
+        }
+      }
+    }
+  }
+
   // Mark TensorRT output nodes as trt outputs
   auto mark_output = Get<bool>("mark_output");
   auto output_tensor_name =
@@ -393,7 +410,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
             continue;
           if ((std::count(output_tensor_name.begin(),
                           output_tensor_name.end(),
-                          x->Name()) > 0) ||
+                          x->Name()) > 0) &&
               !x->outputs.empty()) {
             VLOG(3) << "output " << x->Name() << " has been marked";
             output_names.insert(x->Name());
@@ -569,6 +586,13 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   auto inspector_serialize = Get<bool>("inspector_serialize");
   auto disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
   auto context_memory_sharing = Get<bool>("context_memory_sharing");
+  if (context_memory_sharing && TRT_VERSION < 7200) {
+    // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
+    // when trt version less than 7.2,
+    // createExecutionContextWithoutDeviceMemory() has bug.
+    // so, we cannot enable engine context memory sharing.
+    context_memory_sharing = false;
+  }
   auto enable_low_precision_io = Get<bool>("enable_low_precision_io");
   auto workspace_size = Get<int64_t>("workspace_size");
   auto gpu_device_id = Get<int>("gpu_device_id");
@@ -783,6 +807,9 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
       inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
           .Create(engine_key + std::to_string(predictor_id), params);
 
+  // support force ops to run in FP32 precision
+  trt_engine->SetRunFloat(trt_ops_run_float);
+
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
         Get<std::string>("model_opt_cache_dir"), engine_key);
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index a0d66dc5092981..c3d4c3329016ad 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -482,6 +482,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_engine_memory_sharing_);
   CP_MEMBER(trt_engine_memory_sharing_identifier_);
   CP_MEMBER(trt_optimization_level_);
+  CP_MEMBER(trt_ops_run_float_);
   // Dlnne related
   CP_MEMBER(use_dlnne_);
   CP_MEMBER(dlnne_min_subgraph_size_);
@@ -606,7 +607,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
     // deleted_pass.
     pass_builder_->ClearPasses();
     auto other_passes = other.pass_builder()->AllPasses();
-    for (auto pass : other_passes) {
+    for (auto const &pass : other_passes) {
       pass_builder_->AppendPass(pass);
     }
   }
@@ -623,7 +624,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
                         other_passes.begin(),
                         other_passes.end(),
                         std::inserter(deleted_passes, deleted_passes.begin()));
-    for (auto ps : deleted_passes) {
+    for (auto const &ps : deleted_passes) {
       pass_builder_->DeletePass(ps);
     }
   }
@@ -1148,7 +1149,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << xpu_config_.quant_post_static_gelu_out_threshold;
   ss << xpu_config_.quant_post_dynamic_activation_method;
   ss << xpu_config_.quant_post_dynamic_weight_precision;
-  for (auto type : xpu_config_.quant_post_dynamic_op_types) ss << type;
+  for (auto const &type : xpu_config_.quant_post_dynamic_op_types) ss << type;
   ss << xpu_lite_l3_locked_;
   ss << xpu_lite_enable_multi_stream_;
 
@@ -1164,11 +1165,11 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << ipu_available_memory_proportion_;
   ss << ipu_enable_half_partial_;
   ss << ipu_enable_model_runtime_executor_;
-  for (auto custom_op : ipu_custom_ops_info_)
-    for (auto attr : custom_op) ss << attr;
+  for (auto const &custom_op : ipu_custom_ops_info_)
+    for (auto const &attr : custom_op) ss << attr;
   ss << ";";
-  for (auto pattern : ipu_custom_patterns_)
-    for (auto attr : pattern) ss << attr;
+  for (auto const &pattern : ipu_custom_patterns_)
+    for (auto const &attr : pattern) ss << attr;
   ss << ";";
   for (auto &op : mixed_black_list_) ss << op.c_str();
   for (auto &op : mixed_white_list_) ss << op.c_str();
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index f30e2c560b57ff..a098bc524f2555 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -217,7 +217,7 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                                phi::DenseTensor *t,
                                const platform::Place &place) {
   framework::DDim ddim = phi::make_ddim(pt.shape);
-  void *input_ptr;
+  void *input_ptr = nullptr;
   if (pt.dtype == PaddleDType::INT64) {
     input_ptr = t->mutable_data<int64_t>(ddim, place);
   } else if (pt.dtype == PaddleDType::FLOAT32) {
@@ -1147,6 +1147,9 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
 bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
                             std::vector<paddle::Tensor> *outputs) {
   inference::DisplayMemoryInfo(place_, "before run");
+  if (private_context_) {
+    paddle::platform::DeviceContextPool::SetDeviceContexts(&device_contexts_);
+  }
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 #ifdef PADDLE_WITH_DNNL
   if (config_.use_mkldnn_) MkldnnPreSet(inputs);
@@ -1187,19 +1190,16 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
     return false;
   }
 
-  // All the containers in the scope will be hold in inference, but the
-  // operators assume that the container will be reset after each batch.
-  // Here is a bugfix, collect all the container variables, and reset then to a
-  // bool; the next time, the operator will call MutableData and construct a new
-  // container again, so that the container will be empty for each batch.
-  if (sub_scope_) {
-    tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
-  }
-  tensor_array_batch_cleaner_.ResetNoTensorVars();
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
+  tensor_array_batch_cleaner_.ResetTensorArray();
 
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
+  if (private_context_) {
+    paddle::platform::DeviceContextPool::SetDeviceContexts(nullptr);
+  }
 #ifdef PADDLE_WITH_DNNL
   if (config_.use_mkldnn_) MkldnnPostReset();
 #endif
@@ -1425,6 +1425,7 @@ void AnalysisPredictor::PrepareArgument() {
         config_.trt_use_explicit_quantization_);
     argument_->SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
     argument_->SetTensorRtOptimizationLevel(config_.trt_optimization_level_);
+    argument_->SetTensorRtOpsRunFloat(config_.trt_ops_run_float_);
   }
 
   if (config_.dlnne_enabled()) {
@@ -1468,7 +1469,7 @@ void AnalysisPredictor::PrepareArgument() {
         config_.NNAdapter().nnadapter_subgraph_partition_config_path);
     std::vector<std::string> buffer_keys;
     std::vector<std::vector<char>> buffer_vals;
-    for (auto it : config_.NNAdapter().nnadapter_model_cache_buffers) {
+    for (auto const &it : config_.NNAdapter().nnadapter_model_cache_buffers) {
       buffer_keys.emplace_back(it.first);
       buffer_vals.emplace_back(it.second);
     }
@@ -1884,7 +1885,7 @@ std::map<std::string, std::vector<int64_t>>
 AnalysisPredictor::GetInputTensorShape() {
   std::map<std::string, std::vector<int64_t>> input_shapes;
   std::vector<std::string> names = GetInputNames();
-  for (std::string name : names) {
+  for (std::string const &name : names) {
     auto *var = inference_program_->Block(0).FindVar(name);
     PADDLE_ENFORCE_NOT_NULL(
         var,
@@ -1943,7 +1944,7 @@ std::map<std::string, std::vector<int64_t>>
 AnalysisPredictor::GetOutputTensorShape() {
   std::map<std::string, std::vector<int64_t>> output_shapes;
   std::vector<std::string> names = GetOutputNames();
-  for (std::string name : names) {
+  for (std::string const &name : names) {
     auto *var = inference_program_->Block(0).FindVar(name);
     PADDLE_ENFORCE_NOT_NULL(var,
                             platform::errors::PreconditionNotMet(
@@ -1988,7 +1989,7 @@ AnalysisPredictor::GetOutputTypes() {
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
-  framework::Scope *scope;
+  framework::Scope *scope = nullptr;
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
@@ -2039,7 +2040,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
-  framework::Scope *scope;
+  framework::Scope *scope;  // NOLINT
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
@@ -2363,7 +2364,7 @@ void AnalysisPredictor::StatisticShapeRangeInfo() {
          decltype(min_data) max_data,
          decltype(min_data) opt_data,
          decltype(shape_info_) shape_data) {
-        for (auto it : shape_data) {
+        for (auto const &it : shape_data) {
           auto name = it.first;
           auto shapes = it.second;
 
@@ -2954,6 +2955,7 @@ USE_TRT_CONVERTER(cumsum)
 USE_TRT_CONVERTER(assign)
 USE_TRT_CONVERTER(unbind)
 USE_TRT_CONVERTER(flip)
+USE_TRT_CONVERTER(share_data)
 #if IS_TRT_VERSION_GE(8522)
 USE_TRT_CONVERTER(flash_multihead_matmul)
 USE_TRT_CONVERTER(cross_multihead_matmul)
@@ -3221,6 +3223,13 @@ void InternalUtils::SetTransformerMaskid(
 #endif
 }
 
+void InternalUtils::DisableTensorRtHalfOps(
+    paddle_infer::Config *c, const std::unordered_set<std::string> &ops) {
+#ifdef PADDLE_WITH_CUDA
+  c->trt_ops_run_float_ = ops;
+#endif
+}
+
 void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
 #ifdef PADDLE_WITH_CUDA
   auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 76b0410cc8e8f4..c3f50fd6f6bb39 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -214,7 +214,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto &input = feed_tensors_[i];
     framework::DDim ddim = phi::make_ddim(inputs[i].shape);
-    void *input_ptr;
+    void *input_ptr = nullptr;
     if (inputs[i].dtype == PaddleDType::INT64) {
       input_ptr = input.mutable_data<int64_t>(ddim, place_);
     } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 0a0f786d9a04e5..1b604b544b9475 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -157,7 +157,7 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs(
         // output of ops with unsigned input must be unsigned
         is_unsigned = true;
         double min_scale = std::numeric_limits<double>::max();
-        for (auto input_var_name : op->Input("X")) {
+        for (auto const& input_var_name : op->Input("X")) {
           PADDLE_ENFORCE_NE(
               scales_.find(input_var_name),
               scales_.end(),
@@ -577,7 +577,7 @@ AnalysisPredictor::MkldnnQuantizer::Histogram(
     ++hist[bin];
   }
 
-  return std::make_pair(std::move(hist), std::move(bin_width));
+  return std::make_pair(std::move(hist), bin_width);
 }
 
 void AnalysisPredictor::MkldnnQuantizer::ClearDeviceContext() const {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 5299fa4334ae83..ccefb05896d3f3 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -1291,9 +1291,11 @@ struct PD_INFER_DECL AnalysisConfig {
 
   // memory reuse related.
   bool enable_memory_optim_{false};
-  bool trt_engine_memory_sharing_{false};
+  bool trt_engine_memory_sharing_{true};
   int trt_engine_memory_sharing_identifier_{0};
 
+  std::unordered_set<std::string> trt_ops_run_float_;
+
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
 
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index af16aead74129e..3fefba9ef22be8 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -26,6 +26,7 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "crypto/cipher.h"
@@ -517,6 +518,9 @@ class PD_INFER_DECL InternalUtils {
   static void SetTransformerMaskid(
       paddle_infer::Config* c, const std::string& tensorrt_transformer_maskid);
 
+  static void DisableTensorRtHalfOps(
+      paddle_infer::Config* c, const std::unordered_set<std::string>& ops);
+
   static void SyncStream(paddle_infer::Predictor* pred);
   static void SyncStream(cudaStream_t stream);
   template <typename T>
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 2471c365e29ed9..206b2f5a6a2fdb 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -111,7 +111,8 @@ list(
   assign_op.cc
   flip_op.cc
   quantize_linear_op.cc
-  dequantize_linear_op.cc)
+  dequantize_linear_op.cc
+  share_data_op.cc)
 
 if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
   list(APPEND CONVERT_FILES emb_eltwise_layernorm.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index b68a703c7edf98..70893a97815943 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -43,45 +43,16 @@ void ConvertConv2d(TensorRTEngine* engine,
   framework::OpDesc op_desc(op, nullptr);
 
   auto* X = engine->GetITensor(op_desc.Input("Input").front());
-  bool enable_int8 = op_desc.HasAttr("enable_int8");
-
-  if (enable_int8) {
-#if IS_TRT_VERSION_GE(5000)
-    float in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
-    engine->SetTensorDynamicRange(X, in_scale);
-#endif
-  }
-
+  std::string filter_var_name = op_desc.Input("Filter").front();
+  auto* Y_v = scope.FindVar(filter_var_name);
+  phi::DenseTensor* Y_t = nullptr;
+  nvinfer1::ITensor* filter = nullptr;
   int n_output;
   int n_input;
   int filter_h;
   int filter_w;
-  std::string filter_var_name = op_desc.Input("Filter").front();
-  TensorRTEngine::Weight weight;
-  if (engine->use_explicit_quantization()) {
-    auto* filter = engine->GetITensor(filter_var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        filter,
-        platform::errors::NotFound("Can not find %s ITensor in engine",
-                                   filter_var_name));
-    auto filter_dims = filter->getDimensions();
-    PADDLE_ENFORCE_EQ(
-        filter_dims.nbDims,
-        4UL,
-        platform::errors::InvalidArgument(
-            "The conv2d filter's dims size should be 4, but got %d",
-            filter_dims.nbDims));
-    n_output = filter_dims.d[0];
-    n_input = filter_dims.d[1];
-    filter_h = filter_dims.d[2];
-    filter_w = filter_dims.d[3];
-  } else {
-    auto* Y_v = scope.FindVar(filter_var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        Y_v,
-        platform::errors::NotFound("Can not find %s presistale var in scope.",
-                                   filter_var_name));
-    auto* Y_t = Y_v->GetMutable<phi::DenseTensor>();
+  if (Y_v) {
+    Y_t = Y_v->GetMutable<phi::DenseTensor>();
     PADDLE_ENFORCE_EQ(
         Y_t->dims().size(),
         4UL,
@@ -92,7 +63,27 @@ void ConvertConv2d(TensorRTEngine* engine,
     n_input = Y_t->dims()[1];
     filter_h = Y_t->dims()[2];
     filter_w = Y_t->dims()[3];
-    weight = engine->GetTrtWeight(op_desc.Input("Filter").front(), *Y_t);
+  } else {
+    filter = engine->GetITensor(op_desc.Input("Filter").front());
+    PADDLE_ENFORCE_EQ(
+        filter->getDimensions().nbDims,
+        4UL,
+        platform::errors::InvalidArgument(
+            "The conv2d filter's dims size should be 4, but got %d",
+            filter->getDimensions().nbDims));
+    n_output = filter->getDimensions().d[0];
+    n_input = filter->getDimensions().d[1];
+    filter_h = filter->getDimensions().d[2];
+    filter_w = filter->getDimensions().d[3];
+  }
+
+  bool enable_int8 = op_desc.HasAttr("enable_int8");
+
+  if (enable_int8) {
+#if IS_TRT_VERSION_GE(5000)
+    float in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
+    engine->SetTensorDynamicRange(X, in_scale);
+#endif
   }
   const int groups = PADDLE_GET_CONST(int, op_desc.GetAttr("groups"));
   const std::vector<int> dilations =
@@ -133,7 +124,10 @@ void ConvertConv2d(TensorRTEngine* engine,
     nv_post_paddings.d[0] = paddings[1];
     nv_post_paddings.d[1] = paddings[3];
   }
-
+  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, nullptr, 0);
+  if (Y_v) {
+    weight = engine->GetTrtWeight(op_desc.Input("Filter").front(), *Y_t);
+  }
   TensorRTEngine::Weight bias;
   bias.SetDataType(weight.get().type);
   bias.SetCount(0);
@@ -167,7 +161,10 @@ void ConvertConv2d(TensorRTEngine* engine,
   layer->setStrideNd(nv_strides);
 
   layer->setPrePadding(nv_pre_paddings);
-  if (output_padding.size() > 0) {
+
+  if (!Y_v) layer->setInput(1, *filter);
+
+  if (!output_padding.empty()) {
     nv_post_paddings.d[0] -= output_padding[0];
     nv_post_paddings.d[1] -= output_padding[1];
   }
@@ -186,11 +183,6 @@ void ConvertConv2d(TensorRTEngine* engine,
   // set dilations
   fset_dilation(layer, nv_dilations);
 
-  if (engine->use_explicit_quantization()) {
-    auto* filter_tensor = engine->GetITensor(op_desc.Input("Filter").front());
-    layer->setInput(1, *filter_tensor);
-  }
-
   auto output_name = op_desc.Output("Output").front();
   layer->setName((name + " (Output: " + output_name + ")").c_str());
   layer->getOutput(0)->setName(output_name.c_str());
@@ -206,6 +198,8 @@ class Conv2dOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    auto output_name = op_desc.Output("Output").front();
     ConvertConv2d(
         engine_,
         op,
@@ -223,6 +217,7 @@ class Conv2dOpConverter : public OpConverter {
                                              ksize,
                                              weight.get(),
                                              bias.get());
+          SupportFP32MixPrecision(output_name, op_desc.Type(), layer);
           return layer;
         },
         [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
@@ -237,6 +232,8 @@ class Deconv2dOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    auto output_name = op_desc.Output("Output").front();
     ConvertConv2d(
         engine_,
         op,
@@ -254,6 +251,7 @@ class Deconv2dOpConverter : public OpConverter {
                                              ksize,
                                              weight.get(),
                                              bias.get());
+          SupportFP32MixPrecision(output_name, op_desc.Type(), layer);
           return layer;
         },
         [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 419383ff0a3342..198a164894c0b1 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -162,7 +162,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
                                          *(less_layer->getOutput(0)),
                                          *(equal_layer->getOutput(0)),
                                          nvinfer1::ElementWiseOperation::kOR);
-
       RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
     } else if (op_type_ == "greater_equal") {
       auto* greater_layer =
@@ -182,7 +181,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
                                          *(greater_layer->getOutput(0)),
                                          *(equal_layer->getOutput(0)),
                                          nvinfer1::ElementWiseOperation::kOR);
-
       RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
     } else if (op_type_ == "mod") {
       auto* div_layer =
@@ -191,17 +189,20 @@ class ElementwiseTensorOpConverter : public OpConverter {
                                *X,
                                *reshape_y_tensor,
                                nvinfer1::ElementWiseOperation::kFLOOR_DIV);
+      SupportFP32MixPrecision(output_name, op_desc.Type(), div_layer);
       auto* mul_layer =
           TRT_ENGINE_ADD_LAYER(engine_,
                                ElementWise,
                                *(div_layer->getOutput(0)),
                                *reshape_y_tensor,
                                nvinfer1::ElementWiseOperation::kPROD);
+      SupportFP32MixPrecision(output_name, op_desc.Type(), mul_layer);
       auto* layer = TRT_ENGINE_ADD_LAYER(engine_,
                                          ElementWise,
                                          *X,
                                          *(mul_layer->getOutput(0)),
                                          nvinfer1::ElementWiseOperation::kSUB);
+      SupportFP32MixPrecision(output_name, op_desc.Type(), layer);
       RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
     } else {
       auto op_pair = ops.find(op_type_);
@@ -215,6 +216,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
 
       auto* layer = TRT_ENGINE_ADD_LAYER(
           engine_, ElementWise, *X, *reshape_y_tensor, op_pair->second);
+      SupportFP32MixPrecision(output_name, op_desc.Type(), layer);
       RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
     }
   }
@@ -347,6 +349,7 @@ class PowOpConverter : public OpConverter {
 
     auto* layer = TRT_ENGINE_ADD_LAYER(
         engine_, ElementWise, *X, *Y, nvinfer1::ElementWiseOperation::kPOW);
+    SupportFP32MixPrecision(output_name, op_desc.Type(), layer);
     RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
index d985c6232c093e..ebe4c724180d13 100644
--- a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
@@ -237,7 +237,7 @@ class MatrixMultiplyOpConverter : public OpConverter {
                                  matrix_operation_x,
                                  *input2,
                                  matrix_operation_y);
-
+    SupportFP32MixPrecision(output_name, op_desc.Type(), layer);
     if (enable_int8) {
       if (op_desc.HasAttr("out_threshold") || op_desc.HasAttr("Out")) {
         engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
@@ -259,6 +259,7 @@ class MatrixMultiplyOpConverter : public OpConverter {
                                    *layer->getOutput(0),
                                    *reshape_alpha->getOutput(0),
                                    nvinfer1::ElementWiseOperation::kPROD);
+      SupportFP32MixPrecision(output_name, op_desc.Type(), layer);
     }
     RreplenishLayerAndOutput(
         layer, "matrix_multiply_op", {output_name}, test_mode);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 429bc89f0d90ea..3eb01c0951e275 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -371,6 +371,23 @@ class OpConverter {
     engine->ClearWeights();
   }
 
+  void SupportFP32MixPrecision(const std::string& output_name,
+                               const std::string& op_type,
+                               nvinfer1::ILayer* layer) {
+    if (engine_->OpIsRunFloat(output_name) || engine_->OpIsRunFloat(op_type)) {
+#if IS_TRT_VERSION_GE(8210)
+      VLOG(3) << op_type << "(output: " << output_name << ")"
+              << " is forced to run in FP32 precision.";
+      layer->resetPrecision();
+      layer->setPrecision(nvinfer1::DataType::kFLOAT);
+#else
+      VLOG(3)
+          << op_type << "(output: " << output_name << ")"
+          << ": Set layer precision needs TensorRT version 8.2.1 and after.";
+#endif
+    }
+  }
+
   nvinfer1::ITensor* Cast(nvinfer1::ITensor* input, nvinfer1::DataType dtype) {
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Identity, *input);
     layer->setOutputType(0, dtype);
diff --git a/paddle/fluid/inference/tensorrt/convert/share_data_op.cc b/paddle/fluid/inference/tensorrt/convert/share_data_op.cc
new file mode 100644
index 00000000000000..644eeda8d102f1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/share_data_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ShareDataOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a share_data op to tensorrt";
+    framework::OpDesc op_desc(op, nullptr);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Identity, *input);
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "share_data", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(share_data, ShareDataOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index ef9989c9fc9ba0..9fe7b51391153c 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -370,6 +370,13 @@ void TensorRTEngine::FreezeNetwork() {
       params_.optimization_level);
 #endif
 
+#if IS_TRT_VERSION_GE(8210)
+  if (!trt_ops_run_float_.empty()) {
+    infer_builder_config_->setFlag(
+        nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+  }
+#endif
+
 #if IS_TRT_VERSION_LT(8000)
   infer_engine_.reset(infer_builder_->buildEngineWithConfig(
       *network(), *infer_builder_config_));
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index d32666e8ccb5c5..ff35be1c607c7f 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -319,6 +319,14 @@ class TensorRTEngine {
     return quant_dynamic_range_.count(tensor);
   }
 
+  void SetRunFloat(const std::unordered_set<std::string>& ops) {
+    trt_ops_run_float_ = ops;
+  }
+
+  bool OpIsRunFloat(const std::string& op) const {
+    return trt_ops_run_float_.count(op) > 0;
+  }
+
   // A pointer to CPU memory is needed of the TRT weight.
   // Before TRT runs, fluid loads weight into GPU storage.
   // so we need to copy the weights from GPU to CPU in our op converter.
@@ -593,6 +601,9 @@ class TensorRTEngine {
   // Used for convert weight into Itensor
   const framework::Scope* scope_{nullptr};
 
+  // specify run on float to avoid overflow
+  std::unordered_set<std::string> trt_ops_run_float_;
+
 #if IS_TRT_VERSION_GE(6000)
   int binding_num_;
   infer_ptr<nvinfer1::IBuilderConfig> infer_builder_config_;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index b44c58379ca732..b9c1ee5bdd8a69 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -324,12 +324,12 @@ struct SimpleOpTypeSetTeller : public Teller {
       auto* block = desc.Block();
       if (block) {
         auto* filter_var_desc = block->FindVar(desc.Input("Filter")[0]);
-        if (!filter_var_desc->Persistable() && !use_explicit_quantization) {
+        if (!filter_var_desc->Persistable()) {
 #if IS_TRT_VERSION_GE(8600)
 #else
           LOG(INFO)
               << "Trt below 8.6 not support conv2d's filter is a intermedoate "
-                 "tensor in conv2d op, please upgarde your TenroRT.";
+                 "tensor in conv2d op, please upgarde your TensorRT.";
           return false;
 #endif
         }
@@ -2918,7 +2918,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "assign",
       "flip",
       "quantize_linear",
-      "dequantize_linear"};
+      "dequantize_linear",
+      "share_data"};
 
   std::unordered_set<std::string> teller_set{
       "matrix_multiply",
@@ -3086,7 +3087,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "assign",
       "flip",
       "quantize_linear",
-      "dequantize_linear"};
+      "dequantize_linear",
+      "share_data"};
 };
 
 struct GenericPluginTeller : public Teller {
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 41a6c8ac3379cf..5804a637574f11 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(
   benchmark
   SRCS benchmark.cc
   DEPS enforce)
-cc_test_old(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
+paddle_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
 cc_library(
   infer_io_utils
   SRCS io_utils.cc
@@ -29,6 +29,6 @@ if(WITH_ONNXRUNTIME AND WIN32)
 endif()
 
 cc_library(table_printer SRCS table_printer.cc)
-cc_test_old(test_table_printer SRCS table_printer_tester.cc DEPS table_printer)
+paddle_test(test_table_printer SRCS table_printer_tester.cc)
 
 proto_library(shape_range_info_proto SRCS shape_range_info.proto)
diff --git a/paddle/fluid/inference/utils/io_utils.cc b/paddle/fluid/inference/utils/io_utils.cc
index 0ee80e3700b5c9..27de396f597856 100644
--- a/paddle/fluid/inference/utils/io_utils.cc
+++ b/paddle/fluid/inference/utils/io_utils.cc
@@ -80,21 +80,21 @@ void SerializePDTensorToStream(std::ostream *os, const PaddleTensor &tensor) {
 
 void DeserializePDTensorToStream(std::istream &is, PaddleTensor *tensor) {
   // 1. Version
-  uint32_t version;
+  uint32_t version = 0;
   is.read(reinterpret_cast<char *>(&version), sizeof(version));
   // 2. Name
-  uint64_t name_bytes;
+  uint64_t name_bytes = 0;
   is.read(reinterpret_cast<char *>(&name_bytes), sizeof(name_bytes));
   std::vector<char> bytes(name_bytes);
   is.read(bytes.data(), name_bytes);  // NOLINT
   tensor->name = std::string(bytes.data(), name_bytes);
   // 3. LoD
-  uint64_t lod_level;
+  uint64_t lod_level = 0;
   is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
   auto *lod = &(tensor->lod);
   lod->resize(lod_level);
   for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
+    uint64_t size = 0;
     is.read(reinterpret_cast<char *>(&size), sizeof(size));
     std::vector<size_t> tmp(size / sizeof(size_t));
     is.read(reinterpret_cast<char *>(tmp.data()),
@@ -102,13 +102,13 @@ void DeserializePDTensorToStream(std::istream &is, PaddleTensor *tensor) {
     (*lod)[i] = tmp;
   }
   // 4. Shape
-  size_t dims;
+  size_t dims = 0;
   is.read(reinterpret_cast<char *>(&dims), sizeof(dims));
   tensor->shape.resize(dims);
   is.read(reinterpret_cast<char *>(tensor->shape.data()),
           sizeof(int) * dims);  // NOLINT
   // 5. Data
-  uint64_t length;
+  uint64_t length = 0;
   is.read(reinterpret_cast<char *>(&tensor->dtype), sizeof(tensor->dtype));
   is.read(reinterpret_cast<char *>(&length), sizeof(length));
   tensor->data.Resize(length);
@@ -139,10 +139,10 @@ void SerializePDTensorsToStream(std::ostream *os,
 void DeserializePDTensorsToStream(std::istream &is,
                                   std::vector<PaddleTensor> *tensors) {
   // 1. Version
-  uint32_t version;
+  uint32_t version = 0;
   is.read(reinterpret_cast<char *>(&version), sizeof(version));
   // 2. Tensors
-  uint64_t num;
+  uint64_t num = 0;
   is.read(reinterpret_cast<char *>(&num), sizeof(num));
   tensors->resize(num);
   for (auto &tensor : *tensors) {
@@ -240,35 +240,41 @@ void DeserializeShapeRangeInfo(
       continue;
     } else {
       std::vector<int32_t> tmp(info.min_shape_size());
-      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.min_shape(k);
+      for (size_t k = 0; k < tmp.size(); ++k)
+        tmp[k] = info.min_shape(static_cast<int>(k));
       min_shape->insert(std::make_pair(name, tmp));
 
       tmp.resize(info.max_shape_size());
-      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.max_shape(k);
+      for (size_t k = 0; k < tmp.size(); ++k)
+        tmp[k] = info.max_shape(static_cast<int>(k));
       max_shape->insert(std::make_pair(name, tmp));
 
       tmp.resize(info.opt_shape_size());
-      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.opt_shape(k);
+      for (size_t k = 0; k < tmp.size(); ++k)
+        tmp[k] = info.opt_shape(static_cast<int>(k));
       opt_shape->insert(std::make_pair(name, tmp));
     }
   }
   for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
-    auto info = shape_range_infos.shape_range_info(i);
+    auto info = shape_range_infos.shape_range_info(static_cast<int>(i));
     auto name = info.name();
     if (min_value->count(name) || max_value->count(name) ||
         opt_value->count(name)) {
       continue;
     } else {
       std::vector<int32_t> tmp(info.min_value_size());
-      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.min_value(k);
+      for (size_t k = 0; k < tmp.size(); ++k)
+        tmp[k] = info.min_value(static_cast<int>(k));
       min_value->insert(std::make_pair(name, tmp));
 
       tmp.resize(info.max_value_size());
-      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.max_value(k);
+      for (size_t k = 0; k < tmp.size(); ++k)
+        tmp[k] = info.max_value(static_cast<int>(k));
       max_value->insert(std::make_pair(name, tmp));
 
       tmp.resize(info.opt_value_size());
-      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.opt_value(k);
+      for (size_t k = 0; k < tmp.size(); ++k)
+        tmp[k] = info.opt_value(static_cast<int>(k));
       opt_value->insert(std::make_pair(name, tmp));
     }
   }
diff --git a/paddle/fluid/inference/utils/table_printer.cc b/paddle/fluid/inference/utils/table_printer.cc
index 7f192152e052f8..564757b88d69a8 100644
--- a/paddle/fluid/inference/utils/table_printer.cc
+++ b/paddle/fluid/inference/utils/table_printer.cc
@@ -101,7 +101,8 @@ void TablePrinter::InsertRow(const std::vector<std::string>& row) {
       if (line.length() > max_width) max_width = line.length();
     }
 
-    if (max_width > widths_[i]) widths_[i] = static_cast<float>(max_width);
+    if (static_cast<float>(max_width) > widths_[i])
+      widths_[i] = static_cast<float>(max_width);
 
     size_t num_lines = table_row[i].size();
     if (num_lines > max_height) max_height = num_lines;
@@ -159,13 +160,15 @@ void TablePrinter::CalcLayout() {
         // If the number of rows required for this record is larger than 1, we
         // will break that line and put it in multiple lines
         if (num_rows > 1) {
-          data_[i][j].erase(data_[i][j].begin() + line_index);
+          data_[i][j].erase(data_[i][j].begin() + line_index);  // NOLINT
           for (size_t k = 0; k < num_rows; ++k) {
             size_t start =
-                std::min(static_cast<size_t>(k * shares_[j]), line.length());
-            size_t end = std::min(static_cast<size_t>((k + 1) * shares_[j]),
-                                  line.length());
-            data_[i][j].insert(data_[i][j].begin() + line_index + k,
+                std::min(static_cast<size_t>(k * shares_[j]),  // NOLINT
+                         line.length());
+            size_t end =
+                std::min(static_cast<size_t>((k + 1) * shares_[j]),  // NOLINT
+                         line.length());
+            data_[i][j].insert(data_[i][j].begin() + line_index + k,  // NOLINT
                                line.substr(start, end - start));
           }
 
@@ -173,8 +176,8 @@ void TablePrinter::CalcLayout() {
           line_index += num_rows - 1;
         }
 
-        if (heights_[i] < (num_rows - 1 + data_[i][j].size()))
-          heights_[i] += num_rows - 1;
+        if (heights_[i] < static_cast<float>(num_rows - 1 + data_[i][j].size()))
+          heights_[i] += static_cast<float>(num_rows - 1);
       }
     }
   }
@@ -182,8 +185,8 @@ void TablePrinter::CalcLayout() {
 
 void TablePrinter::AddRowDivider(std::stringstream& ss) {
   ss << "+";
-  for (auto share : shares_) {
-    for (size_t j = 0; j < share + 2; ++j) ss << "-";
+  for (float share : shares_) {
+    for (float j = 0; j < share + 2; ++j) ss << "-";
     ss << "+";
   }
   ss << "\n";
@@ -191,15 +194,16 @@ void TablePrinter::AddRowDivider(std::stringstream& ss) {
 
 void TablePrinter::AddRow(std::stringstream& ss, size_t row_idx) {
   auto row = data_[row_idx];
-  size_t max_height = heights_[row_idx];
+  size_t max_height = static_cast<size_t>(heights_[row_idx]);
 
   for (size_t h = 0; h < max_height; ++h) {
     ss << "|" << std::left;
     for (size_t i = 0; i < row.size(); ++i) {
       if (h < row[i].size()) {
-        ss << " " << std::setw(shares_[i]) << row[i][h] << " |";
+        ss << " " << std::setw(static_cast<int>(shares_[i])) << row[i][h]
+           << " |";
       } else {
-        ss << " " << std::setw(shares_[i]) << " "
+        ss << " " << std::setw(static_cast<int>(shares_[i])) << " "
            << " |";
       }
     }
diff --git a/paddle/fluid/inference/utils/table_printer.h b/paddle/fluid/inference/utils/table_printer.h
index f0a01c8c1f8297..a8fde831263f0c 100644
--- a/paddle/fluid/inference/utils/table_printer.h
+++ b/paddle/fluid/inference/utils/table_printer.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+#include "paddle/utils/test_macros.h"
 
 namespace paddle {
 namespace inference {
@@ -23,7 +24,7 @@ namespace inference {
 //
 // A simple table printer.
 //
-class TablePrinter {
+class TEST_API TablePrinter {
  public:
   explicit TablePrinter(const std::vector<std::string>& header);
 
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 65ed57ebc9be15..a2910ed51b6751 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -259,6 +259,9 @@ pir::OpInfo OpTranscriber::LoopkUpOpInfo(pir::IrContext* ctx,
       continue;
     }
     VarDesc* var = op_desc.Block()->FindVarRecursive(legacy_input_vars[0]);
+    IR_ENFORCE(var != nullptr,
+               "Can't find var recursively from current block.");
+
     if (var->GetType() == paddle::framework::proto::VarType::LOD_TENSOR) {
       need_inputs_sig.emplace_back("dense");
     } else if (var->GetType() ==
@@ -280,7 +283,7 @@ pir::OpInfo OpTranscriber::LoopkUpOpInfo(pir::IrContext* ctx,
     if (need_inputs_sig.size() != sig.inputs.size()) {
       continue;
     }
-    size_t i;
+    size_t i = 0;
     for (i = 0; i < need_inputs_sig.size(); ++i) {
       if (need_inputs_sig[i] == "") {
         continue;
@@ -677,10 +680,12 @@ void OpTranscriber::RecordOpResultMapping(pir::IrContext* ctx,
     pir::OpResult value = operation->result(idx_in_op);
     bool generated_by_vector = value.type().isa<pir::VectorType>();
 
-    (*param_map)[arg_name] = VariableDefiningInfo(
-        value,
-        generated_by_vector,
-        static_cast<int>(generated_by_vector ? idx_in_vec : -1));
+    param_map->PushValue(
+        arg_name,
+        VariableDefiningInfo(
+            value,
+            generated_by_vector,
+            static_cast<int>(generated_by_vector ? idx_in_vec : -1)));
   }
 }
 
@@ -816,7 +821,7 @@ struct AssignValueOpTranscriber : public OpTranscriber {
     std::tie(input_infos, attr_infos, output_infos, std::ignore, std::ignore) =
         op_info_concept->get_op_info_();
     std::unordered_map<std::string, OpAttributeInfo> attr_info_maps;
-    for (auto info : attr_infos) {
+    for (auto const& info : attr_infos) {
       attr_info_maps.insert({info.name, info});
     }
 
@@ -1171,7 +1176,7 @@ struct ShadowOutputOpTranscriber : public OpTranscriber {
                              TranslationContext* param_map,
                              const OpDesc& op_desc,
                              pir::Block* block) override {
-    auto op_info = ctx->GetRegisteredOpInfo(pir::SetParameterOp::name());
+    auto op_info = ctx->GetRegisteredOpInfo(pir::ShadowOutputOp::name());
 
     std::vector<pir::Value> op_inputs;
     auto legacy_input_vars = op_desc.Input("x", true);
@@ -1186,7 +1191,7 @@ struct ShadowOutputOpTranscriber : public OpTranscriber {
     op_inputs.push_back(defining_info.value);
 
     pir::AttributeMap attribute_map = {
-        {"parameter_name",
+        {"output_name",
          pir::StrAttribute::get(ctx,
                                 op_desc.GetAttrIfExists<std::string>("name"))},
     };
@@ -1281,7 +1286,7 @@ struct FillConstant2FullTranscriber : public OpTranscriber {
         {"dtype",
          paddle::dialect::DataTypeAttribute::get(
              ctx,
-             paddle::dialect::VarTypeToDataType(
+             paddle::translator::VarTypeToDataType(
                  static_cast<paddle::framework::proto::VarType_Type>(dtype)))}};
 
     int place_type = PADDLE_GET_CONST(int, op_desc.GetAttr("place_type"));
@@ -1388,7 +1393,7 @@ struct FillConstant2FullWithTensorTranscriber : public OpTranscriber {
         {"dtype",
          paddle::dialect::DataTypeAttribute::get(
              ctx,
-             paddle::dialect::VarTypeToDataType(
+             paddle::translator::VarTypeToDataType(
                  static_cast<paddle::framework::proto::VarType_Type>(dtype)))}};
     return attribute_map;
   }
@@ -1433,11 +1438,11 @@ pir::OpResult TranslateNumClassesForOneHot(
     auto var_name = legacy_vars[0];
     IR_ENFORCE(legacy_vars.size() == 1,
                "depth_tensor input of one hot MUST be a tensor");
-    auto defining_info = param_map->find(legacy_vars[0]);
-    IR_ENFORCE(defining_info != param_map->end(),
+    IR_ENFORCE(param_map->count(legacy_vars[0]),
                "%s should be existed in one_hot_v2 as input depth_tensor.",
                legacy_vars[0]);
-    return defining_info->second.value;
+    auto defining_info = param_map->at(legacy_vars[0]);
+    return defining_info.value.dyn_cast<pir::OpResult>();
   }
 
   auto& attribute_translator = AttributeTranslator::instance();
@@ -1527,7 +1532,7 @@ struct ElementwiseTranscriber : public OpTranscriber {
           ctx, param_map, block, x_defining_info, x_name);
       x_defining_info = param_map->at(x_name);
     }
-    pir::OpResult x_value = x_defining_info.value;
+    pir::OpResult x_value = x_defining_info.value.dyn_cast<pir::OpResult>();
     IR_ENFORCE(x_value,
                "Expected op[%s]'s input %s is not null",
                op_desc.Type(),
@@ -1558,7 +1563,7 @@ struct ElementwiseTranscriber : public OpTranscriber {
           ctx, param_map, block, y_defining_info, y_name);
       y_defining_info = param_map->at(y_name);
     }
-    pir::OpResult y_value = y_defining_info.value;
+    pir::OpResult y_value = y_defining_info.value.dyn_cast<pir::OpResult>();
     IR_ENFORCE(y_value,
                "Expected op[%s]'s input %s is not null",
                op_desc.Type(),
@@ -1577,8 +1582,7 @@ struct ElementwiseTranscriber : public OpTranscriber {
       axis += static_cast<int>(x_shape.size());
     }
 
-    int append_size =
-        static_cast<int>(x_shape.size() - axis - 1 - y_shape.size());
+    int append_size = static_cast<int>(x_shape.size() - axis - y_shape.size());
     if (append_size < 0) {  // which means x.rank <= y.rank, mostly
                             // x.rank=y.rank
       return {x_value, y_value};
@@ -1593,7 +1597,7 @@ struct ElementwiseTranscriber : public OpTranscriber {
     pir::OpResult y_new;
     if (std::find(y_shape.begin(), y_shape.end(), -1) == y_shape.end()) {
       std::vector<int64_t> y_new_shape(y_shape);
-      for (int i = 0; i <= append_size; i++) {
+      for (int i = 0; i < append_size; i++) {
         y_new_shape.push_back(1);
       }
       dialect::ReshapeOp reshape_op =
@@ -1605,7 +1609,7 @@ struct ElementwiseTranscriber : public OpTranscriber {
       auto shape_op = builder.Build<dialect::ShapeOp>(y_value);
       auto append_shape_op = builder.Build<dialect::FullIntArrayOp>(
           std::vector<int64_t>(append_size, 1),
-          phi::DataType::INT64,
+          phi::DataType::INT32,
           phi::CPUPlace());
       auto y_true_shape_op = builder.Build<pir::CombineOp>(
           std::vector<pir::Value>{shape_op.out(), append_shape_op.out()});
@@ -1622,7 +1626,10 @@ struct ElementwiseTranscriber : public OpTranscriber {
 struct GradAddOpTranscriber : public ElementwiseTranscriber {
   pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
                             const OpDesc& op_desc) override {
-    const std::string& target_op_name = "pd_op.add";
+    std::string target_op_name = "pd_op.add";
+    if (IsInplace(op_desc) && *target_op_name.rbegin() != '_') {
+      target_op_name += "_";
+    }
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
       IR_THROW(
@@ -1675,7 +1682,7 @@ struct ElementwiseGradTranscriber : public OpTranscriber {
                op_desc.Type(),
                y_name);
     auto y_defining_info = param_map->at(y_name);
-    pir::OpResult y_value = y_defining_info.value;
+    pir::OpResult y_value = y_defining_info.value.dyn_cast<pir::OpResult>();
     IR_ENFORCE(y_value,
                "Expected op[%s]'s input %s is not null",
                op_desc.Type(),
@@ -1693,8 +1700,8 @@ struct ElementwiseGradTranscriber : public OpTranscriber {
     pir::OpResult value = operation->result(idx_in_op);
     pir::Builder builder(ctx, operation->GetParent());
     auto reshape_op = builder.Build<dialect::ReshapeOp>(value, y_shape);
-    (*param_map)[y_grad_var_name] =
-        VariableDefiningInfo(reshape_op.out(), false, -1);
+    param_map->PushValue(y_grad_var_name,
+                         VariableDefiningInfo(reshape_op.out(), false, -1));
   }
 };
 
@@ -1766,7 +1773,7 @@ struct SetValueWithTensorOpTranscriber : public SetValueOpTranscriber {
             ctx, param_map, block, defining_info, var_name);
         defining_info = param_map->at(var_name).value;
       }
-      return defining_info.value;
+      return defining_info.value.dyn_cast<pir::OpResult>();
     };
   }
 };
@@ -1861,9 +1868,24 @@ struct FusedFeedForwardOpTranscriber : public OpTranscriber {
       auto output_var = output_vars[0];
       auto fused_feedforward_op =
           operation->dyn_cast<dialect::FusedFeedforwardOp>();
-      (*param_map)[output_var] =
-          VariableDefiningInfo{fused_feedforward_op.out()};
+      param_map->PushValue(output_var,
+                           VariableDefiningInfo{fused_feedforward_op.out()});
+    }
+  }
+};
+
+struct ShareBufferOpTranscriber : public OpTranscriber {
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
+    std::string target_op_name = dialect::ShareDataOp::name();
+    const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
+    if (!op_info) {
+      IR_THROW(
+          "Op share_buffer should have corresponding OpInfo "
+          "pd_op.share_data");
     }
+
+    return op_info;
   }
 };
 
@@ -1890,6 +1912,7 @@ OpTranslator::OpTranslator() {
   special_handlers["reduce_any"] = ReduceOpTranscriber();
   special_handlers["rnn"] = RnnOpTranscriber();
   special_handlers["shadow_output"] = ShadowOutputOpTranscriber();
+  special_handlers["share_buffer"] = ShareBufferOpTranscriber();
   special_handlers["set_value"] = LegacySetValueDispatcher();
   special_handlers["set_value_grad"] = SetValueGradOpTranscriber();
   special_handlers["split"] = SplitOpTranscriber();
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc
index 2ebece4fbfef7d..cf2cbdd902d734 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/ir_adaptor/translator/op_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/type_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/pir/core/attribute.h"
@@ -43,6 +44,10 @@ using ProgramDesc = ::paddle::framework::ProgramDesc;
 using BlockDesc = ::paddle::framework::BlockDesc;
 using VarDesc = ::paddle::framework::VarDesc;
 
+using TCKey = TranslationContext::Key;
+using TCValue = TranslationContext::Value;
+using TCContainer = TranslationContext::Container;
+
 const std::unordered_set<std::string> ProgramTranslator::no_cast_var_names = {
     "feed",
     "fetch",
@@ -50,28 +55,61 @@ const std::unordered_set<std::string> ProgramTranslator::no_cast_var_names = {
 
 const std::unordered_set<std::string> ProgramTranslator::unsupported_ops = {
     "conditional_block_grad",
-    "while",
     "while_grad",
 };
 
 static std::vector<uint64_t> GetCondOpIds(const BlockDesc& src_block,
                                           uint64_t first_id) {
-  std::vector<uint64_t> op_list = {first_id};
-  if (src_block.Op(first_id + 1)->Type() == "logical_not") {
-    op_list.emplace_back(first_id + 1);
+  uint64_t temp_id = first_id;
+  // add conditional_block
+  std::vector<uint64_t> op_list = {temp_id};
+  temp_id++;
+  // add logical_not
+  if ((temp_id < src_block.OpSize()) &&
+      (src_block.Op(static_cast<int>(temp_id))->Type() == "logical_not")) {
+    op_list.emplace_back(temp_id);
+    temp_id++;
   }
-  if (src_block.Op(first_id + 2)->Type() == "conditional_block") {
-    op_list.emplace_back(first_id + 2);
+  // add conditional_block
+  if ((temp_id < src_block.OpSize()) &&
+      (src_block.Op(static_cast<int>(temp_id))->Type() ==
+       "conditional_block")) {
+    op_list.emplace_back(temp_id);
+    temp_id++;
   }
-  if (src_block.Op(first_id + 3)->Type() == "cast") {
-    op_list.emplace_back(first_id + 3);
+  // add cast
+  if ((temp_id < src_block.OpSize()) &&
+      (src_block.Op(static_cast<int>(temp_id))->Type() == "cast")) {
+    op_list.emplace_back(temp_id);
+    temp_id++;
   }
-  size_t output_size = src_block.Op(first_id)->Output("Out").size();
-  for (size_t i = 0; i < output_size; i++) {
-    if (src_block.Op(first_id + 4 + i)->Type() == "select_input") {
-      op_list.emplace_back(first_id + 4 + i);
+  // Note(zhangbo): Some output variables are input, without select_input op.
+  std::vector<uint64_t> init_op_list;
+  while (temp_id < src_block.OpSize()) {
+    if ((src_block.Op(static_cast<int>(temp_id))->Type() == "fill_constant") ||
+        (src_block.Op(static_cast<int>(temp_id))->Type() == "assign_value")) {
+      init_op_list.emplace_back(temp_id);
+      temp_id++;
+    } else {
+      break;
+    }
+  }
+  std::vector<uint64_t> select_input_op_list;
+  while (temp_id < src_block.OpSize()) {
+    if (src_block.Op(static_cast<int>(temp_id))->Type() == "select_input") {
+      select_input_op_list.emplace_back(temp_id);
+      temp_id++;
+    } else {
+      break;
     }
   }
+
+  if (select_input_op_list.size() > 0) {
+    op_list.insert(op_list.end(), init_op_list.begin(), init_op_list.end());
+  }
+  op_list.insert(
+      op_list.end(), select_input_op_list.begin(), select_input_op_list.end());
+
   return op_list;
 }
 
@@ -79,7 +117,7 @@ ConditionBlockCombination::ConditionBlockCombination(
     const ::paddle::framework::BlockDesc& src_block,
     const std::vector<uint64_t>& op_ids) {
   for (auto op_id : op_ids) {
-    op_list_.emplace_back(src_block.Op(op_id));
+    op_list_.emplace_back(src_block.Op(static_cast<int>(op_id)));
   }
   PADDLE_ENFORCE(Verify(op_list_),
                  platform::errors::NotFound(
@@ -92,25 +130,53 @@ const std::string& ConditionBlockCombination::CondVarName() const {
   return op_list_[0]->Input("Cond")[0];
 }
 
-size_t ConditionBlockCombination::OutputSize() const {
-  return op_list_[0]->Output("Out").size();
+std::vector<std::vector<::paddle::framework::VarDesc*>>
+ConditionBlockCombination::OutputVars() const {
+  std::vector<::paddle::framework::VarDesc*> if_outputs;
+  std::vector<::paddle::framework::VarDesc*> true_block_outputs;
+  std::vector<::paddle::framework::VarDesc*> false_block_outputs;
+  for (::paddle::framework::OpDesc* op : op_list_) {
+    if (op->Type() == "select_input") {
+      if_outputs.emplace_back(
+          op->Block()->FindVarRecursive(op->Output("Out")[0]));
+      true_block_outputs.emplace_back(
+          op->Block()->FindVarRecursive(op->Input("X")[1]));
+      false_block_outputs.emplace_back(
+          op->Block()->FindVarRecursive(op->Input("X")[0]));
+    }
+  }
+  return {if_outputs, true_block_outputs, false_block_outputs};
 }
 
-std::vector<::paddle::framework::VarDesc*>
-ConditionBlockCombination::OutputVars() const {
-  std::vector<::paddle::framework::VarDesc*> outputs;
-  if (this->OutputSize() > 0) {
-    for (size_t i = 4; i < op_list_.size(); i++) {
-      outputs.emplace_back(op_list_[i]->Block()->FindVarRecursive(
-          op_list_[i]->Output("Out")[0]));
+size_t ConditionBlockCombination::MainOutputSize() const {
+  return OutputVars()[0].size();
+}
+
+std::vector<std::string> ConditionBlockCombination::TrueBlockOutputVarNames()
+    const {
+  std::vector<std::string> output_names;
+  for (::paddle::framework::OpDesc* op : op_list_) {
+    if (op->Type() == "select_input") {
+      output_names.emplace_back(op->Input("X")[1]);
     }
   }
-  return outputs;
+  return output_names;
 }
 
-const std::vector<std::string>&
-ConditionBlockCombination::TrueBlockOutputVarNames() const {
-  return op_list_[0]->Output("Out");
+std::vector<::paddle::framework::OpDesc*>
+ConditionBlockCombination::TrueBlockInitOps() const {
+  std::vector<::paddle::framework::OpDesc*> init_ops;
+  std::vector<std::string> output_names = TrueBlockOutputVarNames();
+  for (::paddle::framework::OpDesc* op : op_list_) {
+    if ((op->Type() == "fill_constant") || (op->Type() == "assign_value")) {
+      auto out_name = op->Output("Out")[0];
+      if (std::find(output_names.begin(), output_names.end(), out_name) !=
+          output_names.end()) {
+        init_ops.emplace_back(op);
+      }
+    }
+  }
+  return init_ops;
 }
 
 int ConditionBlockCombination::TrueBlockId() const {
@@ -119,10 +185,29 @@ int ConditionBlockCombination::TrueBlockId() const {
 
 std::vector<std::string> ConditionBlockCombination::FalseBlockOutputVarNames()
     const {
-  if (op_list_.size() > 1) {
-    return op_list_[2]->Output("Out");
+  std::vector<std::string> output_names;
+  for (::paddle::framework::OpDesc* op : op_list_) {
+    if (op->Type() == "select_input") {
+      output_names.emplace_back(op->Input("X")[0]);
+    }
   }
-  return {""};
+  return output_names;
+}
+
+std::vector<::paddle::framework::OpDesc*>
+ConditionBlockCombination::FalseBlockInitOps() const {
+  std::vector<::paddle::framework::OpDesc*> init_ops;
+  std::vector<std::string> output_names = FalseBlockOutputVarNames();
+  for (::paddle::framework::OpDesc* op : op_list_) {
+    if ((op->Type() == "fill_constant") || (op->Type() == "assign_value")) {
+      auto out_name = op->Output("Out")[0];
+      if (std::find(output_names.begin(), output_names.end(), out_name) !=
+          output_names.end()) {
+        init_ops.emplace_back(op);
+      }
+    }
+  }
+  return init_ops;
 }
 
 int ConditionBlockCombination::FalseBlockId() const {
@@ -139,9 +224,6 @@ bool ConditionBlockCombination::Verify(
       if (op_list[id]->Type() != "conditional_block") {
         return false;
       }
-      if (op_list.size() == 1 && op_list[id]->Output("Out").size() != 0) {
-        return false;
-      }
     } else if (id == 1) {
       if (op_list[id]->Type() != "logical_not") {
         return false;
@@ -164,10 +246,9 @@ bool ConditionBlockCombination::Verify(
         return false;
       }
     } else {
-      if (op_list[id]->Type() != "select_input") {
-        return false;
-      }
-      if (op_list[id]->Input("Mask")[0] != op_list[3]->Output("Out")[0]) {
+      if ((op_list[id]->Type() != "select_input") &&
+          (op_list[id]->Type() != "fill_constant") &&
+          (op_list[id]->Type() != "assign_value")) {
         return false;
       }
     }
@@ -175,6 +256,55 @@ bool ConditionBlockCombination::Verify(
   return true;
 }
 
+const TCValue& TranslationContext::operator[](const TCKey& key) const {
+  return at(key);
+}
+
+const TCValue& TranslationContext::at(const TCKey& key) const {
+  auto it = container_.find(key);
+  if (it == container_.end() && parent_) {
+    return parent_->at(key);
+  }
+  PADDLE_ENFORCE_NE(it,
+                    container_.end(),
+                    platform::errors::InvalidArgument(
+                        "param %s should exists in TranslationContext", key));
+  const auto& values = it->second;
+  PADDLE_ENFORCE_NE(
+      values.size(),
+      0,
+      platform::errors::InvalidArgument(
+          "param %s should have size > 0, but get:%d", key, values.size()));
+  return values.back();
+}
+
+size_t TranslationContext::count(const TCKey& key) const {
+  auto it = container_.find(key);
+  if (it == container_.end()) {
+    if (parent_) return parent_->count(key);
+    return 0u;
+  }
+  const auto& values = it->second;
+  PADDLE_ENFORCE_NE(
+      values.size(),
+      0u,
+      platform::errors::InvalidArgument(
+          "param %s should have size > 0, but get:%d", key, values.size()));
+  return values.size();
+}
+
+void TranslationContext::PushValue(const Key& key, const Value& value) {
+  container_[key].push_back(value);
+}
+void TranslationContext::PopValue(const Key& key) {
+  container_[key].pop_back();
+}
+
+TranslationContext* TranslationContext::CreateInnerContext() {
+  sons_.emplace_back(std::make_unique<TranslationContext>(this));
+  return sons_.back().get();
+}
+
 ProgramTranslator::ProgramTranslator(const ProgramDesc* legacy_program,
                                      pir::Program* program)
     : legacy_program_(legacy_program), program_(program) {
@@ -188,6 +318,7 @@ void ProgramTranslator::Translate() {
   TranslateBlock(legacy_program_->Block(0),
                  0,
                  legacy_program_->Block(0).OpSize(),
+                 &param_map_,
                  program_->block());
 
   SetParameterFromSingleBlock(legacy_program_->Block(0));
@@ -203,11 +334,15 @@ void ProgramTranslator::Translate() {
   }
 }
 
-void ProgramTranslator::TranslateBlock(const BlockDesc& src_block,
-                                       uint64_t start_id,
-                                       uint64_t end_id,
-                                       pir::Block* dest_block,
-                                       bool for_cond_block) {
+void ProgramTranslator::TranslateBlock(
+    const BlockDesc& src_block,
+    uint64_t start_id,
+    uint64_t end_id,
+    TranslationContext* translation_ctx,
+    pir::Block* dest_block,
+    bool for_cond_block,
+    std::vector<std::string> cond_sub_block_outputs,
+    std::vector<::paddle::framework::OpDesc*> cond_init_ops) {
   VLOG(8) << "=============>start to translate a block";
   PADDLE_ENFORCE(
       (src_block.OpSize() >= end_id) && (start_id <= end_id),
@@ -219,11 +354,13 @@ void ProgramTranslator::TranslateBlock(const BlockDesc& src_block,
           src_block.OpSize()));
 
   std::unordered_map<uint64_t, bool> translate_completed;
+  std::map<std::string, std::string> assign_output_2_input;
   for (uint64_t op_id = start_id; op_id < end_id; op_id++) {
     if (translate_completed.count(op_id) && translate_completed.at(op_id)) {
       continue;
     }
-    auto op = src_block.Op(op_id);
+
+    auto op = src_block.Op(static_cast<int>(op_id));
     VLOG(8) << "=============>start to translate a op: " << op->Type();
 
     PADDLE_ENFORCE_EQ(unsupported_ops.count(op->Type()),
@@ -232,30 +369,46 @@ void ProgramTranslator::TranslateBlock(const BlockDesc& src_block,
                           "Not support translated %s op", op->Type()));
 
     if (op->Type() == "conditional_block") {
-      std::vector<const OpDesc*> cond_op_list = {op};
       std::vector<uint64_t> cond_op_ids = GetCondOpIds(src_block, op_id);
       ConditionBlockCombination cond_op_combination(src_block, cond_op_ids);
-      pir::Operation* if_op =
-          TranslateCondIfOperation(cond_op_combination, dest_block);
+      pir::Operation* if_op = TranslateCondIfOperation(
+          cond_op_combination, translation_ctx, dest_block);
       for (auto cond_id : cond_op_ids) {
         translate_completed[cond_id] = true;
       }
       VLOG(10) << "[op translated][conditional_block]" << if_op;
+    } else if (op->Type() == "while") {
+      TranslateWhileOperation(op, translation_ctx, dest_block);
     } else {
-      TranslateGeneralOperation(op, dest_block);
-      translate_completed[op_id] = true;
+      if (for_cond_block && op->Type() == "assign" &&
+          std::count(cond_sub_block_outputs.begin(),
+                     cond_sub_block_outputs.end(),
+                     op->Output("Out")[0])) {
+        assign_output_2_input[op->Output("Out")[0]] = op->Input("X")[0];
+        translate_completed[op_id] = true;
+      } else {
+        TranslateGeneralOperation(op, translation_ctx, dest_block);
+        translate_completed[op_id] = true;
+      }
     }
   }
+
   // NOTE(zhangbo): If conditional_block operator has output, the cf.yeild
   // operator needs to be inserted
   if (for_cond_block) {
+    // insert init ops
+    for (::paddle::framework::OpDesc* init_op : cond_init_ops) {
+      TranslateGeneralOperation(init_op, translation_ctx, dest_block);
+    }
+    // insert yeild op
     std::vector<pir::Value> yeild_inputs;
-    for (size_t id = end_id; id < src_block.OpSize(); id++) {
-      PADDLE_ENFORCE(
-          src_block.Op(id)->Type() == "assign",
-          "The operator at the end of the sub block needs to be assign");
-      yeild_inputs.emplace_back(
-          param_map_[src_block.Op(id)->Input("X")[0]].value);
+    for (auto output_name : cond_sub_block_outputs) {
+      if (assign_output_2_input.count(output_name) != 0) {
+        yeild_inputs.emplace_back(
+            (*translation_ctx)[assign_output_2_input[output_name]].value);
+      } else {
+        yeild_inputs.emplace_back((*translation_ctx)[output_name].value);
+      }
     }
     pir::AttributeMap attribute_map;
     auto yeild_info = ctx_->GetRegisteredOpInfo(pir::YieldOp::name());
@@ -266,18 +419,20 @@ void ProgramTranslator::TranslateBlock(const BlockDesc& src_block,
 }
 
 pir::Operation* ProgramTranslator::TranslateCondIfOperation(
-    const ConditionBlockCombination& cond_ops, pir::Block* dest_block) {
+    const ConditionBlockCombination& cond_ops,
+    TranslationContext* translation_ctx,
+    pir::Block* dest_block) {
   auto& type_translator = TypeTranslator::instance();
   auto op_info = ctx_->GetRegisteredOpInfo(paddle::dialect::IfOp::name());
   std::vector<pir::Value> op_inputs = {
-      param_map_[cond_ops.CondVarName()].value};
+      (*translation_ctx)[cond_ops.CondVarName()].value};
 
   // NOTE(zhangbo): Now paddle::dialect::IfOp has 0 attribute
   pir::AttributeMap attribute_map;
 
   std::vector<pir::Type> op_output_types;
   std::vector<::paddle::framework::VarDesc*> output_vardescs =
-      cond_ops.OutputVars();
+      cond_ops.OutputVars()[0];
   for (auto var_desc : output_vardescs) {
     IR_ENFORCE(var_desc != nullptr, "[control flow] Output should not be null");
     pir::Type translated_var_type =
@@ -290,8 +445,8 @@ pir::Operation* ProgramTranslator::TranslateCondIfOperation(
       op_inputs, attribute_map, op_output_types, op_info, 2);
 
   for (size_t i = 0; i < output_vardescs.size(); i++) {
-    param_map_[output_vardescs[i]->Name()] =
-        VariableDefiningInfo(operation->result(i));
+    translation_ctx->PushValue(output_vardescs[i]->Name(),
+                               VariableDefiningInfo(operation->result(i)));
   }
 
   dest_block->push_back(operation);
@@ -302,11 +457,17 @@ pir::Operation* ProgramTranslator::TranslateCondIfOperation(
         legacy_program_->Block(cond_ops.TrueBlockId());
     pir::Region& true_region = operation->region(0);
     if (true_region.empty()) true_region.emplace_back();
+
+    auto* true_block_context = translation_ctx->CreateInnerContext();
+
     TranslateBlock(true_sub_block,
                    0,
-                   true_sub_block.OpSize() - cond_ops.OutputSize(),
+                   true_sub_block.OpSize(),
+                   true_block_context,
                    true_region.front(),
-                   true);
+                   true,
+                   cond_ops.TrueBlockOutputVarNames(),
+                   cond_ops.TrueBlockInitOps());
   }
   VLOG(4) << "[general op][conditional_block] IfOp true block translate end.";
 
@@ -315,28 +476,106 @@ pir::Operation* ProgramTranslator::TranslateCondIfOperation(
         legacy_program_->Block(cond_ops.FalseBlockId());
     pir::Region& false_region = operation->region(1);
     if (false_region.empty()) false_region.emplace_back();
+    auto* false_block_context = translation_ctx->CreateInnerContext();
     TranslateBlock(false_sub_block,
                    0,
-                   false_sub_block.OpSize() - cond_ops.OutputSize(),
+                   false_sub_block.OpSize(),
+                   false_block_context,
                    false_region.front(),
-                   true);
+                   true,
+                   cond_ops.FalseBlockOutputVarNames(),
+                   cond_ops.FalseBlockInitOps());
   }
   VLOG(4) << "[general op][conditional_block] IfOp false block translate end.";
+
+  operation->Verify();
   VLOG(4) << "[general op][conditional_block] IfOp translate end.";
   return operation;
 }
 
-void ProgramTranslator::TranslateGeneralOperation(const OpDesc* src_op,
-                                                  pir::Block* dest_block) {
+void ProgramTranslator::TranslateWhileOperation(
+    const OpDesc* op,
+    TranslationContext* translation_ctx,
+    pir::Block* dest_block) {
+  VLOG(8) << "=============>Start to translate while op:" << op;
+  auto& sub_block = legacy_program_->Block(op->GetBlockAttrId("sub_block"));
+  int index = static_cast<int>(sub_block.OpSize()) - 1;
+  std::vector<std::pair<std::string, std::string>> loop_vars_reverse;
+  while (index >= 0) {
+    auto sub_op = sub_block.Op(index);
+    if (sub_op->Type() == "assign" &&
+        translation_ctx->count(sub_op->Output("Out")[0]) > 0) {
+      loop_vars_reverse.emplace_back(sub_op->Output("Out")[0],
+                                     sub_op->Input("X")[0]);
+      --index;
+    } else {
+      break;
+    }
+  }
+  PADDLE_ENFORCE(!loop_vars_reverse.empty(),
+                 platform::errors::PreconditionNotMet(
+                     "While op must has condition value input"));
+  PADDLE_ENFORCE(loop_vars_reverse.front().first == op->Input("Condition")[0],
+                 platform::errors::PreconditionNotMet(
+                     "The last op in sub_block of While op must used to assign "
+                     "condition var"));
+  auto op_info = ctx_->GetRegisteredOpInfo(paddle::dialect::WhileOp::name());
+  std::vector<pir::Value> op_inputs{
+      translation_ctx->at(loop_vars_reverse[0].first).value};
+  std::vector<pir::Type> op_outputs_type;
+  auto body_block = new pir::Block();
+  std::vector<TCValue> param_status;
+  for (size_t idx = loop_vars_reverse.size() - 1u; idx > 0; --idx) {
+    auto& name = loop_vars_reverse[idx].first;
+    auto& tc_value = translation_ctx->at(name);
+    auto val_type = tc_value.value.type();
+    op_inputs.push_back(tc_value.value);
+    op_outputs_type.push_back(val_type);
+    param_status.emplace_back(tc_value);
+    translation_ctx->PushValue(name, body_block->AddArgument(val_type));
+  }
+  pir::Operation* while_op =
+      pir::Operation::Create(op_inputs, {}, op_outputs_type, op_info, 1);
+  dest_block->push_back(while_op);
+  while_op->region(0).push_back(body_block);
+  TranslateBlock(sub_block, 0, index + 1, translation_ctx, body_block);
+
+  auto yeild_info = ctx_->GetRegisteredOpInfo(pir::YieldOp::name());
+  std::vector<pir::Value> yeild_inputs{
+      translation_ctx->at(loop_vars_reverse[0].second).value};
+  for (size_t idx = loop_vars_reverse.size() - 1u; idx > 0; --idx) {
+    auto& name = loop_vars_reverse[idx].second;
+    yeild_inputs.push_back(translation_ctx->at(name).value);
+  }
+  body_block->push_back(
+      pir::Operation::Create(yeild_inputs, {}, {}, yeild_info));
+
+  index = 0;
+  for (size_t idx = loop_vars_reverse.size() - 1u; idx > 0; --idx) {
+    auto& name = loop_vars_reverse[idx].first;
+    translation_ctx->PushValue(name, param_status[index++]);
+  }
+  auto name_iter = loop_vars_reverse.rbegin();
+  for (size_t idx = 0; idx < while_op->num_results(); ++idx) {
+    translation_ctx->PushValue(name_iter++->first, while_op->result(idx));
+  }
+  while_op->Verify();
+  VLOG(8) << "=============>end to translate while op:" << op;
+}
+
+void ProgramTranslator::TranslateGeneralOperation(
+    const OpDesc* src_op,
+    TranslationContext* translation_ctx,
+    pir::Block* dest_block) {
   auto& op_translator = OpTranslator::instance();
   OpTranslateFn& fn = op_translator[src_op->Type()];
   if (src_op->Type() == "shadow_output") {
-    if (!param_map_.count(src_op->Input("x")[0])) {
+    if (!translation_ctx->count(src_op->Input("x")[0])) {
       return;
     }
   }
-  pir::Operation* operation = fn(ctx_, &param_map_, *src_op, dest_block);
-  VLOG(10) << "[op translated][special]" << operation << "end";
+  pir::Operation* operation = fn(ctx_, translation_ctx, *src_op, dest_block);
+  VLOG(10) << "[op translated][general]" << operation << "end";
 }
 
 inline pir::Operation* InsertGetParamaterOp(pir::IrContext* ctx,
@@ -355,7 +594,7 @@ inline pir::Operation* InsertGetParamaterOp(pir::IrContext* ctx,
 }
 
 inline pir::Operation* InsertSetParamaterOp(pir::IrContext* ctx,
-                                            pir::OpResult defining_op_result,
+                                            pir::Value defining_op_result,
                                             const VarDesc* var) {
   std::string set_parameter_op_name(pir::SetParameterOp::name());
   pir::OpInfo op_info = ctx->GetRegisteredOpInfo(set_parameter_op_name);
@@ -406,7 +645,7 @@ void ProgramTranslator::GetParameterForSingleBlock(const BlockDesc& block) {
                   "VarDesc of [%s] can not be nullptr", var_name));
           pir::Operation* op = InsertGetParamaterOp(ctx_, var_desc);
           program_->block()->push_back(op);
-          param_map_[var_name] = VariableDefiningInfo(op->result(0));
+          param_map_.PushValue(var_name, VariableDefiningInfo(op->result(0)));
           VLOG(10) << "[op translated][get parameter]" << var_name;
 
           program_->SetParameter(var_name, nullptr);
@@ -424,20 +663,6 @@ void ProgramTranslator::GetParameterForSingleBlock(const BlockDesc& block) {
   }
 }
 
-void ProgramTranslator::InsertOperationToSingleBlock(const BlockDesc& block) {
-  auto& op_translator = OpTranslator::instance();
-  for (auto op : block.AllOps()) {
-    OpTranslateFn& fn = op_translator[op->Type()];
-    if (op->Type() == "shadow_output") {
-      if (!param_map_.count(op->Input("x")[0])) {
-        continue;
-      }
-    }
-    pir::Operation* operation = fn(ctx_, &param_map_, *op, program_->block());
-    VLOG(10) << "[op translated][special]" << operation;
-  }
-}
-
 void ProgramTranslator::SetParameterFromSingleBlock(const BlockDesc& block) {
   const auto& ops = block.AllOps();
   for (auto op_desc = ops.rbegin(); op_desc != ops.rend(); op_desc++) {
@@ -458,7 +683,8 @@ void ProgramTranslator::SetParameterFromSingleBlock(const BlockDesc& block) {
         need_set_parameter_op &= (param_map_.count(var_name) != 0);
         need_set_parameter_op &= (!set_input_var_names.count(var_name));
         if (need_set_parameter_op) {
-          pir::OpResult defining_op_result = param_map_[var_name].value;
+          pir::OpResult defining_op_result =
+              param_map_[var_name].value.dyn_cast<pir::OpResult>();
           if (!defining_op_result) {
             continue;
           }
@@ -469,7 +695,8 @@ void ProgramTranslator::SetParameterFromSingleBlock(const BlockDesc& block) {
                                           program_->block(),
                                           param_map_[var_name],
                                           var_name);
-            defining_op_result = param_map_.at(var_name).value;
+            defining_op_result =
+                param_map_.at(var_name).value.dyn_cast<pir::OpResult>();
           }
 
           pir::Operation* op = InsertSetParamaterOp(
@@ -500,38 +727,37 @@ void ProgramTranslator::SetStopGradientAttributeForAllValue(
     const BlockDesc& block) {
   // Currently we set stop gradient for operation that generated a value
   // connected with VarDesc
-  for (const auto& [var_name, value_info] : param_map_) {
+  for (const auto& [var_name, value_list] : param_map_) {
     if (no_cast_var_names.count(var_name) != 0) continue;
     VLOG(10) << "[op translated][stop gradient]" << var_name;
     VarDesc* var = block.FindVarRecursive(var_name);
     if (var == nullptr) {
       continue;
     }
-    pir::OpResult value = value_info.value;
-    if (!value) {
-      PADDLE_THROW(phi::errors::PreconditionNotMet(
-          "Value of [%s] can not ber None", var_name));
-    }
-    auto* defining_op = value.owner();
-    PADDLE_ENFORCE_NOT_NULL(
-        defining_op,
-        phi::errors::PreconditionNotMet(
-            "Defining operator of [%s] can not be nullptr", var_name));
-    VLOG(8) << "[op translated][stop gradient]" << var_name
-            << " from: " << defining_op->name();
-    std::vector<pir::Attribute> stop_gradients;
-    if (defining_op->HasAttribute(kAttrStopGradients)) {
-      stop_gradients = defining_op->attribute(kAttrStopGradients)
-                           .dyn_cast<pir::ArrayAttribute>()
-                           .AsVector();
-    } else {
-      stop_gradients = std::vector<pir::Attribute>(
-          defining_op->num_results(), pir::BoolAttribute::get(ctx_, false));
+    for (const auto& value_info : value_list) {
+      pir::OpResult value = value_info.value.dyn_cast<pir::OpResult>();
+      if (!value) continue;
+      auto* defining_op = value.owner();
+      PADDLE_ENFORCE_NOT_NULL(
+          defining_op,
+          phi::errors::PreconditionNotMet(
+              "Defining operator of [%s] can not be nullptr", var_name));
+      VLOG(8) << "[op translated][stop gradient]" << var_name
+              << " from: " << defining_op->name();
+      std::vector<pir::Attribute> stop_gradients;
+      if (defining_op->HasAttribute(kAttrStopGradients)) {
+        stop_gradients = defining_op->attribute(kAttrStopGradients)
+                             .dyn_cast<pir::ArrayAttribute>()
+                             .AsVector();
+      } else {
+        stop_gradients = std::vector<pir::Attribute>(
+            defining_op->num_results(), pir::BoolAttribute::get(ctx_, false));
+      }
+      stop_gradients[value.index()] =
+          pir::BoolAttribute::get(ctx_, var->StopGradient());
+      defining_op->set_attribute(
+          kAttrStopGradients, pir::ArrayAttribute::get(ctx_, stop_gradients));
     }
-    stop_gradients[value.index()] =
-        pir::BoolAttribute::get(ctx_, var->StopGradient());
-    defining_op->set_attribute(kAttrStopGradients,
-                               pir::ArrayAttribute::get(ctx_, stop_gradients));
   }
 }
 
@@ -539,39 +765,49 @@ void ProgramTranslator::SetIsPersisableAttributeForAllValue(
     const BlockDesc& block) {
   // Currently we set is persisable for operation that generated a value
   // connected with VarDesc
-  for (const auto& [var_name, value_info] : param_map_) {
+  for (const auto& [var_name, value_list] : param_map_) {
     if (no_cast_var_names.count(var_name) != 0) continue;
     VLOG(10) << "[op translated][is persisable]" << var_name;
     VarDesc* var = block.FindVarRecursive(var_name);
     if (var == nullptr) {
       continue;
     }
-    pir::OpResult value = value_info.value;
-    if (!value) {
-      PADDLE_THROW(phi::errors::PreconditionNotMet(
-          "Value of [%s] can not ber None", var_name));
+    for (const auto& value_info : value_list) {
+      pir::OpResult value = value_info.value.dyn_cast<pir::OpResult>();
+      if (!value) continue;
+      auto* defining_op = value.owner();
+      PADDLE_ENFORCE_NOT_NULL(
+          defining_op,
+          phi::errors::PreconditionNotMet(
+              "Defining operator of [%s] can not be nullptr", var_name));
+      VLOG(8) << "[op translated][is persisable]" << var_name
+              << " from: " << defining_op->name();
+      std::vector<pir::Attribute> is_persisable;
+      if (defining_op->HasAttribute(kAttrIsPersisable)) {
+        is_persisable = defining_op->attribute(kAttrIsPersisable)
+                            .dyn_cast<pir::ArrayAttribute>()
+                            .AsVector();
+      } else {
+        is_persisable = std::vector<pir::Attribute>(
+            defining_op->num_results(), pir::BoolAttribute::get(ctx_, false));
+      }
+      is_persisable[value.index()] =
+          pir::BoolAttribute::get(ctx_, var->Persistable());
+      defining_op->set_attribute(kAttrIsPersisable,
+                                 pir::ArrayAttribute::get(ctx_, is_persisable));
     }
-    auto* defining_op = value.owner();
-    PADDLE_ENFORCE_NOT_NULL(
-        defining_op,
-        phi::errors::PreconditionNotMet(
-            "Defining operator of [%s] can not be nullptr", var_name));
-    VLOG(8) << "[op translated][is persisable]" << var_name
-            << " from: " << defining_op->name();
-    std::vector<pir::Attribute> is_persisable;
-    if (defining_op->HasAttribute(kAttrIsPersisable)) {
-      is_persisable = defining_op->attribute(kAttrIsPersisable)
-                          .dyn_cast<pir::ArrayAttribute>()
-                          .AsVector();
-    } else {
-      is_persisable = std::vector<pir::Attribute>(
-          defining_op->num_results(), pir::BoolAttribute::get(ctx_, false));
+  }
+}
+
+std::unordered_map<std::string, std::vector<pir::Value>>
+ProgramTranslator::VarDesc2Value() {
+  std::unordered_map<std::string, std::vector<pir::Value>> var_desc_2_value;
+  for (const auto& [var_name, value_info_list] : param_map_) {
+    for (const auto& value_info : value_info_list) {
+      var_desc_2_value[var_name].push_back(value_info.value);
     }
-    is_persisable[value.index()] =
-        pir::BoolAttribute::get(ctx_, var->Persistable());
-    defining_op->set_attribute(kAttrIsPersisable,
-                               pir::ArrayAttribute::get(ctx_, is_persisable));
   }
+  return var_desc_2_value;
 }
 
 }  // namespace translator
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.h b/paddle/fluid/ir_adaptor/translator/program_translator.h
index a59f4b34a5adaa..ed734241ca26cb 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.h
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -29,7 +30,7 @@ namespace paddle {
 namespace translator {
 
 struct VariableDefiningInfo {
-  VariableDefiningInfo(pir::OpResult value,
+  VariableDefiningInfo(pir::Value value,
                        bool generated_by_vector = false,
                        int idx_in_vector = -1)
       : value(value),
@@ -37,7 +38,7 @@ struct VariableDefiningInfo {
         idx_in_vector(idx_in_vector) {}
   VariableDefiningInfo() {}
 
-  pir::OpResult value;
+  pir::Value value;
 
   bool generated_by_vector =
       false;  // true if target variable is generated by Vector<Tensor>
@@ -49,12 +50,23 @@ class ConditionBlockCombination {
  public:
   ConditionBlockCombination(const ::paddle::framework::BlockDesc& src_block,
                             const std::vector<uint64_t>& op_ids);
+
   const std::string& CondVarName() const;
-  size_t OutputSize() const;
-  std::vector<::paddle::framework::VarDesc*> OutputVars() const;
-  const std::vector<std::string>& TrueBlockOutputVarNames() const;
+
+  std::vector<std::vector<::paddle::framework::VarDesc*>> OutputVars() const;
+
+  size_t MainOutputSize() const;
+
+  std::vector<std::string> TrueBlockOutputVarNames() const;
+
+  std::vector<::paddle::framework::OpDesc*> TrueBlockInitOps() const;
+
   int TrueBlockId() const;
+
   std::vector<std::string> FalseBlockOutputVarNames() const;
+
+  std::vector<::paddle::framework::OpDesc*> FalseBlockInitOps() const;
+
   int FalseBlockId() const;
 
  private:
@@ -63,8 +75,35 @@ class ConditionBlockCombination {
   std::vector<::paddle::framework::OpDesc*> op_list_;
 };
 
-using TranslationContext =
-    std::unordered_map<std::string, VariableDefiningInfo>;
+class TranslationContext {
+ public:
+  using Key = std::string;
+  using Value = VariableDefiningInfo;
+  using ValueList = std::vector<Value>;
+  using Container = std::unordered_map<Key, ValueList>;
+
+  TranslationContext() {}
+  explicit TranslationContext(TranslationContext* parent) : parent_(parent) {}
+  ~TranslationContext() {}
+
+  const Value& operator[](const Key& key) const;
+  const Value& at(const Key& key) const;
+  size_t count(const Key& key)
+      const;  // Caution: not exactly same as count in stl library
+
+  void PushValue(const Key& key, const Value& value);
+  void PopValue(const Key& key);
+  TranslationContext* CreateInnerContext();
+
+  Container::const_iterator begin() const { return container_.begin(); }
+  Container::const_iterator end() const { return container_.end(); }
+
+ private:
+  Container container_;
+  TranslationContext* parent_ = nullptr;
+  std::vector<std::unique_ptr<TranslationContext>>
+      sons_;  // used to seperate different block
+};
 
 class ProgramTranslator {
   using ProgramDesc = ::paddle::framework::ProgramDesc;
@@ -78,6 +117,8 @@ class ProgramTranslator {
 
   void Translate();
 
+  std::unordered_map<std::string, std::vector<pir::Value>> VarDesc2Value();
+
  private:
   const ProgramDesc* legacy_program_;  // not owned
   pir::Program* program_;              // not owned
@@ -97,21 +138,31 @@ class ProgramTranslator {
 
   static const std::unordered_set<std::string> unsupported_ops;
 
-  void TranslateBlock(const BlockDesc& src_block,
-                      uint64_t start_id,
-                      uint64_t end_id,
-                      pir::Block* dest_block,
-                      bool for_cond_block = false);
-  void TranslateGeneralOperation(const OpDesc* src_op, pir::Block* dest_block);
+  void TranslateBlock(
+      const BlockDesc& src_block,
+      uint64_t start_id,
+      uint64_t end_id,
+      TranslationContext* translation_ctx,
+      pir::Block* dest_block,
+      bool for_cond_block = false,
+      std::vector<std::string> cond_sub_block_outputs = {},
+      std::vector<::paddle::framework::OpDesc*> cond_init_ops = {});
+  void TranslateGeneralOperation(const OpDesc* src_op,
+                                 TranslationContext* translation_ctx,
+                                 pir::Block* dest_block);
   void GetParameterForSingleBlock(const BlockDesc& block);
-  void InsertOperationToSingleBlock(const BlockDesc& block);
   void SetParameterFromSingleBlock(const BlockDesc& block);
   void SetStopGradientAttributeForAllValue(const BlockDesc& block);
   void SetIsPersisableAttributeForAllValue(const BlockDesc& block);
 
   /// Translate methods for control flow ops.
   pir::Operation* TranslateCondIfOperation(
-      const ConditionBlockCombination& cond_ops, pir::Block* dest_block);
+      const ConditionBlockCombination& cond_ops,
+      TranslationContext* translation_ctx,
+      pir::Block* dest_block);
+  void TranslateWhileOperation(const OpDesc* op,
+                               TranslationContext* translation_ctx,
+                               pir::Block* dest_block);
 };
 
 }  // namespace translator
diff --git a/paddle/fluid/ir_adaptor/translator/utils.cc b/paddle/fluid/ir_adaptor/translator/utils.cc
index 5ee0c91b5bae5f..7f50115c5c578e 100644
--- a/paddle/fluid/ir_adaptor/translator/utils.cc
+++ b/paddle/fluid/ir_adaptor/translator/utils.cc
@@ -59,7 +59,7 @@ pir::Operation* InsertSliceOperationForTarget(
                              op_info);
   block->push_back(operation);
   pir::OpResult target_op_result = operation->result(0);
-  (*param_map)[arg_name] = VariableDefiningInfo(target_op_result);
+  param_map->PushValue(arg_name, VariableDefiningInfo(target_op_result));
   return operation;
 }
 
diff --git a/paddle/fluid/ir_adaptor/translator/utils.h b/paddle/fluid/ir_adaptor/translator/utils.h
index 8745ee2ac0d7bf..a4765940d0a78a 100644
--- a/paddle/fluid/ir_adaptor/translator/utils.h
+++ b/paddle/fluid/ir_adaptor/translator/utils.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
 #include "paddle/pir/core/ir_context.h"
@@ -61,5 +62,43 @@ std::ostream& operator<<(std::ostream& os,
 std::vector<std::string> CheckUnregisteredOperation(
     pir::IrContext* ctx, const framework::ProgramDesc& legacy_program);
 
+inline DataType VarTypeToDataType(
+    ::paddle::framework::proto::VarType_Type var_type) {
+  switch (var_type) {
+    case paddle::framework::proto::VarType_Type::VarType_Type_BOOL:
+      return DataType::BOOL;
+    case paddle::framework::proto::VarType_Type::VarType_Type_INT16:
+      return DataType::INT16;
+    case paddle::framework::proto::VarType_Type::VarType_Type_INT32:
+      return DataType::INT32;
+    case paddle::framework::proto::VarType_Type::VarType_Type_INT64:
+      return DataType::INT64;
+    case paddle::framework::proto::VarType_Type::VarType_Type_FP16:
+      return DataType::FLOAT16;
+    case paddle::framework::proto::VarType_Type::VarType_Type_FP32:
+      return DataType::FLOAT32;
+    case paddle::framework::proto::VarType_Type::VarType_Type_FP64:
+      return DataType::FLOAT64;
+    case paddle::framework::proto::VarType_Type::VarType_Type_SIZE_T:
+      return DataType::UINT64;
+    case paddle::framework::proto::VarType_Type::VarType_Type_UINT8:
+      return DataType::UINT8;
+    case paddle::framework::proto::VarType_Type::VarType_Type_INT8:
+      return DataType::INT8;
+    case paddle::framework::proto::VarType_Type::VarType_Type_BF16:
+      return DataType::BFLOAT16;
+    case paddle::framework::proto::VarType_Type::VarType_Type_COMPLEX64:
+      return DataType::COMPLEX64;
+    case paddle::framework::proto::VarType_Type::VarType_Type_COMPLEX128:
+      return DataType::COMPLEX128;
+    case paddle::framework::proto::VarType_Type::VarType_Type_PSTRING:
+      return DataType::PSTRING;
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported proto::VarType_Type `%s` when casting it into DataType.",
+          var_type));
+  }
+}
+
 }  // namespace translator
 }  // namespace paddle
diff --git a/paddle/fluid/jit/property.cc b/paddle/fluid/jit/property.cc
index 174b3b065f1fac..9b0c50a954624c 100644
--- a/paddle/fluid/jit/property.cc
+++ b/paddle/fluid/jit/property.cc
@@ -340,7 +340,7 @@ void Property::SetStrings(const std::vector<std::string> &v) {
   auto type = proto::ValueProto::STRINGS;
   auto entry = property_.add_entrys();
   entry->set_type(type);
-  for (auto i : v) {
+  for (auto const &i : v) {
     entry->add_strings(i);
   }
   VLOG(3) << "Property: set_strings " << v.size();
@@ -352,7 +352,7 @@ void Property::SetStrings(const std::string &name,
   auto entry = property_.add_entrys();
   entry->set_name(name);
   entry->set_type(type);
-  for (auto i : v) {
+  for (auto const &i : v) {
     entry->add_strings(i);
   }
   VLOG(3) << "Property: set_strings " << v[0] << " name: " << name;
diff --git a/paddle/fluid/jit/serializer_utils.cc b/paddle/fluid/jit/serializer_utils.cc
index 5b58b9d4173129..4fdc07f55ac745 100644
--- a/paddle/fluid/jit/serializer_utils.cc
+++ b/paddle/fluid/jit/serializer_utils.cc
@@ -79,7 +79,7 @@ const std::vector<std::pair<std::string, std::string>> PdmodelFilePaths(
   std::string dir_path =
       format_path.substr(0, format_path.length() - layer_name.length());
   DIR* dir = opendir(dir_path.c_str());
-  struct dirent* ptr;
+  struct dirent* ptr = nullptr;
 
   while ((ptr = readdir(dir)) != nullptr) {
     std::string file_name = ptr->d_name;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 7add694a04f68f..6af73d8f48958d 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -212,10 +212,6 @@ class AllocatorFacadePrivate {
                 platform::CustomPlace(dev_type, dev_id));
           }
         }
-        if (FLAGS_use_stream_safe_cuda_allocator) {
-          WrapStreamSafeCustomDeviceAllocatorForDefault();
-          is_stream_safe_cuda_allocator_used_ = true;
-        }
 #endif
         break;
       }
@@ -576,13 +572,14 @@ class AllocatorFacadePrivate {
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  bool HasCustomDevice(const platform::CustomPlace& place,
-                       phi::stream::stream_t stream) {
+  bool HasCustomDeviceAllocator(const platform::CustomPlace& place,
+                                phi::stream::stream_t stream) {
     auto it = custom_device_allocators_.find(place);
     if (it == custom_device_allocators_.end()) {
       return false;
     }
-    auto& allocator_map = it->second;
+    const std::map<phi::stream::stream_t, std::shared_ptr<Allocator>>&
+        allocator_map = it->second;
     return allocator_map.find(stream) != allocator_map.end();
   }
 
@@ -590,10 +587,15 @@ class AllocatorFacadePrivate {
       const platform::CustomPlace& place,
       phi::stream::stream_t stream,
       bool create_if_not_found = false) {
+    if (stream == GetDefaultStream(place)) {
+      VLOG(7) << "Get Allocator by passing in a default stream";
+      return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+    }
+
     /* shared_lock_guard */ {
       std::shared_lock<std::shared_timed_mutex> lock_guard(
           custom_device_allocator_mutex_);
-      if (LIKELY(HasCustomDevice(place, stream))) {
+      if (LIKELY(HasCustomDeviceAllocator(place, stream))) {
         return custom_device_allocators_[place][stream];
       } else {
         PADDLE_ENFORCE_NE(create_if_not_found,
@@ -627,17 +629,11 @@ class AllocatorFacadePrivate {
     return iter->second;
   }
 
-  void RecordStream(std::shared_ptr<phi::Allocation> allocation,
-                    phi::stream::stream_t stream) {
-    std::shared_ptr<StreamSafeCustomDeviceAllocation>
-        stream_safe_custom_device_allocation =
-            std::dynamic_pointer_cast<StreamSafeCustomDeviceAllocation>(
-                allocation);
-    if (stream_safe_custom_device_allocation != nullptr) {
-      stream_safe_custom_device_allocation->RecordStream(stream);
-    } else {
-      VLOG(6) << "RecordStream for a non-StreamSafeCustomDeviceAllocation";
-    }
+  phi::stream::stream_t GetDefaultStream(
+      const platform::CustomPlace& place) const {
+    const std::shared_ptr<StreamSafeCustomDeviceAllocator>& allocator =
+        GetDefaultStreamSafeCustomDeviceAllocator(place);
+    return allocator->GetDefaultStream();
   }
 
   void SetDefaultStream(const platform::CustomPlace& place,
@@ -662,6 +658,34 @@ class AllocatorFacadePrivate {
             << ") in " << place;
   }
 
+  void RecordStream(std::shared_ptr<phi::Allocation> allocation,
+                    phi::stream::stream_t stream) {
+    std::shared_ptr<StreamSafeCustomDeviceAllocation>
+        stream_safe_custom_device_allocation =
+            std::dynamic_pointer_cast<StreamSafeCustomDeviceAllocation>(
+                allocation);
+    if (stream_safe_custom_device_allocation != nullptr) {
+      stream_safe_custom_device_allocation->RecordStream(stream);
+    } else {
+      VLOG(6) << "RecordStream for a non-StreamSafeCustomDeviceAllocation";
+    }
+  }
+
+  phi::stream::stream_t GetStream(
+      const std::shared_ptr<phi::Allocation>& allocation) const {
+    const std::shared_ptr<StreamSafeCustomDeviceAllocation>
+        stream_safe_custom_device_allocation =
+            std::dynamic_pointer_cast<StreamSafeCustomDeviceAllocation>(
+                allocation);
+    if (stream_safe_custom_device_allocation != nullptr) {
+      return stream_safe_custom_device_allocation->GetOwningStream();
+    }
+
+    VLOG(6) << "GetStream for a non-StreamSafeCustomDeviceAllocation";
+    return static_cast<phi::CustomContext*>(
+               platform::DeviceContextPool::Instance().Get(allocation->place()))
+        ->stream();
+  }
 #endif
 
  private:
@@ -1108,10 +1132,41 @@ class AllocatorFacadePrivate {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
   }
 
-  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p,
-                                             phi::stream::stream_t stream) {
+  std::shared_ptr<Allocator> CreateCustomDeviceAllocator(
+      platform::CustomPlace p) {
+    return std::make_shared<CustomAllocator>(p);
+  }
+
+  void InitStreamSafeCustomDeviceAllocator(platform::CustomPlace p,
+                                           phi::stream::stream_t stream) {
+    PADDLE_ENFORCE_EQ(
+        strategy_,
+        AllocatorStrategy::kAutoGrowth,
+        platform::errors::Unimplemented(
+            "Only support auto-growth strategey for "
+            "StreamSafeCustomDeviceAllocator, "
+            "the allocator strategy %d is unsupported for multi-stream",
+            static_cast<int>(strategy_)));
+    if (LIKELY(!HasCustomDeviceAllocator(p, stream))) {
+      VLOG(8) << "Init StreamSafeCustomDeviceAllocator for stream " << stream
+              << " in place " << p;
+      InitAutoGrowthCustomDeviceAllocator(p, stream);
+      WrapStreamSafeCustomDeviceAllocator(p, stream);
+    }
+  }
+
+  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
+                                           phi::stream::stream_t stream) {
+    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
+    VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
+            << FLAGS_auto_growth_chunk_size_in_mb;
+
+    auto custom_allocator =
+        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
+    auto alignment = phi::DeviceManager::GetMinChunkSize(p);
     custom_device_allocators_[p][stream] =
-        std::make_shared<NaiveBestFitAllocator>(p);
+        std::make_shared<AutoGrowthBestFitAllocator>(
+            custom_allocator, alignment, chunk_size, allow_free_idle_chunk_);
   }
 
   void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
@@ -1146,20 +1201,6 @@ class AllocatorFacadePrivate {
     }
   }
 
-  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
-                                           phi::stream::stream_t stream) {
-    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
-    VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
-            << FLAGS_auto_growth_chunk_size_in_mb;
-
-    auto custom_allocator =
-        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
-    auto alignment = phi::DeviceManager::GetMinChunkSize(p);
-    custom_device_allocators_[p][stream] =
-        std::make_shared<AutoGrowthBestFitAllocator>(
-            custom_allocator, alignment, chunk_size, allow_free_idle_chunk_);
-  }
-
   void WrapStreamSafeCustomDeviceAllocator(platform::CustomPlace p,
                                            phi::stream::stream_t stream) {
     std::shared_ptr<Allocator>& allocator =
@@ -1167,18 +1208,6 @@ class AllocatorFacadePrivate {
     allocator =
         std::make_shared<StreamSafeCustomDeviceAllocator>(allocator, p, stream);
   }
-
-  void InitStreamSafeCustomDeviceAllocator(platform::CustomPlace p,
-                                           phi::stream::stream_t stream) {
-    VLOG(8) << "Init CustomDevice allocator for stream " << stream
-            << " in place " << p;
-    if (strategy_ == AllocatorStrategy::kAutoGrowth) {
-      InitAutoGrowthCustomDeviceAllocator(p, stream);
-    } else {
-      InitNaiveBestFitCustomDeviceAllocator(p, stream);
-    }
-    WrapStreamSafeCustomDeviceAllocator(p, stream);
-  }
 #endif
 
   void InitSystemAllocators() {
@@ -1419,12 +1448,20 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      const phi::Stream& stream) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   if (platform::is_custom_place(place)) {
+    if (!GetPrivate()->IsStreamSafeCUDAAllocatorUsed()) {
+      VLOG(6) << "Warning: StreamSafeCustomDeviceAllocator is not used!";
+      return Alloc(place, size);
+    }
     platform::CustomPlace p(place);
-    phi::stream::stream_t s =
-        reinterpret_cast<phi::stream::stream_t>(stream.id());
-    return GetPrivate()
-        ->GetAllocator(p, s, /* create_if_not_found = */ true)
-        ->Allocate(size);
+    if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
+      phi::stream::stream_t s =
+          reinterpret_cast<phi::stream::stream_t>(stream.id());
+      return GetPrivate()
+          ->GetAllocator(p, s, /* create_if_not_found = */ true)
+          ->Allocate(size);
+    } else {
+      return GetPrivate()->GetAllocator(p, size)->Allocate(size);
+    }
   }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -1552,10 +1589,32 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
+uint64_t AllocatorFacade::Release(const platform::CustomPlace& place,
+                                  phi::stream::stream_t stream) {
+  AllocatorFacadePrivate* m = GetPrivate();
+  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
+    VLOG(6) << "Warning: StreamSafeCustomDeviceAllocator is not used!";
+    return Release(place);
+  }
+
+  return m->GetAllocator(place, stream)->Release(place);
+}
+
+void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
+                                   phi::stream::stream_t stream) {
+  GetPrivate()->RecordStream(allocation, stream);
+}
+
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place, phi::stream::stream_t stream) {
   AllocatorFacadePrivate* m = GetPrivate();
+
   if (!m->IsStreamSafeCUDAAllocatorUsed()) {
+    VLOG(6) << "Warning: StreamSafeCustomDeviceAllocator is not used!";
+    return GetAllocator(place);
+  }
+
+  if (platform::is_custom_place(place) && FLAGS_use_system_allocator == false) {
     return m->GetAllocator(place,
                            stream,
                            /*create_if_not_found=*/true);
@@ -1563,9 +1622,9 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
   return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
-void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
-                                   phi::stream::stream_t stream) {
-  GetPrivate()->RecordStream(allocation, stream);
+phi::stream::stream_t AllocatorFacade::GetStream(
+    const std::shared_ptr<phi::Allocation>& allocation) const {
+  return GetPrivate()->GetStream(allocation);
 }
 
 void AllocatorFacade::SetDefaultStream(const platform::CustomPlace& place,
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 0131d56c6f6428..9d2c85eccf4555 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -97,11 +97,14 @@ class AllocatorFacade {
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
+  uint64_t Release(const platform::CustomPlace& place,
+                   phi::stream::stream_t stream);
+  void RecordStream(std::shared_ptr<Allocation> allocation,
+                    phi::stream::stream_t stream);
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
                                                  phi::stream::stream_t stream);
-  void RecordStream(std::shared_ptr<phi::Allocation> allocation,
-                    phi::stream::stream_t stream);
-
+  phi::stream::stream_t GetStream(
+      const std::shared_ptr<Allocation>& allocation) const;
   void SetDefaultStream(const platform::CustomPlace& place,
                         phi::stream::stream_t stream);
 #endif
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index dde362ebed4ef7..398c015627860d 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -38,7 +38,7 @@ void CPUAllocator::FreeImpl(phi::Allocation *allocation) {
 }
 
 phi::Allocation *CPUAllocator::AllocateImpl(size_t size) {
-  void *p;
+  void *p = nullptr;
 #ifdef _WIN32
   p = _aligned_malloc(size, kAlignment);
 #else
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index 6be6436b4db7b0..5e857f9acb7171 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -321,7 +321,7 @@ void MemoryMapFdSet::Clear() {
   VLOG(3) << "PID: " << getpid() << ", MemoryMapFdSet: set size - "
           << fd_set_.size();
   std::lock_guard<std::mutex> guard(mtx_);
-  for (auto fd : fd_set_) {
+  for (auto const &fd : fd_set_) {
     int rlt = shm_unlink(fd.c_str());
     if (rlt == 0) {
       VLOG(3) << "PID: " << getpid() << ", MemoryMapFdSet: clear " << fd;
@@ -375,7 +375,7 @@ void MemoryMapAllocationPool::SetMaxPoolSize(const int &size) {
 
 void MemoryMapAllocationPool::Clear() {
   std::lock_guard<std::mutex> guard(mtx_);
-  for (auto mmap : memory_map_allocations_) {
+  for (auto const &mmap : memory_map_allocations_) {
     int rlt = shm_unlink(mmap.file_name_.c_str());
     if (rlt == 0) {
       VLOG(4) << "MemoryMapAllocationPool: clear " << mmap.file_name_;
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 9f513448eea266..a296d254266ab2 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -35,7 +35,7 @@ StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
                  underlying_allocation->size(),
                  underlying_allocation->place()),
       underlying_allocation_(std::move(underlying_allocation)),
-      owning_stream_(std::move(owning_stream)),
+      owning_stream_(owning_stream),
       allocator_(allocator->shared_from_this()) {}
 
 void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
@@ -148,8 +148,8 @@ StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
     gpuStream_t default_stream,
     bool in_cuda_graph_capturing)
     : underlying_allocator_(std::move(underlying_allocator)),
-      place_(std::move(place)),
-      default_stream_(std::move(default_stream)),
+      place_(place),
+      default_stream_(default_stream),
       in_cuda_graph_capturing_(in_cuda_graph_capturing) {
   if (LIKELY(!in_cuda_graph_capturing)) {
     std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc
index e04d14f0adfde0..16b538599df258 100644
--- a/paddle/fluid/memory/allocation/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -26,7 +26,7 @@ PHI_DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
   bool freed = false;
   {
-    size_t index;
+    size_t index;  // NOLINT
     void* p = a->Alloc(&index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
index 0289859dff30eb..e18646f0e82bf9 100644
--- a/paddle/fluid/memory/stats.cc
+++ b/paddle/fluid/memory/stats.cc
@@ -15,8 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/memory/stats.h"
 
 #include "paddle/fluid/memory/allocation/spin_lock.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/core/macros.h"
 
+PADDLE_DEFINE_EXPORTED_bool(
+    log_memory_stats,
+    false,
+    "Log memory stats after each op runs, just used for debug.");
 namespace paddle {
 namespace memory {
 
@@ -104,6 +109,28 @@ void HostMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
 }
 
+void LogDeviceMemoryStats(const platform::Place& place,
+                          const std::string& op_name) {
+  if (FLAGS_log_memory_stats && platform::is_gpu_place(place)) {
+    VLOG(0) << "After launching op_name: " << op_name << ", "
+            << "memory_allocated: "
+            << static_cast<double>(memory::DeviceMemoryStatCurrentValue(
+                   "Allocated", place.device)) /
+                   1024 / 1024
+            << " MB, "
+            << "max_memory_allocated: "
+            << static_cast<double>(memory::DeviceMemoryStatPeakValue(
+                   "Allocated", place.device)) /
+                   1024 / 1024
+            << " MB, "
+            << "max_memory_reserved: "
+            << static_cast<double>(memory::DeviceMemoryStatPeakValue(
+                   "Reserved", place.device)) /
+                   1024 / 1024
+            << " MB";
+  }
+}
+
 #define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \
   StatRegistry::GetInstance()->Register(              \
       "Device" #item, id, Stat<DeviceMemoryStat##item##id>::GetInstance());
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index bd4761f41116ef..d2c8b04bc70ab7 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/phi/common/thread_data_registry.h"
 #include "paddle/utils/string/string_helper.h"
 
@@ -122,6 +123,9 @@ void HostMemoryStatUpdate(const std::string& stat_type,
                           int dev_id,
                           int64_t increment);
 
+void LogDeviceMemoryStats(const platform::Place& place,
+                          const std::string& op_name);
+
 #define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)              \
   case id:                                                          \
     stat = paddle::memory::Stat<                                    \
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
index 53386c1551d0f5..1a6561fc383cc6 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cc
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -51,7 +51,7 @@ class BilateralSliceOp : public framework::OperatorWithKernel {
     int64_t coeffs_chans = grid_dims[1];
     int64_t input_chans = input_dims[1];
 
-    int64_t output_chans;
+    int64_t output_chans = 0;
     if ((!ctx->IsRuntime()) && ((coeffs_chans < 0) || (input_chans < 0))) {
       output_chans = -1;
     } else {
diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc
deleted file mode 100644
index 28226d5d94d5ac..00000000000000
--- a/paddle/fluid/operators/center_loss_op.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/center_loss_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-class CenterLossOp : public framework::OperatorWithKernel {
- public:
-  CenterLossOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CenterLoss");
-    auto x_dims = ctx->GetInputDim("X");
-
-    OP_INOUT_CHECK(ctx->HasInput("CenterUpdateRate"),
-                   "Input",
-                   "CenterUpdateRate",
-                   "CenterLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "CenterLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Centers"), "Input", "Centers", "CenterLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("SampleCenterDiff"),
-                   "Output",
-                   "SampleCenterDiff",
-                   "CenterLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "CenterLoss");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("CentersOut"), "Output", "CentersOut", "CenterLoss");
-
-    ctx->SetOutputDim("SampleCenterDiff",
-                      {x_dims[0], product(x_dims) / x_dims[0]});
-    ctx->SetOutputDim("CentersOut", ctx->GetInputDim("Centers"));
-    ctx->SetOutputDim("Loss", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Loss");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class CenterLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of center_loss operator.");
-    AddInput("Label", "(Tensor) Input tensor of center_loss operator.");
-    AddInput("Centers", "(Tensor) Input tensor of center_loss operator.");
-    AddInput("CenterUpdateRate",
-             "(Tensor) Input tensor of center_loss operator.");
-
-    AddOutput("CentersOut", "(Tensor) Input tensor of center_loss operator.");
-    AddOutput("SampleCenterDiff",
-              "(Tensor) output tensor of center_loss operator.");
-    AddOutput("Loss", "(Tensor) Output tensor of center_loss operator.");
-
-    AddAttr<int>("cluster_num",
-                 "The output cluster num of the center_loss operator.");
-    AddAttr<bool>("need_update", "whether need to update center info.");
-    AddComment(R"DOC(
-**CenterLoss operator**
-implemention of the center loss function in the papper<<A Discriminative
-Feature Learning Approach for Deep Face Recognition>>, equations in this  implement
-is:loss = 1/2 * (x-y)^2 ,where x(X) means the deep feature(output of last hidden layer )
-and y(Label) the target label
-)DOC");
-  }
-};
-
-class CenterLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("SampleCenterDiff"),
-                   "Input",
-                   "SampleCenterDiff",
-                   "CenterLossGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input",
-                   framework::GradVarName("Loss"),
-                   "CenterLossGrad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "CenterLossGrad");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "SampleCenterDiff"),
-        ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class CenterLossOpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("center_loss_grad");
-    retv->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
-    retv->SetInput("SampleCenterDiff", this->Output("SampleCenterDiff"));
-    retv->SetInput("X", this->Input("X"));
-    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-
-    retv->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(CenterLossGradNoNeedBufVarsInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(center_loss,
-                  ops::CenterLossOp,
-                  ops::CenterLossOpMaker,
-                  ops::CenterLossOpGradMaker<paddle::framework::OpDesc>,
-                  ops::CenterLossOpGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(center_loss_grad,
-                  ops::CenterLossGradOp,
-                  ops::CenterLossGradNoNeedBufVarsInferer);
-
-PD_REGISTER_STRUCT_KERNEL(
-    center_loss, CPU, ALL_LAYOUT, ops::CenterLossKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(center_loss_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CenterLossGradKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
deleted file mode 100644
index 73567c195d97fb..00000000000000
--- a/paddle/fluid/operators/center_loss_op.cu
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-
-#include "paddle/fluid/operators/center_loss_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void ComputeDifferent(T *centers_diff,
-                                 const T *X,
-                                 const T *centers,
-                                 const int64_t *ids,
-                                 const int64_t N,
-                                 const int64_t K,
-                                 const int64_t D) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-
-  while (idy < K) {
-    int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
-    PADDLE_ENFORCE(
-        id < N, "Id should smaller than %d but received id: %d.", N, id);
-
-    T *out = centers_diff + idy * D;
-    const T *x = X + idy * D;
-    const T *cent = centers + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      out[i] = x[i] - cent[i];
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void UpdateCenters(T *centers,
-                              T *centers_diff,
-                              const int64_t *ids,
-                              const int64_t N,
-                              const int64_t K,
-                              const int64_t D,
-                              const T *alpha) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-  int count;
-  while (idy < K) {
-    int count = 1;
-    int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
-    PADDLE_ENFORCE(
-        id < N, "Id should smaller than %d but received id: %d.", N, id);
-
-    for (int i = 0; i < K; i++) {
-      if (ids[i] == id) {
-        count++;
-      }
-    }
-    const T *diff = centers_diff + idy * D;
-    T *cent = centers + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      phi::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T, typename DeviceContext>
-class CenterLossCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &device_context = ctx.template device_context<DeviceContext>();
-    auto stream = device_context.stream();
-    auto *X = ctx.Input<phi::DenseTensor>("X");  // deep feature
-    auto *labels = ctx.Input<phi::DenseTensor>("Label");
-    auto *centers = ctx.Input<phi::DenseTensor>("Centers");
-    auto *update_rate = ctx.Input<phi::DenseTensor>("CenterUpdateRate");
-    int cluster_num = ctx.Attr<int>("cluster_num");
-    auto *lr_center = update_rate->data<T>();
-    bool need_update = static_cast<T>(ctx.Attr<bool>("need_update"));
-
-    auto x_data = X->data<T>();
-    auto label_data = labels->data<int64_t>();
-
-    auto x_dims = X->dims();
-    int batch_size = x_dims[0];
-    const int deep_feat_dim = x_dims[1];
-
-    auto *centers_diff = ctx.Output<phi::DenseTensor>("SampleCenterDiff");
-    auto centers_diff_data = centers_diff->mutable_data<T>(ctx.GetPlace());
-
-    auto centers_data = centers->data<T>();
-    auto centers_dim = centers->dims();
-    auto *out_loss = ctx.Output<phi::DenseTensor>("Loss");
-    auto loss_data = out_loss->mutable_data<T>(ctx.GetPlace());
-
-    auto *centers_out = ctx.Output<phi::DenseTensor>("CentersOut");
-    auto *centers_out_data = centers_out->mutable_data<T>(ctx.GetPlace());
-
-    auto ctx_place = ctx.GetPlace();
-    if (centers != centers_out) {
-      framework::TensorCopy(
-          *static_cast<const phi::DenseTensor *>(centers),
-          ctx_place,
-          *platform::DeviceContextPool::Instance().Get(ctx_place),
-          static_cast<phi::DenseTensor *>(centers_out));
-    }
-
-    int64_t numel = X->numel();
-
-    size_t N = centers->dims()[0];
-    size_t D = centers->dims()[1];
-    size_t K = labels->numel();
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    ComputeDifferent<T, 128, 8, 8><<<grids, threads, 0, stream>>>(
-        centers_diff_data, x_data, centers_data, label_data, N, K, D);
-
-    auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto sub_result = EigenMatrix<T>::From(*centers_diff);
-
-    auto sub_res_pow2 = (sub_result * sub_result) / T(2.0);
-    auto z = EigenVector<T>::Flatten(*out_loss);
-    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
-    if (need_update) {
-      UpdateCenters<T, 128, 8, 8><<<grids, threads, 0, stream>>>(
-          centers_out_data, centers_diff_data, label_data, N, K, D, lr_center);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    center_loss, GPU, ALL_LAYOUT, ops::CenterLossCUDAKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(center_loss_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::CenterLossGradKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
deleted file mode 100644
index 5e5575c68cb0b5..00000000000000
--- a/paddle/fluid/operators/center_loss_op.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <cstring>
-#include <limits>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/common/transform.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
-template <typename T, typename DeviceContext>
-class CenterLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<phi::DenseTensor>("X");  // deep feature
-    auto *labels = ctx.Input<phi::DenseTensor>("Label");
-    auto *centers = ctx.Input<phi::DenseTensor>("Centers");
-    auto *update_rate = ctx.Input<phi::DenseTensor>("CenterUpdateRate");
-    int cluster_num = ctx.Attr<int>("cluster_num");
-    auto *lr_center = update_rate->data<T>();
-    T alpha = lr_center[0];
-    bool need_update = static_cast<T>(ctx.Attr<bool>("need_update"));
-
-    auto x_data = X->data<T>();
-    auto label_data = labels->data<int64_t>();
-
-    auto centers_dim = centers->dims();
-    auto centers_data = centers->data<T>();
-
-    auto x_dims = X->dims();
-    int batch_size = x_dims[0];
-    int deep_feat_dim = x_dims[1];
-
-    auto centers_diff = ctx.Output<phi::DenseTensor>("SampleCenterDiff");
-    auto centers_diff_data = centers_diff->mutable_data<T>(ctx.GetPlace());
-    auto *out_loss = ctx.Output<phi::DenseTensor>("Loss");
-
-    auto *centers_out = ctx.Output<phi::DenseTensor>("CentersOut");
-    auto *centers_out_data = centers_out->mutable_data<T>(ctx.GetPlace());
-
-    if (centers_out_data != centers_data) {
-      int size = centers_out->numel() * sizeof(T);
-      memcpy(centers_out_data, centers_data, size);
-    }
-
-    std::vector<int> center_update_count(cluster_num, 1);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto loss_data = out_loss->mutable_data<T>(ctx.GetPlace());
-
-    phi::DenseTensor centers_diffacc;  // used to accumulate all diff
-    auto centers_diffacc_data =
-        centers_diffacc.mutable_data<T>(centers_dim, ctx.GetPlace());
-    int numel = centers_diffacc.numel();
-    std::memset(centers_diffacc_data, 0, sizeof(T) * numel);
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    int tLabel;
-
-    const T *x_index;
-    const T *center_index;
-    T *center_out_index;
-    T *center_loss_diff_index;
-    T *acc_index;
-    phi::Transform<DeviceContext> trans;
-
-    for (int i = 0; i < batch_size; ++i) {
-      tLabel = label_data[i];
-      center_update_count[tLabel]++;
-      x_index = x_data + i * deep_feat_dim;                  // xi index
-      center_index = centers_data + tLabel * deep_feat_dim;  // center index
-      center_loss_diff_index = centers_diff_data + i * deep_feat_dim;
-      trans(dev_ctx,
-            x_index,
-            x_index + deep_feat_dim,
-            center_index,
-            center_loss_diff_index,
-            SubFunctor<T>());
-
-      acc_index = centers_diffacc_data + tLabel * deep_feat_dim;
-      blas.VADD(deep_feat_dim,
-                center_loss_diff_index,
-                acc_index,
-                acc_index);  // accumulate
-      loss_data[i] =
-          blas.DOT(
-              deep_feat_dim, center_loss_diff_index, center_loss_diff_index) /
-          T(2.0);
-    }
-
-    // update centers data
-    if (need_update == true) {
-      for (int i = 0; i < cluster_num; i++) {
-        acc_index = centers_diffacc_data + i * deep_feat_dim;
-        center_out_index = centers_out_data + i * deep_feat_dim;
-        T scale = alpha / center_update_count[i];
-        blas.SCAL(deep_feat_dim, scale, acc_index);
-        blas.VADD(deep_feat_dim, acc_index, center_out_index, center_out_index);
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class CenterLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in0 = context.Input<phi::DenseTensor>("SampleCenterDiff");
-    auto *in1 = context.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
-    auto *x_g = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto sub_result = EigenMatrix<T>::From(*in0);
-    auto out_grad = EigenMatrix<T>::From(*in1);
-
-    auto x_dims = x_g->dims();
-    int cols = x_g->numel() / x_dims[0];
-    // calculate gradient
-    auto grad_mat =
-        (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) * sub_result;
-
-    // propagate back to input
-    auto &eigen_place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    x_g->mutable_data<T>(context.GetPlace());
-    // eigen matrix
-    auto x_grad = EigenMatrix<T>::From(*x_g, phi::make_ddim({x_dims[0], cols}));
-    x_grad.device(eigen_place) = grad_mat;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 9f34211a6169b6..e6815115865aad 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
@@ -63,8 +64,6 @@ class CCommInitOp : public framework::OperatorBase {
       PADDLE_ENFORCE_NOT_NULL(
           var, platform::errors::InvalidArgument("Input con not be empty."));
 
-      phi::ccl::CCLRootId* comm_id = var->GetMutable<phi::ccl::CCLRootId>();
-
       int nranks = Attr<int>("nranks");
       int rid = Attr<int>("ring_id");
 
@@ -73,8 +72,17 @@ class CCommInitOp : public framework::OperatorBase {
         device_id = Attr<int>("device_id");
       }
       int rank_id = Attr<int>("rank");
-      platform::XCCLCommContext::Instance(place.GetDeviceType())
-          .CreateComm(comm_id, nranks, rank_id, device_id, rid);
+      auto store = phi::distributed::CreateOrGetGlobalTCPStore();
+      if (!phi::distributed::CommContextManager::GetInstance().Has(
+              std::to_string(rid))) {
+        phi::distributed::CommContextManager::CreateXCCLCommContext(
+            store,
+            std::to_string(rid),
+            phi::CustomPlace(place.GetDeviceType(), device_id),
+            rank_id,
+            nranks,
+            "c_comm_init_op");
+      }
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with custom device."));
@@ -114,10 +122,7 @@ class CCommInitOp : public framework::OperatorBase {
       int rank_id = Attr<int>("rank");
 #endif
 #if defined(PADDLE_WITH_NCCL)
-      const char* dynamic_static_unified_comm =
-          getenv("FLAGS_dynamic_static_unified_comm");
-      if (dynamic_static_unified_comm &&
-          std::string(dynamic_static_unified_comm) == "1") {
+      if (FLAGS_dynamic_static_unified_comm) {
         VLOG(3) << "#### use new comm lab ####";
         auto store = phi::distributed::CreateOrGetGlobalTCPStore();
         phi::distributed::CommContextManager::SetDeviceId(device_id);
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 3f6d6348ba925c..4a07f7e98f793c 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
 
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace operators {
 
@@ -70,10 +71,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
     std::vector<ncclUniqueId> nccl_ids;
     nccl_ids.resize(1);
 
-    const char* dynamic_static_unified_comm =
-        getenv("FLAGS_dynamic_static_unified_comm");
-    if (!dynamic_static_unified_comm ||
-        std::string(dynamic_static_unified_comm) != "1") {
+    if (!FLAGS_dynamic_static_unified_comm) {
       int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
       if (rank == 0) {
         GenNCCLID(&nccl_ids);
diff --git a/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
index effe7021b0d7ef..5a3f27e72a82ac 100644
--- a/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
@@ -52,34 +52,7 @@ class CGenXCCLIdOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    int rank = Attr<int>("rank");
-    int ring_id = Attr<int>("ring_id");
-
-    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
-      return Output("Out");
-    };
-
-    std::string endpoint = Attr<std::string>("endpoint");
-    int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
-
-    std::vector<phi::ccl::CCLRootId> xccl_ids;
-    xccl_ids.resize(1);
-
-    if (rank == 0) {
-      for (size_t i = 0; i < xccl_ids.size(); ++i) {
-        phi::DeviceManager::CCLGetUniqueId(dev_place.GetDeviceType(),
-                                           &xccl_ids[i]);
-      }
-      std::vector<std::string> endpoint_list =
-          Attr<std::vector<std::string>>("other_endpoints");
-      platform::SendBroadCastCommID(endpoint_list, &xccl_ids, ring_id);
-    } else {
-      platform::RecvBroadCastCommID(server_fd, endpoint, &xccl_ids, ring_id);
-    }
-
-    CopyXCCLIDToVar(xccl_ids, func, scope);
-  }
+               const platform::Place& dev_place) const override {}
 };
 
 #else
diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
index 0dd43e761da391..9caca06f53ad3a 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
@@ -50,7 +50,7 @@ class ConditionalBlockInferOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    bool need_run;
+    bool need_run = false;
     if (Attr<bool>("is_scalar_condition")) {
       // When is_scalar_condition is True, the conditional variable is a scalar,
       // whether need to execute the operators in sub-block depends on the
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 501761d82d0343..d7166a5ad02672 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    bool need_run;
+    bool need_run = false;
     if (Attr<bool>("is_scalar_condition")) {
       // When is_scalar_condition is True, the conditional variable is a scalar,
       // whether need to execute the operators in sub-block depends on the
@@ -147,7 +147,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    bool need_run;
+    bool need_run = false;
     if (Attr<bool>("is_scalar_condition")) {
       auto xs = this->InputTensors(scope, ConditionalOp::kCondition);
       need_run = ScalarCondition(xs);
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 9f67b1d4b6e183..9262ca59af970b 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -52,7 +52,7 @@ class GetPlacesOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    bool is_gpu;
+    bool is_gpu = false;
     if (Attr<std::string>("device_type") == "AUTO") {
       is_gpu = platform::is_gpu_place(place);
     } else {
diff --git a/paddle/fluid/operators/controlflow/pylayer_op_helper.cc b/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
index dabe561eea3e73..9dc53d428ef1d2 100644
--- a/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
@@ -47,7 +47,7 @@ static void FindAllPyLayerOpAndPyLayerGradOp(
   for (size_t i = 1; i < program.Size(); ++i) {
     auto &block = program.Block(i);
     for (size_t j = 0; j < block.OpSize(); ++j) {
-      auto *op = block.Op(j);
+      auto *op = block.Op(static_cast<int>(j));
       if (op->Type() == "pylayer") {
         fwd_ops->emplace_back(op);
       } else if (op->Type() == "pylayer_grad") {
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
deleted file mode 100644
index d2d8f56587cfd6..00000000000000
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_shift_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/eigen.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-class ConvShiftOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ConvShiftOp");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ConvShiftOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ConvShiftOp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "Input(X)'s dimensions of ConvShiftOp should be 2. "
-            "But received X's shape = [%s] and the dimension is %d.",
-            x_dims,
-            x_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        y_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "Input(Y)'s dimensions of ConvShiftOp should be 2. "
-            "But received Y's shape = [%s] and the dimension is %d.",
-            y_dims,
-            y_dims.size()));
-    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0))
-      PADDLE_ENFORCE_EQ(
-          x_dims[0],
-          y_dims[0],
-          platform::errors::InvalidArgument(
-              "The first dimension of Input(X) and Input(Y) of ConvShiftOp "
-              "should be equal. "
-              "But received X's shape = [%s], Y's shape = [%s], "
-              "and the first dimensions are %d and %d respectively.",
-              x_dims,
-              y_dims,
-              x_dims[0],
-              y_dims[0]));
-    if (ctx->IsRuntime() || y_dims[1] > 0)
-      PADDLE_ENFORCE_EQ(
-          y_dims[1] % 2,
-          1,
-          platform::errors::InvalidArgument(
-              "The second dimension of Input(Y) of ConvShiftOp should be odd."
-              "But received Y's shape = [%s] and the second dimension is %d.",
-              y_dims,
-              y_dims[1]));
-    if (ctx->IsRuntime() || (x_dims[1] > 0 && y_dims[1] > 0))
-      PADDLE_ENFORCE_LE(
-          y_dims[1],
-          x_dims[1],
-          platform::errors::InvalidArgument(
-              "The second dimension of Input(Y) of ConvShiftOp should be less "
-              "than or equal to the 2nd dimension of Input(X)."
-              "But received X's shape = [%s], Y's shape = [%s], "
-              "and the second dimensions are %d and %d respectively.",
-              x_dims,
-              y_dims,
-              x_dims[1],
-              y_dims[1]));
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ConvShiftGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ConvShiftGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ConvShiftGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@GRAD",
-                   "ConvShiftGradOp");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      auto x_dims = ctx->GetInputDim("X");
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(y_grad_name)) {
-      auto y_dims = ctx->GetInputDim("Y");
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
-             "where B is the batch size and M is the data dimension.");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x N, "
-             "where B is the batch size and N is the data dimension. N must "
-             "be odd.");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
-              "i.e., the same shape as X.");
-    AddComment(R"DOC(
-ConvShift Operator.
-
-A layer for circular convolution of two vectors,
-as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
-
-The equation is:
-
-$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$
-
-where X's index is computed modulo M, and Y's index is computed modulo N.
-
-Both inputs X and Y can carry LoD (Level of Details) information.
-However, the output only shares the LoD information with input X.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class ConvShiftKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<phi::DenseTensor>("X");
-    auto *Y = context.Input<phi::DenseTensor>("Y");
-    auto *Out = context.Output<phi::DenseTensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenMatrix<T>::From(*X);
-    auto y = EigenMatrix<T>::From(*Y);
-    auto out = EigenMatrix<T>::From(*Out);
-    out.setZero();
-
-    size_t batch_size = X->dims()[0];
-    size_t x_width = X->dims()[1];
-    size_t y_width = Y->dims()[1];
-    size_t y_half_width = (y_width - 1) / 2;
-
-    for (size_t k = 0; k < batch_size; ++k) {
-      for (size_t i = 0; i < x_width; ++i) {
-        for (size_t j = 0; j < y_width; ++j) {
-          int index =
-              static_cast<int>((i + j - y_half_width + x_width) % x_width);
-          out(k, i) += x(k, index) * y(k, j);
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class ConvShiftGradKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<phi::DenseTensor>("X");
-    auto *Y = context.Input<phi::DenseTensor>("Y");
-    auto *dOut = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *dY = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    auto x = EigenMatrix<T>::From(*X);
-    auto y = EigenMatrix<T>::From(*Y);
-    auto dout = EigenMatrix<T>::From(*dOut);
-
-    auto x_dims = X->dims();
-    auto y_dims = Y->dims();
-    size_t batch_size = x_dims[0];
-    size_t x_width = x_dims[1];
-    size_t y_width = y_dims[1];
-    size_t y_half_width = (y_width - 1) / 2;
-
-    // The below trades code duplication for efficiency (keeping the if
-    // statement outside of the loop).
-    if (dX) {
-      dX->mutable_data<T>(context.GetPlace());
-      auto dx = EigenMatrix<T>::From(*dX);
-      dx.setZero();
-      for (size_t k = 0; k < batch_size; ++k) {
-        for (size_t i = 0; i < x_width; ++i) {
-          for (size_t j = 0; j < y_width; ++j) {
-            int index =
-                static_cast<int>((i + j - y_half_width + x_width) % x_width);
-            dx(k, index) += dout(k, i) * y(k, j);
-          }
-        }
-      }
-    }
-
-    if (dY) {
-      dY->mutable_data<T>(context.GetPlace());
-      auto dy = EigenMatrix<T>::From(*dY);
-      dy.setZero();
-      for (size_t k = 0; k < batch_size; ++k) {
-        for (size_t i = 0; i < x_width; ++i) {
-          for (size_t j = 0; j < y_width; ++j) {
-            int index =
-                static_cast<int>((i + j - y_half_width + x_width) % x_width);
-            dy(k, j) += x(k, index) * dout(k, i);
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class ConvShiftGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("conv_shift_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(conv_shift,
-                  ops::ConvShiftOp,
-                  ops::ConvShiftOpMaker,
-                  ops::ConvShiftGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConvShiftGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp);
-PD_REGISTER_STRUCT_KERNEL(
-    conv_shift, CPU, ALL_LAYOUT, ops::ConvShiftKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    conv_shift_grad, CPU, ALL_LAYOUT, ops::ConvShiftGradKernel, float) {}
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
deleted file mode 100644
index 2ac37ac8d6f8f7..00000000000000
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_shift_op.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {
-
-inline int DivUp(int x, int y) { return (x + y - 1) / y; }
-
-// Some notes on the design:
-//
-// Each thread is responsible for computing a single output out[k, i].
-// Thread blocks are based on tiles of x with height 1 in the batch dimension.
-//
-// This design is based on the typical use case where the filter
-// y is fairly small. For large y, it would probably be more efficient
-// to also tile across y.
-template <typename T>
-__global__ void ConvShiftForward(const T *x,
-                                 const T *y,
-                                 int x_width,
-                                 int y_width,
-                                 int y_half_width,
-                                 int batch_size,
-                                 T *out) {
-  extern __shared__ T mem[];
-
-  int tx = threadIdx.x;
-  int i = blockIdx.x * blockDim.x + tx;  // global x index
-  int k = blockIdx.y;                    // batch index
-
-  // Check if we are in a boundary block with fewer x's to process than
-  // blockDim.x.
-  int num_x =
-      (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x;
-
-  T *sx = mem;
-  T *sx_pad = &mem[num_x];
-  T *sy = &mem[blockDim.x + y_width];
-
-  // Collaboratively load y[k, :] and length-y padding of x into shared memory.
-  int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width;
-  for (int j = tx; j < y_width; j += blockDim.x) {
-    sy[j] = y[k * y_width + j];
-    sx_pad[j] = x[k * x_width + (pad_start + j) % x_width];
-  }
-
-  // Load a cyclically shifted slice of x into shared memory.
-  if (tx < num_x) {
-    int load_i = (i - y_half_width + x_width) % x_width;
-    sx[tx] = x[k * x_width + load_i];
-  }
-  __syncthreads();
-
-  if (tx < num_x) {
-    // Compute dot product of sx[tx:tx + y_width] and sy.
-    T sum = 0;
-    for (int j = 0; j < y_width; ++j) {
-      sum += sx[tx + j] * sy[j];
-    }
-
-    // Save to out[k, i].
-    out[k * x_width + i] = sum;
-  }
-}
-
-// Compute x gradient - initial naive implementation with atomic add.
-template <typename T>
-__global__ void ConvShiftGradX(const T *dout,
-                               const T *y,
-                               int x_width,
-                               int y_width,
-                               int y_half_width,
-                               int batch_size,
-                               T *dx) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
-  int j = blockIdx.y;                             // y index
-  int k = blockIdx.z;                             // batch index
-
-  if (i < x_width) {
-    int index = (i + j - y_half_width + x_width) % x_width;
-    atomicAdd(&dx[k * x_width + index],
-              dout[k * x_width + i] * y[k * y_width + j]);
-  }
-}
-
-// Compute y gradient - initial naive implementation with atomic add.
-template <typename T>
-__global__ void ConvShiftDy(const T *x,
-                            const T *dout,
-                            int x_width,
-                            int y_width,
-                            int y_half_width,
-                            int batch_size,
-                            T *dy) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
-  int j = blockIdx.y;                             // y index
-  int k = blockIdx.z;                             // batch index
-
-  if (i < x_width) {
-    int index = (i + j - y_half_width + x_width) % x_width;
-    atomicAdd(&dy[k * y_width + j],
-              x[k * x_width + index] * dout[k * x_width + i]);
-  }
-}
-}  // namespace
-
-template <typename T>
-class ConvShiftKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *Y = context.Input<phi::DenseTensor>("Y");
-    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
-    const T *x_data = X->data<T>();
-    const T *y_data = Y->data<T>();
-    T *out_data = Out->mutable_data<T>(context.GetPlace());
-
-    int batch_size = X->dims()[0];
-    int x_width = X->dims()[1];
-    int y_width = Y->dims()[1];
-    int y_half_width = (y_width - 1) / 2;
-
-    const int x_per_block = 256;
-    int num_x_blocks = DivUp(x_width, x_per_block);
-    int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
-
-    dim3 grid_dim(num_x_blocks, batch_size);
-
-    auto stream = context.template device_context<phi::GPUContext>().stream();
-
-    ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
-        x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
-  }
-};
-
-template <typename T>
-class ConvShiftGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *Y = context.Input<phi::DenseTensor>("Y");
-    const phi::DenseTensor *dOut =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    const T *x_data = X->data<T>();
-    const T *y_data = Y->data<T>();
-    const T *dout_data = dOut->data<T>();
-
-    phi::DenseTensor *dX =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    phi::DenseTensor *dY =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    int batch_size = X->dims()[0];
-    int x_width = X->dims()[1];
-    int y_width = Y->dims()[1];
-    int y_half_width = (y_width - 1) / 2;
-
-    auto &device_ctx = context.template device_context<phi::GPUContext>();
-    phi::funcs::SetConstant<phi::GPUContext, T> zero;
-
-    const int x_per_block = 256;
-    int num_x_blocks = DivUp(x_width, x_per_block);
-    dim3 grid_dim(num_x_blocks, y_width, batch_size);
-
-    if (dX) {
-      T *dx_data = dX->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, dX, static_cast<T>(0.0));
-      ConvShiftGradX<T>
-          <<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(dout_data,
-                                                              y_data,
-                                                              x_width,
-                                                              y_width,
-                                                              y_half_width,
-                                                              batch_size,
-                                                              dx_data);
-    }
-    if (dY) {
-      T *dy_data = dY->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, dY, static_cast<T>(0.0));
-      ConvShiftDy<T>
-          <<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(x_data,
-                                                              dout_data,
-                                                              x_width,
-                                                              y_width,
-                                                              y_half_width,
-                                                              batch_size,
-                                                              dy_data);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    conv_shift, GPU, ALL_LAYOUT, ops::ConvShiftKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    conv_shift_grad, GPU, ALL_LAYOUT, ops::ConvShiftGradKernel, float) {}
diff --git a/paddle/fluid/operators/conv_shift_op.h b/paddle/fluid/operators/conv_shift_op.h
deleted file mode 100644
index 603d1e52221553..00000000000000
--- a/paddle/fluid/operators/conv_shift_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class ConvShiftKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-
-template <typename T, typename DeviceContext>
-class ConvShiftGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
deleted file mode 100644
index 6dd84d58ae9a59..00000000000000
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cos_sim_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class CosSimOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // notnull check
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CosSim");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "CosSim");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CosSim");
-    OP_INOUT_CHECK(ctx->HasOutput("XNorm"), "Output", "XNorm", "CosSim");
-    OP_INOUT_CHECK(ctx->HasOutput("YNorm"), "Output", "YNorm", "CosSim");
-
-    // shape check
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(y_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims.size(),
-          y_dims.size(),
-          platform::errors::InvalidArgument(
-              "ShapeError: Ranks of Input(X) and Input(Y) must be equal."
-              "But received: Ranks of Input(X) is [%d], Ranks of Input(Y) is "
-              "[%d]",
-              x_dims.size(),
-              y_dims.size()));
-      PADDLE_ENFORCE_GE(
-          x_dims.size(),
-          2,
-          platform::errors::InvalidArgument(
-              "ShapeError: Rank of Input(X) must not be less than 2."
-              "But received: Ranks of Input(X) is [%d]",
-              x_dims.size()));
-      PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 1, x_dims.size()),
-          phi::slice_ddim(y_dims, 1, y_dims.size()),
-          platform::errors::InvalidArgument(
-              "All dimensions except the 1st of Input(X) and Input(Y) "
-              "must be equal."));
-      PADDLE_ENFORCE_EQ(
-          x_dims[0] == y_dims[0] || y_dims[0] == 1,
-          true,
-          platform::errors::InvalidArgument(
-              "The 1st dimension of Input(Y) %d must be equal to Input(X) %d or"
-              " just 1 (which will be broadcasted to match Input(X)).",
-              y_dims[0],
-              x_dims[0]));
-    }
-
-    // resize tensor
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-    ctx->SetOutputDim("XNorm", {x_dims[0], 1});
-    ctx->SetOutputDim("YNorm", {y_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The 1st input of cos_sim op, Tensor with shape ``[N_1, N_2, "
-             "..., N_k]``, the data type is float32.");
-    AddInput("Y",
-             "The 2nd input of cos_sim op, Tensor with shape ``[N_1 or 1, N_2, "
-             "..., N_k]``, the data type is float32.");
-    AddOutput("Out", "The output of cos_sim op.");
-    AddOutput("XNorm",
-              "Norm of the first input, reduced along the 1st "
-              "dimension.")
-        .AsIntermediate();
-    AddOutput("YNorm",
-              "Norm of the second input, reduced along the 1st "
-              "dimension.")
-        .AsIntermediate();
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
-                  "Skip calling InferShape() function in the runtime.")
-        .SetDefault(true);
-
-    AddComment(R"DOC(
-**Cosine Similarity Operator**
-
-$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$
-
-The input X and Y must have the same shape, except that the 1st dimension
-of input Y could be just 1 (different from input X), which will be
-broadcasted to match the shape of input X before computing their cosine
-similarity.
-
-)DOC");
-  }
-};
-
-class CosSimOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // notnull check
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CosSimGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "CosSimGrad");
-    OP_INOUT_CHECK(ctx->HasInput("XNorm"), "Input", "XNorm", "CosSimGrad");
-    OP_INOUT_CHECK(ctx->HasInput("YNorm"), "Input", "YNorm", "CosSimGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "CosSimGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "CosSimGrad");
-
-    // shape check
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto xnorm_dims = ctx->GetInputDim("XNorm");
-    auto ynorm_dims = ctx->GetInputDim("YNorm");
-    auto out_dims = ctx->GetInputDim("Out");
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_GE(
-        x_dims.size(),
-        y_dims.size(),
-        platform::errors::InvalidArgument(
-            "ShapeError: Ranks of Input(X) and Input(Y) must be equal."
-            "But received: Ranks of Input(X) is [%d], Ranks of Input(Y) is "
-            "[%d]",
-            x_dims.size(),
-            y_dims.size()));
-    PADDLE_ENFORCE_GE(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "ShapeError: Rank of Input(X) must not be less than 2."
-            "But received: Ranks of Input(X) is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        phi::slice_ddim(x_dims, 1, x_dims.size()),
-        phi::slice_ddim(y_dims, 1, y_dims.size()),
-        platform::errors::InvalidArgument(
-            "All dimensions except the 1st of Input(X) [%s] and Input(Y) [%s] "
-            "must be equal.",
-            x_dims,
-            y_dims));
-    PADDLE_ENFORCE_EQ(
-        true,
-        x_dims[0] == y_dims[0] || y_dims[0] == 1,
-        platform::errors::InvalidArgument(
-            "The 1st dimension of Input(Y) %d must be equal to Input(X) %d or"
-            " just 1 (which will be broadcasted to match Input(X)).",
-            y_dims[0],
-            x_dims[0]));
-    auto target_xnorm_dims = phi::make_ddim({x_dims[0], 1});
-    auto target_ynorm_dims = phi::make_ddim({y_dims[0], 1});
-    PADDLE_ENFORCE_EQ(
-        xnorm_dims,
-        target_xnorm_dims,
-        platform::errors::InvalidArgument(
-            "Shape of Input(XNorm) [%s] must be (X.Dim(0), 1) - [%s]",
-            xnorm_dims,
-            target_xnorm_dims));
-    PADDLE_ENFORCE_EQ(
-        ynorm_dims,
-        target_ynorm_dims,
-        platform::errors::InvalidArgument(
-            "Shape of Input(YNorm) [%s] must be (Y.Dim(0), 1) - [%s]",
-            ynorm_dims,
-            target_ynorm_dims));
-    PADDLE_ENFORCE_EQ(
-        out_dims,
-        target_xnorm_dims,
-        platform::errors::InvalidArgument(
-            "Shape of Input(Out) [%s] must be (X.Dim(0), 1) - [%s]",
-            out_dims,
-            target_xnorm_dims));
-    PADDLE_ENFORCE_EQ(
-        out_grad_dims,
-        target_xnorm_dims,
-        platform::errors::InvalidArgument(
-            "Shape of Input(Out@Grad) [%s] must be (X.Dim(0), 1) - [%s]",
-            out_grad_dims,
-            target_xnorm_dims));
-
-    // resize tensor
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-template <typename T>
-class CosSimGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("cos_sim_grad");
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetInput("Y", this->Input("Y"));
-    grad_op->SetInput("XNorm", this->Output("XNorm"));
-    grad_op->SetInput("YNorm", this->Output("YNorm"));
-    grad_op->SetInput("Out", this->Output("Out"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(cos_sim,
-                  ops::CosSimOp,
-                  ops::CosSimOpMaker,
-                  ops::CosSimGradOpMaker<paddle::framework::OpDesc>,
-                  ops::CosSimGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad);
-PD_REGISTER_STRUCT_KERNEL(cos_sim, CPU, ALL_LAYOUT, ops::CosSimKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    cos_sim_grad, CPU, ALL_LAYOUT, ops::CosSimGradKernel, float) {}
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
deleted file mode 100644
index 115bfa0a42e56e..00000000000000
--- a/paddle/fluid/operators/cos_sim_op.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class CosSimKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get phi::DenseTensor
-    auto* in_x = context.Input<phi::DenseTensor>("X");
-    auto* in_y = context.Input<phi::DenseTensor>("Y");
-    auto* out_z = context.Output<phi::DenseTensor>("Out");
-    auto* out_x_norm = context.Output<phi::DenseTensor>("XNorm");
-    auto* out_y_norm = context.Output<phi::DenseTensor>("YNorm");
-
-    int rows_x = in_x->dims()[0];
-    int rows_y = in_y->dims()[0];
-    out_z->Resize({rows_x, 1});
-    out_x_norm->Resize({rows_x, 1});
-    out_y_norm->Resize({rows_y, 1});
-    out_z->mutable_data<T>(context.GetPlace());
-    out_x_norm->mutable_data<T>(context.GetPlace());
-    out_y_norm->mutable_data<T>(context.GetPlace());
-    out_z->set_lod(in_x->lod());
-
-    int cols = phi::product(in_x->dims()) / rows_x;
-
-    if (rows_x == rows_y) {
-      math::CosSimFunctor<T, true> functor(in_x->data<T>(),
-                                           in_y->data<T>(),
-                                           out_x_norm->data<T>(),
-                                           out_y_norm->data<T>(),
-                                           out_z->data<T>(),
-                                           cols);
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(context.device_context()), rows_x);
-      for_range(functor);
-    } else {
-      math::CosSimFunctor<T, false> functor(in_x->data<T>(),
-                                            in_y->data<T>(),
-                                            out_x_norm->data<T>(),
-                                            out_y_norm->data<T>(),
-                                            out_z->data<T>(),
-                                            cols);
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(context.device_context()), rows_x);
-      for_range(functor);
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class CosSimGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get phi::DenseTensor
-    auto* in_x = context.Input<phi::DenseTensor>("X");
-    auto* in_y = context.Input<phi::DenseTensor>("Y");
-    auto* in_z = context.Input<phi::DenseTensor>("Out");
-    auto* in_x_norm = context.Input<phi::DenseTensor>("XNorm");
-    auto* in_y_norm = context.Input<phi::DenseTensor>("YNorm");
-    auto* out_grad_x =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* out_grad_y =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto* in_grad_z =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    // compute gradident
-    int rows_x = in_x->dims()[0];
-    int rows_y = in_y->dims()[0];
-    int cols = phi::product(in_x->dims()) / rows_x;
-
-    if (rows_x == rows_y) {
-      if (out_grad_x) {
-        out_grad_x->Resize(in_x->dims());
-        math::CosSimGradFunctor<T> functor(
-            in_x_norm->data<T>(),
-            in_y_norm->data<T>(),
-            in_x->data<T>(),
-            in_y->data<T>(),
-            in_z->data<T>(),
-            in_grad_z->data<T>(),
-            out_grad_x->mutable_data<T>(context.GetPlace()),
-            cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-      if (out_grad_y) {
-        out_grad_y->Resize(in_y->dims());
-        math::CosSimGradFunctor<T> functor(
-            in_y_norm->data<T>(),
-            in_x_norm->data<T>(),
-            in_y->data<T>(),
-            in_x->data<T>(),
-            in_z->data<T>(),
-            in_grad_z->data<T>(),
-            out_grad_y->mutable_data<T>(context.GetPlace()),
-            cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-    } else {
-      if (out_grad_x) {
-        out_grad_x->Resize(in_x->dims());
-        math::CosSimDxFunctor<T> functor(
-            in_x_norm->data<T>(),
-            in_y_norm->data<T>(),
-            in_x->data<T>(),
-            in_y->data<T>(),
-            in_z->data<T>(),
-            in_grad_z->data<T>(),
-            out_grad_x->mutable_data<T>(context.GetPlace()),
-            cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-      if (out_grad_y) {
-        out_grad_y->Resize(in_y->dims());
-        out_grad_y->mutable_data<T>(context.GetPlace());
-        phi::funcs::SetConstant<DeviceContext, T> set_zero;
-        auto& dev_ctx = context.template device_context<DeviceContext>();
-        set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
-
-        math::CosSimDyFunctor<DeviceContext, T> functor;
-        functor(dev_ctx,
-                in_x_norm->data<T>(),
-                in_y_norm->data<T>(),
-                in_x->data<T>(),
-                in_y->data<T>(),
-                in_z->data<T>(),
-                in_grad_z->data<T>(),
-                static_cast<size_t>(rows_x),
-                static_cast<size_t>(cols),
-                out_grad_y->data<T>());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
index a1106b1386757b..f01b0a92d89f82 100644
--- a/paddle/fluid/operators/custom_device_common_op_registry.cc
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/phi/api/backward/backward_api.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/xccl_comm_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 
@@ -99,13 +101,14 @@ class CConcatOpCustomDeviceKernel : public framework::OpKernel<T> {
       auto task = pg->AllGather(in_tensor, out_tensor);
       task->Wait();
     } else {
-      auto comm = platform::XCCLCommContext::Instance(place.GetDeviceType())
-                      .Get(rid, place);
+      auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+          phi::distributed::CommContextManager::GetInstance().Get(
+              std::to_string(rid)));
       PADDLE_ENFORCE_EQ(
           nranks,
-          comm->nranks(),
+          comm->GetSize(),
           platform::errors::InvalidArgument(
-              "nranks: %s should equal to %s", nranks, comm->nranks()));
+              "nranks: %s should equal to %s", nranks, comm->GetSize()));
 
       int64_t send_numel = x->numel();
       const T* send_buff = x->data<T>();
@@ -118,7 +121,7 @@ class CConcatOpCustomDeviceKernel : public framework::OpKernel<T> {
           recv_buff,
           send_numel,
           phi::ccl::ToCCLDataType(x->dtype()),
-          comm->comm(),
+          comm->GetXcclComm(),
           stream);
     }
     std::vector<phi::DenseTensor> inputs;
@@ -600,9 +603,9 @@ class CAllReduceOpCustomDeviceKernel : public framework::OpKernel<T> {
       return;
     }
 
-    auto comm =
-        paddle::platform::XCCLCommContext::Instance(place.GetDeviceType())
-            .Get(rid, place);
+    auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+        phi::distributed::CommContextManager::GetInstance().Get(
+            std::to_string(rid)));
 
     std::shared_ptr<phi::stream::Stream> stream;
     if (ctx.Attr<bool>("use_calc_stream")) {
@@ -610,7 +613,7 @@ class CAllReduceOpCustomDeviceKernel : public framework::OpKernel<T> {
       stream = static_cast<paddle::platform::CustomDeviceContext*>(dev_ctx)
                    ->GetStream();
     } else {
-      stream = comm->stream();
+      stream = comm->GetStream();
     }
     phi::DeviceManager::CCLAllReduce(place.GetDeviceType(),
                                      const_cast<void*>(sendbuff),
@@ -618,7 +621,7 @@ class CAllReduceOpCustomDeviceKernel : public framework::OpKernel<T> {
                                      numel,
                                      dtype,
                                      red_type,
-                                     comm->comm(),
+                                     comm->GetXcclComm(),
                                      *stream);
   }
 };
@@ -634,22 +637,30 @@ class CBroadcastOpCustomDeviceKernel : public framework::OpKernel<T> {
     int root = ctx.Attr<int>("root");
     int rid = ctx.Attr<int>("ring_id");
 
-    auto stream = static_cast<const phi::CustomContext&>(ctx.device_context())
-                      .GetStream();
+    auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+        phi::distributed::CommContextManager::GetInstance().Get(
+            std::to_string(rid)));
+
+    std::shared_ptr<phi::stream::Stream> stream;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<paddle::platform::CustomDeviceContext*>(dev_ctx)
+                   ->GetStream();
+    } else {
+      stream = comm->GetStream();
+    }
 
     int numel = x->numel();
     auto dtype = phi::ccl::ToCCLDataType(x->dtype());
-    auto comm = platform::XCCLCommContext::Instance(place.GetDeviceType())
-                    .Get(rid, place);
-    if (root == comm->rank()) {
+    if (root == comm->GetRank()) {
       phi::DeviceManager::CCLBroadcast(place.GetDeviceType(),
                                        const_cast<void*>(x->data()),
                                        numel,
                                        dtype,
                                        root,
-                                       comm->comm(),
+                                       comm->GetXcclComm(),
                                        *stream);
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
+      VLOG(3) << "rank " << comm->GetRank() << " invoke Bcast. sent "
               << x->numel();
       if (out != x) {
         framework::TensorCopy(
@@ -664,9 +675,9 @@ class CBroadcastOpCustomDeviceKernel : public framework::OpKernel<T> {
                                        numel,
                                        dtype,
                                        root,
-                                       comm->comm(),
+                                       comm->GetXcclComm(),
                                        *stream);
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
+      VLOG(3) << "rank " << comm->GetRank() << " invoke Bcast. received "
               << phi::product(out->dims());
     }
     out->set_lod(x->lod());
@@ -684,16 +695,27 @@ class BarrierOpCustomDeviceKernel : public framework::OpKernel<T> {
     const void* sendbuff = in->data();
     void* recvbuff = ctx.device_context().Alloc<T>(out);
     int rid = ctx.Attr<int>("ring_id");
-    auto comm = platform::XCCLCommContext::Instance(place.GetDeviceType())
-                    .Get(rid, place);
+
+    auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+        phi::distributed::CommContextManager::GetInstance().Get(
+            std::to_string(rid)));
+
+    std::shared_ptr<phi::stream::Stream> stream;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<paddle::platform::CustomDeviceContext*>(dev_ctx)
+                   ->GetStream();
+    } else {
+      stream = comm->GetStream();
+    }
     phi::DeviceManager::CCLAllReduce(place.GetDeviceType(),
                                      const_cast<void*>(sendbuff),
                                      recvbuff,
                                      numel,
                                      phi::ccl::ToCCLDataType(in->dtype()),
                                      phi::ccl::CCLReduceOp::SUM,
-                                     comm->comm(),
-                                     *(comm->stream()));
+                                     comm->GetXcclComm(),
+                                     *stream);
   }
 };
 
@@ -993,16 +1015,22 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
         }
       }
     } else {
-      auto comm = platform::XCCLCommContext::Instance(place.GetDeviceType())
-                      .Get(rid, place);
+      auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+          phi::distributed::CommContextManager::GetInstance().Get(
+              std::to_string(rid)));
+
       std::shared_ptr<phi::stream::Stream> stream;
       if (ctx.Attr<bool>("use_calc_stream")) {
-        stream = dev_ctx.GetStream();
+        auto dev_ctx =
+            paddle::platform::DeviceContextPool::Instance().Get(place);
+        stream = static_cast<paddle::platform::CustomDeviceContext*>(dev_ctx)
+                     ->GetStream();
       } else {
-        stream = comm->stream();
+        stream = comm->GetStream();
       }
-      int nranks = comm->nranks();
-      int rank = comm->rank();
+
+      int nranks = comm->GetSize();
+      int rank = comm->GetRank();
       auto in_feat = x->dims()[1];
       auto n_expert = local_count->dims()[0] / nranks;
       int64_t fwd_count = 0;
@@ -1033,7 +1061,7 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
                 cpu_global_count_data[idx] * in_feat,
                 phi::ccl::ToCCLDataType(x->dtype()),
                 j,
-                comm->comm(),
+                comm->GetXcclComm(),
                 *stream);
             recv_ptr += cpu_global_count_data[idx];
           }
@@ -1049,7 +1077,7 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
                   cpu_local_count_data[idx] * in_feat,
                   phi::ccl::ToCCLDataType(x->dtype()),
                   j,
-                  comm->comm(),
+                  comm->GetXcclComm(),
                   *stream);
             }
           }
@@ -1072,7 +1100,7 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
                 cpu_global_count_data[idx] * in_feat,
                 phi::ccl::ToCCLDataType(x->dtype()),
                 j,
-                comm->comm(),
+                comm->GetXcclComm(),
                 *stream);
             recv_ptr += cpu_global_count_data[idx];
           }
@@ -1199,16 +1227,21 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
         }
       }
     } else {
-      auto comm = platform::XCCLCommContext::Instance(place.GetDeviceType())
-                      .Get(rid, place);
+      auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+          phi::distributed::CommContextManager::GetInstance().Get(
+              std::to_string(rid)));
+
       std::shared_ptr<phi::stream::Stream> stream;
       if (ctx.Attr<bool>("use_calc_stream")) {
-        stream = dev_ctx.GetStream();
+        auto dev_ctx =
+            paddle::platform::DeviceContextPool::Instance().Get(place);
+        stream = static_cast<paddle::platform::CustomDeviceContext*>(dev_ctx)
+                     ->GetStream();
       } else {
-        stream = comm->stream();
+        stream = comm->GetStream();
       }
-      int nranks = comm->nranks();
-      int rank = comm->rank();
+      int nranks = comm->GetSize();
+      int rank = comm->GetRank();
       auto in_feat = x->dims()[1];
       auto n_expert = local_count->dims()[0] / nranks;
 
@@ -1238,7 +1271,7 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
                                         cpu_local_count_data[idx] * in_feat,
                                         phi::ccl::ToCCLDataType(x->dtype()),
                                         j,
-                                        comm->comm(),
+                                        comm->GetXcclComm(),
                                         *stream);
           }
         }
@@ -1253,7 +1286,7 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
                   cpu_global_count_data[idx] * in_feat,
                   phi::ccl::ToCCLDataType(x->dtype()),
                   j,
-                  comm->comm(),
+                  comm->GetXcclComm(),
                   *stream);
             } else {
               phi::DeviceManager::GetDeviceWithPlace(place)->MemoryCopyD2D(
@@ -1274,7 +1307,7 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
                                         cpu_local_count_data[idx] * in_feat,
                                         phi::ccl::ToCCLDataType(x->dtype()),
                                         j,
-                                        comm->comm(),
+                                        comm->GetXcclComm(),
                                         *stream);
           }
         }
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 1d990b4466a963..fe32cc32d02d4b 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -43,7 +43,6 @@ detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
                   anchor_generator_op.cu)
-detection_library(target_assign_op SRCS target_assign_op.cc target_assign_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
                   polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
@@ -54,8 +53,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS phi)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                   box_decoder_and_assign_op.cu)
-detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
-                  sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS
                   retinanet_detection_output_op.cc)
 
@@ -79,9 +76,6 @@ else()
   detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc)
 endif()
 
-detection_library(
-  roi_perspective_transform_op SRCS roi_perspective_transform_op.cc
-  roi_perspective_transform_op.cu)
 #Export local libraries to parent
 # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
 
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
index 70fdf4b8999f4e..f3e5b166b43b8a 100644
--- a/paddle/fluid/operators/detection/mask_util.cc
+++ b/paddle/fluid/operators/detection/mask_util.cc
@@ -42,10 +42,10 @@ void Decode(const uint32_t* cnts, int m, uint8_t* mask) {
 
 typedef uint32_t uint;
 void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
-  int j, m = 0;
+  int j = 0, m = 0;
   double scale = 5;
-  int *x, *y, *u, *v;
-  uint *a, *b;
+  int *x = nullptr, *y = nullptr, *u = nullptr, *v = nullptr;
+  uint *a = nullptr, *b = nullptr;
   platform::CPUPlace cpu;
   auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2);
   x = reinterpret_cast<int*>(xptr->ptr());
@@ -65,9 +65,10 @@ void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
   v = u + m;
   m = 0;
   for (j = 0; j < k; j++) {
-    int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx, dy, t, d;
-    int flip;
-    double s;
+    int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx = 0, dy = 0,
+        t = 0, d = 0;
+    int flip = 0;
+    double s = NAN;
     dx = abs(xe - xs);
     dy = abs(ys - ye);
     flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye);
@@ -100,7 +101,7 @@ void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
   /* get points along y-boundary and downsample */
   k = m;
   m = 0;
-  double xd, yd;
+  double xd = NAN, yd = NAN;
   auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2);
   x = reinterpret_cast<int*>(xyptr->ptr());
   y = x + k;
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 8519752bc10492..9f3f426d1ad853 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -250,7 +250,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     *num_nmsed_out = num_det;
     const T* scores_data = scores.data<T>();
     if (keep_top_k > -1 && num_det > keep_top_k) {
-      const T* sdata;
+      const T* sdata = nullptr;
       std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
       for (const auto& it : *indices) {
         int label = it.first;
@@ -310,7 +310,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     auto* scores_data = scores.data<T>();
     auto* bboxes_data = bboxes.data<T>();
     auto* odata = outs->data<T>();
-    const T* sdata;
+    const T* sdata = nullptr;
     phi::DenseTensor bbox;
     bbox.Resize({scores.dims()[0], box_size});
     int count = 0;
@@ -325,7 +325,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
       for (auto idx : indices) {
         odata[count * out_dim] = label;  // label
-        const T* bdata;
+        const T* bdata = nullptr;
         if (scores_size == 3) {
           bdata = bboxes_data + idx * box_size;
           odata[count * out_dim + 1] = sdata[idx];  // score
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
deleted file mode 100644
index 51f058617edc62..00000000000000
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ /dev/null
@@ -1,711 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <array>
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
-}
-
-template <typename T>
-bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
-}
-
-template <typename T>
-bool GT(T a, T b) {
-  return (a - b) > 1e-4;
-}
-
-/*
- *check if (x, y) is in the boundary of roi
- */
-template <typename T>
-bool in_quad(T x, T y, T roi_x[], T roi_y[]) {  // NOLINT
-  for (int i = 0; i < 4; i++) {
-    T xs = roi_x[i];
-    T ys = roi_y[i];
-    T xe = roi_x[(i + 1) % 4];
-    T ye = roi_y[(i + 1) % 4];
-    if (fabs(ys - ye) < 1e-4) {
-      if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 &&
-          GT_E<T>(x, std::min(xs, xe)) && LT_E<T>(x, std::max(xs, xe))) {
-        return true;
-      }
-    } else {
-      T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
-      if (fabs(intersec_x - x) < 1e-4 && GT_E<T>(y, std::min(ys, ye)) &&
-          LT_E<T>(y, std::max(ys, ye))) {
-        return true;
-      }
-    }
-  }
-
-  int n_cross = 0;
-  for (int i = 0; i < 4; i++) {
-    T xs = roi_x[i];
-    T ys = roi_y[i];
-    T xe = roi_x[(i + 1) % 4];
-    T ye = roi_y[(i + 1) % 4];
-    if (fabs(ys - ye) < 1e-4) {
-      continue;
-    }
-    if (LT_E<T>(y, std::min(ys, ye)) || GT<T>(y, std::max(ys, ye))) {
-      continue;
-    }
-    T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
-    if (fabs(intersec_x - x) < 1e-4) {
-      return true;
-    }
-    if (GT<T>(intersec_x, x)) {
-      n_cross++;
-    }
-  }
-  return (n_cross % 2 == 1);
-}
-
-/**
- * Get the matrix of perspective transform.
- *
- * dx1 = x1 - x2
- * dx2 = x3 - x2
- * dx3 = x0 - x1 + x2 - x3
- * dy1 = y1 - y2
- * dy2 = y3 - y2
- * dy3 = y0 - y1 + y2 - y3
- *
- * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
- * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
- * a13 = x0
- * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
- * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
- * a23 = y0
- * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
- * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
- * a33 = 1
- */
-template <typename T>
-void get_transform_matrix(const int transformed_width,
-                          const int transformed_height,
-                          T roi_x[],     // NOLINT
-                          T roi_y[],     // NOLINT
-                          T matrix[]) {  // NOLINT
-  T x0 = roi_x[0];
-  T x1 = roi_x[1];
-  T x2 = roi_x[2];
-  T x3 = roi_x[3];
-  T y0 = roi_y[0];
-  T y1 = roi_y[1];
-  T y2 = roi_y[2];
-  T y3 = roi_y[3];
-
-  // Estimate the height and width of RoI
-  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
-  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
-  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
-  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
-  T estimated_height = (len2 + len4) / 2.0;
-  T estimated_width = (len1 + len3) / 2.0;
-
-  // Get the normalized height and normalized width
-  int normalized_height = std::max(2, transformed_height);
-  int normalized_width =
-      std::round(estimated_width * (normalized_height - 1) / estimated_height) +
-      1;
-  normalized_width = std::max(2, std::min(normalized_width, transformed_width));
-
-  T dx1 = x1 - x2;
-  T dx2 = x3 - x2;
-  T dx3 = x0 - x1 + x2 - x3;
-  T dy1 = y1 - y2;
-  T dy2 = y3 - y2;
-  T dy3 = y0 - y1 + y2 - y3;
-
-  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) /
-              (normalized_width - 1);
-  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) /
-              (normalized_height - 1);
-  matrix[8] = 1;
-
-  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
-              (normalized_width - 1);
-  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
-              (normalized_height - 1);
-  matrix[5] = y0;
-
-  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
-              (normalized_width - 1);
-  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
-              (normalized_height - 1);
-  matrix[2] = x0;
-}
-
-/**
- * Get the source coordinates in the input feature map.
- *
- * (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix
- *
- * in_w = u / w
- * in_h = v / w
- *
- */
-template <typename T>
-void get_source_coords(
-    T matrix[], int out_w, int out_h, T* in_w, T* in_h) {  // NOLINT
-  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
-  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
-  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
-
-  in_w[0] = u / w;
-  in_h[0] = v / w;
-}
-
-/**
- * Perform bilinear interpolation in the input feature map.
- */
-template <typename T>
-void bilinear_interpolate(const T* in_data,
-                          const int channels,
-                          const int width,
-                          const int height,
-                          int in_n,
-                          int in_c,
-                          T in_w,
-                          T in_h,
-                          T* val) {
-  // Deal with cases that source coords are out of feature map boundary
-  if (GT_E<T>(-0.5, in_w) || GT_E<T>(in_w, width - 0.5) ||
-      GT_E<T>(-0.5, in_h) || GT_E<T>(in_h, height - 0.5)) {
-    // empty
-    val[0] = 0.0;
-    return;
-  }
-
-  if (GT_E<T>(0, in_w)) {
-    in_w = 0;
-  }
-  if (GT_E<T>(0, in_h)) {
-    in_h = 0;
-  }
-
-  int in_w_floor = floor(in_w);
-  int in_h_floor = floor(in_h);
-  int in_w_ceil;
-  int in_h_ceil;
-
-  if (GT_E<T>(in_w_floor, width - 1)) {
-    in_w_ceil = in_w_floor = width - 1;
-    in_w = static_cast<T>(in_w_floor);
-  } else {
-    in_w_ceil = in_w_floor + 1;
-  }
-
-  if (GT_E<T>(in_h_floor, height - 1)) {
-    in_h_ceil = in_h_floor = height - 1;
-    in_h = static_cast<T>(in_h_floor);
-  } else {
-    in_h_ceil = in_h_floor + 1;
-  }
-  T w_floor = in_w - in_w_floor;
-  T h_floor = in_h - in_h_floor;
-  T w_ceil = 1 - w_floor;
-  T h_ceil = 1 - h_floor;
-  const T* data = in_data + (in_n * channels + in_c) * height * width;
-  // Do bilinear interpolation
-  T v1 = data[in_h_floor * width + in_w_floor];
-  T v2 = data[in_h_ceil * width + in_w_floor];
-  T v3 = data[in_h_ceil * width + in_w_ceil];
-  T v4 = data[in_h_floor * width + in_w_ceil];
-  T w1 = w_ceil * h_ceil;
-  T w2 = w_ceil * h_floor;
-  T w3 = w_floor * h_floor;
-  T w4 = w_floor * h_ceil;
-  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
-}
-
-template <typename T, typename DeviceContext>
-class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* mask = ctx.Output<phi::DenseTensor>("Mask");
-    auto* out_transform_matrix =
-        ctx.Output<phi::DenseTensor>("TransformMatrix");
-    auto transformed_height = ctx.Attr<int>("transformed_height");
-    auto transformed_width = ctx.Attr<int>("transformed_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = phi::vectorize<int64_t>(in->dims());
-    int channels = static_cast<int>(in_dims[1]);
-    int in_height = static_cast<int>(in_dims[2]);
-    int in_width = static_cast<int>(in_dims[3]);
-    int rois_num = static_cast<int>(rois->dims()[0]);
-
-    const T* input_data = in->data<T>();
-    int* mask_data = mask->mutable_data<int>(ctx.GetPlace());
-
-    phi::DenseTensor roi2image;
-    roi2image.Resize({rois_num});
-    int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
-    auto lod = rois->lod().back();
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        roi2image_data[j] = static_cast<int64_t>(i);
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* rois_data = rois->data<T>();
-
-    T* transform_matrix =
-        out_transform_matrix->mutable_data<T>({rois_num, 9}, ctx.GetPlace());
-
-    for (int n = 0; n < rois_num; ++n) {
-      const T* n_rois = rois_data + n * 8;
-      std::array<T, 4> roi_x;
-      std::array<T, 4> roi_y;
-      for (int k = 0; k < 4; ++k) {
-        roi_x[k] = n_rois[2 * k] * spatial_scale;
-        roi_y[k] = n_rois[2 * k + 1] * spatial_scale;
-      }
-      int image_id = roi2image_data[n];
-      // Get transform matrix
-      std::array<T, 9> matrix;
-      get_transform_matrix<T>(transformed_width,
-                              transformed_height,
-                              roi_x.data(),
-                              roi_y.data(),
-                              matrix.data());
-      for (int i = 0; i < 9; i++) {
-        transform_matrix[n * 9 + i] = matrix[i];
-      }
-      for (int c = 0; c < channels; ++c) {
-        for (int out_h = 0; out_h < transformed_height; ++out_h) {
-          for (int out_w = 0; out_w < transformed_width; ++out_w) {
-            int out_index =
-                n * channels * transformed_height * transformed_width +
-                c * transformed_height * transformed_width +
-                out_h * transformed_width + out_w;
-            T in_w, in_h;
-            get_source_coords<T>(matrix.data(), out_w, out_h, &in_w, &in_h);
-            if (in_quad<T>(in_w, in_h, roi_x.data(), roi_y.data())) {
-              if (GT_E<T>(-0.5, in_w) ||
-                  GT_E<T>(in_w, static_cast<T>(in_width - 0.5)) ||
-                  GT_E<T>(-0.5, in_h) ||
-                  GT_E<T>(in_h, static_cast<T>(in_height - 0.5))) {
-                output_data[out_index] = 0.0;
-                mask_data[(n * transformed_height + out_h) * transformed_width +
-                          out_w] = 0;
-              } else {
-                bilinear_interpolate(input_data,
-                                     channels,
-                                     in_width,
-                                     in_height,
-                                     image_id,
-                                     c,
-                                     in_w,
-                                     in_h,
-                                     output_data + out_index);
-                mask_data[(n * transformed_height + out_h) * transformed_width +
-                          out_w] = 1;
-              }
-            } else {
-              output_data[out_index] = 0.0;
-              mask_data[(n * transformed_height + out_h) * transformed_width +
-                        out_w] = 0;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-T get_feature_gradient(
-    T xs, T ys, int w, int h, const int width, const int height) {
-  if (GT_E<T>(-0.5, xs) || GT_E<T>(xs, width - 0.5) || GT_E<T>(-0.5, ys) ||
-      GT_E<T>(ys, height - 0.5)) {
-    return 0;
-  }
-
-  if (GT_E<T>(0, xs)) {
-    xs = 0;
-  }
-  if (GT_E<T>(0, ys)) {
-    ys = 0;
-  }
-
-  int xs_floor = floor(xs);
-  int ys_floor = floor(ys);
-  int xs_ceil;
-  int ys_ceil;
-
-  if (GT_E(xs_floor, width - 1)) {
-    xs_ceil = xs_floor = width - 1;
-    xs = static_cast<T>(xs_floor);
-  } else {
-    xs_ceil = xs_floor + 1;
-  }
-
-  if (GT_E(ys_floor, height - 1)) {
-    ys_ceil = ys_floor = height - 1;
-    ys = static_cast<T>(ys_floor);
-  } else {
-    ys_ceil = ys_floor + 1;
-  }
-
-  T weight = 0;
-  if (w == xs_floor) {
-    if (h == ys_floor) {
-      weight = (w + 1 - xs) * (h + 1 - ys);
-    } else if (h == ys_ceil) {
-      weight = (w + 1 - xs) * (ys + 1 - h);
-    }
-  } else if (w == xs_ceil) {
-    if (h == ys_floor) {
-      weight = (xs + 1 - w) * (h + 1 - ys);
-    } else if (h == ys_ceil) {
-      weight = (xs + 1 - w) * (ys + 1 - h);
-    }
-  }
-  return weight;
-}
-
-template <typename T, typename DeviceContext>
-class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto transformed_height = ctx.Attr<int>("transformed_height");
-    auto transformed_width = ctx.Attr<int>("transformed_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = phi::vectorize<int>(in->dims());
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int in_height = in_dims[2];
-    int in_width = in_dims[3];
-    int rois_num = static_cast<int>(rois->dims()[0]);
-
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const T* rois_data = rois->data<T>();
-
-    phi::DenseTensor roi2image;
-    roi2image.Resize({rois_num});
-    int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
-    auto lod = rois->lod().back();
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        roi2image_data[j] = static_cast<int>(i);
-      }
-    }
-
-    for (int n = 0; n < batch_size; ++n) {
-      for (int c = 0; c < channels; ++c) {
-        for (int in_h = 0; in_h < in_height; ++in_h) {
-          for (int in_w = 0; in_w < in_width; ++in_w) {
-            T gradient = 0.0;
-            for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
-              const T* rois = rois_data + roi_idx * 8;
-              std::array<T, 4> roi_x;
-              std::array<T, 4> roi_y;
-              for (int k = 0; k < 4; ++k) {
-                roi_x[k] = rois[2 * k] * spatial_scale;
-                roi_y[k] = rois[2 * k + 1] * spatial_scale;
-              }
-
-              // Get transform matrix
-              std::array<T, 9> matrix;
-              get_transform_matrix<T>(transformed_width,
-                                      transformed_height,
-                                      roi_x.data(),
-                                      roi_y.data(),
-                                      matrix.data());
-              const T* out_grad_ptr = out_grad_data + (roi_idx * channels + c) *
-                                                          transformed_height *
-                                                          transformed_width;
-              for (int out_h = 0; out_h < transformed_height; ++out_h) {
-                for (int out_w = 0; out_w < transformed_width; ++out_w) {
-                  T src_w;
-                  T src_h;
-                  get_source_coords<T>(
-                      matrix.data(), out_w, out_h, &src_w, &src_h);
-                  if (in_quad<T>(src_w, src_h, roi_x.data(), roi_y.data())) {
-                    if (GT_E<T>(-0.5, src_w) ||
-                        GT_E<T>(src_w, static_cast<T>(in_width - 0.5)) ||
-                        GT_E<T>(-0.5, src_h) ||
-                        GT_E<T>(src_h, static_cast<T>(in_height - 0.5))) {
-                      continue;
-                    }
-                    T weight = get_feature_gradient<T>(
-                        src_w, src_h, in_w, in_h, in_width, in_height);
-                    gradient +=
-                        out_grad_ptr[out_h * transformed_width + out_w] *
-                        weight;
-                  }
-                }
-              }
-            }
-            int out_idx = (n * channels + c) * in_height * in_width +
-                          in_h * in_width + in_w;
-            in_grad_data[out_idx] = gradient;
-          }
-        }
-      }
-    }
-  }
-};
-
-class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("X"), "Input", "X", "roi_perspective_transform");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ROIs"), "Input", "ROIs", "roi_perspective_transform");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Ountput", "Out", "roi_perspective_transform");
-
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The format of input tensor must be NCHW. But "
-                          "received input dims is %d.",
-                          input_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 8)"
-            "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]. But received "
-            "rois dims is %d",
-            rois_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1],
-        8,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 8)"
-            "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]. But received %d",
-            rois_dims[1]));
-
-    int transformed_height = ctx->Attrs().Get<int>("transformed_height");
-    int transformed_width = ctx->Attrs().Get<int>("transformed_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(
-        transformed_height,
-        0,
-        platform::errors::InvalidArgument("The transformed output height must "
-                                          "greater than 0. But received %d.",
-                                          transformed_height));
-    PADDLE_ENFORCE_GT(
-        transformed_width,
-        0,
-        platform::errors::InvalidArgument("The transformed output width must "
-                                          "greater than 0. But received %d.",
-                                          transformed_width));
-    PADDLE_ENFORCE_GT(
-        spatial_scale,
-        0.0f,
-        platform::errors::InvalidArgument(
-            "The spatial scale must greater than 0. But received %f.",
-            spatial_scale));
-    std::vector<int64_t> out_dims_v({rois_dims[0],   // num_rois
-                                     input_dims[1],  // channels
-                                     static_cast<int64_t>(transformed_height),
-                                     static_cast<int64_t>(transformed_width)});
-    auto out_dims = phi::make_ddim(out_dims_v);
-
-    std::vector<int64_t> mask_dims_v({rois_dims[0],  // num_rois
-                                      1,             // channels
-                                      static_cast<int64_t>(transformed_height),
-                                      static_cast<int64_t>(transformed_width)});
-    auto mask_dims = phi::make_ddim(mask_dims_v);
-
-    std::vector<int64_t> matrix_dims_v({rois_dims[0], 9});
-    auto matrix_dims = phi::make_ddim(matrix_dims_v);
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("Mask", mask_dims);
-    ctx->SetOutputDim("TransformMatrix", matrix_dims);
-    ctx->SetOutputDim("Out2InIdx", out_dims);
-    ctx->SetOutputDim("Out2InWeights", out_dims);
-    ctx->ShareLoD("ROIs", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@Grad",
-                   "roi_perspective_transform_grad");
-    OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")),
-                   "Output",
-                   "X@Grad",
-                   "roi_perspective_transform_grad");
-
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class ROIPerspectiveTransformOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(phi::DenseTensor), "
-             "the input of ROIPerspectiveTransformOp. "
-             "The format of input tensor is NCHW. Where N is batch size, "
-             "C is the number of input channels, "
-             "H is the height of the feature, and "
-             "W is the width of the feature.");
-    AddInput("ROIs",
-             "(phi::DenseTensor), "
-             "ROIs (Regions of Interest) to be transformed. "
-             "should be a 2-D phi::DenseTensor of shape (num_rois, 8)"
-             "given as [[x1, y1, x2, y2, x3, y3, x4, y4], ...]."
-             "(x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the top right coordinates, and"
-             "(x3, y3) is the bottom right coordinates, and"
-             "(x4, y4) is the bottom left coordinates.");
-    AddOutput(
-        "Out",
-        "(phi::DenseTensor), "
-        "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
-        "(num_rois, channels, transformed_h, transformed_w).");
-    AddOutput("Mask",
-              "(phi::DenseTensor), "
-              "The output mask of ROIPerspectiveTransformOp is a 4-D tensor "
-              "with shape "
-              "(num_rois, 1, transformed_h, transformed_w).");
-    AddOutput("TransformMatrix",
-              "(phi::DenseTensor), "
-              "The output transform matrix of ROIPerspectiveTransformOp is a "
-              "1-D tensor with shape "
-              "(num_rois, 9).");
-    AddOutput("Out2InIdx",
-              "(phi::DenseTensor), "
-              "An intermediate tensor used to map indexes of input feature map "
-              "and indexes of output feature map."
-              "The shape of the tensor is [out_size, 4] and out_size is the "
-              "number of elements in output feature map.")
-        .AsIntermediate();
-    AddOutput("Out2InWeights",
-              "(phi::DenseTensor), "
-              "An intermediate tensor used to record the weights of bilinear "
-              "interpolatein for each element in output. The shape of the "
-              "tensor is [out_size, 4] and out_size is the number of elements "
-              "in output feature map.")
-        .AsIntermediate();
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Spatial scale factor to scale ROI coords.")
-        .SetDefault(1.0);
-    AddAttr<int>("transformed_height",
-                 "(int, default 1), "
-                 "The height of transformed output.")
-        .SetDefault(1);
-    AddAttr<int>("transformed_width",
-                 "(int, default 1), "
-                 "The width of transformed output.")
-        .SetDefault(1);
-    AddComment(R"DOC(
-**ROIPerspectiveTransform Operator**
-
-    )DOC");
-  }
-};
-
-template <typename T>
-class ROIPerspectiveTransformGradMaker
-    : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("roi_perspective_transform_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("Out2InIdx", this->Output("Out2InIdx"));
-    op->SetInput("Out2InWeights", this->Output("Out2InWeights"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    roi_perspective_transform,
-    ops::ROIPerspectiveTransformOp,
-    ops::ROIPerspectiveTransformOpMaker,
-    ops::ROIPerspectiveTransformGradMaker<paddle::framework::OpDesc>,
-    ops::ROIPerspectiveTransformGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(roi_perspective_transform_grad,
-                  ops::ROIPerspectiveTransformGradOp);
-PD_REGISTER_STRUCT_KERNEL(roi_perspective_transform,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CPUROIPerspectiveTransformOpKernel,
-                          float) {}
-PD_REGISTER_STRUCT_KERNEL(roi_perspective_transform_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CPUROIPerspectiveTransformGradOpKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
deleted file mode 100644
index 3a94bcafd669f7..00000000000000
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ /dev/null
@@ -1,551 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-using paddle::platform::float16;
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-namespace paddle {
-namespace operators {
-
-// CUDA: index helpers
-#define idx4_4(index, d1, d2, d3, d4) (index % d4)
-#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3)
-#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
-#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)
-
-template <typename T>
-__device__ bool GT_E(T a, T b) {
-  return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
-}
-
-template <typename T>
-__device__ bool LT_E(T a, T b) {
-  return (a < b) || Eigen::numext::abs(a - b) < 1e-4;
-}
-
-template <typename T>
-__device__ bool GT(T a, T b) {
-  return (a - b) > 1e-4;
-}
-
-template <typename T>
-__device__ T max(T a, T b) {
-  return a > b ? a : b;
-}
-
-template <typename T>
-__device__ T min(T a, T b) {
-  return a < b ? a : b;
-}
-
-/*
- * check if (x, y) is in the boundary of roi
- */
-template <typename T>
-__device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
-  for (int i = 0; i < 4; i++) {
-    T start_w = roi_x[i];
-    T start_h = roi_y[i];
-    T end_w = roi_x[(i + 1) % 4];
-    T end_h = roi_y[(i + 1) % 4];
-    if (fabs(start_h - end_h) < 1e-4) {
-      if (fabs(y - start_h) < 1e-4 && fabs(y - end_h) < 1e-4 &&
-          GT_E<T>(x, min<T>(start_w, end_w)) &&
-          LT_E<T>(x, max<T>(start_w, end_w))) {
-        return true;
-      }
-    } else {
-      T intersec_x =
-          (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
-      if (fabs(intersec_x - x) < 1e-4 && GT_E(y, min<T>(start_h, end_h)) &&
-          LT_E<T>(y, max<T>(start_h, end_h))) {
-        return true;
-      }
-    }
-  }
-
-  int n_cross = 0;
-  for (int i = 0; i < 4; i++) {
-    T start_w = roi_x[i];
-    T start_h = roi_y[i];
-    T end_w = roi_x[(i + 1) % 4];
-    T end_h = roi_y[(i + 1) % 4];
-    if (fabs(start_h - end_h) < 1e-4) {
-      continue;
-    }
-    if (LT_E<T>(y, min<T>(start_h, end_h)) ||
-        GT<T>(y, max<T>(start_h, end_h))) {
-      continue;
-    }
-    T intersec_x =
-        (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
-    if (fabs(intersec_x - x) < 1e-4) {
-      return true;
-    }
-    if (GT<T>(intersec_x, x)) {
-      n_cross++;
-    }
-  }
-  return (n_cross % 2 == 1);
-}
-
-/**
- * Perform bilinear interpolation in the input feature map.
- */
-template <typename T>
-__device__ void bilinear_interpolate(const T* in_data,
-                                     const int channels,
-                                     const int width,
-                                     const int height,
-                                     int in_n,
-                                     int in_c,
-                                     T in_w,
-                                     T in_h,
-                                     T* val,
-                                     int out_idx,
-                                     int* out2in_idx,
-                                     T* out2in_w) {
-  // Deal with cases that source coords are out of feature map boundary
-  if (GT_E<T>(-0.5, in_w) || GT_E<T>(in_w, width - 0.5) ||
-      GT_E<T>(-0.5, in_h) || GT_E<T>(in_h, height - 0.5)) {
-    val[0] = 0.0;
-    return;
-  }
-
-  if (GT_E<T>(0, in_w)) {
-    in_w = 0;
-  }
-  if (GT_E<T>(0, in_h)) {
-    in_h = 0;
-  }
-
-  int in_w_floor = floor(in_w);
-  int in_h_floor = floor(in_h);
-  int in_w_ceil;
-  int in_h_ceil;
-
-  if (GT_E<T>(in_w_floor, width - 1)) {
-    in_w_ceil = in_w_floor = width - 1;
-    in_w = static_cast<T>(in_w_floor);
-  } else {
-    in_w_ceil = in_w_floor + 1;
-  }
-
-  if (GT_E<T>(in_h_floor, height - 1)) {
-    in_h_ceil = in_h_floor = height - 1;
-    in_h = static_cast<T>(in_h_floor);
-  } else {
-    in_h_ceil = in_h_floor + 1;
-  }
-
-  T w_floor = in_w - in_w_floor;
-  T h_floor = in_h - in_h_floor;
-  T w_ceil = 1 - w_floor;
-  T h_ceil = 1 - h_floor;
-  const T* data = in_data + (in_n * channels + in_c) * height * width;
-  // Do bilinear interpolation
-  T v1 = data[in_h_floor * width + in_w_floor];
-  T v2 = data[in_h_ceil * width + in_w_floor];
-  T v3 = data[in_h_ceil * width + in_w_ceil];
-  T v4 = data[in_h_floor * width + in_w_ceil];
-  T w1 = w_ceil * h_ceil;
-  T w2 = w_ceil * h_floor;
-  T w3 = w_floor * h_floor;
-  T w4 = w_floor * h_ceil;
-  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
-
-  int base_idx = (in_n * channels + in_c) * height * width;
-  out2in_idx[out_idx * 4] = base_idx + in_h_floor * width + in_w_floor;
-  out2in_idx[out_idx * 4 + 1] = base_idx + in_h_ceil * width + in_w_floor;
-  out2in_idx[out_idx * 4 + 2] = base_idx + in_h_ceil * width + in_w_ceil;
-  out2in_idx[out_idx * 4 + 3] = base_idx + in_h_floor * width + in_w_ceil;
-  out2in_w[out_idx * 4] = w1;
-  out2in_w[out_idx * 4 + 1] = w2;
-  out2in_w[out_idx * 4 + 2] = w3;
-  out2in_w[out_idx * 4 + 3] = w4;
-}
-
-/**
- * Get the source coordinates in the input feature map.
- *
- * (u, v, w)^matrix = T * (out_w, out_h, 1)^matrix
- *
- * in_w = u / w
- * in_h = v / w
- *
- */
-template <typename T>
-__device__ void get_source_coords(
-    T matrix[], int out_w, int out_h, T* in_w, T* in_h) {
-  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
-  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
-  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
-
-  in_w[0] = u / w;
-  in_h[0] = v / w;
-}
-
-/**
- * Get the matrix of perspective transform.
- *
- * dx1 = x1 - x2
- * dx2 = x3 - x2
- * dx3 = x0 - x1 + x2 - x3
- * dy1 = y1 - y2
- * dy2 = y3 - y2
- * dy3 = y0 - y1 + y2 - y3
- *
- * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
- * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
- * a13 = x0
- * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
- * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
- * a23 = y0
- * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
- * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
- * a33 = 1
- *
- */
-template <typename T>
-__device__ void get_transform_matrix(const int transformed_width,
-                                     const int transformed_height,
-                                     T roi_x[],
-                                     T roi_y[],
-                                     T matrix[]) {
-  T x0 = roi_x[0];
-  T x1 = roi_x[1];
-  T x2 = roi_x[2];
-  T x3 = roi_x[3];
-  T y0 = roi_y[0];
-  T y1 = roi_y[1];
-  T y2 = roi_y[2];
-  T y3 = roi_y[3];
-
-  // Estimate the height and width of RoI
-  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
-  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
-  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
-  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
-  T estimated_height = (len2 + len4) / 2.0;
-  T estimated_width = (len1 + len3) / 2.0;
-
-  // Get the normalized height and normalized width
-  int normalized_height = max(2, transformed_height);
-  int normalized_width =
-      round(estimated_width * (normalized_height - 1) / estimated_height) + 1;
-  normalized_width = max(2, min(normalized_width, transformed_width));
-
-  T dx1 = x1 - x2;
-  T dx2 = x3 - x2;
-  T dx3 = x0 - x1 + x2 - x3;
-  T dy1 = y1 - y2;
-  T dy2 = y3 - y2;
-  T dy3 = y0 - y1 + y2 - y3;
-
-  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) /
-              (normalized_width - 1);
-  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) /
-              (normalized_height - 1);
-  matrix[8] = 1;
-
-  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
-              (normalized_width - 1);
-  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
-              (normalized_height - 1);
-  matrix[5] = y0;
-
-  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
-              (normalized_width - 1);
-  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
-              (normalized_height - 1);
-  matrix[2] = x0;
-}
-
-template <typename T>
-__global__ void RoiTransformKernel(const float* input_data,
-                                   const float* rois_data,
-                                   const int* roi2image_data,
-                                   int num_rois,
-                                   int in_height,
-                                   int in_width,
-                                   int channels,
-                                   int transformed_height,
-                                   int transformed_width,
-                                   float spatial_scale,
-                                   T* output_data,
-                                   int* out2in_idx,
-                                   T* out2in_w,
-                                   int* mask,
-                                   T* transform_matrix) {
-  int output_size =
-      num_rois * transformed_height * transformed_width * channels;
-  CUDA_KERNEL_LOOP(index, output_size) {
-    // (n, c, out_h, out_w) is an element in the transformed output
-    int out_w = idx4_4(
-        index, num_rois, channels, transformed_height, transformed_width);
-    int out_h = idx4_3(
-        index, num_rois, channels, transformed_height, transformed_width);
-    int c = idx4_2(
-        index, num_rois, channels, transformed_height, transformed_width);
-    int n = idx4_1(
-        index, num_rois, channels, transformed_height, transformed_width);
-
-    auto bottom_rois = rois_data + n * 8;
-    int roi_batch_ind = bottom_rois[0];
-    T roi_x[4];
-    T roi_y[4];
-    for (int k = 0; k < 4; ++k) {
-      roi_x[k] = bottom_rois[2 * k] * spatial_scale;
-      roi_y[k] = bottom_rois[2 * k + 1] * spatial_scale;
-    }
-
-    // Get transform matrix
-    T matrix[9];
-    get_transform_matrix<T>(
-        transformed_width, transformed_height, roi_x, roi_y, matrix);
-    for (int i = 0; i < 9; i++) {
-      transform_matrix[n * 9 + i] = matrix[i];
-    }
-    // Get source coords
-    T in_w;
-    T in_h;
-    get_source_coords<T>(matrix, out_w, out_h, &in_w, &in_h);
-
-    if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
-      if (GT_E<T>(-0.5, in_w) ||
-          GT_E<T>(in_w, static_cast<T>(in_width - 0.5)) ||
-          GT_E<T>(-0.5, in_h) ||
-          GT_E<T>(in_h, static_cast<T>(in_height - 0.5))) {
-        // Skip if source coords is not in input image
-        output_data[index] = 0.0;
-        mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0;
-      } else {
-        // Perform bilinear interpolation
-        int in_n = roi2image_data[n];
-        bilinear_interpolate<T>(input_data,
-                                channels,
-                                in_width,
-                                in_height,
-                                in_n,
-                                c,
-                                in_w,
-                                in_h,
-                                output_data + index,
-                                index,
-                                out2in_idx,
-                                out2in_w);
-        mask[(n * transformed_height + out_h) * transformed_width + out_w] = 1;
-      }
-
-    } else {
-      // Skip if source coords is not in quad
-      output_data[index] = 0.0;
-      mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0;
-    }
-  }
-}
-
-template <typename T, typename DeviceContext>
-class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* out2in_idx = ctx.Output<phi::DenseTensor>("Out2InIdx");
-    auto* out2in_w = ctx.Output<phi::DenseTensor>("Out2InWeights");
-    auto* mask = ctx.Output<phi::DenseTensor>("Mask");
-    auto* out_transform_matrix =
-        ctx.Output<phi::DenseTensor>("TransformMatrix");
-
-    int* mask_data = mask->mutable_data<int>(ctx.GetPlace());
-    int* out2in_idx_data =
-        out2in_idx->mutable_data<int>({out->numel(), 4}, ctx.GetPlace());
-    T* out2in_w_data =
-        out2in_w->mutable_data<T>({out->numel(), 4}, ctx.GetPlace());
-
-    phi::funcs::SetConstant<phi::GPUContext, int> init;
-    init(ctx.cuda_device_context(), out2in_idx, static_cast<int>(-1));
-
-    auto transformed_height = ctx.Attr<int>("transformed_height");
-    auto transformed_width = ctx.Attr<int>("transformed_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int in_height = in_dims[2];
-    int in_width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    const T* input_data = in->data<T>();
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* rois_data = rois->data<T>();
-
-    phi::DenseTensor roi2image;
-    phi::DenseTensor roi2image_dev;
-    roi2image.Resize({rois_num});
-    int* roi2image_data = roi2image.mutable_data<int>(platform::CPUPlace());
-    auto lod = rois->lod().back();
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        roi2image_data[j] = i;
-      }
-    }
-    paddle::framework::TensorCopySync(
-        roi2image, ctx.GetPlace(), &roi2image_dev);
-
-    int out_size = rois_num * transformed_height * transformed_width * channels;
-    auto stream = ctx.cuda_device_context().stream();
-    int block = 512;
-    int grid = (out_size + block - 1) / block;
-
-    // Get transform matrix
-    T* matrix =
-        out_transform_matrix->mutable_data<T>({rois_num, 9}, ctx.GetPlace());
-
-    RoiTransformKernel<T><<<grid, block, 0, stream>>>(input_data,
-                                                      rois_data,
-                                                      roi2image_dev.data<int>(),
-                                                      rois_num,
-                                                      in_height,
-                                                      in_width,
-                                                      channels,
-                                                      transformed_height,
-                                                      transformed_width,
-                                                      spatial_scale,
-                                                      output_data,
-                                                      out2in_idx_data,
-                                                      out2in_w_data,
-                                                      mask_data,
-                                                      matrix);
-  }
-};
-
-template <typename T>
-__device__ T get_feature_gradient(
-    T xs, T ys, int w, int h, const int width, const int height) {
-  if (GT_E<T>(-0.5, xs) || GT_E<T>(xs, width - 0.5) || GT_E<T>(-0.5, ys) ||
-      GT_E<T>(ys, height - 0.5)) {
-    return 0;
-  }
-
-  if (GT_E<T>(0, xs)) {
-    xs = 0;
-  }
-  if (GT_E<T>(0, ys)) {
-    ys = 0;
-  }
-
-  int xs_floor = floor(xs);
-  int ys_floor = floor(ys);
-  int xs_ceil;
-  int ys_ceil;
-
-  if (GT_E<T>(xs_floor, width - 1)) {
-    xs_ceil = xs_floor = width - 1;
-    xs = static_cast<T>(xs_floor);
-  } else {
-    xs_ceil = xs_floor + 1;
-  }
-
-  if (GT_E(ys_floor, height - 1)) {
-    ys_ceil = ys_floor = height - 1;
-    ys = static_cast<T>(ys_floor);
-  } else {
-    ys_ceil = ys_floor + 1;
-  }
-
-  T weight = 0;
-  if (w == xs_floor) {
-    if (h == ys_floor) {
-      weight = (w + 1 - xs) * (h + 1 - ys);
-    } else if (h == ys_ceil) {
-      weight = (w + 1 - xs) * (ys + 1 - h);
-    }
-  } else if (w == xs_ceil) {
-    if (h == ys_floor) {
-      weight = (xs + 1 - w) * (h + 1 - ys);
-    } else if (h == ys_ceil) {
-      weight = (xs + 1 - w) * (ys + 1 - h);
-    }
-  }
-  return weight;
-}
-
-template <typename T>
-__global__ void RoiTransformGradKernel(int out_size,
-                                       const int* out2in_idx_data,
-                                       const T* out2in_w_data,
-                                       const T* out_grad_data,
-                                       T* in_grad_data) {
-  CUDA_KERNEL_LOOP(index, out_size * 4) {
-    int in_idx = out2in_idx_data[index];
-    if (in_idx >= 0) {
-      int out_idx = index / 4;
-      atomicAdd(in_grad_data + in_idx,
-                out_grad_data[out_idx] * out2in_w_data[index]);
-    }
-  }
-}
-
-template <typename T, typename DeviceContext>
-class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out2in_idx = ctx.Input<phi::DenseTensor>("Out2InIdx");
-    auto* out2in_w = ctx.Input<phi::DenseTensor>("Out2InWeights");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-
-    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
-    set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
-
-    const T* out_grad_data = out_grad->data<T>();
-    const int* out2in_idx_data = out2in_idx->data<int>();
-    const T* out2in_w_data = out2in_w->data<T>();
-
-    int out_size = out_grad->numel();
-    auto stream = ctx.cuda_device_context().stream();
-    int block = 512;
-    int grid = (out_size * 4 + block - 1) / block;
-
-    RoiTransformGradKernel<T><<<grid, block, 0, stream>>>(
-        out_size, out2in_idx_data, out2in_w_data, out_grad_data, in_grad_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(roi_perspective_transform,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::CUDAROIPerspectiveTransformOpKernel,
-                          float) {}
-PD_REGISTER_STRUCT_KERNEL(roi_perspective_transform_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::CUDAROIPerspectiveTransformGradOpKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index a41b8a70a42833..81e8d0d3edf7e7 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -122,7 +122,7 @@ std::vector<phi::DenseTensor> FilterStraddleAnchor(
   int anchor_num = static_cast<int>(anchor->dims()[0]);
   auto* anchor_data = anchor->data<T>();
   if (rpn_straddle_thresh >= 0) {
-    int index;
+    int index = 0;
     for (int i = 0; i < anchor_num; ++i) {
       index = i * 4;
       if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
deleted file mode 100644
index fe716adb9f20ae..00000000000000
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
+++ /dev/null
@@ -1,276 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class SigmoidFocalLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "sigmoid_focal_loss");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Label"), "Input", "Label", "sigmoid_focal_loss");
-    OP_INOUT_CHECK(
-        ctx->HasInput("FgNum"), "Input", "FgNum", "sigmoid_focal_loss");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "sigmoid_focal_loss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto fg_dims = ctx->GetInputDim("FgNum");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        rank,
-        labels_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank of Input(X) should be equal to the rank of Input(Label), "
-            "but received X rank is:%d, X shape is:[%s], "
-            "Label rank is:%d, Label shape is:[%s].",
-            rank,
-            x_dims,
-            labels_dims.size(),
-            labels_dims));
-    PADDLE_ENFORCE_EQ(
-        fg_dims.size(),
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of Input(FgNum) must be 1, but received FgNum rank is "
-            ":%d, FgNum shape is:[%s].",
-            fg_dims.size(),
-            fg_dims));
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank - 1),
-          phi::slice_ddim(labels_dims, 0, rank - 1),
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Label) should have the same shape "
-              "except the last dimension, but received X shape is:[%s], "
-              "Label shape is:[%s].",
-              x_dims,
-              labels_dims));
-    }
-
-    PADDLE_ENFORCE_EQ(
-        labels_dims[rank - 1],
-        1UL,
-        platform::errors::InvalidArgument(
-            "The last dimension of Input(Label) should be 1, but received "
-            "Label shape is:[%s].",
-            labels_dims));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class SigmoidFocalLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "sigmoid_focal_loss");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Label"), "Input", "Label", "sigmoid_focal_loss");
-    OP_INOUT_CHECK(
-        ctx->HasInput("FgNum"), "Input", "FgNum", "sigmoid_focal_loss");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@GRAD",
-                   "sigmoid_focal_loss");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   "X@GRAD",
-                   "sigmoid_focal_loss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto fg_dims = ctx->GetInputDim("FgNum");
-    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        rank,
-        labels_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank of Input(X) should be equal to the rank of Input(Label), "
-            "but received X rank is:%d, X shape is:[%s], "
-            "Label rank is:%d, Label shape is:[%s].",
-            rank,
-            x_dims,
-            labels_dims.size(),
-            labels_dims));
-    PADDLE_ENFORCE_EQ(
-        fg_dims.size(),
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of Input(FgNum) must be 1, but received FgNum rank is "
-            ":%d, FgNum shape is:[%s].",
-            fg_dims.size(),
-            fg_dims));
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank - 1),
-          phi::slice_ddim(labels_dims, 0, rank - 1),
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Label) should have the same shape "
-              "except the last dimension, but received X shape is:[%s], "
-              "Label shape is:[%s].",
-              x_dims,
-              labels_dims));
-
-      PADDLE_ENFORCE_EQ(
-          labels_dims[rank - 1],
-          1UL,
-          platform::errors::InvalidArgument(
-              "The last dimension of Input(Label) should be 1, but received "
-              "Label shape is:[%s].",
-              labels_dims));
-
-      PADDLE_ENFORCE_EQ(phi::slice_ddim(x_dims, 0, rank),
-                        phi::slice_ddim(dout_dims, 0, rank),
-                        platform::errors::InvalidArgument(
-                            "Input(X) and Input(Out@Grad) should have the same "
-                            "shape, but received "
-                            "X shape is:[%s], Out@Grad shape is:[%s].",
-                            x_dims,
-                            dout_dims));
-    }
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class SigmoidFocalLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N, D], "
-             "where N is the batch size and D is the number of classes "
-             "(excluding background). This input is a tensor of logits "
-             "computed by the previous operator.");
-    AddInput("Label",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape [N, 1]. "
-             "This input is a tensor of probabilistic labels.");
-    AddInput("FgNum",
-             "(Tensor, default Tensor<int>), a 1-D tensor with shape [1]. "
-             "This input is the number of foreground.");
-    AddOutput(
-        "Out",
-        "(Tensor, default Tensor<float>), a 2-D tensor with shape [N, D]. "
-        "This output is the focal loss.");
-    AddAttr<float>(
-        "gamma",
-        "Hyper-parameter of sigmoid focal loss op, which is to balance the "
-        "easy and hard examples. "
-        "A float scalar with default value 2.0.")
-        .SetDefault(2.0);
-    AddAttr<float>(
-        "alpha",
-        "Hyper-parameter of sigmoid focal loss op, which is to balance the "
-        "positive and negative examples. "
-        "A float scalar with default value 0.5.")
-        .SetDefault(0.25);
-    AddComment(R"DOC(
-Sigmoid Focal Loss Operator.
-
-Focal loss is used to address the foreground-background class imbalance existed
-on the training phase of one-stage detectors. This operator computes the sigmoid
-value for each element in the input tensor, after which focal loss is measured.
-
-The focal loss is given as follows:
-
-$$Loss_j = (-Label_j * alpha * \pow(1 - \sigma(X_j), gamma) * \log(\sigma(X_j)) -
-(1 - Labels_j) * (1 - alpha) * \pow(\sigma(X_j), gamma) * \log(1 - \sigma(X_j)))
-/ FgNum, j = 1,...,K$$
-
-We know that $$\sigma(X_j) = \\frac{1}{1 + \exp(-X_j)}$$.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class SigmoidFocalLossGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("sigmoid_focal_loss_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Label", this->Input("Label"));
-    op->SetInput("FgNum", this->Input("FgNum"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sigmoid_focal_loss,
-                  ops::SigmoidFocalLossOp,
-                  ops::SigmoidFocalLossOpMaker,
-                  ops::SigmoidFocalLossGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SigmoidFocalLossGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(sigmoid_focal_loss_grad, ops::SigmoidFocalLossGradOp);
-PD_REGISTER_STRUCT_KERNEL(sigmoid_focal_loss,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SigmoidFocalLossKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(sigmoid_focal_loss_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SigmoidFocalLossGradKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
deleted file mode 100644
index 5d29d52669d4f1..00000000000000
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUSigmoidFocalLossForward(const T *x_data,
-                                           const int *label_data,
-                                           const int *fg_num_data,
-                                           const T gamma,
-                                           const T alpha,
-                                           const int num_classes,
-                                           const int limit,
-                                           T *out_data) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    int a = i / num_classes;  // current sample
-    int d = i % num_classes;  // current class
-    int g = label_data[a];    // target
-
-    // check whether the input data is positive or negative
-    // the target classes are in range 1-81
-    // and the d is in range 0-80
-    T c_pos = static_cast<T>(g == (d + 1));
-    T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
-
-    T fg_num = static_cast<T>((fg_num_data[0] > 1) ? fg_num_data[0] : 1);
-    T s_neg = (1.0 - alpha) / fg_num;
-    T s_pos = alpha / fg_num;
-
-    // p = 1. / 1. + expf(-x)
-    T p = 1. / (1. + phi::funcs::real_exp(-x));
-
-    // (1 - p)**gamma * log(p)
-    T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                 phi::funcs::real_log(p > FLT_MIN ? p : FLT_MIN);
-    // p**gamma * log(1 - p)
-    T term_neg = std::pow(p, gamma) *
-                 (-1. * x * (x >= 0) -
-                  phi::funcs::real_log(
-                      1. + phi::funcs::real_exp(x - 2. * x * (x >= 0))));
-
-    out_data[i] = 0.0;
-    out_data[i] += -c_pos * term_pos * s_pos;
-    out_data[i] += -c_neg * term_neg * s_neg;
-  }
-}
-
-template <typename T>
-__global__ void GPUSigmoidFocalLossBackward(const T *x_data,
-                                            const int *label_data,
-                                            const int *fg_num_data,
-                                            const T gamma,
-                                            const T alpha,
-                                            const int num_classes,
-                                            const T *dout_data,
-                                            const int limit,
-                                            T *dx_data) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T dout = dout_data[i];
-
-    int a = i / num_classes;  // current sample
-    int d = i % num_classes;  // current class
-
-    T fg_num = static_cast<T>((fg_num_data[0] > 1) ? fg_num_data[0] : 1);
-    T s_neg = (1.0 - alpha) / fg_num;
-    T s_pos = alpha / fg_num;
-
-    int g = label_data[a];
-    T c_pos = static_cast<T>(g == (d + 1));
-    T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
-
-    T p = 1. / (1. + phi::funcs::real_exp(-x));
-
-    // (1-p)**g * (1 - p - g*p*log(p))
-    T term_pos =
-        std::pow(static_cast<T>(1. - p), gamma) *
-        (1. - p -
-         (p * gamma * phi::funcs::real_log(p > FLT_MIN ? p : FLT_MIN)));
-    // (p**g) * (g*(1-p)*log(1-p) - p)
-    T term_neg = std::pow(p, gamma) *
-                 ((-1. * x * (x >= 0) -
-                   phi::funcs::real_log(
-                       1. + phi::funcs::real_exp(x - 2. * x * (x >= 0)))) *
-                      (1. - p) * gamma -
-                  p);
-
-    dx_data[i] = 0.0;
-    dx_data[i] += -c_pos * s_pos * term_pos;
-    dx_data[i] += -c_neg * s_neg * term_neg;
-    dx_data[i] = dx_data[i] * dout;
-  }
-}
-
-template <typename T, typename DeviceContext>
-class GPUSigmoidFocalLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
-    T gamma = static_cast<T>(context.Attr<float>("gamma"));
-    T alpha = static_cast<T>(context.Attr<float>("alpha"));
-    auto x_dims = X->dims();
-    int num_classes = static_cast<int>(x_dims[1]);
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.cuda_device_context();
-
-    int limit = Out->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidFocalLossForward<T>
-        <<<blocks, threads, 0, dev_ctx.stream()>>>(X->data<T>(),
-                                                   Labels->data<int>(),
-                                                   FgNum->data<int>(),
-                                                   gamma,
-                                                   alpha,
-                                                   num_classes,
-                                                   limit,
-                                                   out_data);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class GPUSigmoidFocalLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    const phi::DenseTensor *dOut =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    phi::DenseTensor *dX =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-    T gamma = static_cast<T>(context.Attr<float>("gamma"));
-    T alpha = static_cast<T>(context.Attr<float>("alpha"));
-    auto x_dims = X->dims();
-    int num_classes = static_cast<int>(x_dims[1]);
-
-    auto &dev_ctx = context.cuda_device_context();
-
-    int limit = dX->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidFocalLossBackward<T>
-        <<<blocks, threads, 0, dev_ctx.stream()>>>(X->data<T>(),
-                                                   Labels->data<int>(),
-                                                   FgNum->data<int>(),
-                                                   gamma,
-                                                   alpha,
-                                                   num_classes,
-                                                   dOut->data<T>(),
-                                                   limit,
-                                                   dx_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(sigmoid_focal_loss,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::GPUSigmoidFocalLossKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(sigmoid_focal_loss_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::GPUSigmoidFocalLossGradKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
deleted file mode 100644
index 28cac641d14526..00000000000000
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <cfloat>
-#include <limits>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class SigmoidFocalLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
-    T gamma = static_cast<T>(context.Attr<float>("gamma"));
-    T alpha = static_cast<T>(context.Attr<float>("alpha"));
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-    int limit = Out->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<int>();
-    auto fg_num_data = FgNum->data<int>();
-    auto x_dims = X->dims();
-    int num_classes = static_cast<int>(x_dims[1]);
-
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      int a = idx / num_classes;  // current sample
-      int d = idx % num_classes;  // current class
-      int g = label_data[a];      // target
-
-      // Check whether the input data is positive or negative
-      // The target classes are in range 1-81
-      // and the d is in range 0-80
-      T c_pos = static_cast<T>(g == (d + 1));
-      T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
-      T fg_num = static_cast<T>((fg_num_data[0] > 1) ? fg_num_data[0] : 1);
-      T s_neg = (1.0 - alpha) / fg_num;
-      T s_pos = alpha / fg_num;
-
-      // p = 1. / 1. + expf(-x)
-      T p = 1. / (1. + std::exp(-x));
-
-      // (1 - p)**gamma * log(p) where
-      T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                   std::log(p > FLT_MIN ? p : FLT_MIN);
-      // p**gamma * log(1 - p)
-      T term_neg =
-          std::pow(p, gamma) *
-          (-1. * x * (x >= 0) - std::log(1. + std::exp(x - 2. * x * (x >= 0))));
-
-      out_data[idx] = 0.0;
-      out_data[idx] += -c_pos * term_pos * s_pos;
-      out_data[idx] += -c_neg * term_neg * s_neg;
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SigmoidFocalLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    const phi::DenseTensor *dOut =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    phi::DenseTensor *dX =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-    T gamma = static_cast<T>(context.Attr<float>("gamma"));
-    T alpha = static_cast<T>(context.Attr<float>("alpha"));
-    auto x_dims = X->dims();
-    int num_classes = static_cast<int>(x_dims[1]);
-
-    int limit = dX->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<int>();
-    auto fg_num_data = FgNum->data<int>();
-    auto dout_data = dOut->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      int a = idx / num_classes;  // current sample
-      int d = idx % num_classes;  // current class
-
-      T fg_num = static_cast<T>((fg_num_data[0] > 1) ? fg_num_data[0] : 1);
-      T s_neg = static_cast<T>((1.0 - alpha) / fg_num);
-      T s_pos = alpha / fg_num;
-      int g = label_data[a];
-
-      T c_pos = static_cast<T>(g == (d + 1));
-      T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
-      T p = 1. / (1. + std::exp(-x));
-
-      // (1-p)**g * (1 - p - g*p*log(p))
-      T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                   (1. - p - (p * gamma * std::log(p > FLT_MIN ? p : FLT_MIN)));
-      // (p**g) * (g*(1-p)*log(1-p) - p)
-      T term_neg = std::pow(p, gamma) *
-                   ((-1. * x * (x >= 0) -
-                     std::log(1. + std::exp(x - 2. * x * (x >= 0)))) *
-                        (1. - p) * gamma -
-                    p);
-      dx_data[idx] = 0.0;
-      dx_data[idx] += -c_pos * s_pos * term_pos;
-      dx_data[idx] += -c_neg * s_neg * term_neg;
-      dx_data[idx] = dx_data[idx] * dout_data[idx];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
deleted file mode 100644
index 437b46c459ff3e..00000000000000
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/target_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-class TargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of TargetAssignOp should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("MatchIndices"),
-        true,
-        platform::errors::InvalidArgument(
-            "Input(MatchIndices) of TargetAssignOp should not be null"));
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of TargetAssignOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("OutWeight"),
-        true,
-        platform::errors::InvalidArgument(
-            "Output(OutWeight) of TargetAssignOp should not be null."));
-
-    auto in_dims = ctx->GetInputDim("X");
-    auto mi_dims = ctx->GetInputDim("MatchIndices");
-
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        3,
-        platform::errors::InvalidArgument(
-            "Expected the rank of Input(X) is 3. But received %d.",
-            in_dims.size()));
-    PADDLE_ENFORCE_EQ(mi_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(MatchIndices) must be 2."));
-
-    if (ctx->HasInput("NegIndices")) {
-      auto neg_dims = ctx->GetInputDim("NegIndices");
-      PADDLE_ENFORCE_EQ(neg_dims.size(),
-                        2,
-                        platform::errors::InvalidArgument(
-                            "The rank of Input(NegIndices) must be 2."));
-      PADDLE_ENFORCE_EQ(
-          neg_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The last dimension of Out(NegIndices) must be 1."));
-    }
-
-    auto n = mi_dims[0];
-    auto m = mi_dims[1];
-    auto k = in_dims[in_dims.size() - 1];
-    ctx->SetOutputDim("Out", {n, m, k});
-    ctx->SetOutputDim("OutWeight", {n, m, 1});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(phi::DenseTensor), This input is a 3D phi::DenseTensor with "
-             "shape [M, P, K]. "
-             "Some elements in X will be assigned to Out based on the "
-             "MatchIndices and NegIndices.");
-    AddInput("MatchIndices",
-             "(Tensor, default Tensor<int>), The input matched indices "
-             "with shape [N, P], If MatchIndices[i][j] is -1, the j-th entity "
-             "of column is not matched to any entity of row in i-th instance.");
-    AddInput("NegIndices",
-             "(phi::DenseTensor, default phi::DenseTensor<int>), The input "
-             "negative example "
-             "indices are an optional input with shape [Neg, 1], where Neg is "
-             "the total number of negative example indices.")
-        .AsDispensable();
-    AddAttr<int>("mismatch_value",
-                 "(int, default 0), Fill this value to the "
-                 "mismatched location.")
-        .SetDefault(0);
-    AddOutput("Out",
-              "(Tensor), The output is a 3D Tensor with shape [N, P, K], "
-              "N and P is the same as they are in NegIndices, K is the "
-              "same as it in input of X. If MatchIndices[i][j] "
-              "is -1, the Out[i][j][0 : K] is the mismatch_value.");
-    AddOutput("OutWeight",
-              "(Tensor), The weight for output with the shape of [N, P, 1]");
-    AddComment(R"DOC(
-This operator can be, for given the target bounding boxes or labels,
-to assign classification and regression targets to each prediction as well as
-weights to prediction. The weights is used to specify which prediction would
-not contribute to training loss.
-
-For each instance, the output `Out` and`OutWeight` are assigned based on
-`MatchIndices` and `NegIndices`.
-Assumed that the row offset for each instance in `X` is called lod,
-this operator assigns classification/regression targets by performing the
-following steps:
-
-1. Assigning all outpts based on `MatchIndices`:
-
-If id = MatchIndices[i][j] > 0,
-
-    Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-    OutWeight[i][j] = 1.
-
-Otherwise,
-
-    Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-    OutWeight[i][j] = 0.
-
-2. Assigning OutWeight based on `NegIndices` if `NegIndices` is provided:
-
-Assumed that the row offset for each instance in `NegIndices` is called neg_lod,
-for i-th instance and each `id` of NegIndices in this instance:
-
-    Out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
-    OutWeight[i][id] = 1.0
-
-    )DOC");
-  }
-};
-
-template <typename T, typename WT>
-struct NegTargetAssignFunctor<phi::CPUContext, T, WT> {
-  void operator()(const phi::CPUContext& ctx,
-                  const int* neg_indices,
-                  const size_t* lod,
-                  const int N,
-                  const int M,
-                  const int K,
-                  const int mismatch_value,
-                  T* out,
-                  WT* out_wt) {
-    for (int i = 0; i < N; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        int id = neg_indices[j];
-        int off = (i * M + id) * K;
-        for (int k = 0; k < K; ++k) {
-          out[off + k] = mismatch_value;
-          out_wt[off + k] = static_cast<WT>(1.0);
-        }
-      }
-    }
-  }
-};
-
-template struct NegTargetAssignFunctor<phi::CPUContext, int, float>;
-template struct NegTargetAssignFunctor<phi::CPUContext, float, float>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    target_assign,
-    ops::TargetAssignOp,
-    ops::TargetAssignOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    target_assign, CPU, ALL_LAYOUT, ops::TargetAssignKernel, int, float) {}
diff --git a/paddle/fluid/operators/detection/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu
deleted file mode 100644
index 951fcdbafae8e1..00000000000000
--- a/paddle/fluid/operators/detection/target_assign_op.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/target_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename WT>
-__global__ void NegTargetAssignKernel(const int* neg_indices,
-                                      const size_t* lod,
-                                      const int N,
-                                      const int M,
-                                      const int K,
-                                      const int mismatch_value,
-                                      T* out,
-                                      WT* out_wt) {
-  int bidx = blockIdx.x;
-  int st = lod[bidx];
-  int ed = lod[bidx + 1];
-
-  int row_start = bidx * M;
-  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
-    int id = row_start + neg_indices[i];
-    for (int k = 0; k < K; ++k) {
-      out[id * K + k] = T(mismatch_value);
-      out_wt[id * K + k] = WT(1.);
-    }
-  }
-}
-
-template <typename T, typename WT>
-struct NegTargetAssignFunctor<phi::GPUContext, T, WT> {
-  void operator()(const phi::GPUContext& ctx,
-                  const int* neg_indices,
-                  const size_t* lod,
-                  const int N,
-                  const int M,
-                  const int K,
-                  const int mismatch_value,
-                  T* out,
-                  WT* out_wt) {
-    const int block_size = 256;
-    const int grid_size = N;
-    NegTargetAssignKernel<T, WT><<<grid_size, block_size, 0, ctx.stream()>>>(
-        neg_indices, lod, N, M, K, mismatch_value, out, out_wt);
-  }
-};
-
-template struct NegTargetAssignFunctor<phi::GPUContext, int, float>;
-template struct NegTargetAssignFunctor<phi::GPUContext, float, float>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    target_assign, GPU, ALL_LAYOUT, ops::TargetAssignKernel, int, float) {}
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
deleted file mode 100644
index 484bd8454bae9d..00000000000000
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-template <typename T, typename WT>
-struct TargetAssignFunctor {
-  const T* in_;
-  const int* match_indices_;
-  const size_t* lod_;
-  const int mismatch_value_;
-  const int64_t N_;
-  const int64_t M_;
-  const int64_t P_;
-  const int64_t K_;
-
-  T* out_;
-  WT* out_wt_;
-
-  TargetAssignFunctor(const T* input,
-                      const int* match_indices,
-                      const size_t* lod,
-                      const int mismatch_value,
-                      const int64_t N,
-                      const int64_t M,
-                      const int64_t P,
-                      const int64_t K,
-                      T* out,
-                      WT* out_wt)
-      : in_(input),
-        match_indices_(match_indices),
-        lod_(lod),
-        mismatch_value_(mismatch_value),
-        N_(N),
-        M_(M),
-        P_(P),
-        K_(K),
-        out_(out),
-        out_wt_(out_wt) {}
-
-  HOSTDEVICE void operator()(size_t i) const {
-    int h = i / M_;
-    int w = i - h * M_;
-
-    size_t off = lod_[h];
-    int id = match_indices_[i];
-
-    T* out = out_ + i * K_;
-    WT* out_wt = out_wt_ + i;
-
-    if (id > -1) {
-      int w_off = w % P_;
-      const T* in = in_ + ((off + id) * P_ + w_off) * K_;
-      for (int64_t k = 0; k < K_; ++k) {
-        out[k] = in[k];
-      }
-      out_wt[0] = static_cast<WT>(1.);
-    } else {
-      for (int64_t k = 0; k < K_; ++k) {
-        out[k] = static_cast<T>(mismatch_value_);
-      }
-      out_wt[0] = static_cast<WT>(0.);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, typename WT>
-struct NegTargetAssignFunctor {
-  void operator()(const platform::DeviceContext& ctx,
-                  const int* neg_indices,
-                  const size_t* lod,
-                  const int N,
-                  const int M,
-                  const int K,
-                  const int mismatch_value,
-                  T* out,
-                  WT* out_wt) const;
-};
-
-template <typename T, typename DeviceContext, typename WT = float>
-class TargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* match_indices = ctx.Input<phi::DenseTensor>("MatchIndices");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* out_wt = ctx.Output<phi::DenseTensor>("OutWeight");
-
-    PADDLE_ENFORCE_EQ(x->lod().size(),
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "TargetAssignOp input(X) needs 1 level of LoD"));
-    int mismatch_value = ctx.Attr<int>("mismatch_value");
-
-    const T* x_data = x->data<T>();
-    const int* match_idx_data = match_indices->data<int>();
-
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    WT* out_wt_data = out_wt->mutable_data<WT>(ctx.GetPlace());
-
-    int64_t n = match_indices->dims()[0];
-    int64_t m = match_indices->dims()[1];
-    int64_t p = x->dims()[1];
-    int64_t k = x->dims()[2];
-
-    auto x_lod = x->lod().back();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    phi::MixVector<size_t> mixv_x_lod(&x_lod);
-    size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace());
-#else
-    size_t* x_lod_data = x_lod.data();
-#endif
-
-    TargetAssignFunctor<T, WT> functor(x_data,
-                                       match_idx_data,
-                                       x_lod_data,
-                                       mismatch_value,
-                                       n,
-                                       m,
-                                       p,
-                                       k,
-                                       out_data,
-                                       out_wt_data);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    mixv_x_lod.CopyToCPU();
-#endif
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(device_ctx, n * m);
-    for_range(functor);
-
-    auto* neg_indices = ctx.Input<phi::DenseTensor>("NegIndices");
-    if (neg_indices) {
-      PADDLE_ENFORCE_EQ(
-          neg_indices->lod().size(),
-          1UL,
-          platform::errors::InvalidArgument(
-              "TargetAssignOp input(NegIndices) needs 1 level of LoD"));
-      const int* neg_idx_data = neg_indices->data<int>();
-      auto neg_lod = neg_indices->lod().back();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      phi::MixVector<size_t> mixv_neg_lod(&neg_lod);
-      size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace());
-#else
-      size_t* neg_lod_data = neg_lod.data();
-#endif
-      NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
-      neg_trg_functor(device_ctx,
-                      neg_idx_data,
-                      neg_lod_data,
-                      n,
-                      m,
-                      k,
-                      mismatch_value,
-                      out_data,
-                      out_wt_data);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      mixv_neg_lod.CopyToCPU();
-#endif
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/diag_op.cc b/paddle/fluid/operators/diag_op.cc
deleted file mode 100644
index f7b2c4915662c6..00000000000000
--- a/paddle/fluid/operators/diag_op.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/diag_op.h"
-
-namespace paddle {
-namespace operators {
-
-class DiagOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Diagonal"), "Input", "Diagonal", "diag");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diag");
-
-    auto s_dims = ctx->GetInputDim("Diagonal");
-
-    PADDLE_ENFORCE_EQ(
-        s_dims.size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "The dimension of 'diagonal' must be 1, but now it is %d.",
-            s_dims.size()));
-
-    ctx->SetOutputDim("Out", {s_dims[0], s_dims[0]});
-  }
-};
-
-class DiagOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Diagonal",
-             "Diagonal values of square matrix. It is a tensor with rank 1.");
-    AddOutput("Out", "A square matrix.");
-    AddComment(R"DOC(
-    Return a square matrix with specified diagonal values.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    diag,
-    ops::DiagOp,
-    ops::DiagOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(diag,
-                       ops::DiagKernel<phi::CPUContext, int>,
-                       ops::DiagKernel<phi::CPUContext, float>,
-                       ops::DiagKernel<phi::CPUContext, double>,
-                       ops::DiagKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/diag_op.h b/paddle/fluid/operators/diag_op.h
deleted file mode 100644
index e3514e59e806d2..00000000000000
--- a/paddle/fluid/operators/diag_op.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DiagFunctor {
-  DiagFunctor(const T* diagonal, int64_t numel, T* output)
-      : diagonal_(diagonal), numel_(numel), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    output_[idx * numel_ + idx] = diagonal_[idx];
-  }
-
-  const T* diagonal_;
-  int64_t numel_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class DiagKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* diagonal = context.Input<phi::DenseTensor>("Diagonal");
-    auto* diag_data = diagonal->data<T>();
-    auto numel = diagonal->numel();
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, out, static_cast<T>(0));
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    DiagFunctor<T> functor(diag_data, numel, out_data);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
deleted file mode 100644
index 107fe9f6174b61..00000000000000
--- a/paddle/fluid/operators/expand_as_op.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/expand_as_op.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class ExpandAsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAs");
-    OP_INOUT_CHECK(
-        ctx->HasInput("target_tensor"), "Input", "target_tensor", "ExpandAs");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAs");
-    auto x_dims = ctx->GetInputDim("X");
-    auto target_tensor_dims = ctx->GetInputDim("target_tensor");
-    PADDLE_ENFORCE_EQ(
-        static_cast<size_t>(x_dims.size()),
-        target_tensor_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank of Input(target_tensor) must be equal "
-            "to the rank of Input(X). But received Input(X): input "
-            "rank %u, input shape [%s]; received Input(target_tensor): "
-            "input rank %u, input shape [%s].",
-            x_dims.size(),
-            x_dims,
-            target_tensor_dims.size(),
-            target_tensor_dims));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(),
-        6,
-        platform::errors::InvalidArgument(
-            "The rank of Input(X) must not be greater than 6. But "
-            "received: input rank %u, input shape [%s].",
-            x_dims.size(),
-            x_dims));
-    std::vector<int64_t> out_shape(x_dims.size());
-    ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
-  }
-};
-
-class ExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-             "X is the input to be expanded.");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-              "The rank of Output(Out) have the same with Input(X). "
-              "After expanding, size of each dimension of Output(Out) is equal "
-              "to size of the corresponding dimension of Input(X) multiplying "
-              "the corresponding value given by Attr(expand_times).");
-    AddInput("target_tensor", "Expand tensor's shape for each dimension.");
-    AddComment(R"DOC(
-Expand as operator tiles the input by given times number. You should set times
-number for each dimension by providing tensor 'expend_tensor'. The rank of X
-should be in [1, 6]. Please note that size of 'expend_tensor' must be the same
-with X's rank. Following is a using case:
-Input(X) is a 3-D tensor with shape [2, 3, 1]:
-        [
-           [[1], [2], [3]],
-           [[4], [5], [6]]
-        ]
-target_tensors'shape:  [2, 6, 2]
-Output(Out) is a 3-D tensor with shape [2, 6, 2]:
-        [
-            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
-            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
-        ]
-)DOC");
-  }
-};
-
-class ExpandAsGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAs");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "ExpandAs");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class ExpandAsGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("expand_as_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("target_tensor", this->Input("target_tensor"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsGradNoNeedBufVarsInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(expand_as,
-                  ops::ExpandAsOp,
-                  ops::ExpandAsOpMaker,
-                  ops::ExpandAsGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ExpandAsGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(expand_as_grad,
-                  ops::ExpandAsGradOp,
-                  ops::ExpandAsGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(expand_as,
-                       ops::ExpandAsKernel<phi::CPUContext, float>,
-                       ops::ExpandAsKernel<phi::CPUContext, double>,
-                       ops::ExpandAsKernel<phi::CPUContext, int>,
-                       ops::ExpandAsKernel<phi::CPUContext, int64_t>,
-                       ops::ExpandAsKernel<phi::CPUContext, bool>);
-REGISTER_OP_CPU_KERNEL(expand_as_grad,
-                       ops::ExpandAsGradKernel<phi::CPUContext, int>,
-                       ops::ExpandAsGradKernel<phi::CPUContext, int64_t>,
-                       ops::ExpandAsGradKernel<phi::CPUContext, float>,
-                       ops::ExpandAsGradKernel<phi::CPUContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(expand_as,
-                        ops::ExpandAsKernel<phi::GPUContext, float>,
-                        ops::ExpandAsKernel<phi::GPUContext, double>,
-                        ops::ExpandAsKernel<phi::GPUContext, int>,
-                        ops::ExpandAsKernel<phi::GPUContext, int64_t>,
-                        ops::ExpandAsKernel<phi::GPUContext, bool>);
-REGISTER_OP_CUDA_KERNEL(expand_as_grad,
-                        ops::ExpandAsGradKernel<phi::GPUContext, int>,
-                        ops::ExpandAsGradKernel<phi::GPUContext, int64_t>,
-                        ops::ExpandAsGradKernel<phi::GPUContext, float>,
-                        ops::ExpandAsGradKernel<phi::GPUContext, double>);
-#endif
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
deleted file mode 100644
index a3462a00bcfb15..00000000000000
--- a/paddle/fluid/operators/expand_as_op.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-#define MAX_RANK_SUPPORTED 6
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ExpandAsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        ExpandAs<1>(context);
-        break;
-      case 2:
-        ExpandAs<2>(context);
-        break;
-      case 3:
-        ExpandAs<3>(context);
-        break;
-      case 4:
-        ExpandAs<4>(context);
-        break;
-      case 5:
-        ExpandAs<5>(context);
-        break;
-      case 6:
-        ExpandAs<6>(context);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Only support tensor with rank being between 1 and 6. But received "
-            "tensor X's rank = %d.",
-            rank));
-    }
-  }
-
- protected:
-  template <int Rank>
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto* target_tensor = context.Input<phi::DenseTensor>("target_tensor");
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    int bcast_dims_remainder = 0;
-    auto x_dims = in0->dims();
-    auto y_dims = target_tensor->dims();
-    for (int i = 0; i < y_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(
-          x_dims[i],
-          0UL,
-          platform::errors::InvalidArgument(
-              "X(input) should not have 0 dim. But received x_dims[%d] = 0.",
-              i));
-      bcast_dims[i] = y_dims[i] / x_dims[i];
-      bcast_dims_remainder += y_dims[i] % x_dims[i];
-    }
-    PADDLE_ENFORCE_EQ(
-        bcast_dims_remainder,
-        0UL,
-        platform::errors::InvalidArgument(
-            "X(input) could not be broadcast together with remapped "
-            "shape(expand tensor's shape)"));
-    framework::DDim out_dims(in_dims);
-    for (size_t i = 0; i < bcast_dims.size(); ++i) {
-      out_dims[i] *= bcast_dims[i];
-    }
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
-        place, y, x, bcast_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandAsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto* target_tensor = context.Input<phi::DenseTensor>("target_tensor");
-    auto x_dims = in0->dims();
-    auto y_dims = target_tensor->dims();
-    std::vector<int> bcast_dims;
-    for (int i = 0; i < y_dims.size(); ++i) {
-      bcast_dims.push_back(y_dims[i] / x_dims[i]);
-    }
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < bcast_dims.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(bcast_dims[i]);
-      reshape_dims_vec.push_back(x_dims[i]);
-    }
-    int dims = reduce_dims_vec.size();
-    bool just_copy = true;
-    for (size_t i = 0; i < bcast_dims.size(); i++) {
-      if (bcast_dims[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* in0 =
-          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      auto* out0 =
-          context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-      out0->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(
-          *in0, context.GetPlace(), context.device_context(), out0);
-    } else {
-      PADDLE_ENFORCE_GE(dims,
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_grad op must be greater than or "
-                            "equal to 1, but the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims,
-                        MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_grad op must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED,
-                            dims));
-      switch (dims) {
-        case 1:
-          ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void ExpandAsBackward(const framework::ExecutionContext& context,
-                        const std::vector<int>& reshape_dims_vec,
-                        const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index bcb7081847111c..aeefe07d348e93 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -78,9 +78,3 @@ REGISTER_OPERATOR(
     ops::FillOpVarTypeInference,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fill,
-                       ops::FillKernel<float>,
-                       ops::FillKernel<double>,
-                       ops::FillKernel<int64_t>,
-                       ops::FillKernel<int>,
-                       ops::FillKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_op.cu.cc b/paddle/fluid/operators/fill_op.cu.cc
deleted file mode 100644
index 1a22f53898cf07..00000000000000
--- a/paddle/fluid/operators/fill_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(fill,
-                        ops::FillKernel<float>,
-                        ops::FillKernel<double>,
-                        ops::FillKernel<int64_t>,
-                        ops::FillKernel<int>,
-                        ops::FillKernel<paddle::platform::float16>,
-                        ops::FillKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h
index 5f4f993bec20b5..086ad44e024b00 100644
--- a/paddle/fluid/operators/fill_op.h
+++ b/paddle/fluid/operators/fill_op.h
@@ -42,42 +42,5 @@ struct FillOpVisitor {
   const std::vector<float> &value_;
 };
 
-template <typename T>
-class FillKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto &out = GET_DATA_SAFELY(
-        ctx.Output<phi::DenseTensor>("Out"), "Output", "Out", "Fill");
-    out.Resize(phi::make_ddim(ctx.Attr<std::vector<int>>("shape")));
-    auto dtype =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto phi_dtype = framework::TransToPhiDataType(dtype);
-    platform::CPUPlace cpu;
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), phi_dtype);
-
-    phi::DenseTensor tensor;
-
-    if (force_cpu || platform::is_cpu_place(ctx.GetPlace())) {
-      tensor.ShareDataWith(out);
-    } else {
-      // Always make tensor in CPU memory.
-      tensor.Resize(out.dims());
-      tensor.mutable_data(cpu, phi_dtype);
-    }
-
-    framework::VisitDataType(
-        dtype, FillOpVisitor(&tensor, ctx.Attr<std::vector<float>>("value")));
-
-    if (!force_cpu && platform::is_gpu_place(ctx.GetPlace())) {
-      // Copy tensor to out
-      framework::TensorCopy(
-          tensor,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &out);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc
deleted file mode 100644
index 02735a1ee5be00..00000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/filter_by_instag_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-class FilterByInstagOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Ins"),
-        true,
-        platform::errors::InvalidArgument("Input(Ins) should be not null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ins_tag"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Ins_tag) should be not null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Filter_tag"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Filter_tag) should be not null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        platform::errors::InvalidArgument("Output(Out) should be not null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("LossWeight"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(LossWeight) shoudl not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("IndexMap"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(IndexMap) should be not null."));
-
-    auto x1_dims = ctx->GetInputDim("Ins");  // batch_size * vec
-
-    ctx->SetOutputDim("Out", phi::make_ddim({-1, x1_dims[1]}));
-    ctx->SetOutputDim("LossWeight", phi::make_ddim({-1, 1}));
-    ctx->SetOutputDim("IndexMap", phi::make_ddim({-1, 2}));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Ins");
-    return phi::KernelKey(data_type, ctx.device_context().GetPlace());
-  }
-};
-
-class FilterByInstagOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ins", "(phi::DenseTensor) embeded tensor");
-    AddInput("Ins_tag", "(phi::DenseTensor) ins tag list");
-    AddInput("Filter_tag", "(1D Tensor) filter tag list");
-    AddAttr<bool>("is_lod", "is Ins with LoD info or not, default True");
-    AddAttr<int64_t>("out_val_if_empty",
-                     "if the output after filter is empty, the output value")
-        .SetDefault(0);
-    AddOutput("Out", "(phi::DenseTensor) embeded tensor filtered by instag");
-    AddOutput("LossWeight", "(Tensor) loss weight.");
-    AddOutput("IndexMap",
-              "(phi::DenseTensor) mapping from Out rows to X1 rows");
-    AddComment(R"DOC(
-Filter By Instag Op
-
-This operator is used to filter embeded ins.
-
-There are 3 inputs. First is embeded ins, Second is tags for ins,
-Third is tags to filter.
-
-There are 3 outputs. First is filtered embeded ins, Second is Loss Weight,
-Third is the IndexMap from Out line number to X1 line number.
-)DOC");
-  }
-};
-
-class FilterByInstagOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("IndexMap"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(IndexMap) should be not null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Grad Input(Out) should be not null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Ins"),
-        true,
-        platform::errors::InvalidArgument("Input(Ins) should be not null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LossWeight"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(LossWeight) should be not null"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Ins")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Grad Output(Ins) should be not null"));
-
-    auto grad_out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x1_dims = ctx->GetInputDim("Ins");
-    ctx->SetOutputDim(framework::GradVarName("Ins"),
-                      phi::make_ddim({x1_dims[0], grad_out_dims[1]}));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return phi::KernelKey(data_type, ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class FilterByInstagGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("filter_by_instag_grad");
-    op->SetInput("IndexMap", this->Output("IndexMap"));
-    op->SetInput("Ins", this->Input("Ins"));
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("LossWeight", this->Output("LossWeight"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("Ins"), this->InputGrad("Ins"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(filter_by_instag,
-                  ops::FilterByInstagOp,
-                  ops::FilterByInstagOpMaker,
-                  ops::FilterByInstagGradOpMaker<paddle::framework::OpDesc>,
-                  ops::FilterByInstagGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(filter_by_instag_grad, ops::FilterByInstagOpGrad);
-
-PD_REGISTER_STRUCT_KERNEL(filter_by_instag,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::FilterByInstagKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
-
-PD_REGISTER_STRUCT_KERNEL(filter_by_instag_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::FilterByInstagGradKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
deleted file mode 100644
index 4449044acb89bf..00000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ /dev/null
@@ -1,639 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
-
-#include "paddle/fluid/operators/filter_by_instag_op.h"
-
-#if defined(PADDLE_WITH_CUDA)
-#include <cooperative_groups.h>
-#endif
-
-#include <thrust/copy.h>
-#include <thrust/device_vector.h>
-
-#include <cstring>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/core/mixed_vector.h"
-
-#if defined(PADDLE_WITH_CUDA)
-namespace cg = cooperative_groups;
-#endif
-
-namespace paddle {
-namespace operators {
-
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-using Vector = phi::Vector<T>;
-
-#define WARP_SIZE 32
-#define MAX_WARP_NUM 32
-
-#if defined(PADDLE_WITH_CUDA)
-
-template <typename T>
-__global__ void filter_copy_fuse_kernel(const size_t N,
-                                        const int ins_per_thread,
-                                        size_t* x1_lods_data,
-                                        size_t* x2_lods_data,
-                                        const int64_t* x2_data,
-                                        const int64_t* x3_data,
-                                        int64_t filter_tag_size,
-                                        T* out_data,
-                                        int64_t* map_data,
-                                        size_t* map_lods_data,
-                                        size_t* out_lods_data,
-                                        size_t* out_idx_data,
-                                        const T* x1_data,
-                                        int x1_embed_size,
-                                        float* loss_weight_data,
-                                        float fill_value) {
-  // N is instance num
-  // one threads for ins_per_thread instances
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  cg::thread_block b = cg::this_thread_block();
-  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-  int gid = idx / WARP_SIZE;
-
-  // general use
-  int thread_num =
-      (N + (ins_per_thread - 1)) / ins_per_thread;  // real thread num
-  int total_warp_num = thread_num / WARP_SIZE;      // 30
-  int remain_thread_num = thread_num % WARP_SIZE;   // 16
-
-  int warp_thread_num = -1;
-  if (gid < total_warp_num) {
-    warp_thread_num = WARP_SIZE;
-  } else {
-    warp_thread_num = remain_thread_num;
-  }
-
-  int group_num = total_warp_num;
-  if (remain_thread_num > 0) {
-    group_num = total_warp_num + 1;
-  }
-
-  if (gid >= group_num) return;
-
-  int ins_start = idx * ins_per_thread;
-  int ins_end = (idx + 1) * ins_per_thread;
-
-  if (N < ins_end) ins_end = N;
-
-  int flag_data[5];
-  int prefix_sum_data[5];
-  int prefix_sum_data2[5];
-
-  __shared__ int shr[MAX_WARP_NUM];
-  __shared__ int shr2[MAX_WARP_NUM];
-  __shared__ int shr3[MAX_WARP_NUM];
-
-  for (int p = ins_start; p < ins_end; p++) {
-    int ins_tag_start = x2_lods_data[p];
-    int ins_tag_end = x2_lods_data[p + 1];
-    flag_data[p - ins_start] = 0;
-    // filter logic
-    int i = ins_tag_start;
-    for (; i < ins_tag_end; i++) {
-      int64_t ins_tag = x2_data[i];
-      int j = 0;
-      for (; j < filter_tag_size; j++) {
-        if (x3_data[j] == ins_tag) break;
-      }
-      // if ins_tag in filter tag
-      if (j < filter_tag_size) {
-        flag_data[p - ins_start] = 1;
-        break;
-      }
-    }
-  }
-
-  int sum_addr = 0;
-  int sum_flag = 0;
-  int sum_out_lods = 0;
-
-  int local_addr = 0;
-  int local_flag = 0;
-  int local_out_lods = 0;
-
-  if (ins_start < ins_end) {
-    for (int p = ins_start; p < ins_end; p++) {
-      int previous = -1;
-      if (p == ins_start) {
-        previous = 0;
-      } else {
-        previous = prefix_sum_data[p - ins_start - 1];
-      }
-
-      prefix_sum_data[p - ins_start] =
-          previous +
-          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
-    }
-
-    local_addr = prefix_sum_data[ins_end - 1 - ins_start];
-    sum_addr = local_addr;
-
-    for (int p = ins_start; p < ins_end; p++) {
-      local_flag += flag_data[p - ins_start];
-    }
-    sum_flag = local_flag;
-
-    for (int p = ins_start; p < ins_end; p++) {
-      local_out_lods +=
-          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
-    }
-
-    sum_out_lods = local_out_lods;
-  }
-
-  for (int i = 1; i < warp_thread_num; i *= 2) {
-    int temp_addr = g.shfl_up(sum_addr, i);
-    int temp_flag = g.shfl_up(sum_flag, i);
-    int temp_out_lods = g.shfl_up(sum_out_lods, i);
-
-    if (g.thread_rank() >= i) {
-      sum_addr += temp_addr;
-      sum_flag += temp_flag;
-      sum_out_lods += temp_out_lods;
-    }
-  }
-
-  if (g.thread_rank() == warp_thread_num - 1) {
-    shr[gid] = sum_addr;
-    shr2[gid] = sum_flag;
-    shr3[gid] = sum_out_lods;
-  }
-
-  b.sync();
-
-  int sum_addr2 = 0;
-  int sum_flag2 = 0;
-  int sum_out_lods2 = 0;
-
-  // communicate between warp
-  if (g.thread_rank() < group_num) {
-    sum_addr2 = shr[g.thread_rank()];
-    sum_flag2 = shr2[g.thread_rank()];
-    sum_out_lods2 = shr3[g.thread_rank()];
-  }
-
-  for (int i = 1; i < group_num; i *= 2) {
-    int temp_addr2 = g.shfl_up(sum_addr2, i);
-    int temp_flag2 = g.shfl_up(sum_flag2, i);
-    int temp_out_lods2 = g.shfl_up(sum_out_lods2, i);
-
-    if (g.thread_rank() >= i) {
-      sum_addr2 += temp_addr2;
-      sum_flag2 += temp_flag2;
-      sum_out_lods2 += temp_out_lods2;
-    }
-  }
-
-  int sum_addr3 = g.shfl(sum_addr2, gid);
-  int sum_flag3 = g.shfl(sum_flag2, gid);
-  int sum_out_lods3 = g.shfl(sum_out_lods2, gid);
-
-  int p_flag;
-  int p_addr;
-  int p_out_lods;
-
-  if (ins_start < ins_end) {
-    p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr;
-    p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag;
-    p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods;
-
-    for (int p = ins_start; p < ins_end; p++) {
-      if (ins_start == p) {
-        prefix_sum_data2[p - ins_start] = p_addr;
-      } else {
-        prefix_sum_data2[p - ins_start] =
-            prefix_sum_data2[p - ins_start - 1] +
-            flag_data[p - ins_start - 1] *
-                (x1_lods_data[p] - x1_lods_data[p - 1]);
-      }
-    }
-
-    if (gid == 0 && g.thread_rank() == group_num - 1) {
-      *out_idx_data = (sum_flag2 + 1);
-      map_lods_data[sum_flag2] = sum_flag2;
-    }
-  }
-
-  int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1);
-
-  if (ins_start < ins_end) {
-    int out_lods_idx = p_flag + 1;
-    for (int p = ins_start; p < ins_end; p++) {
-      if (flag_data[p - ins_start] == 1) {
-        size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
-        int t = out_lods_idx - 1;
-        int previous;
-        if (out_lods_idx == p_flag + 1) {
-          previous = p_out_lods;
-        } else {
-          previous = out_lods_data[t];
-        }
-        map_data[t * 3] = (int64_t)previous;
-        map_data[t * 3 + 1] = x1_lods_data[p];
-        map_lods_data[t] = t;
-        out_lods_data[out_lods_idx] = previous + batch_len;
-        map_data[t * 3 + 2] = batch_len;
-        out_lods_idx++;
-      }
-    }
-
-    // fill loss_weight_data
-    if (sum_out_lods4 > 1) {
-      int out_data_num = sum_out_lods4 - 1;
-      int out_start = ins_start;
-      if (out_start < out_data_num) {
-        int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
-        for (int p = out_start; p < out_end; p++) {
-          loss_weight_data[p] = fill_value;
-        }
-      }
-    }
-
-    for (int p = ins_start; p < ins_end; p++) {
-      // copy logic
-      if (flag_data[p - ins_start] == 1) {
-        auto output_start_idx = prefix_sum_data2[p - ins_start];
-        T* dst = out_data + output_start_idx * x1_embed_size;
-        const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
-        const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
-        for (const T* j = src_start; j != src_end; dst++, j++) {
-          *dst = *j;
-        }
-      }
-    }
-  }
-
-  b.sync();
-}
-
-template <typename T>
-__global__ void copy_grad_kernel(const size_t N,
-                                 const int ins_per_thread,
-                                 const T* out_grad_data,
-                                 T* x1_grad_data,
-                                 const int64_t* map_data,
-                                 int x1_embed_size) {
-  // N is instance num
-  // one threads for one instance
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ins_start = idx * ins_per_thread;
-  int ins_end = (idx + 1) * ins_per_thread;
-  if (ins_start >= N) {
-    return;
-  }
-  if (ins_end > N) ins_end = N;
-  for (int p = ins_start; p < ins_end; p++) {
-    T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
-    const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
-    const T* src_end =
-        out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
-
-    for (const T* j = src_start; j != src_end; dst++, j++) {
-      *dst = *j;
-    }
-  }
-}
-
-#endif
-
-template <typename T, typename DeviceContext>
-class FilterByInstagGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-#if defined(PADDLE_WITH_CUDA)
-
-    auto gpu_place = context.GetPlace();
-
-    gpuStream_t current_stream = context.cuda_device_context().stream();
-
-    int max_thread_num_per_block = 1024;
-    //    context.cuda_device_context().GetMaxThreadsPerBlock();
-    // X1 is global FC output
-    // Dim [batch size, embedding size]
-    const phi::DenseTensor* x1 = context.Input<phi::DenseTensor>("Ins");
-    bool is_lod = context.Attr<bool>("is_lod");
-
-    int is_x1_lod = -1;
-    if (is_lod)
-      is_x1_lod = 1;
-    else
-      is_x1_lod = 0;
-
-    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
-    size_t x1_embed_size = x1->dims()[1];
-    // X2 is ins tag list
-    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
-    const phi::DenseTensor* x2 = context.Input<phi::DenseTensor>("Ins_tag");
-    // expected auto = const int64_t
-    const int64_t* x2_data = x2->data<int64_t>();
-
-    // X3 is local fc tag list
-    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
-    const phi::DenseTensor* x3 = context.Input<phi::DenseTensor>("Filter_tag");
-    const int64_t* x3_data = x3->data<int64_t>();
-
-    Vector<size_t> x2_lods;
-    if (x2->lod().size() != 0) {  // lod_level = 1
-      x2_lods = x2->lod()[0];
-    } else {  // lod_level = 0
-      const size_t x2_lods_size = x2->dims()[0];
-      const size_t instag_per_num = x2->dims()[1];
-      // x2_lods.resize(x2->dims()[0] + 1);
-      // move to cuda
-      x2_lods.push_back(0);
-      for (size_t i = 0; i < x2_lods_size; i++) {
-        x2_lods.push_back(x2_lods.back() + instag_per_num);
-      }
-    }
-
-    const size_t x2_lods_size = x2_lods.size() - 1;
-    phi::MixVector<size_t> mixv_x2_lods(&x2_lods);
-
-    size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
-
-    Vector<size_t> x1_lods;
-    if (!is_x1_lod) {
-      x1_lods.push_back(0);
-      for (int i = 0; i < x1->dims()[0]; i++) {
-        x1_lods.push_back(i + 1);
-      }
-    } else {
-      // x1_lods = context.Input<phi::DenseTensor>("Ins")->lod()[0];
-      // new: lod_level=0 => lod() return {}
-      if (x1->lod().size() != 0) {  // lod_level = 1
-        x1_lods = x1->lod()[0];
-      } else {  // lod_level = 0
-        // x1_lods.resize(x1->dims()[0] + 1);
-        // move to cuda
-        x1_lods.push_back(0);
-        for (int i = 0; i < x1->dims()[0]; i++) {
-          x1_lods.push_back(i + 1);
-        }
-      }
-    }
-
-    phi::MixVector<size_t> mixv_x1_lods(&x1_lods);
-
-    size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place);
-    auto* x1_data = x1->data<T>();
-
-    // set output value
-    // for those whose ins been dropout, set 0 for whole lines.
-    // otherwise, copy whole line
-    // Dim [local fc count, batch size, embedding size]
-    phi::DenseTensor* out = context.Output<phi::DenseTensor>("Out");
-    phi::DenseTensor* map = context.Output<phi::DenseTensor>("IndexMap");
-    phi::DenseTensor* loss_weight =
-        context.Output<phi::DenseTensor>("LossWeight");
-
-    int out_first = x1_lods.back();
-
-    out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
-    map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
-    loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1}));
-
-    T* out_data = out->mutable_data<T>(gpu_place);
-    int64_t* map_data = map->mutable_data<int64_t>(gpu_place);
-    float* loss_weight_data = loss_weight->mutable_data<float>(gpu_place);
-
-    int block_size = max_thread_num_per_block;
-    int ins_per_thread = (x2_lods_size + block_size - 1) / block_size;
-    dim3 block_dim(block_size);
-    dim3 grid_dim(1);
-
-    Vector<size_t> out_lods(x2_lods_size + 1, 0);
-    Vector<size_t> map_lods(x2_lods_size + 1, 0);
-
-    phi::MixVector<size_t> mixv_out_lods(&out_lods);
-    phi::MixVector<size_t> mixv_map_lods(&map_lods);
-
-    // thrust::device_vector<size_t> out_idx(1);
-    Vector<size_t> out_idx(1, 0);
-    phi::MixVector<size_t> mixv_out_idx(&out_idx);
-
-    size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place);
-    size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place);
-    size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place);
-
-    float fill_value = 1.0;
-
-    filter_copy_fuse_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
-        x2_lods_size,
-        ins_per_thread,
-        x1_lods_data,
-        x2_lods_data,
-        x2_data,
-        x3_data,
-        x3->numel(),
-        out_data,
-        map_data,
-        map_lods_data,
-        out_lods_data,
-        out_idx_data,
-        x1_data,
-        x1_embed_size,
-        loss_weight_data,
-        fill_value);
-
-    platform::GpuStreamSync(current_stream);
-
-    mixv_out_lods.resize(mixv_out_idx[0]);
-
-    if (mixv_out_lods.size() - 1 > 0) {
-      out->Resize(phi::make_ddim(
-          {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size}));
-
-      map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3}));
-      loss_weight->Resize(
-          phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1}));
-
-    } else {
-      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
-      map->Resize(phi::make_ddim({1, 3}));
-      loss_weight->Resize(phi::make_ddim({1, 1}));
-    }
-
-    if (mixv_out_lods.size() - 1 > 0) {
-      map_lods.resize(mixv_out_lods.size());
-
-      mixv_map_lods.CopyToCPU();
-
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.emplace_back(map_lods);
-
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-
-      mixv_out_lods.CopyToCPU();
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.emplace_back(out_lods);
-      out->set_lod(out_lod_info);
-
-    } else {
-      Vector<size_t> map_lods(2, 0);
-      phi::MixVector<size_t> mixv_map_lods(&map_lods);
-      thrust::device_ptr<int64_t> map_data_ptr(map_data);
-
-      map_data_ptr[0] = 0;
-      map_data_ptr[1] = 1;
-      map_data_ptr[2] = 1;
-
-      mixv_map_lods[0] = 0;
-      mixv_map_lods[1] = 1;
-      mixv_out_lods.push_back(1);
-
-      mixv_map_lods.CopyToCPU();
-      mixv_out_lods.CopyToCPU();
-
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.emplace_back(map_lods);
-      map->set_lod(map_lod_info);
-
-      loss_weight->set_lod(map_lod_info);
-
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.emplace_back(out_lods);
-      out->set_lod(out_lod_info);
-
-      thrust::device_ptr<T> out_data_ptr(out_data);
-
-      // gpu kernel
-      if (std::is_same<T, int32_t>::value) {
-        thrust::fill(out_data_ptr,
-                     out_data_ptr + out->numel(),
-                     static_cast<int32_t>(out_val_if_empty));
-      } else if (std::is_same<T, int64_t>::value) {
-        thrust::fill(out_data_ptr,
-                     out_data_ptr + out->numel(),
-                     static_cast<int64_t>(out_val_if_empty));
-      } else if (std::is_same<T, float>::value) {
-        thrust::fill(out_data_ptr,
-                     out_data_ptr + out->numel(),
-                     static_cast<float>(out_val_if_empty));
-      } else {
-        thrust::fill(out_data_ptr,
-                     out_data_ptr + out->numel(),
-                     static_cast<double>(out_val_if_empty));
-      }
-
-      thrust::device_ptr<float> loss_weight_data_ptr(loss_weight_data);
-      loss_weight_data_ptr[0] = 0;
-    }
-
-#endif
-  }
-};
-
-template <typename T, typename DeviceContext>
-class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-#if defined(PADDLE_WITH_CUDA)
-
-    auto gpu_place = context.GetPlace();
-    gpuStream_t current_stream = context.cuda_device_context().stream();
-    auto max_thread_num_per_block = 1024;
-    auto* output_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x1_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Ins"));
-    auto* loss_weight = context.Input<phi::DenseTensor>("LossWeight");
-    auto* mmap = context.Input<phi::DenseTensor>("IndexMap");
-    auto* x1 = context.Input<phi::DenseTensor>("Ins");
-
-    x1_grad->set_lod(context.Input<phi::DenseTensor>("Ins")->lod());
-    x1_grad->Resize(x1->dims());
-
-    auto* mmap_data = mmap->data<int64_t>();
-    // expected auto = T
-    auto* output_grad_data = output_grad->data<T>();
-    auto* loss_weight_data = loss_weight->data<float>();
-
-    // expected auto = T
-    auto* x1_grad_data = x1_grad->mutable_data<T>(gpu_place);
-    thrust::device_ptr<T> x1_grad_data_ptr(x1_grad_data);
-    thrust::device_ptr<const float> loss_weight_data_ptr(loss_weight_data);
-
-    thrust::fill(
-        x1_grad_data_ptr, x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0);
-
-    if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) {
-      auto output_dims = output_grad->dims();
-      int x1_embed_size = output_dims[1];
-
-      // one thread for multi-instances
-      int block_size = max_thread_num_per_block;
-
-      size_t N = mmap->dims()[0];
-      dim3 block_dim(block_size);
-
-      dim3 grid_dim((N + block_size - 1) / block_size);
-
-      const int ins_per_thread = 1;
-
-      copy_grad_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
-          N,
-          ins_per_thread,
-          output_grad_data,
-          x1_grad_data,
-          mmap_data,
-          x1_embed_size);
-
-      cudaStreamSynchronize(current_stream);
-    }
-
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(filter_by_instag,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FilterByInstagGPUKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
-
-PD_REGISTER_STRUCT_KERNEL(filter_by_instag_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FilterByInstagGradGPUKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
deleted file mode 100644
index 04dc713a4dcc90..00000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/core/mixed_vector.h"
-
-namespace paddle {
-namespace operators {
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-using Vector = phi::Vector<T>;
-
-template <typename T, typename DeviceContext>
-class FilterByInstagKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // X1 is global FC output
-    // Dim [batch size, embedding size]
-    auto* x1 = context.Input<phi::DenseTensor>("Ins");
-    bool is_x1_lod = context.Attr<bool>("is_lod");
-    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
-    // X2 is ins tag list
-    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
-    auto* x2 = context.Input<phi::DenseTensor>("Ins_tag");
-    // X3 is local fc tag list
-    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
-    auto* x3 = context.Input<phi::DenseTensor>("Filter_tag");
-
-    std::unordered_set<int64_t> filter_tag;
-    auto* x3_data = x3->data<int64_t>();
-    size_t len = x3->dims()[0];
-    for (size_t i = 0; i < len; i++) {
-      filter_tag.insert(x3_data[i]);
-    }
-
-    // expected auto = const int64_t
-    auto* x2_data = x2->data<int64_t>();
-    // e.g get [0, 1, 2, 3, ...]
-    // size_t x2_lods_size = x2->dims()[0];
-    // size_t instag_num_per_ins = x2->dims()[1];
-
-    Vector<size_t> x2_lods(1, 0);
-    if (x2->lod().size() != 0) {  // lod_level = 1
-      x2_lods = x2->lod()[0];
-    } else {  // lod_level = 0
-      const size_t x2_lods_size = x2->dims()[0];
-      const size_t instag_num_per_ins = x2->dims()[1];
-      for (size_t i = 0; i < x2_lods_size; i++) {
-        x2_lods.push_back(x2_lods.back() + instag_num_per_ins);
-      }
-    }
-
-    Vector<size_t> x1_lods(1, 0);
-    if (!is_x1_lod) {
-      for (int i = 0; i < x1->dims()[0]; i++) {
-        x1_lods.push_back(i + 1);
-      }
-    } else {
-      // new: lod_level=0 => lod() return {}
-      if (x1->lod().size() != 0) {
-        x1_lods = x1->lod()[0];
-      } else {
-        for (int i = 0; i < x1->dims()[0]; i++) {
-          x1_lods.push_back(i + 1);
-        }
-      }
-    }
-    std::unordered_map<int64_t, int64_t> mmap_aux;
-    Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
-      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
-        if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
-          size_t batch_len = x1_lods[i + 1] - x1_lods[i];
-          mmap_aux[out_lods.back()] = x1_lods[i];
-          out_lods.push_back(out_lods.back() + batch_len);
-          break;
-        }
-      }
-    }
-    // set output value
-    // for those whose ins been dropout, set 0 for whole lines.
-    // otherwise, copy whole line
-    // Dim [local fc count, batch size, embedding size]
-    phi::DenseTensor* out = context.Output<phi::DenseTensor>("Out");
-    phi::DenseTensor* map = context.Output<phi::DenseTensor>("IndexMap");
-    phi::DenseTensor* loss_weight =
-        context.Output<phi::DenseTensor>("LossWeight");
-    // expected auto = const T
-    auto* x1_data = x1->data<T>();
-    // expected auto = T
-    size_t x1_embed_size = x1->dims()[1];
-    if (out_lods.size() - 1 > 0) {
-      out->Resize(
-          phi::make_ddim({(int64_t)out_lods.back(), (int64_t)x1_embed_size}));
-      map->Resize(phi::make_ddim({(int64_t)out_lods.size() - 1, 3}));
-      loss_weight->Resize(phi::make_ddim({(int64_t)out_lods.size() - 1, 1}));
-    } else {
-      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
-      map->Resize(phi::make_ddim({1, 3}));
-      loss_weight->Resize(phi::make_ddim({1, 1}));
-    }
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    auto* map_data = map->mutable_data<int64_t>(context.GetPlace());
-    auto* loss_weight_data =
-        loss_weight->mutable_data<float>(context.GetPlace());
-    if (out_lods.size() - 1 > 0) {
-      Vector<size_t> map_lods;
-      for (size_t i = 0; i < out_lods.size() - 1; i++) {
-        map_data[i * 3] = (int64_t)out_lods[i];
-        map_data[i * 3 + 1] = mmap_aux[map_data[i * 3]];
-        map_data[i * 3 + 2] = out_lods[i + 1] - out_lods[i];
-        map_lods.push_back(i);
-      }
-      map_lods.push_back(out_lods.size() - 1);
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.push_back(map_lods);
-
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.push_back(out_lods);
-      out->set_lod(out_lod_info);
-      memset(out_data, 0, out->numel() * sizeof(T));
-      for (int i = 0; i < loss_weight->numel(); i++) {
-        loss_weight_data[i] = 1;
-      }
-
-      for (size_t i = 0; i < out_lods.size() - 1; i++) {
-        size_t pos = out_lods[i];
-        for (int k = map_data[i * 3 + 1];
-             k < map_data[i * 3 + 1] + map_data[i * 3 + 2];
-             k++) {
-          memcpy(out_data + pos * x1_embed_size,
-                 x1_data + k * x1_embed_size,
-                 x1_embed_size * sizeof(T));
-          ++pos;
-        }
-      }
-    } else {
-      Vector<size_t> map_lods;
-      map_data[0] = 0;
-      map_data[1] = 1;
-      map_data[2] = 1;
-      map_lods.push_back(0);
-      map_lods.push_back(1);
-      out_lods.push_back(1);
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.push_back(map_lods);
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.push_back(out_lods);
-      out->set_lod(out_lod_info);
-      for (int64_t oi = 0; oi < out->numel(); ++oi) {
-        if (std::is_same<T, int32_t>::value) {
-          out_data[oi] = (int32_t)out_val_if_empty;
-        } else if (std::is_same<T, int64_t>::value) {
-          out_data[oi] = (int64_t)out_val_if_empty;
-        } else if (std::is_same<T, double>::value) {
-          out_data[oi] = static_cast<double>(out_val_if_empty);
-        } else {
-          out_data[oi] = static_cast<float>(out_val_if_empty);
-        }
-      }
-      loss_weight_data[0] = 0;
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class FilterByInstagGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x1_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Ins"));
-    auto* loss_weight = context.Input<phi::DenseTensor>("LossWeight");
-    auto* mmap = context.Input<phi::DenseTensor>("IndexMap");
-    auto* x1 = context.Input<phi::DenseTensor>("Ins");
-    x1_grad->set_lod(context.Input<phi::DenseTensor>("Ins")->lod());
-    x1_grad->Resize(x1->dims());
-    auto mmap_data = mmap->data<int64_t>();
-    // expected auto = T
-    auto* output_grad_data = output_grad->data<T>();
-
-    auto* loss_weight_data = loss_weight->data<float>();
-    // expected auto = T
-    auto* x1_grad_data = x1_grad->mutable_data<T>(context.GetPlace());
-    memset(x1_grad_data, 0, x1->dims()[0] * x1->dims()[1] * sizeof(T));
-    if (loss_weight->numel() != 1 || loss_weight_data[0] != 0) {
-      auto output_dims = output_grad->dims();
-      for (int i = 0; i < mmap->dims()[0]; i++) {
-        int src_ln = mmap_data[i * 3], dst_ln = mmap_data[i * 3 + 1];
-        int line_cnt = mmap_data[i * 3 + 2];
-        for (int l = 0; l < line_cnt; l++) {
-          for (int j = 0; j < output_dims[1]; j++) {
-            x1_grad_data[(dst_ln + l) * output_dims[1] + j] =
-                output_grad_data[(src_ln + l) * output_dims[1] + j];
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 7bacc5f9de3e02..ddb67eef4a3fa6 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -27,35 +27,53 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-
-class FlattenOp : public framework::OperatorWithKernel {
+// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
+// the XShape is used to carry the shape and lod of X which will be used in
+// flatten_grad, in this way, the framework can reuse the memory of X
+// immediately the flatten2_op is finished.
+// Considering compatibility issues, we could not fix flatten2_op
+class Flatten2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Flatten");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Flatten");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Flatten2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Flatten2");
     const auto &axis = ctx->Attrs().Get<int>("axis");
     const auto &in_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_GE(axis,
                       0,
                       platform::errors::InvalidArgument(
                           "The axis should be greater than or equal to 0."));
-    if (in_dims.size() > 0) {
-      PADDLE_ENFORCE_LE(
-          axis,
-          in_dims.size(),
-          platform::errors::InvalidArgument(
-              "The axis should be less than or equal to input tensor's rank."));
-    }
+    PADDLE_ENFORCE_LE(
+        axis,
+        in_dims.size(),
+        platform::errors::InvalidArgument(
+            "The axis should be less than or equal to input tensor's rank"));
 
-    const auto &out_dims = GetOutputShape(axis, in_dims);
+    const auto &out_dims = Flatten2Op::GetOutputShape(axis, in_dims);
     ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
     if (in_dims[0] == out_dims[0]) {
       // Only pass LoD when the first dimension of output and Input(X)
       // are the same.
       ctx->ShareLoD("X", "Out");
     }
+    if (!ctx->HasOutput("XShape")) return;
+    // OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return phi::KernelKey(input_data_type, ctx.GetPlace());
   }
 
   static std::vector<int32_t> GetOutputShape(const int axis,
@@ -85,17 +103,9 @@ class FlattenOp : public framework::OperatorWithKernel {
     out_shape[1] = static_cast<int32_t>(inner);
     return out_shape;
   }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
 };
 
-class FlattenOpMaker : public framework::OpProtoAndCheckerMaker {
+class Flatten2OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) A tensor of rank >= axis.");
@@ -145,96 +155,6 @@ Case 2:
   We get:
     Out.shape = (1, 3 * 100 * 100 * 4)
 )DOC");
-  }
-};
-
-class FlattenGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    context->SetOutputDim(framework::GradVarName("X"),
-                          context->GetInputDim("X"));
-    context->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class FlattenGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("flatten_grad");
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
-// the XShape is used to carry the shape and lod of X which will be used in
-// flatten_grad, in this way, the framework can reuse the memory of X
-// immediately the flatten2_op is finished.
-// Considering compatibility issues, we could not fix flatten2_op
-class Flatten2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Flatten2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Flatten2");
-    const auto &axis = ctx->Attrs().Get<int>("axis");
-    const auto &in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(axis,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The axis should be greater than or equal to 0."));
-    PADDLE_ENFORCE_LE(
-        axis,
-        in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The axis should be less than or equal to input tensor's rank"));
-
-    const auto &out_dims = FlattenOp::GetOutputShape(axis, in_dims);
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    if (in_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-    if (!ctx->HasOutput("XShape")) return;
-    // OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2");
-    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < in_dims.size(); ++i) {
-      xshape_dims[i + 1] = in_dims[i];
-    }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", "XShape");
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class Flatten2OpMaker : public FlattenOpMaker {
- public:
-  void Make() override {
-    FlattenOpMaker::Make();
     AddOutput("XShape",
               "XShape is just used to store the shape and lod of X, which will "
               "be used in FlattenGradOp.")
@@ -293,17 +213,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(FlattenGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(flatten,
-                  ops::FlattenOp,
-                  ops::FlattenOpMaker,
-                  ops::FlattenGradOpMaker<paddle::framework::OpDesc>,
-                  ops::FlattenGradOpMaker<paddle::imperative::OpBase>,
-                  ops::FlattenOpInplaceInferer);
-REGISTER_OPERATOR(flatten_grad,
-                  ops::FlattenGradOp,
-                  ops::FlattenGradInplaceInferer,
-                  ops::FlattenGradNoNeedBufferVarsInferer);
-
 REGISTER_OPERATOR(flatten2,
                   ops::Flatten2Op,
                   ops::Flatten2OpMaker,
@@ -314,20 +223,6 @@ REGISTER_OPERATOR(flatten2_grad,
                   ops::Flatten2GradOp,
                   ops::FlattenGradInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(flatten,
-                       ops::FlattenKernel<phi::CPUContext, float>,
-                       ops::FlattenKernel<phi::CPUContext, double>,
-                       ops::FlattenKernel<phi::CPUContext, uint8_t>,
-                       ops::FlattenKernel<phi::CPUContext, int>,
-                       ops::FlattenKernel<phi::CPUContext, int8_t>,
-                       ops::FlattenKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(flatten_grad,
-                       ops::FlattenGradKernel<phi::CPUContext, float>,
-                       ops::FlattenGradKernel<phi::CPUContext, double>,
-                       ops::FlattenGradKernel<phi::CPUContext, uint8_t>,
-                       ops::FlattenGradKernel<phi::CPUContext, int>,
-                       ops::FlattenGradKernel<phi::CPUContext, int8_t>,
-                       ops::FlattenGradKernel<phi::CPUContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(flatten2,
                        ops::Flatten2Kernel<phi::CPUContext, float>,
                        ops::Flatten2Kernel<phi::CPUContext, double>,
diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc
index 6fe28c4a7b6896..59b9271f0af3c2 100644
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
@@ -16,20 +16,6 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(flatten,
-                        ops::FlattenKernel<phi::GPUContext, float>,
-                        ops::FlattenKernel<phi::GPUContext, double>,
-                        ops::FlattenKernel<phi::GPUContext, uint8_t>,
-                        ops::FlattenKernel<phi::GPUContext, int>,
-                        ops::FlattenKernel<phi::GPUContext, int8_t>,
-                        ops::FlattenKernel<phi::GPUContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(flatten_grad,
-                        ops::FlattenGradKernel<phi::GPUContext, float>,
-                        ops::FlattenGradKernel<phi::GPUContext, double>,
-                        ops::FlattenGradKernel<phi::GPUContext, uint8_t>,
-                        ops::FlattenGradKernel<phi::GPUContext, int>,
-                        ops::FlattenGradKernel<phi::GPUContext, int8_t>,
-                        ops::FlattenGradKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(flatten2,
                         ops::Flatten2Kernel<phi::GPUContext, float>,
                         ops::Flatten2Kernel<phi::GPUContext, double>,
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 513716047ed770..6942a0f7db2da4 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -28,14 +28,16 @@ namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
-class FlattenKernel : public framework::OpKernel<T> {
+class Flatten2Kernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
+    auto &axes = context.Attr<int>("axis");
+
     auto *in = context.Input<phi::DenseTensor>("X");
+    auto x_dims = in->dims();
+
     auto *out = context.Output<phi::DenseTensor>("Out");
 
-    auto &axes = context.Attr<int>("axis");
-    auto x_dims = in->dims();
     auto out_dims = phi::make_ddim(GetOutputShape(axes, x_dims));
 
     out->mutable_data(context.GetPlace(), in->type());
@@ -68,48 +70,6 @@ class FlattenKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class FlattenGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto in_dims = ctx.Input<phi::DenseTensor>("X")->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        d_x);
-    d_x->Resize(in_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Flatten2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &axes = context.Attr<int>("axis");
-
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto x_dims = in->dims();
-
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    auto out_dims = phi::make_ddim(
-        FlattenKernel<DeviceContext, T>::GetOutputShape(axes, x_dims));
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class Flatten2GradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/flatten_op_xpu.cc b/paddle/fluid/operators/flatten_op_xpu.cc
index 0e79e7b7dda8cf..ec54a8f815ab42 100644
--- a/paddle/fluid/operators/flatten_op_xpu.cc
+++ b/paddle/fluid/operators/flatten_op_xpu.cc
@@ -19,18 +19,6 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_XPU_KERNEL(
-    flatten,
-    ops::FlattenKernel<paddle::platform::XPUDeviceContext, float>,
-    ops::FlattenKernel<paddle::platform::XPUDeviceContext, int>,
-    ops::FlattenKernel<paddle::platform::XPUDeviceContext, int8_t>,
-    ops::FlattenKernel<paddle::platform::XPUDeviceContext, int64_t>);
-REGISTER_OP_XPU_KERNEL(
-    flatten_grad,
-    ops::FlattenGradKernel<paddle::platform::XPUDeviceContext, float>,
-    ops::FlattenGradKernel<paddle::platform::XPUDeviceContext, int>,
-    ops::FlattenGradKernel<paddle::platform::XPUDeviceContext, int8_t>,
-    ops::FlattenGradKernel<paddle::platform::XPUDeviceContext, int64_t>);
 REGISTER_OP_XPU_KERNEL(
     flatten2,
     ops::Flatten2Kernel<paddle::platform::XPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 89ea5def6fa6bc..42c41effb80ed2 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -7,14 +7,11 @@ register_operators(
   EXCLUDES
   fused_bn_activation_op
   conv_fusion_op
-  fusion_transpose_flatten_concat_op
   fusion_conv_inception_op
-  fused_fc_elementwise_layernorm_op
   self_dp_attention_op
   skip_layernorm_op
   yolo_box_head_op
   yolo_box_post_op
-  fused_embedding_eltwise_layernorm_op
   fusion_group_op
   fusion_gru_op
   fusion_lstm_op
@@ -61,22 +58,15 @@ if(WITH_GPU OR WITH_ROCM)
   if(NOT ${CUDNN_VERSION} VERSION_LESS 7100)
     op_library(conv_fusion_op)
   endif()
-  # fusion_transpose_flatten_concat_op
   # HIP not support cudnnTransformTensor
-  if(NOT WITH_ROCM)
-    op_library(fusion_transpose_flatten_concat_op)
-  endif()
   # fusion_conv_inception_op needs cudnn 7 above
   # HIP not support cudnnConvolutionBiasActivationForward
   if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
     op_library(fusion_conv_inception_op)
   endif()
-  # fused_fc_elementwise_layernorm_op
-  op_library(fused_fc_elementwise_layernorm_op)
   op_library(skip_layernorm_op)
   op_library(yolo_box_head_op)
   op_library(yolo_box_post_op)
-  op_library(fused_embedding_eltwise_layernorm_op DEPS bert_encoder_functor)
   op_library(fused_gate_attention_op)
   # fusion_group
   if(NOT APPLE AND NOT WIN32)
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.h b/paddle/fluid/operators/fused/fused_bn_activation_op.h
index c82c9aca413c37..3b04eb1bc59ed5 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
@@ -88,17 +88,5 @@ class FusedBatchNormActOpInferVarType
   }
 };
 
-template <typename T, typename DeviceContext>
-class FusedBatchNormActKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename T, typename DeviceContext>
-class FusedBatchNormActGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
deleted file mode 100644
index 1fa7ff1826b071..00000000000000
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ /dev/null
@@ -1,387 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <cfloat>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/flags.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/norm_utils.h"
-
-PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent);
-
-namespace paddle {
-namespace operators {
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-template <typename T>
-using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
-template <typename T>
-class FusedBatchNormAddActKernel<T, phi::GPUContext>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-#if CUDNN_VERSION < 7401
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "The fused_bn_add_activation operator is not supported on GPU "
-        "when CUDNN version < 7.4.1"));
-#endif
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    float momentum = ctx.Attr<float>("momentum");
-    std::string act_type = ctx.Attr<std::string>("act_type");
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-    // Get the size for each dimension.
-    // NHWC [batch_size, in_height, in_width, in_channels]
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto *z = ctx.Input<phi::DenseTensor>("Z");
-    const auto &in_dims = x->dims();
-
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-    dev_ctx.Alloc<BatchNormParamType<T>>(
-        mean_out, mean_out->numel() * sizeof(BatchNormParamType<T>));
-    dev_ctx.Alloc<BatchNormParamType<T>>(
-        variance_out, variance_out->numel() * sizeof(BatchNormParamType<T>));
-
-    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-    dev_ctx.Alloc<BatchNormParamType<T>>(
-        saved_mean, saved_mean->numel() * sizeof(BatchNormParamType<T>));
-    dev_ctx.Alloc<BatchNormParamType<T>>(
-        saved_variance,
-        saved_variance->numel() * sizeof(BatchNormParamType<T>));
-
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    dev_ctx.Alloc<T>(y, y->numel() * sizeof(T));
-
-    int N, C, H, W, D;
-    const DataLayout data_layout = DataLayout::kNHWC;
-    phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
-
-    // ------------------- cudnn descriptors ---------------------
-    auto handle = dev_ctx.cudnn_handle();
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-
-    std::vector<int> dims = {N, C, H, W, D};
-    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
-
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_,
-        CudnnDataType<T>::type,
-        in_dims.size() > 3 ? in_dims.size() : 4,
-        dims.data(),
-        strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-
-    double this_factor = 1. - momentum;
-    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
-    platform::ScopedActivationDescriptor scope_act_desc;
-    cudnnActivationDescriptor_t activation_desc_ =
-        scope_act_desc.descriptor<T>(act_type);
-    size_t workspace_size = 0;
-    size_t reserve_space_size = 0;
-    void *reserve_space_ptr = nullptr;
-    void *workspace_ptr = nullptr;
-    phi::DenseTensor workspace_tensor;
-    // Create reserve space and workspace for batch norm.
-    // Create tensor for each batchnorm op, it will be used in the
-    // backward. Thus this tensor shouldn't be temp.
-    auto *reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
-    PADDLE_ENFORCE_NOT_NULL(
-        reserve_space,
-        platform::errors::NotFound(
-            "The argument ReserveSpace of batch_norm op is not found."));
-
-    // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::
-            cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-                /*handle=*/handle,
-                /*mode=*/mode_,
-                /*bnOps=*/bnOps_,
-                /*xDesc=*/data_desc_,
-                /*zDesc=*/data_desc_,
-                /*yDesc=*/data_desc_,
-                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                /*activationDesc=*/activation_desc_,
-                /*sizeInBytes=*/&workspace_size));
-
-    // -------------- cudnn batchnorm reserve space --------------
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-            /*handle=*/handle,
-            /*mode=*/mode_,
-            /*bnOps=*/bnOps_,
-            /*activationDesc=*/activation_desc_,
-            /*xDesc=*/data_desc_,
-            /*sizeInBytes=*/&reserve_space_size));
-
-    reserve_space->Resize({static_cast<int64_t>(
-        (reserve_space_size + phi::SizeOf(x->dtype()) - 1) /
-        phi::SizeOf(x->dtype()))});
-    reserve_space_ptr =
-        dev_ctx.Alloc<T>(reserve_space, reserve_space->numel() * sizeof(T));
-    workspace_tensor.Resize(
-        {static_cast<int64_t>((workspace_size + phi::SizeOf(x->dtype()) - 1) /
-                              phi::SizeOf(x->dtype()))});
-    workspace_ptr = dev_ctx.Alloc<T>(&workspace_tensor,
-                                     workspace_tensor.numel() * sizeof(T));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
-            handle,
-            mode_,
-            bnOps_,
-            CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(),
-            data_desc_,
-            x->template data<T>(),
-            data_desc_,
-            z->template data<T>(),
-            data_desc_,
-            y->template data<T>(),
-            bn_param_desc_,
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(),
-            this_factor,
-            dev_ctx.template Alloc<BatchNormParamType<T>>(
-                mean_out, mean_out->numel() * sizeof(BatchNormParamType<T>)),
-            dev_ctx.template Alloc<BatchNormParamType<T>>(
-                variance_out,
-                variance_out->numel() * sizeof(BatchNormParamType<T>)),
-            epsilon,
-            dev_ctx.template Alloc<BatchNormParamType<T>>(
-                saved_mean,
-                saved_mean->numel() * sizeof(BatchNormParamType<T>)),
-            dev_ctx.template Alloc<BatchNormParamType<T>>(
-                saved_variance,
-                saved_variance->numel() * sizeof(BatchNormParamType<T>)),
-            activation_desc_,
-            workspace_ptr,
-            workspace_size,
-            reserve_space_ptr,
-            reserve_space_size));
-
-    // clean when exit.
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-  }
-};
-
-template <typename T>
-class FusedBatchNormAddActGradKernel<T, phi::GPUContext>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-#if CUDNN_VERSION < 7401
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "The fused_bn_add_activation operator is not supported on GPU "
-        "when CUDNN version < 7.4.1"));
-#endif
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    std::string act_type = ctx.Attr<std::string>("act_type");
-
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto *y = ctx.Input<phi::DenseTensor>("Y");
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *reserve_space = ctx.Input<phi::DenseTensor>("ReserveSpace");
-
-    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-    const auto &in_dims = x->dims();
-
-    int N, C, H, W, D;
-    const DataLayout data_layout = DataLayout::kNHWC;
-    phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
-
-    // init output
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_z = ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    d_z->mutable_data<T>(ctx.GetPlace());
-    PADDLE_ENFORCE_EQ(
-        d_scale && d_bias,
-        true,
-        platform::errors::PreconditionNotMet(
-            "Both the scale grad and the bias grad must not be null."));
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    PADDLE_ENFORCE_EQ(scale->dims().size(),
-                      1UL,
-                      platform::errors::PreconditionNotMet(
-                          "The scale only has one dimension."));
-    PADDLE_ENFORCE_EQ(
-        scale->dims()[0],
-        C,
-        platform::errors::PreconditionNotMet(
-            "The size of scale is equal to the channel of Input(X)."));
-
-    std::vector<int> dims = {N, C, H, W, D};
-    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    // ------------------- cudnn descriptors ---------------------
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_,
-        CudnnDataType<T>::type,
-        in_dims.size() > 3 ? in_dims.size() : 4,
-        dims.data(),
-        strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    const auto *saved_var = ctx.Input<phi::DenseTensor>("SavedVariance");
-    const auto *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const auto *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-
-    size_t workspace_size = 0;
-    void *workspace_ptr = nullptr;
-    phi::DenseTensor workspace_tensor;
-    auto reserve_space_size = reserve_space->memory_size();
-    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
-    platform::ScopedActivationDescriptor scope_act_desc;
-    cudnnActivationDescriptor_t activation_desc_ =
-        scope_act_desc.descriptor<T>(act_type);
-    // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-            /*handle=*/dev_ctx.cudnn_handle(),
-            /*mode=*/mode_,
-            /*bnOps=*/bnOps_,
-            /*xDesc=*/data_desc_,
-            /*yDesc=*/data_desc_,
-            /*dyDesc=*/data_desc_,
-            /*dzDesc=*/data_desc_,
-            /*dxDesc=*/data_desc_,
-            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-            /*activationDesc=*/activation_desc_,
-            /*sizeInBytes=*/&workspace_size));
-
-    workspace_ptr = workspace_tensor.mutable_data(
-        ctx.GetPlace(), x->dtype(), workspace_size);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnBatchNormalizationBackwardEx(
-            /*handle=*/dev_ctx.cudnn_handle(),
-            /*mode=*/mode_,
-            /*bnOps=*/bnOps_,
-            /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
-            /*betaDataDiff=*/CudnnDataType<T>::kZero(),
-            /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
-            /*betaParamDiff=*/CudnnDataType<T>::kZero(),
-            /*xDesc=*/data_desc_,
-            /*xData=*/x->template data<T>(),
-            /*yDesc=*/data_desc_,
-            /*yData=*/y->template data<T>(),
-            /*dyDesc=*/data_desc_,
-            /*dyData=*/d_y->template data<T>(),
-            /*dzDesc=*/data_desc_,
-            /*dzData=*/d_z->template data<T>(),
-            /*dxDesc=*/data_desc_,
-            /*dxData=*/d_x->template data<T>(),
-            /*dBnScaleBiasDesc=*/bn_param_desc_,
-            /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
-            /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
-            /*dBnScaleData=*/d_scale->template data<BatchNormParamType<T>>(),
-            /*dBnBiasData=*/d_bias->template data<BatchNormParamType<T>>(),
-            /*epsilon=*/epsilon,
-            /*savedMean=*/saved_mean_data,
-            /*savedInvVariance=*/saved_var_data,
-            /*activationDesmc=*/activation_desc_,
-            /*workspace=*/workspace_ptr,
-            /*workSpaceSizeInBytes=*/workspace_size,
-            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
-            /*reserveSpaceSizeInBytes=*/reserve_space_size));
-
-    // clean when exit.
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(fused_bn_add_activation,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FusedBatchNormAddActKernel,
-                          plat::float16) {}
-PD_REGISTER_STRUCT_KERNEL(fused_bn_add_activation_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FusedBatchNormAddActGradKernel,
-                          plat::float16) {}
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
index 215ccfdde5e026..82967b043d89e8 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -89,17 +89,5 @@ class FusedBatchNormAddActOpInferVarType
   }
 };
 
-template <typename T, typename DeviceContext>
-class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename T, typename DeviceContext>
-class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index 778e6ed277fd7e..b11840866d46b3 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -405,7 +405,7 @@ class FusedElemwiseAddActivationOp : public FusedElemwiseActivationOp {
     std::vector<std::string> functor_names =
         ctx->Attrs().Get<std::vector<std::string>>("functor_list");
     bool elemntwise_add_detected = false;
-    for (auto names : functor_names) {
+    for (auto const &names : functor_names) {
       if (names == "elementwise_add") {
         elemntwise_add_detected = true;
         break;
@@ -430,7 +430,7 @@ class FusedElemwiseAddActivationOpGrad : public FusedElemwiseActivationOpGrad {
     std::vector<std::string> functor_names =
         ctx->Attrs().Get<std::vector<std::string>>("functor_list");
     bool elemntwise_add_grad_detected = false;
-    for (auto names : functor_names) {
+    for (auto const &names : functor_names) {
       if (names == "elementwise_add_grad") {
         elemntwise_add_grad_detected = true;
         break;
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
deleted file mode 100644
index 6f2c61a5cf4701..00000000000000
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/errors.h"
-
-namespace paddle {
-namespace operators {
-
-class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE_EQ(
-        context->Inputs("Ids").size(),
-        context->Inputs("Embs").size(),
-        platform::errors::InvalidArgument(
-            "Two inputs of EmbeddingEltWiseLayerNormOp shoube be "
-            "the same size, but received the size of input Ids = %d,"
-            " the size of input Embs = %d",
-            context->Inputs("Ids").size(),
-            context->Inputs("Embs").size()));
-    PADDLE_ENFORCE_GE(context->Inputs("Embs").size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "Input Embs of EmbeddingEltWiseLayerNormOp should "
-                          "have at least 2 tensors"));
-    PADDLE_ENFORCE_GE(context->Inputs("Ids").size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "Input Ids of EmbeddingEltWiseLayerNormOp should "
-                          "have at least 2 tensors"));
-
-    PADDLE_ENFORCE_EQ(
-        context->HasInput("Bias"),
-        true,
-        platform::errors::InvalidArgument(
-            "Input(Bias) of EmbeddingEltWiseLayerNormOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        context->HasInput("Scale"),
-        true,
-        platform::errors::InvalidArgument(
-            "Input(Scale) of EmbeddingEltWiseLayerNormOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        context->HasOutput("Out"),
-        true,
-        platform::errors::InvalidArgument(
-            "Output(Out) of EmbeddingEltWiseLayerNormOp should not be null."));
-
-    // batch * seq_len * 1
-    auto ids_dims = context->GetInputsDim("Ids");
-    // word_num * hidden
-    auto embs_dims = context->GetInputsDim("Embs");
-    // hidden
-    auto dims_bias = context->GetInputDim("Bias");
-    int batch = ids_dims[0][0];
-    int seq_len = ids_dims[0][1];
-    int hidden = embs_dims[0][1];
-    for (auto& embs_dim : embs_dims) {
-      PADDLE_ENFORCE_EQ(embs_dim.size(),
-                        2,
-                        platform::errors::InvalidArgument(
-                            "The Emb dim's size shoule be 2, but found %d.",
-                            embs_dim.size()));
-      PADDLE_ENFORCE_EQ(
-          embs_dim[1],
-          dims_bias[0],
-          platform::errors::InvalidArgument(
-              "The second dims (%d) of the Embedding should be equal "
-              "to the Bias's size(%d).",
-              embs_dim[1],
-              dims_bias[0]));
-      PADDLE_ENFORCE_EQ(
-          embs_dim[1],
-          hidden,
-          platform::errors::InvalidArgument(
-              "The second dimension size(%d) of the Embedding should be "
-              "equal to the hidden's size(%d)",
-              embs_dim[1],
-              hidden));
-    }
-
-    auto dim_output = phi::make_ddim({batch, seq_len, hidden});
-    context->SetOutputDim("Out", dim_output);
-    context->ShareLoD("Ids", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto inputs = ctx.MultiInput<phi::DenseTensor>("Embs");
-    auto input_data_type = framework::proto::VarType::Type(0);
-    bool flag = false;
-    for (auto* input : inputs) {
-      if (input->IsInitialized() && input->numel() > 0) {
-        input_data_type = framework::TransToProtoVarType(input->dtype());
-        flag = true;
-        break;
-      }
-    }
-    if (flag == 0) {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "All Inputs of fused_embedding_eltwise_layernorm OP are Empty!"));
-    }
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class EmbeddingEltWiseLayerNormOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "Input id tensors of EmbeddingEltWiseLayerNorm op")
-        .AsDuplicable();
-    AddInput("Embs", "Input emb tensors of EmbeddingEltWiseLayerNorm op")
-        .AsDuplicable();
-    AddInput("Bias", "The LayerNorm Bias of EmbeddingEltWiseLayerNorm op");
-    AddInput("Scale", "The LayerNorm Scale of EmbeddingEltWiseLayerNorm op");
-    AddOutput("Out", "The output of EmbeddingEltWiseLayerNorm op");
-    AddAttr<float>("epsilon",
-                   "Constant for numerical stability [default 1e-5].")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float& epsilon) {
-          PADDLE_ENFORCE_GE(
-              epsilon,
-              0.0f,
-              platform::errors::InvalidArgument(
-                  "'epsilon' is %f, but it should be between 0.0 and 0.001",
-                  epsilon));
-          PADDLE_ENFORCE_LE(
-              epsilon,
-              0.001f,
-              platform::errors::InvalidArgument(
-                  "'epsilon' is %f, but it should be between 0.0 and 0.001.",
-                  epsilon));
-        });
-    AddComment(R"DOC(
-EmbeddingEltWiseLayerNorm Operator.
-
-This op is used for optimize the following structure in ernie model.
-id1 -> lookup_table_op -> data1
-id2 -> lookup_table_op -> data2
-           ...
-idn -> lookup_table_op -> data_n
-data1 + data2 + ... + data_n -> Y
-Y -> layer_norm -> Out
-
-Not suggest to use in other case except has same structure as ernie.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fused_embedding_eltwise_layernorm,
-                             ops::EmbeddingEltWiseLayerNormOp,
-                             ops::EmbeddingEltWiseLayerNormOpMaker);
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
deleted file mode 100644
index 35574331e17d7d..00000000000000
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/fluid/platform/device_context.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <type_traits>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/bert_encoder_functor.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &device_ctx = context.template device_context<DeviceContext>();
-    auto ids = context.MultiInput<phi::DenseTensor>("Ids");
-    auto embs = context.MultiInput<phi::DenseTensor>("Embs");
-    int input_num = static_cast<int>(ids.size());
-
-    phi::DenseTensor in_ids_(
-        framework::TransToPhiDataType(framework::proto::VarType::INT64)),
-        in_embs_(
-            framework::TransToPhiDataType(framework::proto::VarType::INT64));
-    framework::DDim in_dim{input_num};
-    int device_id;
-#ifdef PADDLE_WITH_HIP
-    hipGetDevice(&device_id);
-#else
-    cudaGetDevice(&device_id);
-#endif
-
-    auto &dev_ctx = context.template device_context<phi::GPUContext>();
-
-    in_ids_.Resize(in_dim);
-    in_embs_.Resize(in_dim);
-
-    int64_t *in_ids_d = dev_ctx.template Alloc<int64_t>(
-        &in_ids_, in_ids_.numel() * sizeof(int64_t));
-    int64_t *in_embs_d = dev_ctx.template Alloc<int64_t>(
-        &in_embs_, in_embs_.numel() * sizeof(int64_t));
-
-    std::vector<int64_t> in1s, in2s;
-    for (int i = 0; i < input_num; ++i) {
-      in1s.push_back(reinterpret_cast<uintptr_t>(ids[i]->data<int64_t>()));
-      in2s.push_back(reinterpret_cast<uintptr_t>(embs[i]->data<T>()));
-    }
-#ifdef PADDLE_WITH_HIP
-    hipMemcpyAsync(in_ids_d,
-                   in1s.data(),
-                   sizeof(int64_t) * input_num,
-                   hipMemcpyHostToDevice,
-                   device_ctx.stream());
-    hipMemcpyAsync(in_embs_d,
-                   in2s.data(),
-                   sizeof(int64_t) * input_num,
-                   hipMemcpyHostToDevice,
-                   device_ctx.stream());
-#else
-    cudaMemcpyAsync(in_ids_d,
-                    in1s.data(),
-                    sizeof(int64_t) * input_num,
-                    cudaMemcpyHostToDevice,
-                    device_ctx.stream());
-    cudaMemcpyAsync(in_embs_d,
-                    in2s.data(),
-                    sizeof(int64_t) * input_num,
-                    cudaMemcpyHostToDevice,
-                    device_ctx.stream());
-#endif
-
-    auto *bias = context.Input<phi::DenseTensor>("Bias");
-    auto *scale = context.Input<phi::DenseTensor>("Scale");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    // should be (B * S * hidden)
-    auto id0_dims = ids[0]->dims();
-    auto emb0_dims = embs[0]->dims();
-
-    int batch = id0_dims[0];
-    int seq_len = id0_dims[1];
-    int hidden = emb0_dims[1];
-
-    auto *bias_d = bias->data<T>();
-    auto *scale_d = scale->data<T>();
-    auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
-
-    float eps = context.Attr<float>("epsilon");
-
-    if (std::is_same<T, paddle::platform::float16>::value) {
-      const half *scale_new = reinterpret_cast<const half *>(scale_d);
-      const half *bias_new = reinterpret_cast<const half *>(bias_d);
-      half *output_new = reinterpret_cast<half *>(output_d);
-
-      math::EmbEltwiseLayerNormFunctor<half> emb_eltwise_layernorm_func;
-      emb_eltwise_layernorm_func(batch,
-                                 seq_len,
-                                 hidden,
-                                 in_ids_d,
-                                 scale_new,
-                                 bias_new,
-                                 in_embs_d,
-                                 output_new,
-                                 eps,
-                                 input_num,
-                                 device_ctx.stream());
-    } else {
-      math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
-      emb_eltwise_layernorm_func(batch,
-                                 seq_len,
-                                 hidden,
-                                 in_ids_d,
-                                 scale_d,
-                                 bias_d,
-                                 in_embs_d,
-                                 output_d,
-                                 eps,
-                                 input_num,
-                                 device_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
-PD_REGISTER_STRUCT_KERNEL(fused_embedding_eltwise_layernorm,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::EmbeddingEltWiseLayerNormKernel,
-                          float,
-                          plat::float16) {}
-#else
-PD_REGISTER_STRUCT_KERNEL(fused_embedding_eltwise_layernorm,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::EmbeddingEltWiseLayerNormKernel,
-                          float) {}
-#endif
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
deleted file mode 100644
index 6f00b160d98dfd..00000000000000
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("X"), "Input", "X", "FusedFcElementwiseLayernorm");
-    OP_INOUT_CHECK(
-        ctx->HasInput("W"), "Input", "W", "FusedFcElementwiseLayernorm");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Y"), "Input", "Y", "FusedFcElementwiseLayernorm");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "FusedFcElementwiseLayernorm");
-
-    auto w_dims = ctx->GetInputDim("W");
-    PADDLE_ENFORCE_EQ(
-        w_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The input Weight of fc is expected to be a 2-D tensor. "
-            "But received the number of Weight's dimensions is %d, ",
-            "Weight's shape is %s.",
-            w_dims.size(),
-            w_dims));
-
-    if (ctx->HasInput("Bias0")) {
-      auto bias0_dims = ctx->GetInputDim("Bias0");
-
-      PADDLE_ENFORCE_LE(bias0_dims.size(),
-                        2,
-                        platform::errors::InvalidArgument(
-                            "The input Bias of fc is expected to be an 1-D or "
-                            "2-D tensor. But received the number of Bias's "
-                            "dimensions is %d, Bias's shape is %s.",
-                            bias0_dims.size(),
-                            bias0_dims));
-
-      PADDLE_ENFORCE_EQ(
-          bias0_dims[bias0_dims.size() - 1],
-          w_dims[1],
-          platform::errors::InvalidArgument(
-              "The last dimension of input Bias is expected be equal "
-              "to the actual width of input Weight. But received the last "
-              "dimension of Bias is %d, Bias's shape is %s; "
-              "the actual width of Weight is %d, Weight's shape is %s.",
-              bias0_dims[bias0_dims.size() - 1],
-              bias0_dims,
-              w_dims[1],
-              w_dims));
-
-      if (bias0_dims.size() == 2) {
-        PADDLE_ENFORCE_EQ(
-            bias0_dims[0],
-            1,
-            platform::errors::InvalidArgument(
-                "The first dimension of input Bias is expected to be 1, "
-                "but received %d, Bias's shape is %s.",
-                bias0_dims[0],
-                bias0_dims));
-      }
-    }
-
-    auto x_dims = ctx->GetInputDim("X");
-    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
-    PADDLE_ENFORCE_LT(
-        x_num_col_dims,
-        x_dims.size(),
-        platform::errors::InvalidArgument(
-            "The attribute x_num_col_dims used to flatten input X to "
-            "a 2-D tensor, is expected to be less than the number of "
-            "input X's dimensions. But received x_num_col_dims is %d, "
-            "the number of input X's dimensions is %d, input X's shape is %s.",
-            x_num_col_dims,
-            x_dims.size(),
-            x_dims));
-
-    auto x_mat_dims = phi::flatten_to_2d(x_dims, x_num_col_dims);
-    PADDLE_ENFORCE_EQ(
-        x_mat_dims[1],
-        w_dims[0],
-        platform::errors::InvalidArgument(
-            "The input's second dimension and weight's first dimension is "
-            "expected to be the same. But received input's second dimension is "
-            "%d, input's shape is %s; weight's first dimension is %d, weight's "
-            "shape is %s.",
-            x_mat_dims[1],
-            x_mat_dims,
-            w_dims[0],
-            w_dims));
-
-    std::vector<int64_t> fc_out_dims;
-    for (int i = 0; i < x_num_col_dims; ++i) {
-      fc_out_dims.push_back(x_dims[i]);
-    }
-    fc_out_dims.push_back(w_dims[1]);
-
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(phi::make_ddim(fc_out_dims),
-                      y_dims,
-                      platform::errors::InvalidArgument(
-                          "The output's shape of fc is expected to be equal to "
-                          "that of input Y. But received output's shape of fc "
-                          "is %s, input Y's shape is %s.",
-                          phi::make_ddim(fc_out_dims),
-                          y_dims));
-
-    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
-    PADDLE_ENFORCE_LT(
-        begin_norm_axis,
-        y_dims.size(),
-        platform::errors::InvalidArgument(
-            "The attribute begin_norm_axis used to flatten input Y to a 2-D "
-            "tensor, is expected to be less than the number of input Y's "
-            "dimensions. But received begin_norm_axis is %d, the number of "
-            "input Y's dimensions is %d, input Y's shape is %s.",
-            begin_norm_axis,
-            y_dims.size(),
-            y_dims));
-
-    auto y_mat_dim = phi::flatten_to_2d(y_dims, begin_norm_axis);
-    int64_t dim_0 = y_mat_dim[0];
-    int64_t dim_1 = y_mat_dim[1];
-    if (ctx->HasInput("Scale")) {
-      auto scale_dims = ctx->GetInputDim("Scale");
-      PADDLE_ENFORCE_EQ(scale_dims.size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The input Scale is expected to be an 1-D tensor. "
-                            "But received the number of input Scale's "
-                            "dimensions is %d, input Scale's shape is %s.",
-                            scale_dims.size(),
-                            scale_dims));
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            scale_dims[0],
-            dim_1,
-            platform::errors::InvalidArgument(
-                "The first dimension of input Scale is expected to be equal to "
-                "the second dimension of input Y after flattened. "
-                "But received the first dimension of input Scale is %d, input "
-                "Scale's shape is %s; the second dimension of flattened input "
-                "Y is %d, input Y's shape is %s, flattened axis is %d.",
-                scale_dims[0],
-                scale_dims,
-                dim_1,
-                y_dims,
-                begin_norm_axis));
-      }
-    }
-    if (ctx->HasInput("Bias1")) {
-      auto bias1_dims = ctx->GetInputDim("Bias1");
-      PADDLE_ENFORCE_EQ(
-          bias1_dims.size(),
-          1,
-          platform::errors::InvalidArgument(
-              "The input Bias1 is expected to be an 1-D tensor. "
-              "But received the number of input Bias1's dimension is %d, "
-              "input Bias1's shape is %s.",
-              bias1_dims.size(),
-              bias1_dims));
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            bias1_dims[0],
-            dim_1,
-            platform::errors::InvalidArgument(
-                "The first dimension of input Bias1 is expected to be equal to "
-                "the second dimension of input Y after flattened. "
-                "But received the first dimension of input Bias1 is %d, input "
-                "Bias1's shape is %s; the second dimension of flatten input "
-                "Y is %d, input Y's shape is %s, flattened axis is %d.",
-                bias1_dims[0],
-                bias1_dims,
-                dim_1,
-                y_dims,
-                begin_norm_axis));
-      }
-    }
-
-    ctx->SetOutputDim("Out", y_dims);
-    if (ctx->HasOutput("Mean")) {
-      ctx->SetOutputDim("Mean", {dim_0});
-    }
-    if (ctx->HasOutput("Variance")) {
-      ctx->SetOutputDim("Variance", {dim_0});
-    }
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class FusedFCElementwiseLayerNormOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of fully connected operation");
-    AddInput("W",
-             "(Tensor), The weight tensor of fully connected operation. It is "
-             "a 2-D Tensor with shape (I, O)");
-    AddInput("Bias0",
-             "(Tensor, optional), The bias tensor of fully connecred "
-             "operation. It is a 1-D Tensor with shape (O), or a 2-D Tensor "
-             "with shape (1, O).")
-        .AsDispensable();
-    AddInput("Y",
-             "(Tensor), The second input tensor of elementwise_add operation. "
-             "Note that the shape should be the same as fully connect's result "
-             "tensor.");
-    AddInput(
-        "Scale",
-        "(Tensor, optional), It is a 1-D input Tensor of layer_norm operation.")
-        .AsDispensable();
-    AddInput(
-        "Bias1",
-        "(Tensor, optional), It is a 1-D input Tensor of layer_norm operation.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor), Output after normalization. The shape is the shame as "
-              "layer_norm's input.");
-    AddOutput("Mean", "(Tensor, optional), Mean of the current minibatch")
-        .AsDispensable();
-    AddOutput("Variance",
-              "(Tensor, optional), Variance of the current minibatch")
-        .AsDispensable();
-    AddAttr<int>("x_num_col_dims",
-                 "(int, default 1), This op can take tensors with more than "
-                 "two dimensions as its inputs.")
-        .SetDefault(1)
-        .EqualGreaterThan(1);
-    AddAttr<std::string>("activation_type",
-                         "Activation type used in fully connected operator.")
-        .SetDefault("");
-    AddAttr<float>("epsilon",
-                   "Constant for numerical stability [default 1e-5].")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE_GE(epsilon,
-                            0.0f,
-                            platform::errors::InvalidArgument(
-                                "'epsilon' should be between 0.0 and 0.001."));
-          PADDLE_ENFORCE_LE(epsilon,
-                            0.001f,
-                            platform::errors::InvalidArgument(
-                                "'epsilon' should be between 0.0 and 0.001."));
-        });
-    AddAttr<int>("begin_norm_axis",
-                 "the axis of `begin_norm_axis ... Rank(Y) - 1` will be "
-                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H]. [default 1].")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &begin_norm_axis) {
-          PADDLE_ENFORCE_GT(
-              begin_norm_axis,
-              0,
-              platform::errors::InvalidArgument(
-                  "'begin_norm_axis' should be greater than zero."));
-        });
-    AddComment(R"DOC(
-fc_out <= fc(X, W, Bias0)
-add_out <= elementwise_add(fc_out, Y)
-(out, mean, variance) <= layer_norm(add_out, Scale, Bias1)
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    fused_fc_elementwise_layernorm,
-    ops::FusedFCElementwiseLayerNormOp,
-    ops::FusedFCElementwiseLayerNormOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_matmul_op.cc b/paddle/fluid/operators/fused/fused_matmul_op.cc
index ca3d02bf9bfa11..198fd61a150780 100644
--- a/paddle/fluid/operators/fused/fused_matmul_op.cc
+++ b/paddle/fluid/operators/fused/fused_matmul_op.cc
@@ -82,7 +82,7 @@ class FusedMatmulOp : public framework::OperatorWithKernel {
       y_broadcasted = true;
     }
 
-    size_t M, N;
+    size_t M = 0, N = 0;
     if (trans_x) {
       M = dims_x[ndims_x - 1];
     } else {
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 0625d5c80c08eb..541233949b5d22 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -129,7 +129,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   framework::DDim out_dims({x_mat_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
   ctx->ShareLoD("X", "Hidden");
-  int xx_width;
+  int xx_width = 0;
   if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = static_cast<int>(wx_dims[1]);
   } else {
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 400d8dcdaad2f6..d6e05a4ba3d480 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -141,7 +141,7 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   ctx->SetOutputDim("Cell", out_dims);
   ctx->ShareLoD("X", "Hidden");
   ctx->ShareLoD("X", "Cell");
-  int xx_width;
+  int xx_width = 0;
   if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = static_cast<int>(wx_dims[1]);
   } else {
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
deleted file mode 100644
index e7bb037a3f3aaf..00000000000000
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("X").size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "Inputs(X) of TransposeFlattenConcat op should not be empty."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        platform::errors::InvalidArgument(
-            "Inputs(X) of TransposeFlattenConcat op should not be empty."));
-
-    auto ins = ctx->GetInputsDim("X");
-    const size_t n = ins.size();
-    PADDLE_ENFORCE_GT(n,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The size of Inputs(X)'s dimension should be greater "
-                          " than 0, but received %d.",
-                          n));
-
-    std::vector<int> trans_axis =
-        ctx->Attrs().Get<std::vector<int>>("trans_axis");
-    int flatten_axis = ctx->Attrs().Get<int>("flatten_axis");
-    int concat_axis = ctx->Attrs().Get<int>("concat_axis");
-
-    size_t x_rank = ins[0].size();
-    size_t trans_axis_size = trans_axis.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      trans_axis_size,
-                      platform::errors::InvalidArgument(
-                          "The input tensor's rank(%d) "
-                          "should be equal to the permutation axis's size(%d)",
-                          x_rank,
-                          trans_axis_size));
-
-    auto dims0 =
-        GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[0]));
-    std::vector<int> out_dims(dims0);
-    for (size_t i = 1; i < n; i++) {
-      auto dimsi =
-          GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[i]));
-      for (int j = 0; j < static_cast<int>(dims0.size()); j++) {
-        if (j == concat_axis) {
-          out_dims[concat_axis] += dimsi[j];
-        } else {
-          PADDLE_ENFORCE_EQ(out_dims[j],
-                            dimsi[j],
-                            platform::errors::InvalidArgument(
-                                "After flatting, the %d-th dim should be save "
-                                "except the specify axis.",
-                                j));
-        }
-      }
-    }
-    if (out_dims[concat_axis] < 0) {
-      out_dims[concat_axis] = -1;
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-  }
-};
-
-class TransposeFlattenConcatFusionOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor, tensors with rank up to 6 are supported.")
-        .AsDuplicable();
-    AddOutput("Out", "(Tensor)The output tensor.");
-    AddAttr<std::vector<int>>(
-        "trans_axis",
-        "(vector<int>) A list of values, and the size of the list should be "
-        "the same with the input tensor rank. This operator permutes the input "
-        "tensor's axes according to the values given.");
-    AddAttr<int>("flatten_axis",
-                 "(int)"
-                 "Indicate up to which input dimensions (exclusive) should be"
-                 "flattened to the outer dimension of the output. The value"
-                 "for axis must be in the range [0, R], where R is the rank of"
-                 "the input tensor. When axis = 0, the shape of the output"
-                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
-                 "input tensor is (d_0, d_1, ... d_n).");
-    AddAttr<int>("concat_axis",
-                 "The axis along which the input tensors will be concatenated. "
-                 "It should be 0 or 1, since the tensor is 2D after flatting.");
-    AddComment(R"DOC(
-
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    fusion_transpose_flatten_concat,
-    ops::TransposeFlattenConcatFusionOp,
-    ops::TransposeFlattenConcatFusionOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
deleted file mode 100644
index 3d843ac6409ec5..00000000000000
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-
-template <typename T, typename DeviceContext>
-class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
-    auto odims = out->dims();
-
-    std::vector<int> trans_axis = ctx.Attr<std::vector<int>>("trans_axis");
-    int flatten_axis = ctx.Attr<int>("flatten_axis");
-    int concat_axis = ctx.Attr<int>("concat_axis");
-
-    int rank = ins[0]->dims().size();
-    // use at least 4D in cudnnTransformTensor
-    int max_dim = rank < 4 ? 4 : rank;
-    std::vector<int> stride_x(max_dim, 0);
-    std::vector<int> stride_y(max_dim, 0);
-    std::vector<int> dims_y(max_dim, 0);
-
-    cudnnTensorDescriptor_t in_desc;
-    cudnnTensorDescriptor_t out_desc;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
-    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    T* odata = out->data<T>();
-    for (auto& item : ins) {
-      auto perm_shape = GetPermuteShape(trans_axis, item->dims());
-      int osize = 1;
-      auto idims = item->dims();
-      for (int i = 0; i < rank; i++) {
-        stride_x[i] = 1;
-        for (int j = trans_axis[i] + 1; j < rank; j++) {
-          stride_x[i] *= idims[j];
-        }
-        dims_y[i] = perm_shape[i];
-        osize *= perm_shape[i];
-      }
-      stride_y[rank - 1] = 1;
-      for (int i = rank - 2; i >= 0; i--) {
-        if (((i + 1) == flatten_axis) && (concat_axis == 1)) {
-          stride_y[i] = odims[1];
-        } else {
-          stride_y[i] = stride_y[i + 1] * perm_shape[i + 1];
-        }
-      }
-
-      // Since concat is after flatten, the output is 2D tensor.
-      // If concat_axis is 0, each input's permutated tensor is continuous.
-      // If concat_axis is 1, the stride of 0-th dim of each input's
-      // permutated tensor is odims()[1].
-
-      for (int i = rank; i < max_dim; i++) {
-        stride_x[i] = 1;
-        stride_y[i] = 1;
-        dims_y[i] = 1;
-      }
-
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
-
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor(
-          handle,
-          CudnnDataType<T>::kOne(),
-          in_desc,
-          static_cast<const void*>(item->data<T>()),
-          CudnnDataType<T>::kZero(),
-          out_desc,
-          static_cast<void*>(odata)));
-      if (concat_axis == 0) {
-        odata += osize;
-      } else {
-        auto flat_shape = GetFlattenShape(flatten_axis, perm_shape);
-        odata += flat_shape[1];
-      }
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(fusion_transpose_flatten_concat,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::TransposeFlattenConcatFusionKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
deleted file mode 100644
index 52140c0ca46ee6..00000000000000
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-inline std::vector<int32_t> GetPermuteShape(const std::vector<int>& axis,
-                                            const framework::DDim& in_dims) {
-  std::vector<int32_t> out_dims(in_dims.size());
-  for (size_t i = 0; i < axis.size(); i++) {
-    out_dims[i] = in_dims[axis[i]];
-  }
-  return out_dims;
-}
-
-inline std::vector<int32_t> GetFlattenShape(const int axis,
-                                            const std::vector<int>& in_dims) {
-  int64_t outer = 1, inner = 1;
-  for (int i = 0; i < static_cast<int>(in_dims.size()); ++i) {
-    if (i < axis) {
-      outer *= in_dims[i];
-    } else {
-      inner *= in_dims[i];
-    }
-  }
-  std::vector<int32_t> out_shape(2);
-  out_shape[0] = outer;
-  out_shape[1] = inner;
-  return out_shape;
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
deleted file mode 100644
index c792532e58f792..00000000000000
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/batch_size_like.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::normal_distribution<T> dist(mean, std);
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-  }
-};
-
-class GaussianRandomBatchSizeLikeOp : public BatchSizeLikeOp {
- protected:
-  using BatchSizeLikeOp::BatchSizeLikeOp;
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- protected:
-  void Apply() override {
-    AddAttr<float>("mean",
-                   "(float, default 0.0) "
-                   "The mean (or center) of the gaussian distribution.")
-        .SetDefault(.0f);
-    AddAttr<float>("std",
-                   "(float, default 1.0) "
-                   "The standard deviation (std, or spread) of the "
-                   "gaussian distribution.")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed of generator."
-                 "0 means don't specify random seed."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
-    AddAttr<int>("dtype",
-                 "(int, default 5(FP32)) "
-                 "Output data type.")
-        .SetDefault(framework::proto::VarType::FP32);
-
-    AddComment(R"DOC(
-
-Used to initialize tensors with gaussian random generator.
-The default mean of the distribution is 0, and default standard
-deviation (std) of the distribution is 1.0. Uers can set mean and std
-via input arguments.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(
-    gaussian_random_batch_size_like,
-    paddle::operators::GaussianRandomBatchSizeLikeOp,
-    paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    paddle::operators::BatchSizeLikeNoNeedBufferVarsInferer);
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(gaussian_random_batch_size_like,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CPUGaussianRandomBatchSizeLikeKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu
deleted file mode 100644
index 9c5244976fc9d0..00000000000000
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/random.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/generator.h"
-#include "paddle/phi/kernels/funcs/index_impl.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GaussianGenerator {
-  T mean_, std_;
-  unsigned int seed_;
-  unsigned int offset_ = 0;
-
-  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
-      : mean_(mean), std_(std), seed_(seed) {}
-
-  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
-      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
-                                         static_cast<MT>(std_));
-    unsigned int new_n = n + offset_;
-    rng.discard(new_n);
-    MT out = dist(rng);
-    return static_cast<T>(out);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    int64_t size = tensor->numel();
-
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = phi::DefaultCUDAGenerator(device_id);
-    auto& dev_cxt = context.template device_context<phi::GPUContext>();
-
-    if (seed == 0) {
-      // use global Generator seed
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      uint64_t seed = seed_offset.first;
-      uint64_t offset = seed_offset.second;
-      auto func = GaussianGenerator<T>(mean, std, seed, size * offset);
-      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-    } else {
-      auto func = GaussianGenerator<T>(mean, std, seed);
-      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(gaussian_random_batch_size_like,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::GPUGaussianRandomBatchSizeLikeKernel,
-                          float,
-                          double,
-                          plat::float16) {}
diff --git a/paddle/fluid/operators/generator/type_mapping.py b/paddle/fluid/operators/generator/type_mapping.py
index 8d3a4933c3bd0a..56e01a997e61b7 100644
--- a/paddle/fluid/operators/generator/type_mapping.py
+++ b/paddle/fluid/operators/generator/type_mapping.py
@@ -48,7 +48,7 @@
     'int64_t[]': 'const std::vector<int64_t>&',
     'float[]': 'const std::vector<float>&',
     'double[]': 'const std::vector<double>&',
-    'str[]': 'const std::vector<<std::string>&',
+    'str[]': 'const std::vector<std::string>&',
 }
 
 opmaker_attr_types_map = {
@@ -86,8 +86,8 @@
 }
 
 optional_output_type_map = {
-    'Tensor': 'const paddle::optional<Tensor>&',
-    'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
+    'Tensor': 'const paddle::optional<Tensor>',
+    'Tensor[]': 'const paddle::optional<std::vector<Tensor>>',
 }
 
 # ------------------------------ phi attr ------------------------------
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 315bd225809729..f199fa096d0df6 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -333,7 +333,8 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     auto input_dims = input->dims();
     auto hidden_dims = hidden->dims();
 
-    LodTensorPtr batch_gate, batch_reset_hidden_prev, batch_hidden;
+    LodTensorPtr batch_gate = nullptr, batch_reset_hidden_prev = nullptr,
+                 batch_hidden = nullptr;
     phi::DenseTensor batch_gate_tmp, batch_reset_hidden_prev_tmp,
         batch_hidden_tmp;
     if (is_test) {
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
deleted file mode 100644
index a53a9867b9903e..00000000000000
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ /dev/null
@@ -1,615 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/inplace_abn_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
-#include "paddle/phi/kernels/batch_norm_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-class InplaceABNOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNorm");
-    OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNorm");
-    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "BatchNorm");
-    OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "BatchNorm");
-    OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", "BatchNorm");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "BatchNorm");
-
-    bool is_test = ctx->Attrs().Get<bool>("is_test");
-    bool trainable_stats = ctx->Attrs().Get<bool>("trainable_statistics");
-    bool test_mode = is_test && (!trainable_stats);
-    if (!test_mode) {
-      OP_INOUT_CHECK(
-          ctx->HasOutput("MeanOut"), "Output", "MeanOut", "BatchNorm");
-      OP_INOUT_CHECK(
-          ctx->HasOutput("VarianceOut"), "Output", "VarianceOut", "BatchNorm");
-      OP_INOUT_CHECK(
-          ctx->HasOutput("SavedMean"), "Output", "SavedMean", "BatchNorm");
-      OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"),
-                     "Output",
-                     "SavedVariance",
-                     "BatchNorm");
-    }
-
-    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0],
-                      ctx->Outputs("MeanOut")[0],
-                      platform::errors::InvalidArgument(
-                          "Mean and MeanOut should share the same memory"));
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("Variance")[0],
-        ctx->Outputs("VarianceOut")[0],
-        platform::errors::InvalidArgument(
-            "Variance and VarianceOut should share the same memory"));
-
-    const auto x_dims = ctx->GetInputDim("X");
-
-    for (int i = 0; i < x_dims.size(); i++) {
-      PADDLE_ENFORCE_EQ(
-          (x_dims[i] == -1) || (x_dims[i] > 0),
-          true,
-          platform::errors::InvalidArgument(
-              "Each dimension of input tensor is expected to be -1 or a "
-              "positive number, but received %d. Input's shape is [%s].",
-              x_dims[i],
-              x_dims));
-    }
-
-    const DataLayout data_layout =
-        phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
-
-    if (ctx->IsRuntime() && ctx->HasInput("MomentumTensor")) {
-      auto mom = ctx->Inputs("MomentumTensor");
-      PADDLE_ENFORCE_EQ(mom.size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The input tensor MomentumTensor's size must be 1"
-                            "But received: MomentumTensor's size is [%d]",
-                            mom.size()));
-    }
-
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "ShapeError: the dimension of input "
-                          "X must greater than or equal to 2. But received: "
-                          "the shape of input "
-                          "X = [%s], the dimension of input X =[%d]",
-                          x_dims,
-                          x_dims.size()));
-    PADDLE_ENFORCE_LE(x_dims.size(),
-                      5,
-                      platform::errors::InvalidArgument(
-                          "ShapeError: the dimension of input X "
-                          "must smaller than or equal to 5. But received: the "
-                          "shape of input X "
-                          "= [%s], the dimension of input X = [%d]",
-                          x_dims,
-                          x_dims.size()));
-    VLOG(4) << ctx->IsRunMKLDNNKernel();
-    VLOG(4) << data_layout;
-    const int64_t C = ((ctx->IsRunMKLDNNKernel() == true) ||
-                               (data_layout == DataLayout::kNCHW)
-                           ? x_dims[1]
-                           : x_dims[x_dims.size() - 1]);
-
-    auto scale_dim = ctx->GetInputDim("Scale");
-    auto bias_dim = ctx->GetInputDim("Bias");
-
-    PADDLE_ENFORCE_EQ(
-        scale_dim.size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "ShapeError: the dimension of scale must equal to 1."
-            "But received: the shape of scale is [%s], the dimension "
-            "of scale is [%d]",
-            scale_dim,
-            scale_dim.size()));
-    PADDLE_ENFORCE_EQ(
-        bias_dim.size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "ShapeError: the dimension of bias must equal to 1."
-            "But received: the shape of bias is [%s],the dimension "
-            "of bias is [%d]",
-            bias_dim,
-            bias_dim.size()));
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(scale_dim[0],
-                        C,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: the shape of scale must equal to [%d]"
-                            "But received: the shape of scale is [%d]",
-                            C,
-                            scale_dim[0]));
-      PADDLE_ENFORCE_EQ(bias_dim[0],
-                        C,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: the shape of bias must equal to [%d]"
-                            "But received: the shape of bias is [%d]",
-                            C,
-                            bias_dim[0]));
-    }
-    ctx->SetOutputDim("Y", x_dims);
-    ctx->ShareLoD("X", "Y");
-    VLOG(4) << x_dims;
-    ctx->SetOutputDim("MeanOut", {C});
-    ctx->SetOutputDim("VarianceOut", {C});
-    if (!test_mode) {
-      ctx->SetOutputDim("SavedMean", {C});
-      ctx->SetOutputDim("SavedVariance", {C});
-    }
-    if (ctx->HasOutput("ReserveSpace")) {
-      ctx->SetOutputDim("ReserveSpace", {-1});
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    // By default, the type of the scale, bias, mean,
-    // and var tensors should both be float. (For float or float16 input tensor)
-    // or double (For double input tensor).
-    auto bn_param_type = framework::proto::VarType::FP32;
-    if (input_data_type == framework::proto::VarType::FP64) {
-      bn_param_type = framework::proto::VarType::FP64;
-    }
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Scale")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Bias")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Mean")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Mean input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Variance")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Variance input should be of float type"));
-
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class InplaceABNGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // check input
-    OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "InplaceABNGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   "Y@GRAD",
-                   "InplaceABNGrad");
-    OP_INOUT_CHECK(
-        ctx->HasInput("SavedMean"), "Input", "SavedMean", "InplaceABNGrad");
-    OP_INOUT_CHECK(ctx->HasInput("SavedVariance"),
-                   "Input",
-                   "SavedVariance",
-                   "InplaceABNGrad");
-
-    // check output
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   "X@GRAD",
-                   "InplaceABNGrad");
-
-    const bool has_scale_grad = ctx->HasOutput(framework::GradVarName("Scale"));
-    const bool has_bias_grad = ctx->HasOutput(framework::GradVarName("Bias"));
-
-    PADDLE_ENFORCE_EQ(
-        has_scale_grad,
-        has_bias_grad,
-        platform::errors::InvalidArgument(
-            "Output(Scale@GRAD) and Output(Bias@GRAD) must be null "
-            "or not be null at same time. But now, "
-            "has Scale@Grad=[%d], has Bias@GRAD=[%d]",
-            has_scale_grad,
-            has_bias_grad));
-
-    const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
-    if (use_global_stats) {
-      PADDLE_ENFORCE_EQ(
-          !ctx->Attrs().Get<bool>("use_mkldnn"),
-          true,
-          platform::errors::InvalidArgument(
-              "Using global stats during training is not supported "
-              "in oneDNN version of batch_norm_gradient kernel now."));
-    }
-
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "InplaceABNGrad");
-    const auto y_dims = ctx->GetInputDim("Y");
-    const DataLayout data_layout =
-        phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
-
-    const int C = static_cast<int>((ctx->IsRunMKLDNNKernel() == true) ||
-                                           (data_layout == DataLayout::kNCHW)
-                                       ? y_dims[1]
-                                       : y_dims[y_dims.size() - 1]);
-
-    ctx->SetOutputDim(framework::GradVarName("X"), y_dims);
-    // has_scale_grad == has_bias_grad, judge has_scale_grad is enough
-    if (has_scale_grad) {
-      ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-      ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    const auto* var = ctx.InputVar(framework::GradVarName("Y"));
-    auto input_data_type = framework::TransToProtoVarType(
-        ctx.Input<phi::DenseTensor>("Y")->dtype());
-    if (var == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can't find gradient variable of Y"));
-    }
-    const phi::DenseTensor* t = nullptr;
-    if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW(
-          platform::errors::InvalidArgument("gradient variable of Y is empty"));
-    }
-
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class InplaceABNOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float& epsilon) {
-          PADDLE_ENFORCE_GE(
-              epsilon,
-              0.0f,
-              platform::errors::InvalidArgument(
-                  "'epsilon' should be greater or equal than 0.0."));
-          PADDLE_ENFORCE_LE(
-              epsilon,
-              0.001f,
-              platform::errors::InvalidArgument(
-                  "'epsilon' should be less or equal than 0.001."));
-        });
-    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
-    AddInput("X", "The input tensor");
-    AddInput("Scale",
-             "Scale is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("Bias",
-             "Bias is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("Mean",
-             "The global mean (for training) or "
-             "estimated mean (for testing)");
-    AddInput("Variance",
-             "The global variance (for training) "
-             "or estimated Variance (for testing)");
-    AddInput(
-        "MomentumTensor",
-        "(phi::DenseTensor<float32>, optional) If provided, batch_norm will "
-        "use this as momentum, this has a higher priority than "
-        "attr(momentum), the shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddOutput("Y", "result after normalization");
-    AddOutput("MeanOut",
-              "Share memory with Mean. "
-              "Store the global mean when training");
-    AddOutput("VarianceOut",
-              "Share memory with Variance. "
-              "Store the global Variance when training");
-    AddOutput("SavedMean",
-              "Mean of the current mini batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddOutput("SavedVariance",
-              "Variance of the current mini batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddOutput("ReserveSpace",
-              "Reserve GPU space for triggering the new semi-persistent "
-              "NHWC kernel")
-        .AsDispensable()
-        .AsExtra();
-    AddAttr<bool>("use_global_stats",
-                  "(bool, default false) Whether to use global mean and "
-                  "variance. In inference or test mode, set use_global_stats "
-                  "to true or is_test true. the behavior is equivalent. "
-                  "In train mode, when setting use_global_stats True, the "
-                  "global mean and variance are also used during train time, "
-                  "the BN acts as scaling and shiffting.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "trainable_statistics",
-        "(bool, default false) Whether to calculate mean and variance "
-        "in test mode. If setting true in test mode, mean and variace "
-        "will be calculated by current batch statistics.")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "activation",
-        "(enum string, default identity, can be identity|elu|leaky-relu) "
-        "The activation type used for output candidate {h}_t.")
-        .SetDefault("");
-    AddAttr<float>("alpha",
-                   "(float, default 1.0) Only used in inplace-abn kernel,"
-                   "the activation type(identity|elu|leakyrelu) would be fused "
-                   "with batch_norm, "
-                   "this is the alpha value for elu|leakyrelu.")
-        .SetDefault(0.1f);
-    AddAttr<bool>("use_sync_bn",
-                  "(bool, default false) Whether use synchronize batch "
-                  "normalization.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Batch Normalization.
-
-Batch Norm has been implemented as discussed in the paper:
-https://arxiv.org/pdf/1502.03167.pdf
-Can be used as a normalizer function for conv2d and fully_connected operations.
-The required data format for this layer is one of the following:
-1. NHWC `[batch, in_height, in_width, in_channels]`
-2. NCHW `[batch, in_channels, in_height, in_width]`
-
-)DOC");
-  }
-};
-
-template <typename T>
-class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType(this->ForwardOpType() + "_grad");
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-
-    op->SetInput("Scale", this->Input("Scale"));
-    op->SetInput("Bias", this->Input("Bias"));
-    op->SetInput("SavedMean", this->Output("SavedMean"));
-    op->SetInput("SavedVariance", this->Output("SavedVariance"));
-    if (this->HasOutput("ReserveSpace")) {
-      op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
-    }
-
-    // used when setting use_global_stats True during training
-    if (PADDLE_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
-      op->SetInput("Mean", this->Output("MeanOut"));
-      op->SetInput("Variance", this->Output("VarianceOut"));
-    }
-
-    op->SetAttrMap(this->Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
-    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
-  }
-};
-
-template <typename T, typename DeviceContext>
-class InplaceABNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    PADDLE_ENFORCE_EQ(x,
-                      y,
-                      platform::errors::InvalidArgument(
-                          "X and Y not inplaced in inplace mode"));
-    auto activation =
-        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* mean = ctx.Input<phi::DenseTensor>("Mean");
-    auto* variance = ctx.Input<phi::DenseTensor>("Variance");
-
-    auto momentum = ctx.Attr<float>("momentum");
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto data_layout = ctx.Attr<std::string>("data_layout");
-    auto is_test = ctx.Attr<bool>("is_test");
-    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
-
-    auto* mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-    auto* variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-    auto* saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto* saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-    auto* reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    phi::BatchNormKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x,
-        *mean,
-        *variance,
-        *scale,
-        *bias,
-        is_test,
-        momentum,
-        epsilon,
-        data_layout,
-        use_global_stats,
-        trainable_statistics,
-        y,
-        mean_out,
-        variance_out,
-        saved_mean,
-        saved_variance,
-        reserve_space);
-
-    auto cur_y = EigenVector<T>::Flatten(*y);
-    InplaceABNActivation<DeviceContext, T> functor;
-    functor.Compute(ctx, activation, place, cur_y, cur_y);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class InplaceABNGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    PADDLE_ENFORCE_EQ(d_x,
-                      d_y,
-                      platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD not inplaced in inplace mode"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto activation =
-        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
-
-    auto py = *y;
-    auto pd_y = *d_y;
-    auto cur_y = EigenVector<T>::Flatten(py);
-    auto cur_dy = EigenVector<T>::Flatten(pd_y);
-
-    InplaceABNActivation<DeviceContext, T> functor;
-    functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
-
-    // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
-
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    auto* saved_variance = ctx.Input<phi::DenseTensor>("SavedVariance");
-
-    auto momentum = ctx.Attr<float>("momentum");
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto data_layout = ctx.Attr<std::string>("data_layout");
-    auto is_test = ctx.Attr<bool>("is_test");
-    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
-
-    auto* scale_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* bias_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    auto* reserve_space = ctx.Input<phi::DenseTensor>("ReserveSpace");
-    auto* mean = ctx.Input<phi::DenseTensor>("ReserveSpace");
-    auto* variance = ctx.Input<phi::DenseTensor>("ReserveSpace");
-
-    paddle::optional<phi::DenseTensor> space_opt;
-    paddle::optional<phi::DenseTensor> mean_opt;
-    paddle::optional<phi::DenseTensor> variance_opt;
-
-    if (reserve_space != nullptr) {
-      space_opt = *reserve_space;
-    }
-
-    if (mean != nullptr) {
-      mean_opt = *mean;
-    }
-
-    if (variance != nullptr) {
-      variance_opt = *variance;
-    }
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    phi::BatchNormGradFunctor<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *y,
-        *scale,
-        *bias,
-        mean_opt,
-        variance_opt,
-        *saved_mean,
-        *saved_variance,
-        space_opt,
-        *d_y,
-        momentum,
-        epsilon,
-        data_layout,
-        is_test,
-        use_global_stats,
-        trainable_statistics,
-        true,
-        d_x,
-        scale_grad,
-        bias_grad);
-  }
-};
-
-class InplaceABNOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
-      const override {
-    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
-    return m;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INPLACE_OP_INFERER(InplaceAbnOpInplaceInferer, {"X", "Y"});
-REGISTER_OPERATOR(inplace_abn,
-                  ops::InplaceABNOp,
-                  ops::InplaceABNOpMaker,
-                  ops::InplaceABNOpInferVarType,
-                  ops::InplaceABNOpGradMaker<paddle::framework::OpDesc>,
-                  ops::InplaceABNOpGradMaker<paddle::imperative::OpBase>,
-                  InplaceAbnOpInplaceInferer)
-REGISTER_OPERATOR(inplace_abn_grad, ops::InplaceABNGradOp)
-
-PD_REGISTER_STRUCT_KERNEL(
-    inplace_abn, CPU, ALL_LAYOUT, ops::InplaceABNKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(inplace_abn_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::InplaceABNGradKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
deleted file mode 100644
index b18a75073dd571..00000000000000
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/inplace_abn_op.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/sync_batch_norm_utils.h"
-#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
-#include "paddle/phi/kernels/batch_norm_kernel.h"
-#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
-#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class InplaceABNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    PADDLE_ENFORCE_EQ(x,
-                      y,
-                      platform::errors::InvalidArgument(
-                          "X and Y not inplaced in inplace mode"));
-    auto activation =
-        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* mean = ctx.Input<phi::DenseTensor>("Mean");
-    auto* variance = ctx.Input<phi::DenseTensor>("Variance");
-
-    auto momentum = ctx.Attr<float>("momentum");
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto data_layout = ctx.Attr<std::string>("data_layout");
-    auto is_test = ctx.Attr<bool>("is_test");
-    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
-
-    auto* mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-    auto* variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-    auto* saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto* saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-    auto* reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
-
-    if (ctx.Attr<bool>("use_sync_bn")) {
-      auto& dev_ctx = ctx.device_context<DeviceContext>();
-      phi::SyncBatchNormKernel<T>(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *x,
-          *mean,
-          *variance,
-          *scale,
-          *bias,
-          is_test,
-          momentum,
-          epsilon,
-          data_layout,
-          use_global_stats,
-          trainable_statistics,
-          y,
-          mean_out,
-          variance_out,
-          saved_mean,
-          saved_variance,
-          reserve_space);
-    } else {
-      auto& dev_ctx = ctx.device_context<DeviceContext>();
-      phi::BatchNormKernel<T>(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *x,
-          *mean,
-          *variance,
-          *scale,
-          *bias,
-          is_test,
-          momentum,
-          epsilon,
-          data_layout,
-          use_global_stats,
-          trainable_statistics,
-          y,
-          mean_out,
-          variance_out,
-          saved_mean,
-          saved_variance,
-          reserve_space);
-    }
-
-    auto cur_y = EigenVector<T>::Flatten(*y);
-    InplaceABNActivation<DeviceContext, T> functor;
-    functor.Compute(ctx, activation, place, cur_y, cur_y);
-  }
-};
-
-// Deriving the Gradient for the Backward Pass of Batch Normalization
-// https://kevinzakka.github.io/2016/09/14/batch_normalization/
-template <typename T, typename DeviceContext>
-class InplaceABNGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    PADDLE_ENFORCE_EQ(d_x,
-                      d_y,
-                      platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD not inplaced in inplace mode"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto activation =
-        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
-
-    auto py = *y;
-    auto pd_y = *d_y;
-    auto cur_y = EigenVector<T>::Flatten(py);
-    auto cur_dy = EigenVector<T>::Flatten(pd_y);
-
-    InplaceABNActivation<DeviceContext, T> functor;
-    functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
-
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    auto* saved_variance = ctx.Input<phi::DenseTensor>("SavedVariance");
-
-    auto momentum = ctx.Attr<float>("momentum");
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto data_layout = ctx.Attr<std::string>("data_layout");
-    auto is_test = ctx.Attr<bool>("is_test");
-    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
-
-    auto* scale_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* bias_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    auto* reserve_space = ctx.Input<phi::DenseTensor>("ReserveSpace");
-    auto* mean = ctx.Input<phi::DenseTensor>("ReserveSpace");
-    auto* variance = ctx.Input<phi::DenseTensor>("ReserveSpace");
-
-    if (ctx.Attr<bool>("use_sync_bn")) {
-      auto& dev_ctx = ctx.device_context<DeviceContext>();
-      phi::SyncBatchNormGradFunctor<T>(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          nullptr,
-          y,
-          *scale,
-          *bias,
-          *saved_mean,
-          *saved_variance,
-          *d_y,
-          epsilon,
-          data_layout,
-          d_x,
-          scale_grad,
-          bias_grad);
-    } else {
-      paddle::optional<phi::DenseTensor> space_opt;
-      paddle::optional<phi::DenseTensor> mean_opt;
-      paddle::optional<phi::DenseTensor> variance_opt;
-
-      if (reserve_space != nullptr) {
-        space_opt = *reserve_space;
-      }
-
-      if (mean != nullptr) {
-        mean_opt = *mean;
-      }
-
-      if (variance != nullptr) {
-        variance_opt = *variance;
-      }
-
-      auto& dev_ctx = ctx.device_context<DeviceContext>();
-      phi::BatchNormGradFunctor<T>(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *y,
-          *scale,
-          *bias,
-          mean_opt,
-          variance_opt,
-          *saved_mean,
-          *saved_variance,
-          space_opt,
-          *d_y,
-          momentum,
-          epsilon,
-          data_layout,
-          is_test,
-          use_global_stats,
-          trainable_statistics,
-          true,
-          d_x,
-          scale_grad,
-          bias_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-PD_REGISTER_STRUCT_KERNEL(
-    inplace_abn, GPU, ALL_LAYOUT, ops::InplaceABNKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    inplace_abn_grad, GPU, ALL_LAYOUT, ops::InplaceABNGradKernel, float) {}
-#else
-PD_REGISTER_STRUCT_KERNEL(
-    inplace_abn, GPU, ALL_LAYOUT, ops::InplaceABNKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(inplace_abn_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::InplaceABNGradKernel,
-                          float,
-                          double) {}
-#endif
diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h
deleted file mode 100644
index abdb1e33aaae8c..00000000000000
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-USE_PHI_FUNCTOR(LeakyRelu)
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-enum InplaceABNActivationType { identity = 0, leakyrelu = 1, elu = 2 };
-
-inline InplaceABNActivationType GetInplaceABNActivationType(
-    const std::string& type) {
-  if (type == "leaky_relu") {
-    return InplaceABNActivationType::leakyrelu;
-  } else if (type == "elu") {
-    return InplaceABNActivationType::elu;
-  } else if (type == "identity" || type == "") {
-    return InplaceABNActivationType::identity;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "unsupported activation type %s for Op(inplace_abn)", type));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class InplaceABNActivation {
- private:
-  template <typename Functor>
-  void setAttrs(const framework::ExecutionContext& ctx, Functor* functor) {
-    auto attrs = functor->GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-  }
-
-  template <typename Functor, typename... Args>
-  void compute(const framework::ExecutionContext& ctx,
-               Functor* functor,
-               Args... args) {
-    setAttrs(ctx, functor);
-    (*functor)(args...);
-  }
-
- public:
-  template <typename Device, typename X, typename Y>
-  void Compute(const framework::ExecutionContext& ctx,
-               const int act_type,
-               const Device& d,
-               X x,
-               Y y) {
-    if (act_type == InplaceABNActivationType::identity) {
-      y.device(d) = x;
-    } else if (act_type == InplaceABNActivationType::leakyrelu) {
-      LeakyReluFunctor<T> functor;
-      compute(ctx, &functor, d, x, y);
-    } else if (act_type == InplaceABNActivationType::elu) {
-      ELUFunctor<T> functor;
-      compute(ctx, &functor, d, x, y);
-    } else {
-      PADDLE_THROW(
-          platform::errors::InvalidArgument("unsupported activation type"));
-    }
-  }
-
-  template <typename Device, typename X, typename Y, typename DX, typename DY>
-  void GradCompute(const framework::ExecutionContext& ctx,
-                   const int act_type,
-                   const Device& d,
-                   X x,
-                   Y y,
-                   DX dx,
-                   DY dy) {
-    const float alpha = ctx.Attr<float>("alpha");
-
-    if (act_type == InplaceABNActivationType::identity) {
-      x.device(d) = y;
-      dx.device(d) = dy;
-    } else if (act_type == InplaceABNActivationType::leakyrelu) {
-      auto temp1 = (y < static_cast<T>(0)).template cast<T>().eval() /
-                   static_cast<T>(alpha);
-      auto temp2 = (y >= static_cast<T>(0)).template cast<T>().eval();
-      x.device(d) = y * (temp1 + temp2).template cast<T>();
-
-      LeakyReluGradFunctor<T> functor;
-      compute(ctx, &functor, d, x, y, dy, dx);
-    } else if (act_type == InplaceABNActivationType::elu) {
-      auto temp1 = (y >= static_cast<T>(0)).template cast<T>().eval();
-      auto temp = (y < static_cast<T>(0)).template cast<T>().eval();
-      auto temp2 = (y * temp / static_cast<T>(alpha) + static_cast<T>(1)).log();
-      x.device(d) = (y * temp1 + temp2).template cast<T>();
-
-      ELUGradNegativeAlphaFunctor<T> functor;
-      compute(ctx, &functor, d, x, y, dy, dx);
-    } else {
-      PADDLE_THROW(
-          platform::errors::InvalidArgument("unsupported activation type"));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 2bb9bf633f0c24..1af8b247de4479 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -61,7 +61,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
     return;
   }
 
-  int out_w;
+  int out_w = 0;
   if (ctx->HasInput("Scale")) {
     auto scale_tensor = ctx->GetInputDim("Scale");
     PADDLE_ENFORCE_EQ(
@@ -151,7 +151,7 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
     return;
   }
 
-  int out_h, out_w;
+  int out_h = 0, out_w = 0;
   if (ctx->HasInput("Scale")) {
     auto scale_tensor = ctx->GetInputDim("Scale");
     PADDLE_ENFORCE_EQ(
@@ -247,7 +247,7 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
     return;
   }
 
-  int out_d, out_h, out_w;
+  int out_d = 0, out_h = 0, out_w = 0;
   if (ctx->HasInput("Scale")) {
     auto scale_tensor = ctx->GetInputDim("Scale");
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 8f68ef13e4a4f5..0d80a1c36b0711 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -124,7 +124,6 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_MAKER(isinf, "isinf(X)");
 REGISTER_OP_MAKER(isnan, "isnan(X)");
-REGISTER_OP_MAKER(isfinite, "isfinite(X)");
 
 REGISTER_OP_CPU_KERNEL(
     isinf,
@@ -139,10 +138,3 @@ REGISTER_OP_CPU_KERNEL(
     ops::OverflowKernel<phi::CPUContext, int64_t, ops::NANFunctor>,
     ops::OverflowKernel<phi::CPUContext, float, ops::NANFunctor>,
     ops::OverflowKernel<phi::CPUContext, double, ops::NANFunctor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isfinite,
-    ops::OverflowKernel<phi::CPUContext, int, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::CPUContext, int64_t, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::CPUContext, float, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::CPUContext, double, ops::IsfiniteFunctor>);
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
index 80a65cbda916b7..300229cbeca668 100755
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -33,11 +33,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::OverflowKernel<phi::GPUContext, double, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, plat::float16, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::NANFunctor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isfinite,
-    ops::OverflowKernel<phi::GPUContext, int, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::GPUContext, float, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::GPUContext, double, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::IsfiniteFunctor>);
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index aab7953d6d1030..5352ccc99df92e 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -156,12 +156,6 @@ struct NANFunctor {
   }
 };
 
-struct IsfiniteFunctor {
-  void operator()(const phi::DenseTensor& tensor, phi::DenseTensor* out) {
-    framework::TensorIsfinite(tensor, out);
-  }
-};
-
 template <typename DeviceContext, typename T, typename Functor>
 class OverflowKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
deleted file mode 100644
index 25503ee32e9bf3..00000000000000
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstm_unit_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class LstmUnitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lstm_unit");
-    OP_INOUT_CHECK(ctx->HasInput("C_prev"), "Input", "C_prev", "lstm_unit");
-    OP_INOUT_CHECK(ctx->HasOutput("C"), "Output", "C", "lstm_unit");
-    OP_INOUT_CHECK(ctx->HasOutput("H"), "Output", "H", "lstm_unit");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto c_prev_dims = ctx->GetInputDim("C_prev");
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "Input(X)'s rank must be 2. Received %d instead.", x_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(x_dims[0],
-                        c_prev_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Batch size of inputs and states must be equal, "
-                            "but received %d (inputs)"
-                            "vs %d (states).",
-                            x_dims[0],
-                            c_prev_dims[0]));
-      PADDLE_ENFORCE_EQ(x_dims[1],
-                        c_prev_dims[1] * 4,
-                        platform::errors::InvalidArgument(
-                            "Dimension of FC should equal to prev state * 4, "
-                            "but received %d (dimension of FC)"
-                            "vs %d (prev state * 4).",
-                            x_dims[1],
-                            c_prev_dims[1] * 4));
-    }
-
-    int b_size = static_cast<int>(c_prev_dims[0]);  // batch size
-    int s_dim = static_cast<int>(c_prev_dims[1]);   // state dim
-    ctx->SetOutputDim("C", {b_size, s_dim});
-    ctx->SetOutputDim("H", {b_size, s_dim});
-  }
-};
-
-class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "Lstm unit only applies non-linear activations, please make sure"
-             "that linear tranformation has already been applied to `X`. "
-             "Linear tranformation can be applied by adding a `fc` layer");
-    AddInput(
-        "C_prev",
-        "The cell state tensor of last time-step in the Lstm Unit operator.");
-    AddOutput("C", "The cell tensor of Lstm Unit operator.");
-    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
-    AddAttr<float>("forget_bias",
-                   "(float, default 0.0) "
-                   "The forget bias of Lstm Unit.")
-        .SetDefault(0.0);
-    AddComment(R"DOC(
-Lstm Unit Operator
-
-Equation:
-
-$$
-i, f, o, j = split(X) \\
-C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\
-H = C * sigm(o)
-$$
-
-)DOC");
-  }
-};
-
-class LstmUnitGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("C")),
-                   "Input",
-                   framework::GradVarName("C"),
-                   "lstm_unit");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("H")),
-                   "Input",
-                   framework::GradVarName("H"),
-                   "lstm_unit");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->SetOutputDim(framework::GradVarName("C_prev"),
-                      ctx->GetInputDim("C_prev"));
-  }
-};
-
-template <typename T>
-class LstmUnitGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("lstm_unit_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("C_prev", this->Input("C_prev"));
-    op->SetInput("C", this->Output("C"));
-    op->SetInput(framework::GradVarName("H"), this->OutputGrad("H"));
-    op->SetInput(framework::GradVarName("C"), this->OutputGrad("C"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("C_prev"), this->InputGrad("C_prev"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstm_unit,
-                  ops::LstmUnitOp,
-                  ops::LstmUnitOpMaker,
-                  ops::LstmUnitGradOpMaker<paddle::framework::OpDesc>,
-                  ops::LstmUnitGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(
-    lstm_unit, CPU, ALL_LAYOUT, ops::LstmUnitKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    lstm_unit_grad, CPU, ALL_LAYOUT, ops::LstmUnitGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
deleted file mode 100644
index b1c9d035a8cb5d..00000000000000
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* Acknowledgement: the following code is strongly inspired by
-https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
-*/
-
-#include "paddle/fluid/operators/lstm_unit_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Dtype>
-__device__ Dtype cuda_sigmoid(const Dtype x) {
-  return Dtype(1) / (Dtype(1) + exp(-x));
-}
-
-template <typename Dtype>
-__device__ Dtype cuda_tanh(const Dtype x) {
-  return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x));
-}
-
-template <typename T>
-__global__ void LSTMUnitKernel(const int nthreads,
-                               const int dim,
-                               const T* C_prev,
-                               const T* X,
-                               T* C,
-                               T* H,
-                               const T forget_bias) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / dim;
-    const int d = index % dim;
-
-    const T* X_offset = X + 4 * dim * n;
-    const T i = cuda_sigmoid(X_offset[d]);
-    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
-    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
-    const T g = cuda_tanh(X_offset[3 * dim + d]);
-    const T c_prev = C_prev[index];
-    const T c = f * c_prev + i * g;
-    C[index] = c;
-    const T tanh_c = cuda_tanh(c);
-    H[index] = o * tanh_c;
-  }
-}
-
-template <typename T>
-__global__ void LSTMUnitGradientKernel(const int nthreads,
-                                       const int dim,
-                                       const T* C_prev,
-                                       const T* X,
-                                       const T* C,
-                                       const T* C_diff,
-                                       const T* H_diff,
-                                       T* C_prev_diff,
-                                       T* X_diff,
-                                       const T forget_bias) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / dim;
-    const int d = index % dim;
-    const T* X_offset = X + 4 * dim * n;
-    T* c_prev_diff = C_prev_diff + index;
-    T* X_diff_offset = X_diff + 4 * dim * n;
-    T* i_diff = X_diff_offset + d;
-    T* f_diff = X_diff_offset + 1 * dim + d;
-    T* o_diff = X_diff_offset + 2 * dim + d;
-    T* g_diff = X_diff_offset + 3 * dim + d;
-
-    const T i = cuda_sigmoid(X_offset[d]);
-    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
-    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
-    const T g = cuda_tanh(X_offset[3 * dim + d]);
-    const T c_prev = C_prev[index];
-    const T c = C[index];
-    const T tanh_c = cuda_tanh(c);
-    const T c_term_diff =
-        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
-    *c_prev_diff = c_term_diff * f;
-    *i_diff = c_term_diff * g * i * (1 - i);
-    *f_diff = c_term_diff * c_prev * f * (1 - f);
-    *o_diff = H_diff[index] * tanh_c * o * (1 - o);
-    *g_diff = c_term_diff * i * (1 - g * g);
-  }
-}
-
-template <typename T, typename DeviceContext>
-class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-
-    auto* x_tensor = ctx.Input<phi::DenseTensor>("X");
-    auto* c_prev_tensor = ctx.Input<phi::DenseTensor>("C_prev");
-    auto* c_tensor = ctx.Output<phi::DenseTensor>("C");
-    auto* h_tensor = ctx.Output<phi::DenseTensor>("H");
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int b_size = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    const T* X = x_tensor->data<T>();
-    const T* C_prev = c_prev_tensor->data<T>();
-
-    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
-    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int block = 512;
-    int n = b_size * D;
-    int grid = (n + block - 1) / block;
-
-    LSTMUnitKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, forget_bias);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-
-    auto x_tensor = ctx.Input<phi::DenseTensor>("X");
-    auto c_prev_tensor = ctx.Input<phi::DenseTensor>("C_prev");
-    auto c_tensor = ctx.Input<phi::DenseTensor>("C");
-    auto h_tensor = ctx.Input<phi::DenseTensor>("H");
-
-    auto hdiff_tensor =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("H"));
-    auto cdiff_tensor =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("C"));
-
-    auto xdiff_tensor =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto c_prev_diff_tensor =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("C_prev"));
-
-    auto* X = x_tensor->data<T>();
-    auto* C_prev = c_prev_tensor->data<T>();
-    auto* C = c_tensor->data<T>();
-
-    auto* H_diff = hdiff_tensor->data<T>();
-    auto* C_diff = cdiff_tensor->data<T>();
-
-    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
-    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int N = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int block = 512;
-    int n = N * D;
-    int grid = (n + block - 1) / block;
-
-    LSTMUnitGradientKernel<T><<<grid, block>>>(
-        n, D, C_prev, X, C, C_diff, H_diff, C_prev_diff, X_diff, forget_bias);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    lstm_unit, GPU, ALL_LAYOUT, ops::LstmUnitOpCUDAKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(lstm_unit_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::LstmUnitGradOpCUDAKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
deleted file mode 100644
index 0621741b885fb7..00000000000000
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* Acknowledgement: the following code is strongly inspired by
-https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
-*/
-
-#pragma once
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-inline T sigmoid(T x) {
-  return 1. / (1. + exp(-x));
-}
-
-template <typename T>
-inline T tanh(T x) {
-  return 2. * sigmoid(2. * x) - 1.;
-}
-
-template <typename T, typename DeviceContext>
-class LstmUnitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()),
-        true,
-        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
-
-    auto* x_tensor = ctx.Input<phi::DenseTensor>("X");
-    auto* c_prev_tensor = ctx.Input<phi::DenseTensor>("C_prev");
-    auto* c_tensor = ctx.Output<phi::DenseTensor>("C");
-    auto* h_tensor = ctx.Output<phi::DenseTensor>("H");
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int b_size = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
-    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
-
-    const T* X = x_tensor->data<T>();
-    const T* C_prev = c_prev_tensor->data<T>();
-
-    for (int n = 0; n < b_size; ++n) {
-      for (int d = 0; d < D; ++d) {
-        const T i = sigmoid(X[d]);
-        const T f = sigmoid(X[1 * D + d] + forget_bias);
-        const T o = sigmoid(X[2 * D + d]);
-        const T g = tanh(X[3 * D + d]);
-        const T c_prev = C_prev[d];
-        const T c = f * c_prev + i * g;
-        C[d] = c;
-        const T tanh_c = tanh(c);
-        H[d] = o * tanh_c;
-      }
-      C_prev += D;
-      X += 4 * D;
-      C += D;
-      H += D;
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class LstmUnitGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()),
-        true,
-        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
-
-    auto x_tensor = ctx.Input<phi::DenseTensor>("X");
-    auto c_prev_tensor = ctx.Input<phi::DenseTensor>("C_prev");
-    auto c_tensor = ctx.Input<phi::DenseTensor>("C");
-
-    auto hdiff_tensor =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("H"));
-    auto cdiff_tensor =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("C"));
-
-    auto xdiff_tensor =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto c_prev_diff_tensor =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("C_prev"));
-
-    auto* X = x_tensor->data<T>();
-    auto* C_prev = c_prev_tensor->data<T>();
-    auto* C = c_tensor->data<T>();
-
-    auto* H_diff = hdiff_tensor->data<T>();
-    auto* C_diff = cdiff_tensor->data<T>();
-
-    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
-    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int N = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    for (int n = 0; n < N; ++n) {
-      for (int d = 0; d < D; ++d) {
-        T* c_prev_diff = C_prev_diff + d;
-        T* i_diff = X_diff + d;
-        T* f_diff = X_diff + 1 * D + d;
-        T* o_diff = X_diff + 2 * D + d;
-        T* g_diff = X_diff + 3 * D + d;
-
-        const T i = sigmoid(X[d]);
-        const T f = sigmoid(X[1 * D + d] + forget_bias);
-        const T o = sigmoid(X[2 * D + d]);
-        const T g = tanh(X[3 * D + d]);
-        const T c_prev = C_prev[d];
-        const T c = C[d];
-        const T tanh_c = tanh(c);
-        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
-        *c_prev_diff = c_term_diff * f;
-        *i_diff = c_term_diff * g * i * (1 - i);
-        *f_diff = c_term_diff * c_prev * f * (1 - f);
-        *o_diff = H_diff[d] * tanh_c * o * (1 - o);
-        *g_diff = c_term_diff * i * (1 - g * g);
-      }
-      C_prev += D;
-      X += 4 * D;
-      C += D;
-      C_diff += D;
-      H_diff += D;
-      X_diff += 4 * D;
-      C_prev_diff += D;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
deleted file mode 100644
index 7af04a237de4c3..00000000000000
--- a/paddle/fluid/operators/lstmp_op.cc
+++ /dev/null
@@ -1,411 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstmp_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class LSTMPOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "LSTMP");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTMP");
-    OP_INOUT_CHECK(ctx->HasInput("ProjWeight"), "Input", "ProjWeight", "LSTMP");
-    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTMP");
-
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Projection"), "Output", "Projection", "LSTMP");
-    OP_INOUT_CHECK(ctx->HasOutput("Cell"), "Output", "Cell", "LSTMP");
-    OP_INOUT_CHECK(ctx->HasOutput("BatchGate"), "Output", "BatchGate", "LSTMP");
-    OP_INOUT_CHECK(ctx->HasOutput("BatchCellPreAct"),
-                   "Output",
-                   "BatchCellPreAct",
-                   "LSTMP");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("BatchHidden"), "Output", "BatchHidden", "LSTMP");
-
-    auto in_dims = ctx->GetInputDim("Input");
-
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "Input(X)'s rank of LSTMP operator must be 2, but received %d.",
-            in_dims.size()));
-
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    auto w_dims = ctx->GetInputDim("Weight");
-    auto proj_dims = ctx->GetInputDim("ProjWeight");
-    PADDLE_ENFORCE_EQ(
-        w_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The rank of Input(Weight) should be 2, but received %d.",
-            w_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        w_dims[0],
-        proj_dims[1],
-        platform::errors::InvalidArgument(
-            "The first dimension of Input(Weight) and the second dimension of "
-            "Input(ProjWeight) should be the same, but received %d vs %d.",
-            w_dims[0],
-            proj_dims[1]));
-    PADDLE_ENFORCE_EQ(w_dims[1],
-                      4 * frame_size,
-                      platform::errors::InvalidArgument(
-                          "The second dimension of Input(Weight) should be 4 * "
-                          "%d, but received %d.",
-                          frame_size,
-                          w_dims[1]));
-
-    PADDLE_ENFORCE_EQ(
-        proj_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The rank of Input(ProjWeight) should be 2, but received %d.",
-            proj_dims.size()));
-    PADDLE_ENFORCE_EQ(proj_dims[0],
-                      frame_size,
-                      platform::errors::InvalidArgument(
-                          "The first dimension of Input(ProjWeight) should be "
-                          "%d, but received %d.",
-                          frame_size,
-                          proj_dims[0]));
-
-    if (ctx->HasInput("H0")) {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasInput("C0"),
-          true,
-          platform::errors::NotFound("Input(C0) of LSTMP operator should not "
-                                     "be null after Input(H0) provided."));
-    }
-
-    auto b_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(
-        b_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The rank of Input(Bias) should be 2, but received %d.",
-            b_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        b_dims[0],
-        1,
-        platform::errors::InvalidArgument(
-            "The first dimension of Input(Bias) should be 1, but received %d.",
-            b_dims[0]));
-
-    if (ctx->Attrs().Get<bool>("use_peepholes")) {
-      PADDLE_ENFORCE_EQ(
-          b_dims[1],
-          7 * frame_size,
-          platform::errors::InvalidArgument(
-              "The second dimension of Input(Bias) should be 7 * %d if enable "
-              "peepholes connection, but received %d.",
-              frame_size,
-              b_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          b_dims[1],
-          4 * frame_size,
-          platform::errors::InvalidArgument(
-              "The second dimension of Input(Bias) should be 4 * %d if disable "
-              "peepholes connection, but received %d.",
-              frame_size,
-              b_dims[1]));
-    }
-
-    framework::DDim out_dims({in_dims[0], frame_size});
-    framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
-    ctx->SetOutputDim("Projection", proj_out_dims);
-    ctx->SetOutputDim("Cell", out_dims);
-    ctx->SetOutputDim("BatchGate", in_dims);
-    ctx->SetOutputDim("BatchCellPreAct", out_dims);
-    ctx->SetOutputDim("BatchHidden", out_dims);
-    ctx->ShareLoD("Input", "Projection");
-    ctx->ShareLoD("Input", "Cell");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Input",
-        "(phi::DenseTensor) the input for sequence data, which supports "
-        "variable-time length input sequence. The underlying tensor in "
-        "this phi::DenseTensor is a matrix with shape (T X 4D), where T is the "
-        "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) the initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size and D is the hidden size.")
-        .AsDispensable();
-    AddInput("C0",
-             "(Tensor, optional) the initial cell state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `C0` should not be null if `H0` provided.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(Tensor) the learnable hidden-hidden weights."
-             " - The shape is (P x 4D), where P is the projection layer size "
-             "and  D is the hidden size."
-             " - Weight = {W_cr, W_ir, W_fr, W_or}");
-    AddInput("ProjWeight",
-             "(Tensor) the learnable weight of the projection layer."
-             " - The shape is (D x P), where P is the recurrent projection "
-             "layer size and  D is the hidden size."
-             " - ProjWeight = {W_rh}");
-    AddInput("Bias",
-             "(Tensor) the learnable biases, which contains two parts: "
-             "input-hidden biases and peephole connections weights if "
-             "setting `use_peepholes` to `True`. "
-             "1. `use_peepholes = False` "
-             " - The shape is (1 x 4D). "
-             " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `use_peepholes = True` "
-             " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-    AddOutput("Projection",
-              "(phi::DenseTensor) the projection of the hidden state of LSTMP "
-              "operator. The shape is (T x P), and LoD is the same with the "
-              "`Input`.");
-    AddOutput("Cell",
-              "(phi::DenseTensor) the cell state of LSTMP operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput(
-        "BatchGate",
-        "(phi::DenseTensor) This phi::DenseTensor contains input gate, forget "
-        "gate "
-        "and output gate after the activations. This phi::DenseTensor has the "
-        "same shape as the reorganized input, which is also be called "
-        "batch input. The LoD size is 2. The first-level LoD is the "
-        "batch offsets and the second contains the indices, which "
-        "denotes the position of reorganized sequence in the raw input.")
-        .AsIntermediate();
-    AddOutput(
-        "BatchCellPreAct",
-        "(phi::DenseTensor) the pre-activation cell state reorganized in "
-        "batch. "
-        "This phi::DenseTensor is obtained in the forward and used in the "
-        "backward.")
-        .AsIntermediate();
-    AddOutput(
-        "BatchHidden",
-        "(phi::DenseTensor) the hidden state reorganized in batch. "
-        "This phi::DenseTensor is obtained in the forward and used in the "
-        "backward.")
-        .AsIntermediate();
-    AddAttr<bool>("use_peepholes",
-                  "(bool, default: True) "
-                  "whether to enable diagonal/peephole connections.")
-        .SetDefault(true);
-    AddAttr<bool>("is_reverse",
-                  "(bool, default: False) "
-                  "whether to compute reversed LSTMP.")
-        .SetDefault(false);
-    AddAttr<float>("cell_clip",
-                   "(float, default: 0.0) "
-                   "Clip for Tensor for cell state tensor when clip value is "
-                   "greater than 0.0")
-        .SetDefault(0.0);
-    AddAttr<float>("proj_clip",
-                   "(float, default: 0.0) "
-                   "Clip for Tensor for projection tensor when clip value is "
-                   "greater than 0.0")
-        .SetDefault(0.0);
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default: sigmoid)"
-        "The activation for input gate, forget gate and output "
-        "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("cell_activation",
-                         "(string, default: tanh)"
-                         "The activation for cell output, `tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("candidate_activation",
-                         "(string, default: tanh)"
-                         "The activation for candidate hidden state, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("proj_activation",
-                         "(string, default: tanh)"
-                         "The activation for projection output, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddComment(R"DOC(
-Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
-
-LSTMP has a separate projection layer after the LSTM layer, projecting the
-original hidden state to a lower-dimensional one, which is proposed to reduce
-the number of total parameters and furthermore computational complexity for
-the LSTM, espeacially for the case that the size of output units is relative
-large (https://research.google.com/pubs/archive/43905.pdf).
-
-The formula is as follows:
-
-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
-
-f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
-
-\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
-
-o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
-
-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
-
-h_t = o_t \odot act_h(c_t) \\
-
-r_t = \overline{act_h}(W_{rh}h_t)
-$$
-
-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to represent these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the activation, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$. Here $h$ is usually called the hidden
-state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also
-called the candidate hidden state, whose computation is based on the current
-input and previous hidden state.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\overline{act_h}$ is the activation function for the
-projection output, usually using `identity` or same as $act_h$.
-
-Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
-operations on the input $x_{t}$ are NOT included in this operator.
-Users can choose to use fully-connected operator before LSTMP operator.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class LSTMPGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("lstmp_grad");
-    grad_op->SetInput("Weight", this->Input("Weight"));
-    grad_op->SetInput("ProjWeight", this->Input("ProjWeight"));
-    grad_op->SetInput("Bias", this->Input("Bias"));
-
-    grad_op->SetInput("Projection", this->Output("Projection"));
-    grad_op->SetInput("Cell", this->Output("Cell"));
-    grad_op->SetInput("BatchGate", this->Output("BatchGate"));
-    grad_op->SetInput("BatchCellPreAct", this->Output("BatchCellPreAct"));
-    grad_op->SetInput("BatchHidden", this->Output("BatchHidden"));
-    grad_op->SetInput("H0", this->Input("H0"));
-    grad_op->SetInput("C0", this->Input("C0"));
-
-    grad_op->SetInput(framework::GradVarName("Projection"),
-                      this->OutputGrad("Projection"));
-
-    grad_op->SetOutput(framework::GradVarName("Input"),
-                       this->InputGrad("Input"));
-    grad_op->SetOutput(framework::GradVarName("Weight"),
-                       this->InputGrad("Weight"));
-    grad_op->SetOutput(framework::GradVarName("ProjWeight"),
-                       this->InputGrad("ProjWeight"));
-    grad_op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
-    grad_op->SetOutput(framework::GradVarName("H0"), this->InputGrad("H0"));
-    grad_op->SetOutput(framework::GradVarName("C0"), this->InputGrad("C0"));
-
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-class LSTMPGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Projection"), "Input", "Projection", "LSTMP@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Cell"), "Input", "Cell", "LSTMP@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTMP@Grad");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ProjWeight"), "Input", "ProjWeight", "LSTMP@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTMP@Grad");
-
-    OP_INOUT_CHECK(
-        ctx->HasInput("BatchGate"), "Input", "BatchGate", "LSTMP@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("BatchCellPreAct"),
-                   "Input",
-                   "BatchCellPreAct",
-                   "LSTMP@Grad");
-
-    auto SetOutGradDim = [&ctx](const std::string& name) {
-      auto g_name = framework::GradVarName(name);
-      if (ctx->HasOutput(g_name))
-        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
-    };
-
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("BatchGate"));
-    SetOutGradDim("Weight");
-    SetOutGradDim("ProjWeight");
-    SetOutGradDim("Bias");
-    SetOutGradDim("H0");
-    SetOutGradDim("C0");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "BatchGate"),
-        ctx.device_context().GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstmp,
-                  ops::LSTMPOp,
-                  ops::LSTMPOpMaker,
-                  ops::LSTMPGradMaker<paddle::framework::OpDesc>,
-                  ops::LSTMPGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
-PD_REGISTER_STRUCT_KERNEL(
-    lstmp, CPU, ALL_LAYOUT, ops::LSTMPKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    lstmp_grad, CPU, ALL_LAYOUT, ops::LSTMPGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
deleted file mode 100644
index fd9032c730af84..00000000000000
--- a/paddle/fluid/operators/lstmp_op.h
+++ /dev/null
@@ -1,610 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/phi/common/transform.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
-#include "paddle/phi/kernels/funcs/lstm_compute.h"
-#include "paddle/phi/kernels/funcs/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-using phi::Transform;
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class _ClipFunctor {
- public:
-  explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x < min_)
-      return min_;
-    else if (x > max_)
-      return max_;
-    else
-      return x;
-  }
-
- private:
-  T min_;
-  T max_;
-};
-
-template <typename T>
-class _ClipGradFunctor {
- public:
-  explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x, const T& y) const {
-    return (y > min_ && y < max_) ? x : 0;
-  }
-
- private:
-  T min_;
-  T max_;
-};
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const phi::DenseTensor& src,
-                             phi::Vector<size_t> index,
-                             phi::DenseTensor* dst,
-                             bool indexed_src) {
-  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, dst, indexed_src);
-}
-
-template <typename T, typename DeviceContext>
-class LSTMPKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y>
-  void ActCompute(const phi::funcs::detail::ActivationType act_type,
-                  const Device& d,
-                  X x,
-                  Y y,
-                  platform::Place place) const {
-    if (act_type == phi::funcs::detail::ActivationType::kIdentity) {
-      y.device(d) = x;
-    } else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) {
-      SigmoidFunctor<T>()(d, x, y);
-    } else if (act_type == phi::funcs::detail::ActivationType::kTanh) {
-      TanhFunctor<T>()(d, x, y);
-    } else if (act_type == phi::funcs::detail::ActivationType::kReLU) {
-      if (place == platform::CPUPlace())
-        ReluCPUFunctor<T>()(d, x, y);
-      else
-        ReluCUDAFunctor<T>()(d, x, y);
-    } else {
-      PADDLE_THROW(
-          platform::errors::InvalidArgument("unsupported activation type"));
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
-    auto* proj_weight = ctx.Input<phi::DenseTensor>("ProjWeight");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto* hidden_t0 = ctx.Input<phi::DenseTensor>("H0");
-    auto* cell_t0 = ctx.Input<phi::DenseTensor>("C0");
-
-    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
-    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
-
-    auto* batch_gate = ctx.Output<phi::DenseTensor>("BatchGate");
-    batch_gate->mutable_data<T>(ctx.GetPlace());
-    auto* proj_out = ctx.Output<phi::DenseTensor>("Projection");
-    proj_out->mutable_data<T>(ctx.GetPlace());
-    auto* cell_out = ctx.Output<phi::DenseTensor>("Cell");
-    cell_out->mutable_data<T>(ctx.GetPlace());
-
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
-
-    auto in_dims = input->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    framework::DDim dims({in_dims[0], frame_size});
-    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
-
-    if (bias) {
-      phi::DenseTensor b = *bias;
-      b.Resize({bias->numel(), 1});
-      phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
-      phi::funcs::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
-    }
-
-    phi::funcs::LstmMetaValue<T> lstmp_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      // the code style in LstmpMetaValue will be updated later.
-
-      lstmp_value.check_ig = bias_data + 4 * frame_size;
-      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
-      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
-    } else {
-      lstmp_value.check_ig = nullptr;
-      lstmp_value.check_fg = nullptr;
-      lstmp_value.check_og = nullptr;
-    }
-    lstmp_value.prev_state_value = nullptr;
-    phi::DenseTensor ordered_c0;
-    phi::DenseTensor ordered_h0;
-
-    phi::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTMP reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, *cell_t0, order, &ordered_c0, true);
-      lstmp_value.prev_state_value = ordered_c0.data<T>();
-    }
-
-    // Use the local variable as here.
-    phi::DenseTensor batch_proj, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<phi::DenseTensor>("BatchCellPreAct");
-    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
-    auto* batch_hidden = ctx.Output<phi::DenseTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(dims, ctx.GetPlace());    // T x D
-    batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
-    batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("proj_activation"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
-      phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
-      phi::DenseTensor proj_t = batch_proj.Slice(bstart, bend);
-      phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
-      phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(pre_proj_t,
-                    false,
-                    *weight,
-                    false,
-                    static_cast<T>(1.0),
-                    &gate_t,
-                    static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTMP reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        ReorderInitState<DeviceContext, T>(
-            device_ctx, *hidden_t0, order, &ordered_h0, true);
-        blas.MatMul(ordered_h0,
-                    false,
-                    *weight,
-                    false,
-                    static_cast<T>(1.0),
-                    &gate_t,
-                    static_cast<T>(1.0));
-      }
-
-      lstmp_value.gate_value = gate_t.data<T>();
-      lstmp_value.output_value = hidden_t.data<T>();
-      lstmp_value.state_value = cell_t.data<T>();
-      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
-      phi::funcs::LstmUnitFunctor<DeviceContext, T>::compute(device_ctx,
-                                                             lstmp_value,
-                                                             frame_size,
-                                                             cur_batch_size,
-                                                             cell_clip,
-                                                             gate_act,
-                                                             cell_act,
-                                                             cand_act);
-      lstmp_value.prev_state_value = lstmp_value.state_value;
-      blas.MatMul(hidden_t,
-                  false,
-                  *proj_weight,
-                  false,
-                  static_cast<T>(1.0),
-                  &proj_t,
-                  static_cast<T>(0.0));
-      if (proj_act != phi::funcs::detail::ActivationType::kIdentity) {
-        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
-        ActCompute(cell_act, place, proj_t_dev, proj_t_dev, ctx.GetPlace());
-      }
-      if (proj_clip && proj_clip > 0.0) {
-        T* x_data = proj_t.data<T>();
-        int64_t numel = proj_t.numel();
-        Transform<DeviceContext> trans;
-        trans(ctx.template device_context<DeviceContext>(),
-              x_data,
-              x_data + numel,
-              x_data,
-              _ClipFunctor<T>(-1.0 * proj_clip, proj_clip));
-      }
-    }
-
-    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_proj.set_lod(batch_gate->lod());
-    // restore the output hidden in phi::DenseTensor from the batch hidden
-    to_seq(device_ctx, batch_proj, proj_out);
-
-    batch_cell.set_lod(batch_gate->lod());
-    // restore the output cell state in phi::DenseTensor from the batch cell
-    to_seq(device_ctx, batch_cell, cell_out);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class LSTMPGradKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y, typename DX, typename DY>
-  void ActGradCompute(const phi::funcs::detail::ActivationType act_type,
-                      const Device& d,
-                      X x,
-                      Y y,
-                      DX dx,
-                      DY dy) const {
-    // x is dummy and won't be used even in Relu(use y instead)
-    if (act_type == phi::funcs::detail::ActivationType::kIdentity)
-      dx.device(d) = dy;
-    else if (act_type == phi::funcs::detail::ActivationType::kSigmoid)
-      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == phi::funcs::detail::ActivationType::kTanh)
-      TanhGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == phi::funcs::detail::ActivationType::kReLU)
-      ReluGradFunctor<T>()(d, x, y, dy, dx);
-    else
-      PADDLE_THROW(
-          platform::errors::InvalidArgument("unsupported activation type"));
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
-    auto* proj_weight = ctx.Input<phi::DenseTensor>("ProjWeight");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto* proj_out = ctx.Input<phi::DenseTensor>("Projection");
-    auto* cell_out = ctx.Input<phi::DenseTensor>("Cell");
-
-    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
-    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
-
-    auto* batch_gate = ctx.Input<phi::DenseTensor>("BatchGate");
-    auto* batch_cell_pre_act = ctx.Input<phi::DenseTensor>("BatchCellPreAct");
-    auto* batch_hidden = ctx.Input<phi::DenseTensor>("BatchHidden");
-
-    auto* projection_g =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Projection"));
-
-    auto* in_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto* weight_g =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
-    auto* proj_weight_g =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("ProjWeight"));
-    auto* bias_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    auto* h0 = ctx.Input<phi::DenseTensor>("H0");
-    auto* c0 = ctx.Input<phi::DenseTensor>("C0");
-
-    auto* h0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("C0"));
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    if (weight_g) {
-      weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, weight_g, static_cast<T>(0.0));
-    }
-    if (proj_weight_g) {
-      proj_weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
-    }
-
-    // ordered_h0/c0 is the reordered hidden/cell initialization.
-    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
-    // initialization.
-    phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-
-    phi::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (c0) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, *c0, order, &ordered_c0, true);
-    }
-    if (c0 && c0_g) {
-      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
-    }
-
-    // batch_gate dims equal to input dims
-    auto in_dims = batch_gate->dims();
-    auto out_dims = cell_out->dims();
-    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    PADDLE_ENFORCE_EQ(frame_size,
-                      out_dims[1],
-                      platform::errors::InvalidArgument(
-                          "The second dimension of Input(Cell) should be %d, "
-                          "but received %d in LSTMP@Grad operator.",
-                          frame_size,
-                          out_dims[1]));
-
-    phi::funcs::LstmMetaValue<T> lstmp_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      lstmp_value.check_ig = bias_data + 4 * frame_size;
-      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
-      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
-    } else {
-      lstmp_value.check_ig = nullptr;
-      lstmp_value.check_fg = nullptr;
-      lstmp_value.check_og = nullptr;
-    }
-
-    phi::funcs::LstmMetaGrad<T> lstmp_grad;
-
-    if (bias && bias_g) {
-      bias_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, bias_g, static_cast<T>(0.0));
-    }
-    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_g_data = bias_g->data<T>();
-      lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-      lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
-      lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
-    } else {
-      lstmp_grad.check_ig_grad = nullptr;
-      lstmp_grad.check_fg_grad = nullptr;
-      lstmp_grad.check_og_grad = nullptr;
-    }
-
-    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-
-    auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx,
-                                            const phi::DenseTensor& src,
-                                            const framework::DDim& dims,
-                                            phi::DenseTensor& dst) {
-      dst.mutable_data<T>(dims, ctx.GetPlace());
-      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, &dst, false);
-    };
-
-    phi::DenseTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
-    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    ToBatch(device_ctx, *proj_out, proj_dims, batch_proj);        // T x P
-    ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g);  // T x P
-    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);         // T x D
-
-    phi::DenseTensor batch_cell_g, batch_gate_g;
-    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    // TODO(qingqing) support the case output cell has gradient.
-    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
-    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
-    batch_gate_g.set_lod(batch_gate->lod());
-
-    auto gate_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("proj_activation"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      phi::DenseTensor cur_proj = batch_proj.Slice(bstart, bend);
-      phi::DenseTensor proj_g = batch_proj_g.Slice(bstart, bend);
-
-      if (proj_clip && proj_clip > 0.0) {
-        T* dx_data = proj_g.data<T>();
-        T* x_data = cur_proj.data<T>();
-        int64_t numel = proj_g.numel();
-        Transform<DeviceContext> trans;
-        trans(ctx.template device_context<DeviceContext>(),
-              dx_data,
-              dx_data + numel,
-              x_data,
-              dx_data,
-              _ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
-      }
-
-      if (proj_act != phi::funcs::detail::ActivationType::kIdentity) {
-        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
-        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
-        ActGradCompute(cell_act,
-                       place,
-                       cur_proj_dev,
-                       cur_proj_dev,
-                       proj_g_dev,
-                       proj_g_dev);
-      }
-      /* hidden state backwarad */
-      phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
-      blas.MatMul(proj_g,
-                  false,
-                  *proj_weight,
-                  true,
-                  static_cast<T>(1.0),
-                  &out_g,
-                  static_cast<T>(0.0));
-      /* projection weight backward*/
-      if (proj_weight_g) {
-        phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
-        blas.MatMul(hidden_t,
-                    true,
-                    proj_g,
-                    false,
-                    static_cast<T>(1.0),
-                    proj_weight_g,
-                    static_cast<T>(1.0));
-      }
-
-      phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
-      phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
-      phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstmp_value.gate_value = gate.data<T>();
-      lstmp_value.state_value = cell.data<T>();
-      lstmp_value.state_active_value = cell_pre_act.data<T>();
-
-      phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
-      phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstmp_grad.state_grad = cell_g.data<T>();
-      lstmp_grad.gate_grad = gate_g.data<T>();
-      lstmp_grad.output_grad = out_g.data<T>();
-
-      if (n > 0) {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstmp_value.prev_state_value = cell_pre.data<T>();
-        lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
-      } else {
-        lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-        lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
-      }
-
-      int cur_batch_size = bend - bstart;
-      // lstmp_value.output_value not used in bp, set to null
-      // lstmp_grad.state_active_grad not used in bp, set to null
-      lstmp_value.output_value = nullptr;
-      lstmp_grad.state_active_grad = nullptr;
-
-      phi::funcs::LstmUnitGradFunctor<DeviceContext, T>::compute(device_ctx,
-                                                                 lstmp_value,
-                                                                 lstmp_grad,
-                                                                 frame_size,
-                                                                 cur_batch_size,
-                                                                 cell_clip,
-                                                                 gate_act,
-                                                                 cell_act,
-                                                                 cand_act);
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(gate_g,
-                    false,
-                    *weight,
-                    true,
-                    static_cast<T>(1.0),
-                    &pre_proj_g,
-                    static_cast<T>(1.0));
-        if (weight_g) {
-          /* weight backward*/
-          auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
-          blas.MatMul(pre_proj,
-                      true,
-                      gate_g,
-                      false,
-                      static_cast<T>(1.0),
-                      weight_g,
-                      static_cast<T>(1.0));
-        }
-      } else {
-        if (h0 && weight_g) {
-          ReorderInitState<DeviceContext, T>(
-              device_ctx, *h0, order, &ordered_h0, true);
-          if (weight_g) {
-            blas.MatMul(ordered_h0,
-                        true,
-                        gate_g,
-                        false,
-                        static_cast<T>(1.0),
-                        weight_g,
-                        static_cast<T>(1.0));
-          }
-        }
-        if (h0 && (h0_g || proj_weight_g)) {
-          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          blas.MatMul(gate_g,
-                      false,
-                      *weight,
-                      true,
-                      static_cast<T>(1.0),
-                      &ordered_h0_g,
-                      static_cast<T>(0.0));
-        }
-      }
-    }
-
-    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    if (in_g) {
-      /* backward data */
-      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, in_g);
-    }
-    if (bias && bias_g) {
-      /* backward bias */
-      phi::DenseTensor b_g = *bias_g;
-      b_g.Resize({bias_g->numel(), 1});
-      phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      phi::funcs::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
-    }
-
-    if (h0 && h0_g) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, ordered_h0_g, order, h0_g, false);
-    }
-    if (c0 && c0_g) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, ordered_c0_g, order, c0_g, false);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
deleted file mode 100644
index 2aaf8f74af359e..00000000000000
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/margin_rank_loss_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-class MarginRankLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    OP_INOUT_CHECK(
-        ctx->HasInput("Label"), "Input", "Label", "margin_rank_loss");
-    OP_INOUT_CHECK(ctx->HasInput("X1"), "Input", "X1", "margin_rank_loss");
-    OP_INOUT_CHECK(ctx->HasInput("X2"), "Input", "X2", "margin_rank_loss");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "margin_rank_loss");
-
-    auto label_dims = ctx->GetInputDim("Label");
-    auto x1_dims = ctx->GetInputDim("X1");
-    auto x2_dims = ctx->GetInputDim("X2");
-
-    PADDLE_ENFORCE_EQ(
-        label_dims,
-        x1_dims,
-        platform::errors::InvalidArgument(
-            "The shape of Input(Label) shape should equals the shape of "
-            "Input(X1). Received: Input(Label)'s shape: [%s], Input(X1)'s "
-            "shape: [%s].",
-            label_dims,
-            x1_dims));
-    PADDLE_ENFORCE_EQ(
-        x1_dims,
-        x2_dims,
-        platform::errors::InvalidArgument(
-            "The shape of Input(X1) shape should equals the shape of "
-            "Input(X2). Received: Input(X1)'s shape: [%s], Input(X2)'s shape: "
-            "[%s].",
-            x1_dims,
-            x2_dims));
-    PADDLE_ENFORCE_EQ(
-        label_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The dimensions of Input(Label) should be 2. Received: "
-            "the shape of Input(Label): [%s], the dimensions of Input(Label): "
-            "%d.",
-            label_dims,
-            label_dims.size()));
-    PADDLE_ENFORCE_EQ(label_dims[1],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The second dimension of Input(Lable) should be 1"
-                          "Received: the shape of Input(Label): [%s].",
-                          label_dims));
-    ctx->SetOutputDim("Activated", label_dims);
-    ctx->SetOutputDim("Out", label_dims);
-  }
-};
-
-template <typename T>
-class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X1",
-             "(2-D tensor with shape [batch_size x 1]) The score for "
-             "one item X1 to be ranked, from pairwise ranking model.");
-    AddInput("X2",
-             "(2-D tensor with shape [batch_size x 1]) The score for "
-             "another item X2 to be ranked, from pairwise ranking model.");
-    AddInput("Label",
-             "(2-D tensor with shape [batch_size x 1]) "
-             "The label indicating X1 ranked higher than X2 or not, "
-             "can only be +1 or -1.");
-    AddOutput("Activated",
-              "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
-              "to indicate whether each element of Output(Out) is activated.")
-        .AsIntermediate();
-    AddOutput("Out",
-              "(2-D tensor with shape [batch_size x 1]) "
-              "The output loss of MarginRankLoss operator.");
-    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
-        .SetDefault(static_cast<T>(0));
-    AddComment(R"DOC(
-MarginRankLoss Operator.
-
-This operator measures the loss given a pair of training sample
-{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1`
-indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss
-is calculated as:
-
-$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
-
-The attribute `margin` here helps make the predictions more robust.
-Denote the item ranked higher as the positive sample, otherwise the negative
-sample. If the score of the two samples satisfies
-
-$positive sample - negative sample < margin$
-
-the pair of samples will contribute to the final loss, which will backpropagate
-and train the ranking model to enlarge the difference between the two scores.
-
-For batch input with size `batch_size`, `X1`, `X2` and `Label`
-all have the same shape [batch_size x 1].
-
-)DOC");
-  }
-};
-
-class MarginRankLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Label"), "Input", "Label", "margin_rank_loss_grad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "margin_rank_loss_grad");
-    OP_INOUT_CHECK(ctx->HasInput("Activated"),
-                   "Input",
-                   "Activated",
-                   "margin_rank_loss_grad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X1")),
-                   "Output",
-                   framework::GradVarName("X1"),
-                   "margin_rank_loss_grad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X2")),
-                   "Output",
-                   framework::GradVarName("X2"),
-                   "margin_rank_loss_grad");
-
-    auto dims = ctx->GetInputDim("Label");
-    ctx->SetOutputDim(framework::GradVarName("X1"), dims);
-    ctx->SetOutputDim(framework::GradVarName("X2"), dims);
-  }
-};
-
-template <typename T>
-class MarginRankLossGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("margin_rank_loss_grad");
-    op->SetInput("Activated", this->Output("Activated"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("Label", this->Input("Label"));
-    op->SetOutput(framework::GradVarName("X1"), this->InputGrad("X1"));
-    op->SetOutput(framework::GradVarName("X2"), this->InputGrad("X2"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(margin_rank_loss,
-                  ops::MarginRankLossOp,
-                  ops::MarginRankLossOpMaker<float>,
-                  ops::MarginRankLossGradMaker<paddle::framework::OpDesc>,
-                  ops::MarginRankLossGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(
-    margin_rank_loss, CPU, ALL_LAYOUT, ops::MarginRankLossKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(margin_rank_loss_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::MarginRankLossGradKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu
deleted file mode 100644
index 8c6c2ee055f9c2..00000000000000
--- a/paddle/fluid/operators/margin_rank_loss_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/margin_rank_loss_op.h"
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    margin_rank_loss, GPU, ALL_LAYOUT, ops::MarginRankLossKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(margin_rank_loss_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::MarginRankLossGradKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h
deleted file mode 100644
index 49cbb1168f1b50..00000000000000
--- a/paddle/fluid/operators/margin_rank_loss_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct ReLU {
-  HOSTDEVICE T operator()(const T& val) const {
-    return val > 0 ? val : static_cast<T>(0);
-  }
-};
-
-template <typename T>
-struct Heaviside {
-  HOSTDEVICE T operator()(const T& val) const {
-    return static_cast<T>(val > 0 ? 1 : 0);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class MarginRankLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    auto* act_t = ctx.Output<phi::DenseTensor>("Activated");
-
-    auto* label_t = ctx.Input<phi::DenseTensor>("Label");
-    auto* x1_t = ctx.Input<phi::DenseTensor>("X1");
-    auto* x2_t = ctx.Input<phi::DenseTensor>("X2");
-
-    out_t->mutable_data<T>(ctx.GetPlace());
-    act_t->mutable_data<T>(ctx.GetPlace());
-
-    auto margin = static_cast<T>(ctx.Attr<T>("margin"));
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto act = framework::EigenVector<T>::Flatten(*act_t);
-
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
-    auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
-    act.device(dev) = out.unaryExpr(Heaviside<T>());
-  }
-};
-
-template <typename T, typename DeviceContext>
-class MarginRankLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_x1_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("X1"));
-    auto* d_x2_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("X2"));
-
-    auto* act_t = ctx.Input<phi::DenseTensor>("Activated");
-    auto* d_out_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* label_t = ctx.Input<phi::DenseTensor>("Label");
-
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto act = framework::EigenVector<T>::Flatten(*act_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    // compute d_x1
-    if (d_x1_t) {
-      d_x1_t->mutable_data<T>(ctx.GetPlace());
-      auto d_x1 = framework::EigenVector<T>::Flatten(*d_x1_t);
-      d_x1.device(dev) = -d_out * act * label;
-    }
-    // compute d_x2
-    if (d_x2_t) {
-      d_x2_t->mutable_data<T>(ctx.GetPlace());
-      auto d_x2 = framework::EigenVector<T>::Flatten(*d_x2_t);
-      d_x2.device(dev) = d_out * act * label;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/marker_op.cc b/paddle/fluid/operators/marker_op.cc
deleted file mode 100644
index 0735e63c229b75..00000000000000
--- a/paddle/fluid/operators/marker_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-namespace paddle {
-namespace operators {
-
-class MarkerOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    std::string marker_role = ctx->Attrs().Get<std::string>("marker_role");
-    std::string marker_pos = ctx->Attrs().Get<std::string>("marker_pos");
-
-    VLOG(3) << "The role is:" << marker_role << ";"
-            << "The position is:" << marker_pos << ".";
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
-  }
-};
-
-class MarkerOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::string>("marker_role",
-                         "(string, default forward)forward or backward,"
-                         " mark different stages of porcess.")
-        .SetDefault("forward");
-    AddAttr<std::string>(
-        "marker_pos",
-        "(string, default B)the posititon where the marker is placed, "
-        "B stands for begin of duration,"
-        " E stands for end of duration.")
-        .SetDefault("B");
-    AddComment(
-        R"DOC(Marker Operator - Add marker at the beginning/end of a forward/backward process.)DOC");
-  }
-};
-
-template <typename T, typename DeviceContext>
-class MarkerOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto marker_role = ctx.Attr<std::string>("marker_role");
-    auto marker_pos = ctx.Attr<std::string>("marker_pos");
-
-    platform::RecordEvent record_event(
-        "MarkerCPU",
-        "marker_" + marker_role + "_" + marker_pos,
-        platform::TracerEventType::OperatorInner,
-        1,
-        platform::EventRole::kInnerOp);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(marker, ops::MarkerOp, ops::MarkerOpMaker);
-PD_REGISTER_STRUCT_KERNEL(
-    marker, CPU, ALL_LAYOUT, ops::MarkerOpCPUKernel, float) {}
diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu
deleted file mode 100644
index 1feb6a2b2616f8..00000000000000
--- a/paddle/fluid/operators/marker_op.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void SimpleMarkerKernel(T* in, T* out, int ndim) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  for (; idx < ndim; idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx];
-  }
-}
-
-template <typename T, typename DeviceContext>
-class MarkerOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-
-    auto marker_role = ctx.Attr<std::string>("marker_role");
-    auto marker_pos = ctx.Attr<std::string>("marker_pos");
-    VLOG(3) << "marker role: " << marker_role
-            << " marker position: " << marker_pos;
-
-    phi::DenseTensor A;
-    phi::DenseTensor B;
-    auto* in_temp = A.mutable_data<T>({32, 1}, ctx.GetPlace());
-    auto* out_temp = B.mutable_data<T>({32, 1}, ctx.GetPlace());
-    platform::RecordEvent record_event(
-        "MarkerCUDA",
-        "marker_" + marker_role + "_" + marker_pos,
-        platform::TracerEventType::OperatorInner,
-        1,
-        platform::EventRole::kInnerOp);
-    SimpleMarkerKernel<T>
-        <<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp, 32);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-PD_REGISTER_STRUCT_KERNEL(
-    marker, GPU, ALL_LAYOUT, ops::MarkerOpCUDAKernel, float) {}
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 657b0b976ef621..9424ab8fa99247 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -129,132 +129,6 @@ __device__ inline void LayerNorm2(const phi::funcs::kvp<T> &thread_data,
   }
 }
 
-template <typename T, unsigned TPB>
-__global__ void EmbEltwiseLayernormKernel(int hidden,
-                                          const int64_t *ids,
-                                          const T *scale,
-                                          const T *bias,
-                                          const int64_t *embs,
-                                          T *output,
-                                          T eps,
-                                          int input_num) {
-  cub::Sum pair_sum;
-  // blockIdx.x: position in the sequence
-  // blockIdx.y: batch
-  // gridDim.x: Seq
-  // gridDim.y: Batch
-
-  extern __shared__ int64_t array_id[];
-
-  const T rhidden = T(1.f) / T(hidden);
-  const int64_t seq_pos = blockIdx.y + blockIdx.x * gridDim.y;
-  if (threadIdx.x == 0) {
-    for (int i = 0; i < input_num; ++i) {
-      const int64_t *ids_p = reinterpret_cast<const int64_t *>(ids[i]);
-      array_id[i] = ids_p[seq_pos];
-    }
-  }
-  __syncthreads();
-
-  const int64_t out_offset = seq_pos * hidden;
-
-  phi::funcs::kvp<T> thread_data(0, 0);
-
-#pragma unroll
-  for (int it = threadIdx.x; it < hidden; it += TPB) {
-    T val = 0;
-    for (int i = 0; i < input_num; ++i) {
-      val += reinterpret_cast<const T *>(embs[i])[array_id[i] * hidden + it];
-    }
-
-    output[out_offset + it] = val;
-    const T rhiddenval = rhidden * val;
-    thread_data =
-        pair_sum(thread_data, phi::funcs::kvp<T>(rhiddenval, rhiddenval * val));
-  }
-  LayerNorm<T, TPB>(thread_data, hidden, out_offset, bias, scale, output, eps);
-}
-
-// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifndef __HIPCC__  // @{ Half kernel: EmbEltwiseLayernormKernel
-template <>
-__global__ void EmbEltwiseLayernormKernel<half, 256>(int hidden,
-                                                     const int64_t *ids,
-                                                     const half *scale,
-                                                     const half *bias,
-                                                     const int64_t *embs,
-                                                     half *output,
-                                                     half eps,
-                                                     int input_num) {
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
-  cub::Sum pair_sum;
-  // blockIdx.x: position in the sequence
-  // blockIdx.y: batch
-  // gridDim.x: Seq
-  // gridDim.y: Batch
-
-  extern __shared__ int64_t array_id[];
-
-  const half rhidden = half(1.f) / half(hidden);
-  const int64_t seq_pos = blockIdx.y + blockIdx.x * gridDim.y;
-  if (threadIdx.x == 0) {
-    for (int i = 0; i < input_num; ++i) {
-      const int64_t *ids_p = reinterpret_cast<const int64_t *>(ids[i]);
-      array_id[i] = ids_p[seq_pos];
-    }
-  }
-  __syncthreads();
-
-  const int64_t out_offset = seq_pos * hidden;
-
-  phi::funcs::kvp<half> thread_data(0, 0);
-
-#pragma unroll
-  for (int it = threadIdx.x; it < hidden; it += 256) {
-    half val = 0;
-    for (int i = 0; i < input_num; ++i) {
-      val += reinterpret_cast<const half *>(embs[i])[array_id[i] * hidden + it];
-    }
-
-    output[out_offset + it] = val;
-    const half rhiddenval = rhidden * val;
-    thread_data = pair_sum(thread_data,
-                           phi::funcs::kvp<half>(rhiddenval, rhiddenval * val));
-  }
-  LayerNorm<half, 256>(
-      thread_data, hidden, out_offset, bias, scale, output, eps);
-#endif
-}
-#endif  // @} End Half kernel: EmbEltwiseLayernormKernel
-
-template <typename T>
-void EmbEltwiseLayerNormFunctor<T>::operator()(int batch,
-                                               int seq_len,
-                                               int hidden,
-                                               const int64_t *ids,
-                                               const T *scale,
-                                               const T *bias,
-                                               const int64_t *embs,
-                                               T *output,
-                                               float eps,
-                                               int input_num,
-                                               gpuStream_t stream) {
-  const unsigned tpb = 256;
-  const dim3 grid(seq_len, batch, 1);
-  const dim3 block(tpb, 1, 1);
-  int shared_bytes = input_num * sizeof(int64_t);
-  EmbEltwiseLayernormKernel<T, tpb><<<grid, block, shared_bytes, stream>>>(
-      hidden, ids, scale, bias, embs, output, eps, input_num);
-}
-
-template class EmbEltwiseLayerNormFunctor<float>;
-
-// device function 'operator()' is not supportted until cuda 10.0
-// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
-template class EmbEltwiseLayerNormFunctor<half>;
-#endif
-
 template <typename T, unsigned TPB>
 __global__ void SkipLayerNormSmallKernel(int num,
                                          int hidden,
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 6d31098686608a..76e27380b90e21 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -48,35 +48,6 @@ struct CUDATypeTraits<float> {
 };
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// This functor involves a fusion calculation in Ernie or Bert.
-//  The fusion mode is as follows:
-//
-//      in_var  emb       in_var   emb
-//        |      |          |       |
-//      lookup_table      lookup_table
-//            |                 |
-//         lkt_var           lkt_var
-//             \                /
-//              elementwise_add
-//                     |
-//                elt_out_var
-//
-template <typename T>
-class EmbEltwiseLayerNormFunctor {
- public:
-  void operator()(int batch,
-                  int seq_len,
-                  int hidden,
-                  const int64_t *ids,
-                  const T *scale,
-                  const T *bias,
-                  const int64_t *embs,
-                  T *output,
-                  float eps,
-                  int input_num,
-                  gpuStream_t stream);
-};
-
 // This functor involves a fusion calculation in Ernie or Bert.
 // The fusion mode is as follows:
 //
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index 21eeb52fd311a3..41c131de0f392a 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -98,7 +98,7 @@ class Tree2ColFunctor<phi::CPUContext, T> {
     phi::funcs::SetConstant<phi::CPUContext, T> constant;
     int64_t feature_size = feature_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
-    size_t node_count = 0, patch_count = 0, patch_size;
+    size_t node_count = 0, patch_count = 0, patch_size = 0;
     Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
     std::vector<std::vector<TreeNode>> processing_list;
     for (size_t u = 1; u <= node_count; u++) {
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 69c64de7056459..df66ab400f40bf 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -181,7 +181,7 @@ static phi::DenseTensor FoldHeadAndLastDims(const DeviceContext &context,
  */
 static void ReshapeTensorIntoMatrixSequence(
     phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) {
-  int64_t h, w;
+  int64_t h = 0, w = 0;
   h = descriptor.height_;
   w = descriptor.width_;
   if (descriptor.trans_) {
diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc
deleted file mode 100644
index d87c49187c2fb0..00000000000000
--- a/paddle/fluid/operators/mean_iou_op.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mean_iou_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MeanIoUOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Predictions"), "Input", "Predictions", "MeanIoU");
-    OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "MeanIoU");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("OutMeanIou"), "Output", "OutMeanIou", "MeanIoU");
-    OP_INOUT_CHECK(ctx->HasOutput("OutWrong"), "Output", "OutWrong", "MeanIoU");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("OutCorrect"), "Output", "OutCorrect", "MeanIoU");
-
-    int64_t num_classes =
-        static_cast<int64_t>(ctx->Attrs().Get<int>("num_classes"));
-
-    ctx->SetOutputDim("OutMeanIou", phi::make_ddim({}));
-    ctx->SetOutputDim("OutWrong", {num_classes});
-    ctx->SetOutputDim("OutCorrect", {num_classes});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Predictions"),
-        ctx.GetPlace());
-  }
-};
-
-class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Predictions",
-             "(Tensor), A Tensor of prediction results for semantic labels"
-             " with type int32 or int64. The rank should be greater than 1.");
-    AddInput(
-        "Labels",
-        "(Tensor), A Tensor of ground truth labels with type int32 or int64."
-        "Its shape should be the same as Input(Predictions).");
-    AddInput("InWrongs",
-             "(vector<Tensor>), A list of Tensor with shape "
-             "[num_classes]. They are used to collect wrong number among "
-             "batches. Empty list is also valid here.")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput(
-        "InCorrects",
-        "(vector<Tensor>), A list of Tensor with shape "
-        "[num_classes]. They are used to collect correct number among batches. "
-        "Empty list is also valid here.")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput("InMeanIou",
-             "(vector<Tensor>), A list of Tensor that Output(mean_iou) should "
-             "be added to. Empty list is also valid here.")
-        .AsDuplicable()
-        .AsDispensable();
-    AddOutput("OutMeanIou",
-              "(vector<Tensor>), A Tensor representing the"
-              " mean intersection-over-union with shape [].");
-    AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. ");
-    AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. ");
-    AddAttr<int>("num_classes", "(int), The possible number of labels.");
-
-    AddComment(R"DOC(
-mean-IOU Operator.
-Mean Intersection-Over-Union is a common evaluation metric for
-semantic image segmentation, which first computes the IOU for each
-semantic class and then computes the average over classes.
-IOU is defined as follows:
-    IOU = true_positive / (true_positive + false_positive + false_negative).
-It is based on pixel level area while "IOU Similarity Operator"
-is based on area of rectangle.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    mean_iou,
-    ops::MeanIoUOp,
-    ops::MeanIoUOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    mean_iou, CPU, ALL_LAYOUT, ops::MeanIoUKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
deleted file mode 100644
index 46abb4b72910ad..00000000000000
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mean_iou_op.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void CountCUDAKernel(const int num_classes,
-                                const int count,
-                                const T* predictions,
-                                const T* labels,
-                                int* wrong,
-                                int* correct) {
-  extern __shared__ int blcok_cache[];
-  int* wrong_c = blcok_cache;
-  int* correct_c = blcok_cache + num_classes;
-  // init cache
-  for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) {
-    blcok_cache[i] = 0;
-  }
-  __syncthreads();
-
-  T pred;
-  T label;
-  CUDA_KERNEL_LOOP(i, count) {
-    pred = predictions[i];
-    label = labels[i];
-    if (pred == label) {
-      atomicAdd(correct_c + pred, 1);
-    } else {
-      atomicAdd(wrong_c + pred, 1);
-      atomicAdd(wrong_c + label, 1);
-    }
-  }
-
-  __syncthreads();
-
-  for (int i = threadIdx.x; i < num_classes; i += blockDim.x) {
-    atomicAdd(wrong + i, wrong_c[i]);
-    atomicAdd(correct + i, correct_c[i]);
-  }
-}
-
-__global__ void ComputeIoUCUDAKernel(
-    const int num_classes, int* wrong, int* correct, float* ious, float* iou) {
-  __shared__ int valid_count_c;
-  if (threadIdx.x == 0) {
-    valid_count_c = 0;
-  }
-  __syncthreads();
-  CUDA_KERNEL_LOOP(i, num_classes) {
-    int wrong_n = wrong[i];
-    int correct_n = correct[i];
-    int denominator = wrong_n + correct_n;
-    if (denominator > 0) {
-      atomicAdd(&valid_count_c, 1);
-      ious[i] = static_cast<float>(correct_n) / denominator;
-    } else {
-      ious[i] = 0;
-    }
-  }
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    float iou_sum = 0;
-    for (int i = 0; i < num_classes; ++i) {
-      iou_sum += ious[i];
-    }
-    iou[0] += iou_sum / valid_count_c;
-  }
-}
-
-template <typename T, typename DeviceContext>
-class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto& place = *dev_ctx.eigen_device();
-    // get input and output tensor
-    auto* predictions = ctx.Input<phi::DenseTensor>("Predictions");
-    auto* labels = ctx.Input<phi::DenseTensor>("Labels");
-    auto* out_mean_iou = ctx.Output<phi::DenseTensor>("OutMeanIou");
-    auto* out_wrong = ctx.Output<phi::DenseTensor>("OutWrong");
-    auto* out_correct = ctx.Output<phi::DenseTensor>("OutCorrect");
-    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
-
-    // Get data ptr
-    const T* predictions_data = predictions->data<T>();
-    const T* labels_data = labels->data<T>();
-    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
-    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
-    float* out_mean_iou_data =
-        out_mean_iou->mutable_data<float>(ctx.GetPlace());
-
-    // Get Eigen tensor
-    auto out_mean_iou_t = EigenScalar<float>::From(*out_mean_iou);
-    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
-    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
-
-    // Temporary memory
-    auto tmp_ious_data = memory::Alloc(
-        dev_ctx.GetPlace(),
-        num_classes * sizeof(float),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    float* ious_data = static_cast<float*>(tmp_ious_data->ptr());
-
-    // Init out_wrong, out_correct and out_mean_iou
-    out_wrong_t.device(place) = out_wrong_t.constant(0);
-    out_correct_t.device(place) = out_correct_t.constant(0);
-    out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
-
-    // collect pre wrong, correct and mean_iou
-    auto in_mean_ious = ctx.MultiInput<phi::DenseTensor>("InMeanIou");
-    for (int i = 0; i < in_mean_ious.size(); ++i) {
-      out_mean_iou_t.device(place) +=
-          EigenScalar<float>::From(*in_mean_ious[i]);
-    }
-    auto in_wrongs = ctx.MultiInput<phi::DenseTensor>("InWrongs");
-    for (int i = 0; i < in_wrongs.size(); ++i) {
-      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
-    }
-    auto in_corrects = ctx.MultiInput<phi::DenseTensor>("InCorrects");
-    for (int i = 0; i < in_corrects.size(); ++i) {
-      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
-    }
-    // compute
-    auto stream = ctx.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    int grid = (predictions->numel() + block - 1) / block;
-    int cache_size = (num_classes * 2 + 1) * sizeof(int);
-    CountCUDAKernel<T>
-        <<<grid, block, cache_size, stream>>>(num_classes,
-                                              predictions->numel(),
-                                              predictions_data,
-                                              labels_data,
-                                              out_wrong_data,
-                                              out_correct_data);
-
-    ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes,
-                                                  out_wrong_data,
-                                                  out_correct_data,
-                                                  ious_data,
-                                                  out_mean_iou_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    mean_iou, GPU, ALL_LAYOUT, ops::MeanIoUCUDAOpKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
deleted file mode 100644
index 8569d567c8f088..00000000000000
--- a/paddle/fluid/operators/mean_iou_op.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-
-template <typename T, typename DeviceContext>
-class MeanIoUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place =
-        *ctx.template device_context<phi::CPUContext>().eigen_device();
-    // get input and output tensor
-    auto* predictions = ctx.Input<phi::DenseTensor>("Predictions");
-    auto* labels = ctx.Input<phi::DenseTensor>("Labels");
-    auto* out_mean_iou = ctx.Output<phi::DenseTensor>("OutMeanIou");
-    auto* out_wrong = ctx.Output<phi::DenseTensor>("OutWrong");
-    auto* out_correct = ctx.Output<phi::DenseTensor>("OutCorrect");
-    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
-
-    // get data ptr
-    const T* predictions_data = predictions->data<T>();
-    const T* labels_data = labels->data<T>();
-    float* out_mean_iou_data =
-        out_mean_iou->mutable_data<float>(ctx.GetPlace());
-    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
-    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
-
-    // get eigen tensor
-    auto out_mean_iou_t = EigenScalar<float>::From(*out_mean_iou);
-    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
-    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
-
-    // Tmp tensor
-    phi::DenseTensor denominator;
-    phi::DenseTensor valid_count;
-    phi::DenseTensor iou_sum;
-
-    // get data ptr of tmp tensor
-    int* denominator_data = denominator.mutable_data<int>(
-        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
-    int* valid_count_data = valid_count.mutable_data<int>({1}, ctx.GetPlace());
-    float* iou_sum_data = iou_sum.mutable_data<float>({1}, ctx.GetPlace());
-
-    // get eigen tensor of tmp tensor
-    auto denominator_t = EigenTensor<int, 1>::From(denominator);
-    auto valid_count_t = EigenTensor<int, 1>::From(valid_count);
-    auto iou_sum_t = EigenTensor<float, 1>::From(iou_sum);
-
-    // init out_wrong, out_correct and out_mean_iou
-    out_wrong_t = out_wrong_t.constant(0);
-    out_correct_t = out_correct_t.constant(0);
-    out_mean_iou_t = out_mean_iou_t.constant(0);
-
-    // collect pre wrong, correct and mean_iou
-    auto in_mean_ious = ctx.MultiInput<phi::DenseTensor>("InMeanIou");
-    for (size_t i = 0; i < in_mean_ious.size(); ++i) {
-      out_mean_iou_t.device(place) +=
-          EigenScalar<float>::From(*in_mean_ious[i]);
-    }
-
-    auto in_wrongs = ctx.MultiInput<phi::DenseTensor>("InWrongs");
-    for (size_t i = 0; i < in_wrongs.size(); ++i) {
-      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
-    }
-    auto in_corrects = ctx.MultiInput<phi::DenseTensor>("InCorrects");
-    for (size_t i = 0; i < in_corrects.size(); ++i) {
-      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
-    }
-
-    // compute
-    for (int64_t i = 0; i < predictions->numel(); ++i) {
-      if (predictions_data[i] == labels_data[i]) {
-        out_correct_data[predictions_data[i]] += 1;
-      } else {
-        out_wrong_data[labels_data[i]] += 1;
-        out_wrong_data[predictions_data[i]] += 1;
-      }
-    }
-
-    denominator_t = out_wrong_t + out_correct_t;
-    valid_count_t =
-        (denominator_t > denominator_t.constant(0.0f)).cast<int>().sum();
-
-    for (int i = 0; i < num_classes; ++i) {
-      if (denominator_data[i] == 0) {
-        denominator_data[i] = 1;
-      }
-    }
-
-    iou_sum_t =
-        (out_correct_t.cast<float>() / denominator_t.cast<float>()).sum();
-    out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index ae00939d07844f..3f0fd7bfef2dcc 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -82,7 +82,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     platform::Place place = dev_place;
     int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
     auto data_type = in_true.IsInitialized() ? in_true.type() : in_false.type();
-    int rank;
+    int rank = 0;
     framework::DDim in_dims;
     if (in_true.IsInitialized()) {
       rank = in_true.dims().size();
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index 3c53b05152b7e4..d1bbfe42293724 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -161,7 +161,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
     x_dims = x->dims();
     auto axes = ctx.Attr<int>("axis");
     out_dims = phi::make_ddim(
-        FlattenKernel<phi::CPUContext, float>::GetOutputShape(axes, x_dims));
+        Flatten2Kernel<phi::CPUContext, float>::GetOutputShape(axes, x_dims));
   }
 
  protected:
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
deleted file mode 100644
index ffb3081ca0ba90..00000000000000
--- a/paddle/fluid/operators/one_hot_op.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class OneHotOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "OneHot");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "OneHot");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "Input(input) rank should be at least 2, "
-                          "but received input rank (%d) less than 2",
-                          x_dims.size()));
-
-    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
-      PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1],
-                        1U,
-                        platform::errors::InvalidArgument(
-                            "Last dimension of Input(input) should be 1, "
-                            "but received input Last dimension(%d) != 1",
-                            x_dims[x_dims.size() - 1]));
-    }
-
-    framework::DDim out_dims(x_dims);
-    int depth = ctx->Attrs().Get<int>("depth");
-    if (ctx->HasInput("depth_tensor")) {
-      depth = -1;
-    }
-
-    out_dims[out_dims.size() - 1] = depth;
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-    if (var_name == "depth_tensor") {
-      return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                            expected_kernel_type.layout(),
-                            expected_kernel_type.dtype());
-    }
-    return phi::KernelKey(
-        tensor.place(), tensor.layout(), expected_kernel_type.dtype());
-  }
-};
-
-class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(phi::DenseTensor, phi::DenseTensor<int>) Input variable with "
-             "rank at least 2. "
-             "The last dimension of X should be 1. Each value of X is an index "
-             "to indicate the position.");
-    AddInput("depth_tensor", "(Tensor, Tensor<int>), Length of one-hot vector")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor, Tensor<float>) Output tensor with same rank as X. "
-              "The tensor consists of one-hot representations of values in X.");
-
-    AddAttr<int>("depth",
-                 "A positive integer to specify the length of one-hot vector.")
-        .SetDefault(-1);
-    AddAttr<int>("dtype",
-                 "An integer to specify the data type of one-hot "
-                 "vector. The default value is FP32.")
-        .SetDefault(paddle::framework::proto::VarType::FP32);
-    AddAttr<bool>("allow_out_of_range",
-                  "If it is set true and the input data is out of range, "
-                  "the output tensor will be filled zeros. The default value "
-                  "is false.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-One Hot Operator. This operator creates the one-hot representations for input
-index values. The following example will help to explain the function of this
-operator:
-
-X is a LoDTensor:
-  X.lod = [[0, 1, 4]]
-  X.shape = [4, 1]
-  X.data = [[1], [1], [3], [0]]
-
-set depth = 4
-
-Out is a LoDTensor:
-  Out.lod = [[0, 1, 4]]
-  Out.shape = [4, 4]
-  Out.data = [[0., 1., 0., 0.],
-              [0., 1., 0., 0.],
-              [0., 0., 0., 1.],
-              [1., 0., 0., 0.]]
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    one_hot,
-    ops::OneHotOp,
-    ops::OneHotOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(one_hot,
-                       ops::OneHotKernel<phi::CPUContext, int>,
-                       ops::OneHotKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
deleted file mode 100644
index 917fa857e07782..00000000000000
--- a/paddle/fluid/operators/one_hot_op.cu
+++ /dev/null
@@ -1,103 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-template <typename InT, typename OutT>
-__global__ void FillOutputKernel(const InT* p_in_data,
-                                 OutT* p_out_data,
-                                 const int64_t numel,
-                                 const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
-    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct OneHotOpCUDAFunctor {
-  const phi::DenseTensor* in_;
-  phi::DenseTensor* out_;
-  const DeviceContext& ctx_;
-  int depth_;
-
-  OneHotOpCUDAFunctor(const phi::DenseTensor* in,
-                      phi::DenseTensor* out,
-                      int depth,
-                      const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    auto stream = ctx_.stream();
-    phi::funcs::set_constant(ctx_, out_, 0.0);
-
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS,
-                       0,
-                       stream>>>(p_in_data, p_out_data, numel, depth_);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class OneHotCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    int depth = -1;
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<phi::DenseTensor>("depth_tensor");
-      if (platform::is_gpu_place(depth_tensor->place())) {
-        phi::DenseTensor temp;
-        paddle::framework::TensorCopySync(
-            *depth_tensor, platform::CPUPlace(), &temp);
-        depth = *temp.data<int32_t>();
-      } else {
-        depth = *depth_tensor->data<int32_t>();
-      }
-
-      auto in_dims = in->dims();
-      framework::DDim out_dims(in_dims);
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    } else {
-      depth = context.Attr<int>("depth");
-    }
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotOpCUDAFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(one_hot,
-                        ops::OneHotCUDAKernel<phi::GPUContext, int>,
-                        ops::OneHotCUDAKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
deleted file mode 100644
index 41ec3eb9a135fc..00000000000000
--- a/paddle/fluid/operators/one_hot_op.h
+++ /dev/null
@@ -1,110 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename InT>
-struct OneHotOpFunctor {
-  const phi::DenseTensor* in_;
-  phi::DenseTensor* out_;
-  int depth_;
-  const DeviceContext& ctx_;
-  bool allow_out_of_range_;
-
-  OneHotOpFunctor(const phi::DenseTensor* in,
-                  phi::DenseTensor* out,
-                  int depth,
-                  const DeviceContext& ctx,
-                  bool allow_out_of_range = false)
-      : in_(in),
-        out_(out),
-        depth_(depth),
-        ctx_(ctx),
-        allow_out_of_range_(allow_out_of_range) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    phi::funcs::set_constant(ctx_, out_, 0.0);
-
-    if (allow_out_of_range_) {
-      for (int i = 0; i < numel; ++i) {
-        if (p_in_data[i] >= 0 && p_in_data[i] < depth_) {
-          *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-        }
-      }
-    } else {
-      for (int i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_GE(
-            p_in_data[i],
-            0,
-            platform::errors::InvalidArgument(
-                "Illegal index value, Input(input) value should be at least 0, "
-                "but received input (%d) less than 0",
-                p_in_data[i]));
-        PADDLE_ENFORCE_LT(
-            p_in_data[i],
-            depth_,
-            platform::errors::InvalidArgument(
-                "Illegal index value, Input(input) value should be less than "
-                "Input(depth), "
-                "but received input (%d) not less than depth (%d)",
-                p_in_data[i],
-                depth_));
-
-        *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class OneHotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    int depth = context.Attr<int>("depth");
-    bool allow_out_of_range = context.Attr<bool>("allow_out_of_range");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<phi::DenseTensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      depth = depth_data[0];
-      auto in_dims = in->dims();
-      framework::DDim out_dims(in_dims);
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotOpFunctor<DeviceContext, T>(
-            in,
-            out,
-            depth,
-            context.template device_context<DeviceContext>(),
-            allow_out_of_range));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
deleted file mode 100644
index 3261e96cbbeca4..00000000000000
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ProximalAdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Param"), "Input", "Param", "ProximalAdagradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Moment"), "Input", "Moment", "ProximalAdagradOp");
-    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "ProximalAdagradOp");
-    OP_INOUT_CHECK(ctx->HasInput("LearningRate"),
-                   "Input",
-                   "LearningRate",
-                   "ProximalAdagradOp");
-
-    OP_INOUT_CHECK(
-        ctx->HasOutput("ParamOut"), "Output", "ParamOut", "ProximalAdagradOp");
-    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"),
-                   "Output",
-                   "MomentOut",
-                   "ProximalAdagradOp");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dim,
-                      ctx->GetInputDim("Grad"),
-                      platform::errors::InvalidArgument(
-                          "The shape of Intput(Param) should be equal to the "
-                          "Input(Grad) of ProximalAdagrad Op. But received "
-                          "Input(Param).dimensions=[%s], "
-                          "Input(Grad).dimensions=[%s]",
-                          param_dim,
-                          ctx->GetInputDim("Grad")));
-
-    PADDLE_ENFORCE_EQ(param_dim,
-                      ctx->GetInputDim("Moment"),
-                      platform::errors::InvalidArgument(
-                          "The shape of Intput(Param) should be equal to the "
-                          "Input(Moment) of ProximalAdagrad Op. But received "
-                          "Input(Param).dimensions=[%s], "
-                          "Input(Moment).dimensions=[%s]",
-                          param_dim,
-                          ctx->GetInputDim("Moment")));
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(
-        phi::product(lr_dim),
-        1,
-        platform::errors::InvalidArgument(
-            "Learning Rate should be a scalar. But received dimension[%s]",
-            lr_dim));
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("MomentOut", param_dim);
-  }
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Param"),
-                          ctx.GetPlace());
-  }
-};
-
-class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter that has to be updated.");
-    AddInput("Moment",
-             "(Tensor, default Tensor<float>) "
-             "Moment parameter that has to be updated.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-    AddOutput("MomentOut", "(Tensor) Output updated moment value.");
-
-    AddAttr<float>("l1",
-                   "(float, default 0.0) "
-                   "L1 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("l2",
-                   "(float, default 0.0) "
-                   "L2 regularization strength.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-Proximal Adagrad Optimizer.
-
-Optimizer that implements the proximal adagrad algorithm:
-
-$$
-moment = moment + grad * grad \\
-prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
-param = sign(prox\_param) / (1 + learning\_rate * l2) *
-        \max(|prox\_param| - learning\_rate * l1 , 0)
-$$
-
-The paper that proposed Proximal GD:
-(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
-Here, we use the adagrad learning rate as specified here:
-(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad,
-                             ops::ProximalAdagradOp,
-                             ops::ProximalAdagradOpMaker);
-PD_REGISTER_STRUCT_KERNEL(
-    proximal_adagrad, CPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
deleted file mode 100644
index 0a79dcd425f128..00000000000000
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed
-under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. */
-#include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    proximal_adagrad, GPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
deleted file mode 100644
index 973d870d14f31b..00000000000000
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class ProximalAdagradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto* moment_out = ctx.Output<phi::DenseTensor>("MomentOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-
-    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
-    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
-
-    auto grad = ctx.Input<phi::DenseTensor>("Grad");
-    auto p = framework::EigenVector<T>::Flatten(
-        *ctx.Input<phi::DenseTensor>("Param"));
-    auto m = framework::EigenVector<T>::Flatten(
-        *ctx.Input<phi::DenseTensor>("Moment"));
-    auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<phi::DenseTensor>("LearningRate"));
-
-    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
-    auto m_out = framework::EigenVector<T>::Flatten(*moment_out);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    m_out.device(*place) = m + g * g;
-    auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt();
-    if (l1 > static_cast<T>(0)) {
-      p_out.device(*place) =
-          prox_param.sign() *
-          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
-                .cwiseMax(static_cast<T>(0.0))) /
-           (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize)));
-    } else {
-      p_out.device(*place) =
-          prox_param / (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
index 4485465ddf9eb5..dae61e7cfcf264 100644
--- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake
+++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
@@ -8,7 +8,6 @@ register_unity_group(
   cc
   ftrl_op.cc
   lars_momentum_op.cc
-  proximal_adagrad_op.cc
   proximal_gd_op.cc
   decayed_adagrad_op.cc
   adadelta_op.cc
@@ -19,7 +18,6 @@ register_unity_group(
   lars_momentum_op.cu
   momentum_op.cu
   sgd_op.cu
-  proximal_adagrad_op.cu
   adagrad_op.cu
   decayed_adagrad_op.cu
   adadelta_op.cu
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 337a4e3178956d..29d2807b239709 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -402,231 +402,6 @@ static inline void GetPaddings(int* paddings,
     std::copy(pads.begin(), pads.end(), paddings);
   }
 }
-
-template <typename T, typename DeviceContext>
-class Pad2dCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::array<int, 4> pads;
-    GetPaddings(pads.data(), context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = static_cast<T>(context.Attr<float>("pad_value"));
-
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    if (data_format == "NCHW") {
-      out->Resize({in_dims[0],
-                   in_dims[1],
-                   in_dims[2] + pads[0] + pads[1],
-                   in_dims[3] + pads[2] + pads[3]});
-    } else {
-      out->Resize({in_dims[0],
-                   in_dims[1] + pads[0] + pads[1],
-                   in_dims[2] + pads[2] + pads[3],
-                   in_dims[3]});
-    }
-    auto out_dims = out->dims();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    const int pad_top = pads[0];
-    const int pad_left = pads[2];
-    const int num = static_cast<int>(in_dims[0]);
-    if (data_format == "NCHW") {
-      const int channels = static_cast<int>(in_dims[1]);
-      const int in_height = static_cast<int>(in_dims[2]);
-      const int in_width = static_cast<int>(in_dims[3]);
-      const int out_height = static_cast<int>(out_dims[2]);
-      const int out_width = static_cast<int>(out_dims[3]);
-      if (mode == "reflect") {
-        Pad2DReflectNCHW(in_data,
-                         num,
-                         channels,
-                         in_height,
-                         in_width,
-                         out_height,
-                         out_width,
-                         pad_top,
-                         pad_left,
-                         out_data);
-      } else if (mode == "edge") {
-        Pad2DEdgeNCHW(in_data,
-                      num,
-                      channels,
-                      in_height,
-                      in_width,
-                      out_height,
-                      out_width,
-                      pad_top,
-                      pad_left,
-                      out_data);
-      } else {
-        Pad2DConstNCHW(in_data,
-                       num,
-                       channels,
-                       in_height,
-                       in_width,
-                       out_height,
-                       out_width,
-                       pad_top,
-                       pad_left,
-                       value,
-                       out_data);
-      }
-    } else {
-      const int channels = static_cast<int>(in_dims[3]);
-      const int in_height = static_cast<int>(in_dims[1]);
-      const int in_width = static_cast<int>(in_dims[2]);
-      const int out_height = static_cast<int>(out_dims[1]);
-      const int out_width = static_cast<int>(out_dims[2]);
-      if (mode == "reflect") {
-        Pad2DReflectNHWC(in_data,
-                         num,
-                         channels,
-                         in_height,
-                         in_width,
-                         out_height,
-                         out_width,
-                         pad_top,
-                         pad_left,
-                         out_data);
-      } else if (mode == "edge") {
-        Pad2DEdgeNHWC(in_data,
-                      num,
-                      channels,
-                      in_height,
-                      in_width,
-                      out_height,
-                      out_width,
-                      pad_top,
-                      pad_left,
-                      out_data);
-      } else {
-        Pad2DConstNHWC(in_data,
-                       num,
-                       channels,
-                       in_height,
-                       in_width,
-                       out_height,
-                       out_width,
-                       pad_top,
-                       pad_left,
-                       value,
-                       out_data);
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class Pad2dGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::array<int, 4> pads;
-    GetPaddings(pads.data(), context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
-    set_zero(context.template device_context<phi::CPUContext>(),
-             d_in,
-             static_cast<T>(0));
-    const int pad_top = pads[0];
-    const int pad_left = pads[2];
-    const int num = static_cast<int>(d_in_dims[0]);
-    if (data_format == "NCHW") {
-      const int channels = static_cast<int>(d_in_dims[1]);
-      const int in_height = static_cast<int>(d_in_dims[2]);
-      const int in_width = static_cast<int>(d_in_dims[3]);
-      const int out_height = static_cast<int>(d_out_dims[2]);
-      const int out_width = static_cast<int>(d_out_dims[3]);
-      if (mode == "reflect") {
-        Pad2DGradReflectNCHW(d_in_data,
-                             num,
-                             channels,
-                             in_height,
-                             in_width,
-                             out_height,
-                             out_width,
-                             pad_top,
-                             pad_left,
-                             d_out_data);
-      } else if (mode == "edge") {
-        Pad2DGradEdgeNCHW(d_in_data,
-                          num,
-                          channels,
-                          in_height,
-                          in_width,
-                          out_height,
-                          out_width,
-                          pad_top,
-                          pad_left,
-                          d_out_data);
-      } else {
-        Pad2DGradConstNCHW(d_in_data,
-                           num,
-                           channels,
-                           in_height,
-                           in_width,
-                           out_height,
-                           out_width,
-                           pad_top,
-                           pad_left,
-                           d_out_data);
-      }
-    } else {
-      const int channels = static_cast<int>(d_in_dims[3]);
-      const int in_height = static_cast<int>(d_in_dims[1]);
-      const int in_width = static_cast<int>(d_in_dims[2]);
-      const int out_height = static_cast<int>(d_out_dims[1]);
-      const int out_width = static_cast<int>(d_out_dims[2]);
-      if (mode == "reflect") {
-        Pad2DGradReflectNHWC(d_in_data,
-                             num,
-                             channels,
-                             in_height,
-                             in_width,
-                             out_height,
-                             out_width,
-                             pad_top,
-                             pad_left,
-                             d_out_data);
-      } else if (mode == "edge") {
-        Pad2DGradEdgeNHWC(d_in_data,
-                          num,
-                          channels,
-                          in_height,
-                          in_width,
-                          out_height,
-                          out_width,
-                          pad_top,
-                          pad_left,
-                          d_out_data);
-      } else {
-        Pad2DGradConstNHWC(d_in_data,
-                           num,
-                           channels,
-                           in_height,
-                           in_width,
-                           out_height,
-                           out_width,
-                           pad_top,
-                           pad_left,
-                           d_out_data);
-      }
-    }
-  }
-};
-
 class Pad2dOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -872,8 +647,3 @@ REGISTER_OPERATOR(pad2d,
 REGISTER_OPERATOR(pad2d_grad,
                   ops::Pad2dOpGrad,
                   ops::Pad2dOpGradNoNeedBufferVarsInferer);
-
-PD_REGISTER_STRUCT_KERNEL(
-    pad2d, CPU, ALL_LAYOUT, ops::Pad2dCPUKernel, float, double, int, int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(
-    pad2d_grad, CPU, ALL_LAYOUT, ops::Pad2dGradCPUKernel, float, double) {}
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
deleted file mode 100644
index b8263ea6bb1692..00000000000000
--- a/paddle/fluid/operators/pad2d_op.cu
+++ /dev/null
@@ -1,636 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void Pad2DConstNCHW(const int nthreads,
-                               const T* in_data,
-                               const int num,
-                               const int channels,
-                               const int in_height,
-                               const int in_width,
-                               const int out_height,
-                               const int out_width,
-                               const int pad_top,
-                               const int pad_left,
-                               T value,
-                               T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    out_data[index] =
-        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[(nc * in_height + in_h) * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DConstNHWC(const int nthreads,
-                               const T* in_data,
-                               const int num,
-                               const int channels,
-                               const int in_height,
-                               const int in_width,
-                               const int out_height,
-                               const int out_width,
-                               const int pad_top,
-                               const int pad_left,
-                               T value,
-                               T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int in_h = out_h - pad_top;
-    const int in_w = out_w - pad_left;
-    out_data[index] =
-        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[((n * in_height + in_h) * in_width + in_w) * channels +
-                      c];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DReflectNCHW(const int nthreads,
-                                 const T* in_data,
-                                 const int num,
-                                 const int channels,
-                                 const int in_height,
-                                 const int in_width,
-                                 const int out_height,
-                                 const int out_width,
-                                 const int pad_top,
-                                 const int pad_left,
-                                 T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    in_h = max(in_h, -in_h);                     // reflect by 0
-    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-    in_w = max(in_w, -in_w);                     // reflect by 0
-    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DReflectNHWC(const int nthreads,
-                                 const T* in_data,
-                                 const int num,
-                                 const int channels,
-                                 const int in_height,
-                                 const int in_width,
-                                 const int out_height,
-                                 const int out_width,
-                                 const int pad_top,
-                                 const int pad_left,
-                                 T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    in_h = max(in_h, -in_h);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = max(in_w, -in_w);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-    out_data[index] =
-        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DEdgeNCHW(const int nthreads,
-                              const T* in_data,
-                              const int num,
-                              const int channels,
-                              const int in_height,
-                              const int in_width,
-                              const int out_height,
-                              const int out_width,
-                              const int pad_top,
-                              const int pad_left,
-                              T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DEdgeNHWC(const int nthreads,
-                              const T* in_data,
-                              const int num,
-                              const int channels,
-                              const int in_height,
-                              const int in_width,
-                              const int out_height,
-                              const int out_width,
-                              const int pad_top,
-                              const int pad_left,
-                              T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    out_data[index] =
-        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradConstNCHW(const int in_size,
-                                   T* d_in_data,
-                                   const int num,
-                                   const int channels,
-                                   const int in_height,
-                                   const int in_width,
-                                   const int out_height,
-                                   const int out_width,
-                                   const int pad_top,
-                                   const int pad_left,
-                                   const T* d_out_data) {
-  CUDA_KERNEL_LOOP(in_index, in_size) {
-    int nc = in_index / in_width;
-    const int out_w = in_index % in_width + pad_left;
-    const int out_h = nc % in_height + pad_top;
-    nc /= in_height;
-    d_in_data[in_index] =
-        d_out_data[(nc * out_height + out_h) * out_width + out_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradConstNHWC(const int in_size,
-                                   T* d_in_data,
-                                   const int num,
-                                   const int channels,
-                                   const int in_height,
-                                   const int in_width,
-                                   const int out_height,
-                                   const int out_width,
-                                   const int pad_top,
-                                   const int pad_left,
-                                   const T* d_out_data) {
-  CUDA_KERNEL_LOOP(in_index, in_size) {
-    int n = in_index / channels;
-    const int c = in_index % channels;
-    const int out_w = n % in_width + pad_left;
-    n /= in_width;
-    const int out_h = n % in_height + pad_top;
-    n /= in_height;
-    d_in_data[in_index] =
-        d_out_data[((n * out_height + out_h) * out_width + out_w) * channels +
-                   c];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradReflectNCHW(const int out_size,
-                                     T* d_in_data,
-                                     const int num,
-                                     const int channels,
-                                     const int in_height,
-                                     const int in_width,
-                                     const int out_height,
-                                     const int out_width,
-                                     const int pad_top,
-                                     const int pad_left,
-                                     const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-    phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
-                       d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradReflectNHWC(const int out_size,
-                                     T* d_in_data,
-                                     const int num,
-                                     const int channels,
-                                     const int in_height,
-                                     const int in_width,
-                                     const int out_height,
-                                     const int out_width,
-                                     const int pad_top,
-                                     const int pad_left,
-                                     const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-    in_h = min(in_h, in_height * 2 - in_h - 2);
-    in_w = min(in_w, in_width * 2 - in_w - 2);
-    phi::CudaAtomicAdd(
-        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradEdgeNCHW(const int out_size,
-                                  T* d_in_data,
-                                  const int num,
-                                  const int channels,
-                                  const int in_height,
-                                  const int in_width,
-                                  const int out_height,
-                                  const int out_width,
-                                  const int pad_top,
-                                  const int pad_left,
-                                  const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
-                       d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradEdgeNHWC(const int out_size,
-                                  T* d_in_data,
-                                  const int num,
-                                  const int channels,
-                                  const int in_height,
-                                  const int in_width,
-                                  const int out_height,
-                                  const int out_width,
-                                  const int pad_top,
-                                  const int pad_left,
-                                  const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    phi::CudaAtomicAdd(
-        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-static inline void GetPaddings(int* paddings,
-                               const framework::ExecutionContext& context) {
-  auto* paddings_t = context.Input<phi::DenseTensor>("Paddings");
-  if (paddings_t) {
-    phi::DenseTensor pads;
-    framework::TensorCopySync(*paddings_t, platform::CPUPlace(), &pads);
-    auto pads_data = pads.data<int>();
-    paddings[0] = pads_data[0];
-    paddings[1] = pads_data[1];
-    paddings[2] = pads_data[2];
-    paddings[3] = pads_data[3];
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings);
-  }
-}
-
-template <typename T, typename DeviceContext>
-class Pad2dCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int pads[4];
-    GetPaddings(pads, context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = static_cast<T>(context.Attr<float>("pad_value"));
-
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto out_dims = out->dims();
-    if (data_format == "NCHW") {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1];
-      out_dims[2] = in_dims[2] + pads[0] + pads[1];
-      out_dims[3] = in_dims[3] + pads[2] + pads[3];
-    } else {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1] + pads[0] + pads[1];
-      out_dims[2] = in_dims[2] + pads[2] + pads[3];
-      out_dims[3] = in_dims[3];
-    }
-    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
-    const int pad_top = pads[0];
-    const int pad_left = pads[2];
-    const int num = in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = out->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCHW") {
-      const int channels = in_dims[1];
-      const int in_height = in_dims[2];
-      const int in_width = in_dims[3];
-      const int out_height = out_dims[2];
-      const int out_width = out_dims[3];
-      if (mode == "reflect") {
-        Pad2DReflectNCHW<T><<<grid, block, 0, stream>>>(out_size,
-                                                        in_data,
-                                                        num,
-                                                        channels,
-                                                        in_height,
-                                                        in_width,
-                                                        out_height,
-                                                        out_width,
-                                                        pad_top,
-                                                        pad_left,
-                                                        out_data);
-      } else if (mode == "edge") {
-        Pad2DEdgeNCHW<T><<<grid, block, 0, stream>>>(out_size,
-                                                     in_data,
-                                                     num,
-                                                     channels,
-                                                     in_height,
-                                                     in_width,
-                                                     out_height,
-                                                     out_width,
-                                                     pad_top,
-                                                     pad_left,
-                                                     out_data);
-      } else {
-        Pad2DConstNCHW<T><<<grid, block, 0, stream>>>(out_size,
-                                                      in_data,
-                                                      num,
-                                                      channels,
-                                                      in_height,
-                                                      in_width,
-                                                      out_height,
-                                                      out_width,
-                                                      pad_top,
-                                                      pad_left,
-                                                      value,
-                                                      out_data);
-      }
-    } else {
-      const int channels = in_dims[3];
-      const int in_height = in_dims[1];
-      const int in_width = in_dims[2];
-      const int out_height = out_dims[1];
-      const int out_width = out_dims[2];
-      if (mode == "reflect") {
-        Pad2DReflectNHWC<T><<<grid, block, 0, stream>>>(out_size,
-                                                        in_data,
-                                                        num,
-                                                        channels,
-                                                        in_height,
-                                                        in_width,
-                                                        out_height,
-                                                        out_width,
-                                                        pad_top,
-                                                        pad_left,
-                                                        out_data);
-      } else if (mode == "edge") {
-        Pad2DEdgeNHWC<T><<<grid, block, 0, stream>>>(out_size,
-                                                     in_data,
-                                                     num,
-                                                     channels,
-                                                     in_height,
-                                                     in_width,
-                                                     out_height,
-                                                     out_width,
-                                                     pad_top,
-                                                     pad_left,
-                                                     out_data);
-      } else {
-        Pad2DConstNHWC<T><<<grid, block, 0, stream>>>(out_size,
-                                                      in_data,
-                                                      num,
-                                                      channels,
-                                                      in_height,
-                                                      in_width,
-                                                      out_height,
-                                                      out_width,
-                                                      pad_top,
-                                                      pad_left,
-                                                      value,
-                                                      out_data);
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int pads[4];
-    GetPaddings(pads, context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-
-    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
-    set_zero(context.template device_context<phi::GPUContext>(),
-             d_in,
-             static_cast<T>(0));
-
-    const int pad_top = pads[0];
-    const int pad_left = pads[2];
-    const int num = d_in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = d_out->numel();
-    const int in_size = d_in->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCHW") {
-      const int channels = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-      if (mode == "reflect") {
-        Pad2DGradReflectNCHW<T><<<grid, block, 0, stream>>>(out_size,
-                                                            d_in_data,
-                                                            num,
-                                                            channels,
-                                                            in_height,
-                                                            in_width,
-                                                            out_height,
-                                                            out_width,
-                                                            pad_top,
-                                                            pad_left,
-                                                            d_out_data);
-      } else if (mode == "edge") {
-        Pad2DGradEdgeNCHW<T><<<grid, block, 0, stream>>>(out_size,
-                                                         d_in_data,
-                                                         num,
-                                                         channels,
-                                                         in_height,
-                                                         in_width,
-                                                         out_height,
-                                                         out_width,
-                                                         pad_top,
-                                                         pad_left,
-                                                         d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad2DGradConstNCHW<T><<<grid, block, 0, stream>>>(in_size,
-                                                          d_in_data,
-                                                          num,
-                                                          channels,
-                                                          in_height,
-                                                          in_width,
-                                                          out_height,
-                                                          out_width,
-                                                          pad_top,
-                                                          pad_left,
-                                                          d_out_data);
-      }
-    } else {
-      const int channels = d_in_dims[3];
-      const int in_height = d_in_dims[1];
-      const int in_width = d_in_dims[2];
-      const int out_height = d_out_dims[1];
-      const int out_width = d_out_dims[2];
-      if (mode == "reflect") {
-        Pad2DGradReflectNHWC<T><<<grid, block, 0, stream>>>(out_size,
-                                                            d_in_data,
-                                                            num,
-                                                            channels,
-                                                            in_height,
-                                                            in_width,
-                                                            out_height,
-                                                            out_width,
-                                                            pad_top,
-                                                            pad_left,
-                                                            d_out_data);
-      } else if (mode == "edge") {
-        Pad2DGradEdgeNHWC<T><<<grid, block, 0, stream>>>(out_size,
-                                                         d_in_data,
-                                                         num,
-                                                         channels,
-                                                         in_height,
-                                                         in_width,
-                                                         out_height,
-                                                         out_width,
-                                                         pad_top,
-                                                         pad_left,
-                                                         d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad2DGradConstNHWC<T><<<grid, block, 0, stream>>>(in_size,
-                                                          d_in_data,
-                                                          num,
-                                                          channels,
-                                                          in_height,
-                                                          in_width,
-                                                          out_height,
-                                                          out_width,
-                                                          pad_top,
-                                                          pad_left,
-                                                          d_out_data);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-PD_REGISTER_STRUCT_KERNEL(pad2d,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::Pad2dCUDAKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t,
-                          plat::float16) {}
-PD_REGISTER_STRUCT_KERNEL(pad2d_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::Pad2dGradCUDAKernel,
-                          float,
-                          double,
-                          plat::float16) {}
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
deleted file mode 100644
index d00cefab450454..00000000000000
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pad_constant_like_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class PadConstantLikeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PadConstantLike");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "PadConstantLike");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PadConstantLike");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dim.size(),
-                      y_dim.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(X)'s dimension and the size of "
-                          "Input(Y)'s dimension should be the same, but "
-                          "received %d for Input(X) vs %d for Input(Y).",
-                          x_dim.size(),
-                          y_dim.size()));
-
-    for (int i = 0; i < x_dim.size(); ++i) {
-      if ((!ctx->IsRuntime()) && ((x_dim[i] == -1) || (y_dim[i] == -1))) {
-        continue;
-      } else {
-        PADDLE_ENFORCE_GE(
-            x_dim[i],
-            y_dim[i],
-            platform::errors::InvalidArgument(
-                "The size of each dimension of Input(X) expected to be greater "
-                "than or equal to size of corresponding dimension of Input(Y) "
-                "(X_dim[i] >= Y_dim[i]), but received %d < %d for dimension %d",
-                x_dim[i],
-                y_dim[i],
-                i));
-      }
-    }
-
-    ctx->SetOutputDim("Out", x_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Y"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input of pad_constant_like op. "
-             "The input should be a k-D tensor(k > 0 and k < 7)");
-    AddInput("Y",
-             "The input of pad_constant_like op. "
-             "The input should be a k-D tensor(k > 0 and k < 7)");
-    AddOutput("Out",
-              "The output of pad_constant_like op. "
-              "A tensor with the same shape as X.");
-    AddAttr<float>("pad_value",
-                   "(float, default 0.0) "
-                   "The value to fill the padded areas.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-PadConstantLikeOp Operator.
-
-Pad input(Y) with a pad_value, the number of values padded to the edges of each
-axis is specified by the difference of the shape of X and Y.
-((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for
-each axis.
-The input should be a k-D tensor(k > 0 and k < 7). As an example:
-
-case1:
-    Given:
-        X = [[1, 2],
-             [3, 4],
-             [1, 2],
-             [3, 4]]],
-        X.shape = (4, 2)
-
-        Y = [[5, 6],
-            [7, 8]],
-        Y.shape = (2, 2)
-
-    And
-        pad_value = 0,
-
-    Return:
-        Out = [[5, 6],
-               [7, 8],
-               [0, 0],
-               [0, 0]]
-        Out.shape = (4, 2)
-
-case2:
-    Given:
-        X = [[[[ 0,  1,  2],
-               [ 3,  4,  5]],
-              [[ 6,  7,  8],
-               [ 9, 10, 11]],
-              [[12, 13, 14],
-               [15, 16, 17]]],
-             [[[18, 19, 20],
-               [21, 22, 23]],
-              [[24, 25, 26],
-               [27, 28, 29]],
-              [[30, 31, 32],
-               [33, 34, 35]]]]
-        X.shape = (2, 3, 2, 3)
-
-        Y = [[[[35, 36, 37]],
-              [[38, 39, 40]],
-              [[41, 42, 43]]]]
-        Y.shape = (1, 3, 1, 3)
-
-    And
-        pad_value = -1,
-
-    Return:
-
-        Out = [[[[35, 36, 37],
-                 [-1, -1, -1]],
-                [[38, 39, 40],
-                 [-1, -1, -1]],
-                [[41, 42, 43],
-                 [-1, -1, -1]]],
-               [[[-1, -1, -1],
-                 [-1, -1, -1]],
-                [[-1, -1, -1],
-                 [-1, -1, -1]],
-                [[-1, -1, -1],
-                 [-1, -1, -1]]]]
-        Out.shape = (2, 3, 2, 3)
-)DOC");
-  }
-};
-
-class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "PadConstantLike@Grad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "PadConstantLike@Grad");
-
-    auto y_dim = ctx->GetInputDim("Y");
-    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(
-        dout_dim.size(),
-        y_dim.size(),
-        platform::errors::InvalidArgument(
-            "Op(PadConstantLike@Grad) the size of Input(Out@Grad)'s dimension "
-            "and the size of Input(Y)'s dimension should be the same, but "
-            "received %d for Input(Out@Grad) vs %d for Input(Y).",
-            dout_dim.size(),
-            y_dim.size()));
-
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dim);
-      ctx->ShareLoD("Y", /*->*/ y_grad_name);
-
-      for (int i = 0; i < y_dim.size(); ++i) {
-        if ((!ctx->IsRuntime()) && ((dout_dim[i] == -1) || (y_dim[i] == -1))) {
-          continue;
-        } else {
-          PADDLE_ENFORCE_GE(
-              dout_dim[i],
-              y_dim[i],
-              platform::errors::InvalidArgument(
-                  "The size of each dimension of Input(Out@Grad) expected to "
-                  "be greater than or equal to size of corresponding dimension "
-                  "of Input(Y) (Out_dim[i] >= Y_dim[i]), but received %d < %d "
-                  "for dimension %d",
-                  dout_dim[i],
-                  y_dim[i],
-                  i));
-        }
-      }
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Y"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class PadConstantLikeOpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> bind) const override {
-    bind->SetType("pad_constant_like_grad");
-    bind->SetInput("Y", this->Input("Y"));
-    bind->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-    bind->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(pad_constant_like,
-                  ops::PadConstantLikeOp,
-                  ops::PadConstantLikeOpMaker,
-                  ops::PadConstantLikeOpGradMaker<paddle::framework::OpDesc>,
-                  ops::PadConstantLikeOpGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
-
-PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::PadConstantLikeKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::PadConstantLikeGradKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::PadConstantLikeKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::PadConstantLikeGradKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-#endif
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
deleted file mode 100644
index f6162037fbd56f..00000000000000
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class PadConstantLikeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto in_x = context.Input<phi::DenseTensor>("X");
-    auto in_y = context.Input<phi::DenseTensor>("Y");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    if (in_x->dims() == in_y->dims()) {
-      framework::TensorCopy(*in_y, context.GetPlace(), out);
-      return;
-    }
-
-    T pad_value = static_cast<T>(context.Attr<float>("pad_value"));
-    out->mutable_data<T>(context.GetPlace());
-
-    int rank = context.Input<phi::DenseTensor>("X")->dims().size();
-
-    std::vector<int> pads(rank * 2, 0);
-
-    for (int j = 0; j < rank; ++j) {
-      pads[j * 2] = 0;
-      pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
-    }
-
-    phi::funcs::PaddingFunctor<DeviceContext, T>(
-        rank,
-        context.template device_context<DeviceContext>(),
-        pads,
-        pad_value,
-        *in_y,
-        out);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class PadConstantLikeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto in_y = context.Input<phi::DenseTensor>("Y");
-    auto in_dout =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_y = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    if (d_y == nullptr) {
-      return;
-    }
-
-    if (in_dout->dims() == in_y->dims()) {
-      framework::TensorCopy(*in_dout, context.GetPlace(), d_y);
-      return;
-    }
-
-    d_y->mutable_data<T>(context.GetPlace());
-    int rank = in_dout->dims().size();
-
-    std::vector<int> pads(static_cast<size_t>(rank) * 2, 0);
-    for (int j = 0; j < rank; ++j) {
-      pads[j * 2] = 0;
-      pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
-    }
-
-    phi::funcs::PaddingGradFunctor<DeviceContext, T>(
-        rank,
-        context.template device_context<DeviceContext>(),
-        pads,
-        *in_dout,
-        d_y);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/prim_ops/split_p_op.cc b/paddle/fluid/operators/prim_ops/split_p_op.cc
index 0584de504e7706..dea336bcd263fe 100644
--- a/paddle/fluid/operators/prim_ops/split_p_op.cc
+++ b/paddle/fluid/operators/prim_ops/split_p_op.cc
@@ -110,7 +110,7 @@ class SplitPrimOpVarTypeInference
   void operator()(framework::InferVarTypeContext *ctx) const override {
     auto x_name = Input(ctx, "X")[0];
     auto y_names = Output(ctx, "YS");
-    for (auto y_name : y_names) {
+    for (auto const &y_name : y_names) {
       SetType(ctx, y_name, GetType(ctx, x_name));
       SetDataType(ctx, y_name, GetDataType(ctx, x_name));
     }
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
deleted file mode 100644
index 0f0dbf3c6888a8..00000000000000
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prroi_pool_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), "
-             "the input of PRROIPoolOp. "
-             "The format of input tensor is NCHW. Where N is the batch size, "
-             "C is the number of input channels, "
-             "H is the height of the input feature map, and "
-             "W is the width.");
-    AddInput("ROIs",
-             "(phi::DenseTensor), "
-             "ROIs (Regions of Interest) to pool over. "
-             "should be a 2-D phi::DenseTensor of shape (num_rois, 4) "
-             "given as [(x1, y1, x2, y2), ...]. "
-             "where (x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the bottom right coordinates. "
-             "The roi batch index can be calculated from LoD.");
-    AddInput("BatchRoINums",
-             "(Tensor), "
-             "1-D tensor with shape [N], the number of"
-             " rois for each image in batch, where N is the batch size")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor), "
-              "the output of PRROIPoolOp is a 4-D Tensor with shape "
-              "(num_rois, output_channels, pooled_h, pooled_w).");
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Multiplicative spatial scale factor "
-                   "to translate ROI coords from their input scale "
-                   "to the scale used when pooling.")
-        .SetDefault(1.0);
-    AddAttr<int>("pooled_height",
-                 "(int, default 1), "
-                 "the pooled output height.")
-        .SetDefault(1);
-    AddAttr<int>("pooled_width",
-                 "(int, default 1), "
-                 "the pooled output width.")
-        .SetDefault(1);
-    AddComment(R"Doc(
-**PRROIPool Operator**
-
-Precise region of interest pooling (also known as PRROIPooling) is to perform
- bilinear interpolation average pooling method for RoI Pooling.
-
-Please refer to https://arxiv.org/abs/1807.11590 for more details.
-
-    )Doc");
-  }
-};
-
-class PRROIPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prroi_pool");
-    OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "prroi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "prroi_pool");
-
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The format of input tensor is NCHW"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1],
-        4,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output height must be greater than 0"));
-    PADDLE_ENFORCE_GT(pooled_width,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output width must be greater than 0"));
-    PADDLE_ENFORCE_GT(spatial_scale,
-                      0.0f,
-                      platform::errors::InvalidArgument(
-                          "The spatial scale must greater than 0."));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    if (ctx->HasInput("BatchRoINums")) {
-      auto rois_batch_index = ctx->GetInputDim("BatchRoINums");
-      PADDLE_ENFORCE_EQ(rois_batch_index[0],
-                        input_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The length of BatchRoINums should equal to  "
-                            "first dim of inputs(X)"));
-    }
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class PRROIPoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "prroi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "prroi_pool");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->SetOutputDim(framework::GradVarName("ROIs"), ctx->GetInputDim("ROIs"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class PRROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("prroi_pool_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Out", this->Output("Out"));
-    op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("BatchRoINums", this->Input("BatchRoINums"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("ROIs"), this->InputGrad("ROIs"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(prroi_pool,
-                  ops::PRROIPoolOp,
-                  ops::PRROIPoolOpMaker,
-                  ops::PRROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::PRROIPoolGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(prroi_pool,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CPUPRROIPoolOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CPUPRROIPoolGradOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
deleted file mode 100644
index 5d1243964279b0..00000000000000
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ /dev/null
@@ -1,439 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prroi_pool_op.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUPRROIPoolForward(const int nthreads,
-                                    const T* input_data,
-                                    const T* input_rois,
-                                    const float spatial_scale,
-                                    const int input_channels,
-                                    const int height,
-                                    const int width,
-                                    const int output_channels,
-                                    const int pooled_height,
-                                    const int pooled_width,
-                                    const int* rois_batch_id_data,
-                                    T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-    T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-    T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-    T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-
-    T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-    T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    T win_start_w = roi_start_w + bin_size_w * pw;
-    T win_start_h = roi_start_h + bin_size_h * ph;
-    T win_end_w = win_start_w + bin_size_w;
-    T win_end_h = win_start_h + bin_size_h;
-
-    T win_size = max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-    int input_channel = c;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-
-    if (win_size > static_cast<T>(0.0)) {
-      int s_w = floor(win_start_w);
-      int e_w = ceil(win_end_w);
-      int s_h = floor(win_start_h);
-      int e_h = ceil(win_end_h);
-      T sum_out = 0;
-
-      for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-        for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-          sum_out += PrRoIPoolingMatCalculation(
-              offset_input_data,
-              h_iter,
-              w_iter,
-              h_iter + 1,
-              w_iter + 1,
-              max(win_start_h, static_cast<T>(h_iter)),
-              max(win_start_w, static_cast<T>(w_iter)),
-              min(win_end_h, static_cast<T>(h_iter) + static_cast<T>(1.0)),
-              min(win_end_w, static_cast<T>(w_iter) + static_cast<T>(1.0)),
-              height,
-              width);
-        }
-      }
-      output_data[i] = sum_out / win_size;
-    } else {
-      output_data[i] = 0.;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUPRROIPoolBackward(const int nthreads,
-                                     const T* in_data,
-                                     const T* input_rois,
-                                     const T* output_grad_data,
-                                     const float spatial_scale,
-                                     const int input_channels,
-                                     const int height,
-                                     const int width,
-                                     const int output_channels,
-                                     const int pooled_height,
-                                     const int pooled_width,
-                                     const int* rois_batch_id_data,
-                                     T* input_grad_data,
-                                     const T* out_data,
-                                     T* input_roi_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = c;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-    const T* offset_output_grad_data = output_grad_data + i;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-    T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-    T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-    T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-    T* offset_input_roi_grad_data = input_roi_grad_data + n * 4;
-
-    T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-    T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    T win_start_w = roi_start_w + bin_size_w * pw;
-    T win_start_h = roi_start_h + bin_size_h * ph;
-    T win_end_w = win_start_w + bin_size_w;
-    T win_end_h = win_start_h + bin_size_h;
-
-    T win_size = max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-    int s_w = floor(win_start_w);
-    int e_w = ceil(win_end_w);
-    int s_h = floor(win_start_h);
-    int e_h = ceil(win_end_h);
-
-    T sum_out = win_size == static_cast<T>(0.)
-                    ? static_cast<T>(0.)
-                    : *offset_output_grad_data / win_size;
-
-    for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-      for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-        PrRoIPoolingMatDistributeDiff<T>(
-            offset_input_grad_data,
-            sum_out,
-            h_iter,
-            w_iter,
-            h_iter + 1,
-            w_iter + 1,
-            max(win_start_h, static_cast<T>(h_iter)),
-            max(win_start_w, static_cast<T>(w_iter)),
-            min(win_end_h, static_cast<T>(h_iter) + static_cast<T>(1.0)),
-            min(win_end_w, static_cast<T>(w_iter) + static_cast<T>(1.0)),
-            height,
-            width);
-      }
-    }
-
-    const T* offset_out_data = out_data + i;
-    const T* offset_in_data = in_data + input_offset;
-    PrRoIPoolingCoorBackward<T>(s_w,
-                                e_w,
-                                s_h,
-                                e_h,
-                                width,
-                                height,
-                                win_start_w,
-                                win_start_h,
-                                win_end_w,
-                                win_end_h,
-                                pw,
-                                ph,
-                                pooled_width,
-                                pooled_height,
-                                win_size,
-                                spatial_scale,
-                                offset_in_data,
-                                offset_out_data,
-                                offset_input_roi_grad_data,
-                                offset_output_grad_data);
-  }
-}
-
-template <typename T, typename DeviceContext>
-class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    auto output_channels = input_channels;
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-
-    // set rois batch id
-    phi::DenseTensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-    if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-      auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
-      phi::DenseTensor batch_index_cpu;
-      framework::TensorCopySync(
-          *batchroinum, platform::CPUPlace(), &batch_index_cpu);
-
-      int rois_batch_size = batchroinum->dims()[0];
-      auto* batch_index = batch_index_cpu.data<int64_t>();
-      size_t c = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int64_t k = 0; k < batch_index[n]; ++k) {
-          rois_batch_id_data[c] = n;
-          c = c + 1;
-        }
-      }
-
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size,
-          batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and input(X) batch_size must be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num,
-          rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The rois_num from input and lod must be the same."));
-
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    auto cplace = platform::CPUPlace();
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = rois_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(
-        dev_ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = ctx.GetPlace();
-    memory::Copy(gplace,
-                 roi_id_data,
-                 cplace,
-                 rois_batch_id_data,
-                 bytes,
-                 dev_ctx.stream());
-
-    // call cuda kernel function
-    GPUPRROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size,
-        in->data<T>(),
-        rois->data<T>(),
-        spatial_scale,
-        input_channels,
-        height,
-        width,
-        output_channels,
-        pooled_height,
-        pooled_width,
-        roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename T, typename DeviceContext>
-class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* input_roi_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("ROIs"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    auto output_channels = input_channels;
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad || input_roi_grad) {
-      // set roi batch id
-      phi::DenseTensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-      if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-        auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
-        phi::DenseTensor batch_index_cpu;
-        framework::TensorCopySync(
-            *batchroinum, platform::CPUPlace(), &batch_index_cpu);
-
-        int rois_batch_size = batchroinum->dims()[0];
-        auto* batch_index = batch_index_cpu.data<int64_t>();
-        size_t c = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int64_t k = 0; k < batch_index[n]; ++k) {
-            rois_batch_id_data[c] = n;
-            c = c + 1;
-          }
-        }
-      } else {
-        PADDLE_ENFORCE_EQ(rois->lod().empty(),
-                          false,
-                          platform::errors::InvalidArgument(
-                              "the lod of Input ROIs should not be empty when "
-                              "BatchRoINums is None!"));
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      auto cplace = platform::CPUPlace();
-      auto& dev_ctx = ctx.cuda_device_context();
-      int bytes = rois_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(
-          dev_ctx.GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      const auto gplace = ctx.GetPlace();
-      memory::Copy(gplace,
-                   roi_id_data,
-                   cplace,
-                   rois_batch_id_data,
-                   bytes,
-                   dev_ctx.stream());
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-      input_roi_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.cuda_device_context(), input_roi_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPRROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            output_grad_size,
-            in->data<T>(),
-            rois->data<T>(),
-            output_grad->data<T>(),
-            spatial_scale,
-            input_channels,
-            height,
-            width,
-            output_channels,
-            pooled_height,
-            pooled_width,
-            roi_id_data,
-            input_grad->mutable_data<T>(ctx.GetPlace()),
-            out->data<T>(),
-            input_roi_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    prroi_pool, GPU, ALL_LAYOUT, ops::GPUPRROIPoolOpKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::GPUPRROIPoolGradOpKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
deleted file mode 100644
index e2417a071ce886..00000000000000
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ /dev/null
@@ -1,653 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-inline HOSTDEVICE T PrRoIPoolingGetData(const T* data,
-                                        const int h,
-                                        const int w,
-                                        const int height,
-                                        const int width) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  T retVal = overflow ? 0.0f : data[h * width + w];
-  return retVal;
-}
-
-template <typename T>
-inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data,
-                                               const int s_h,
-                                               const int s_w,
-                                               const int e_h,
-                                               const int e_w,
-                                               const T y0,
-                                               const T x0,
-                                               const T y1,
-                                               const T x1,
-                                               const int h0,
-                                               const int w0) {
-  T alpha, beta, lim_alpha, lim_beta, tmp;
-  T sum_out = 0;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = y0 - static_cast<T>(s_h);
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = y1 - static_cast<T>(s_h);
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = static_cast<T>(e_h) - y1;
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = static_cast<T>(e_h) - y0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
-
-  return sum_out;
-}
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-DEVICE void PrRoIPoolingDistributeDiff(T* diff,
-                                       const T top_diff,
-                                       const int h,
-                                       const int w,
-                                       const int height,
-                                       const int width,
-                                       const T coeff) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  if (!overflow) {
-    phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
-  }
-}
-#else
-template <typename T>
-inline HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff,
-                                                  const T top_diff,
-                                                  const int h,
-                                                  const int w,
-                                                  const int height,
-                                                  const int width,
-                                                  const T coeff) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  if (!overflow) {
-    *(diff + h * width + w) += top_diff * coeff;
-  }
-}
-#endif
-
-template <typename T>
-HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
-                                              const T top_diff,
-                                              const int s_h,
-                                              const int s_w,
-                                              const int e_h,
-                                              const int e_w,
-                                              const T y0,
-                                              const T x0,
-                                              const T y1,
-                                              const T x1,
-                                              const int h0,
-                                              const int w0) {
-  T alpha, beta, lim_alpha, lim_beta, tmp;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = y0 - static_cast<T>(s_h);
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = y1 - static_cast<T>(s_h);
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff<T>(diff, top_diff, s_h, s_w, h0, w0, tmp);
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff<T>(diff, top_diff, s_h, e_w, h0, w0, tmp);
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = static_cast<T>(e_h) - y1;
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = static_cast<T>(e_h) - y0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff<T>(diff, top_diff, e_h, s_w, h0, w0, tmp);
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff<T>(diff, top_diff, e_h, e_w, h0, w0, tmp);
-}
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-DEVICE void AccumulateRois(T* offset, T data) {
-  phi::CudaAtomicAdd(offset, data);
-}
-#else
-template <typename T>
-inline HOSTDEVICE void AccumulateRois(T* offset, T data) {
-  *offset += data;
-}
-#endif
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-DEVICE T MaxFunctor(const T x, const T y) {
-  return max(x, y);
-}
-template <typename T>
-DEVICE T MinFunctor(const T x, const T y) {
-  return min(x, y);
-}
-#else
-template <typename T>
-inline HOSTDEVICE T MaxFunctor(const T x, const T y) {
-  return std::max(x, y);
-}
-template <typename T>
-inline HOSTDEVICE T MinFunctor(const T x, const T y) {
-  return std::max(x, y);
-}
-#endif
-
-template <typename T>
-inline HOSTDEVICE static T PrRoIPoolingGetCoeff(T dh, T dw) {
-  dw = dw > 0 ? dw : -dw;
-  dh = dh > 0 ? dh : -dh;
-  return (1.0f - dh) * (1.0f - dw);
-}
-
-template <typename T, typename H, typename W>
-inline HOSTDEVICE static T PrRoIPoolingInterpolation(
-    const T* data, const H h, const W w, const int height, const int width) {
-  T retVal = 0.0f;
-  int h1 = floorf(h);
-  int w1 = floorf(w);
-  retVal +=
-      PrRoIPoolingGetData(data, h1, w1, height, width) *
-      PrRoIPoolingGetCoeff(h - static_cast<T>(h1), w - static_cast<T>(w1));
-  h1 = floorf(h) + 1;
-  w1 = floorf(w);
-  retVal +=
-      PrRoIPoolingGetData(data, h1, w1, height, width) *
-      PrRoIPoolingGetCoeff(h - static_cast<T>(h1), w - static_cast<T>(w1));
-  h1 = floorf(h);
-  w1 = floorf(w) + 1;
-  retVal +=
-      PrRoIPoolingGetData(data, h1, w1, height, width) *
-      PrRoIPoolingGetCoeff(h - static_cast<T>(h1), w - static_cast<T>(w1));
-  h1 = floorf(h) + 1;
-  w1 = floorf(w) + 1;
-  retVal +=
-      PrRoIPoolingGetData(data, h1, w1, height, width) *
-      PrRoIPoolingGetCoeff(h - static_cast<T>(h1), w - static_cast<T>(w1));
-  return retVal;
-}
-
-template <typename T>
-inline HOSTDEVICE T PrRoIPoolingSingleCoorIntegral(T s, T t, T c1, T c2) {
-  return 0.5f * (t * t - s * s) * c2 +
-         (t - 0.5f * t * t - s + 0.5f * s * s) * c1;
-}
-
-template <typename T>
-inline HOSTDEVICE void PrRoIPoolingCoorBackward(int s_w,
-                                                int e_w,
-                                                int s_h,
-                                                int e_h,
-                                                int width,
-                                                int height,
-                                                T win_start_w,
-                                                T win_start_h,
-                                                T win_end_w,
-                                                T win_end_h,
-                                                int pw,
-                                                int ph,
-                                                const int pooled_width,
-                                                const int pooled_height,
-                                                T win_size,
-                                                const float spatial_scale,
-                                                const T* this_bottom_data,
-                                                const T* this_top_data,
-                                                T* this_data_grad,
-                                                const T* this_out_grad) {
-  T g_x1_y = 0.f;
-  T g_x2_y = 0.f;
-  T g_x_y1 = 0.f;
-  T g_x_y2 = 0.f;
-
-  for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-    g_x1_y += PrRoIPoolingSingleCoorIntegral(
-        MaxFunctor<T>(win_start_h, static_cast<T>(h_iter)) - h_iter,
-        MinFunctor<T>(win_end_h, static_cast<T>(h_iter + 1)) - h_iter,
-        PrRoIPoolingInterpolation(
-            this_bottom_data, h_iter, win_start_w, height, width),
-        PrRoIPoolingInterpolation(
-            this_bottom_data, h_iter + 1, win_start_w, height, width));
-
-    g_x2_y += PrRoIPoolingSingleCoorIntegral(
-        MaxFunctor<T>(win_start_h, static_cast<T>(h_iter)) - h_iter,
-        MinFunctor<T>(win_end_h, static_cast<T>(h_iter + 1)) - h_iter,
-        PrRoIPoolingInterpolation(
-            this_bottom_data, h_iter, win_end_w, height, width),
-        PrRoIPoolingInterpolation(
-            this_bottom_data, h_iter + 1, win_end_w, height, width));
-  }
-
-  for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-    g_x_y1 += PrRoIPoolingSingleCoorIntegral(
-        MaxFunctor<T>(win_start_w, static_cast<T>(w_iter)) - w_iter,
-        MinFunctor<T>(win_end_w, static_cast<T>(w_iter + 1)) - w_iter,
-        PrRoIPoolingInterpolation(
-            this_bottom_data, win_start_h, w_iter, height, width),
-        PrRoIPoolingInterpolation(
-            this_bottom_data, win_start_h, w_iter + 1, height, width));
-
-    g_x_y2 += PrRoIPoolingSingleCoorIntegral(
-        MaxFunctor<T>(win_start_w, static_cast<T>(w_iter)) - w_iter,
-        MinFunctor<T>(win_end_w, static_cast<T>(w_iter + 1)) - w_iter,
-        PrRoIPoolingInterpolation(
-            this_bottom_data, win_end_h, w_iter, height, width),
-        PrRoIPoolingInterpolation(
-            this_bottom_data, win_end_h, w_iter + 1, height, width));
-  }
-
-  float partial_x1 = -g_x1_y + (win_end_h - win_start_h) * (*this_top_data);
-  float partial_y1 = -g_x_y1 + (win_end_w - win_start_w) * (*this_top_data);
-  float partial_x2 = g_x2_y - (win_end_h - win_start_h) * (*this_top_data);
-  float partial_y2 = g_x_y2 - (win_end_w - win_start_w) * (*this_top_data);
-
-  partial_x1 = partial_x1 / win_size * spatial_scale;
-  partial_x2 = partial_x2 / win_size * spatial_scale;
-  partial_y1 = partial_y1 / win_size * spatial_scale;
-  partial_y2 = partial_y2 / win_size * spatial_scale;
-
-  AccumulateRois<T>(
-      this_data_grad + 0,
-      (partial_x1 * (1.0 - static_cast<T>(pw) / pooled_width) +
-       partial_x2 * (1.0 - static_cast<T>(pw + 1) / pooled_width)) *
-          (*this_out_grad));
-  AccumulateRois<T>(
-      this_data_grad + 1,
-      (partial_y1 * (1.0 - static_cast<T>(ph) / pooled_height) +
-       partial_y2 * (1.0 - static_cast<T>(ph + 1) / pooled_height)) *
-          (*this_out_grad));
-  AccumulateRois<T>(this_data_grad + 2,
-                    (partial_x2 * static_cast<T>(pw + 1) / pooled_width +
-                     partial_x1 * static_cast<T>(pw) / pooled_width) *
-                        (*this_out_grad));
-  AccumulateRois<T>(this_data_grad + 3,
-                    (partial_y2 * static_cast<T>(ph + 1) / pooled_height +
-                     partial_y1 * static_cast<T>(ph) / pooled_height) *
-                        (*this_out_grad));
-}
-
-template <typename T, typename DeviceContext>
-class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    auto output_channels = input_channels;
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-
-    auto in_stride = phi::stride(in_dims);
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    phi::DenseTensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-      auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
-      auto* batch_index = batchroinum->data<int64_t>();
-      int rois_batch_size = batchroinum->dims()[0];
-      size_t c = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int64_t k = 0; k < batch_index[n]; ++k) {
-          rois_batch_id_data[c] = n;
-          c = c + 1;
-        }
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(rois->lod().empty(),
-                        false,
-                        platform::errors::InvalidArgument(
-                            "The lod of Input ROIs should not be empty when "
-                            "BatchRoINums is None!"));
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(rois_batch_size,
-                        batch_size,
-                        platform::errors::InvalidArgument(
-                            "The rois_batch_size and input(X)'s "
-                            "batch_size should be the same but received"
-                            "rois_batch_size: %d and batch_size: %d",
-                            rois_batch_size,
-                            batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num_with_lod,
-          rois_num,
-          platform::errors::InvalidArgument("The rois_num from input should be "
-                                            "equal to the rois_num from lod, "
-                                            "but received rois_num from input: "
-                                            "%d and the rois_num from lod: %d.",
-                                            rois_num_with_lod,
-                                            rois_num));
-
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-    // calculate prroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-      T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-      T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-      T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-
-      T roi_width = std::max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-      T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-      // Compute w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-      T win_size = std::max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            T win_start_h = static_cast<T>(ph) * bin_size_h + roi_start_h;
-            T win_start_w = static_cast<T>(pw) * bin_size_w + roi_start_w;
-            T win_end_h = win_start_h + bin_size_h;
-            T win_end_w = win_start_w + bin_size_w;
-            //  Add roi offsets and clip to input boundaries
-            int s_w = std::floor(win_start_w);
-            int e_w = std::ceil(win_end_w);
-            int s_h = std::floor(win_start_h);
-            int e_h = std::ceil(win_end_h);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = c;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T sum_out = 0.;
-
-            if (win_size > static_cast<T>(0.0)) {
-              for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-                for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-                  sum_out += PrRoIPoolingMatCalculation(
-                      offset_input_data,
-                      h_iter,
-                      w_iter,
-                      h_iter + 1,
-                      w_iter + 1,
-                      std::max(win_start_h, static_cast<T>(h_iter)),
-                      std::max(win_start_w, static_cast<T>(w_iter)),
-                      std::min(win_end_h,
-                               static_cast<T>(h_iter) + static_cast<T>(1.0)),
-                      std::min(win_end_w,
-                               static_cast<T>(w_iter) + static_cast<T>(1.0)),
-                      height,
-                      width);
-                }
-              }
-
-              output_data[output_index] = sum_out / win_size;
-            } else {
-              output_data[output_index] = 0.;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* input_roi_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("ROIs"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad || input_roi_grad) {
-      auto in_dims = in->dims();
-      auto* in_data = in->data<T>();
-      auto* out_data = out->data<T>();
-
-      int input_channels = in_dims[1];
-      auto output_channels = input_channels;
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      phi::DenseTensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-        auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
-        auto* batch_index = batchroinum->data<int64_t>();
-        int rois_batch_size = batchroinum->dims()[0];
-        size_t c = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int64_t k = 0; k < batch_index[n]; ++k) {
-            rois_batch_id_data[c] = n;
-            c = c + 1;
-          }
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        // calculate batch id index for each roi according to LoD
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      input_roi_grad->mutable_data<T>(ctx.GetPlace());
-      // set gradient of X to be 0. before backpropagate.
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(),
-               input_grad,
-               static_cast<T>(0));
-      set_zero(ctx.template device_context<DeviceContext>(),
-               input_roi_grad,
-               static_cast<T>(0));
-
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      T* input_roi_grad_data = input_roi_grad->mutable_data<T>(ctx.GetPlace());
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = c;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-        const T* offset_output_grad_data = output_grad_data + i;
-        const T* offset_out_data = out_data + i;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-        T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-        T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-        T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-        T* offset_input_roi_grad_data = input_roi_grad_data + n * 4;
-
-        T roi_width = std::max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-        T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        T win_start_w = roi_start_w + bin_size_w * pw;
-        T win_start_h = roi_start_h + bin_size_h * ph;
-        T win_end_w = win_start_w + bin_size_w;
-        T win_end_h = win_start_h + bin_size_h;
-
-        T win_size = std::max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-
-        T sum_out = win_size == static_cast<T>(0.)
-                        ? static_cast<T>(0.)
-                        : *offset_output_grad_data / win_size;
-
-        int s_w = std::floor(win_start_w);
-        int e_w = std::ceil(win_end_w);
-        int s_h = std::floor(win_start_h);
-        int e_h = std::ceil(win_end_h);
-
-        for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-          for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-            PrRoIPoolingMatDistributeDiff<T>(
-                offset_input_grad_data,
-                sum_out,
-                h_iter,
-                w_iter,
-                h_iter + 1,
-                w_iter + 1,
-                std::max(win_start_h, static_cast<T>(h_iter)),
-                std::max(win_start_w, static_cast<T>(w_iter)),
-                std::min(win_end_h,
-                         static_cast<T>(h_iter) + static_cast<T>(1.0)),
-                std::min(win_end_w,
-                         static_cast<T>(w_iter) + static_cast<T>(1.0)),
-                height,
-                width);
-          }
-        }
-
-        const T* offset_in_data = in_data + input_offset;
-        PrRoIPoolingCoorBackward<T>(s_w,
-                                    e_w,
-                                    s_h,
-                                    e_h,
-                                    width,
-                                    height,
-                                    win_start_w,
-                                    win_start_h,
-                                    win_end_w,
-                                    win_end_h,
-                                    pw,
-                                    ph,
-                                    pooled_width,
-                                    pooled_height,
-                                    win_size,
-                                    spatial_scale,
-                                    offset_in_data,
-                                    offset_out_data,
-                                    offset_input_roi_grad_data,
-                                    offset_output_grad_data);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
index e2c2eb7768e1bd..c0ef288b5134bf 100644
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -239,11 +239,3 @@ REGISTER_OPERATOR(
     ops::QuantizeLinearOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(dequantize_linear,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::DeQuantizeLinearKernel,
-                          float,
-                          int8_t,
-                          double) {}
diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu
index f0d6523d054c29..8bcbc1107e9d13 100644
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
@@ -123,15 +123,6 @@ template struct ChannelDequantizeFunctorV2<phi::GPUContext, double>;
 
 namespace ops = paddle::operators;
 
-PD_REGISTER_STRUCT_KERNEL(dequantize_linear,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::DeQuantizeLinearKernel,
-                          float,
-                          float16,
-                          int8_t,
-                          double) {}
-
 PD_REGISTER_STRUCT_KERNEL(quantize_linear,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h
index 276d1507a4aef8..d6c3b3d2e50ae8 100644
--- a/paddle/fluid/operators/quantize_linear_op.h
+++ b/paddle/fluid/operators/quantize_linear_op.h
@@ -130,74 +130,5 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T, typename DeviceContext>
-class DeQuantizeLinearKernel : public framework::OpKernel<T> {
- public:
-  template <typename D>
-  void ComputeImpl(const framework::ExecutionContext& context) const {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto* in = context.Input<phi::DenseTensor>("X");
-
-    auto in_tmp = phi::Cast<T>(
-        static_cast<const typename paddle::framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *in,
-        phi::CppTypeToDataType<D>::Type());
-
-    auto* scale = context.Input<phi::DenseTensor>("Scale");
-    auto* out = context.Output<phi::DenseTensor>("Y");
-    int bit_length = context.Attr<int>("bit_length");
-    auto quant_axis = context.Attr<int>("quant_axis");
-    dev_ctx.template Alloc<D>(out, out->numel() * sizeof(D));
-    bool only_observer = context.Attr<bool>("only_observer");
-
-    if (only_observer) {
-      framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
-      return;
-    }
-
-    if (quant_axis < 0) {
-      float max_range = (std::pow(2, bit_length - 1) - 1);
-      DequantizeFunctor<DeviceContext, D>()(
-          dev_ctx, &in_tmp, scale, static_cast<D>(max_range), out);
-    } else {
-      PADDLE_ENFORCE_EQ(
-          scale->numel(),
-          in_tmp.dims()[quant_axis],
-          platform::errors::PreconditionNotMet(
-              "The number of first scale values must be the same with "
-              "quant_axis dimension value of Input(X) when the `scale` has "
-              "only one element, but %ld != %ld here.",
-              scale->numel(),
-              in_tmp.dims()[quant_axis]));
-      int max_range = (std::pow(2, bit_length - 1) - 1);
-
-      ChannelDequantizeFunctorV2<DeviceContext, D>()(
-          dev_ctx, &in_tmp, scale, static_cast<D>(max_range), quant_axis, out);
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* scale = context.Input<phi::DenseTensor>("Scale");
-    switch (scale->dtype()) {
-      case phi::DataType::FLOAT64:
-        ComputeImpl<double>(context);
-        break;
-      case phi::DataType::FLOAT32:
-        ComputeImpl<float>(context);
-        break;
-      case phi::DataType::FLOAT16:
-        ComputeImpl<paddle::platform::float16>(context);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "In DeQuantizeLinearKernel, "
-            "data type %d for scale/output is not supported ",
-            scale->dtype()));
-        break;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
deleted file mode 100644
index 62e805e323f845..00000000000000
--- a/paddle/fluid/operators/random_crop_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/random_crop_op.h"
-
-namespace paddle {
-namespace operators {
-
-class RandomCropOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GT(
-        x_dim.size(),
-        static_cast<int64_t>(shape.size()),
-        platform::errors::InvalidArgument(
-            "The dimensions of Input(X) must be greater than the length of "
-            "Attr(shape),"
-            "But received dimensions of Input(X) is [%d], receivecd length"
-            "of Attr(shape) is [%d].",
-            x_dim.size(),
-            static_cast<int64_t>(shape.size())));
-    auto out_dim = phi::vectorize<int>(x_dim);
-    for (size_t i = 1; i <= shape.size(); ++i) {
-      size_t x_i = x_dim.size() - i;
-      size_t shape_i = shape.size() - i;
-      if (ctx->IsRuntime() ||
-          (x_dim[static_cast<int>(x_i)] > 0 && shape[shape_i] > 0)) {
-        PADDLE_ENFORCE_GE(
-            x_dim[x_i],
-            shape[shape_i],
-            platform::errors::InvalidArgument(
-                "The dimensions of Input(X) must be larger than Attr(shape),"
-                "But received dimensions of Input(X) is [%d], received"
-                "size of Attr(shape) is [%d].",
-                x_dim[x_i],
-                shape[shape_i]));
-      }
-      out_dim[x_i] = shape[shape_i];
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dim));
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "A batch of instances to random crop.");
-    AddInput("Seed", "The random seed.");
-    AddOutput("Out", "The cropped instance batch.");
-    AddOutput("SeedOut", "The random seed after random cropping.")
-        .AsIntermediate();
-    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
-    AddAttr<int>("startup_seed",
-                 "If the input 'Seed' is not initialized, the 'startup_seed' "
-                 "will be used to replace it. Even so, the seed after random "
-                 "crop will also be outputed to the 'SeedOut'.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-      This operator takes a batch of instance, and do random cropping on each instance.
-      It means that cropping positions differs on each instance, which is determined
-      by an uniform random generator. All cropped instances have the same shape, which
-      is determined by the operator's attribute 'shape'.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    random_crop,
-    ops::RandomCropOp,
-    ops::RandomCropOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(random_crop,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::RandomCropKernel,
-                          float,
-                          int,
-                          double,
-                          uint8_t,
-                          int16_t) {}
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
deleted file mode 100644
index fc625826b9a91b..00000000000000
--- a/paddle/fluid/operators/random_crop_op.h
+++ /dev/null
@@ -1,232 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/for_range.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <thrust/random.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext>
-struct Random;
-
-template <>
-struct Random<phi::CPUContext> {
-  using Engine = std::minstd_rand;
-
-  template <typename T>
-  using UniformIntDist = std::uniform_int_distribution<T>;
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <>
-struct Random<phi::GPUContext> {
-  using Engine = thrust::minstd_rand;
-
-  template <typename T>
-  using UniformIntDist = thrust::uniform_int_distribution<T>;
-};
-#endif
-
-template <typename T>
-HOSTDEVICE inline void StridedMemcpy(const T* x,
-                                     const size_t* x_dims,
-                                     T* out,
-                                     const size_t* out_dims,
-                                     int i,
-                                     int rank,
-                                     size_t prod_x_remain,
-                                     size_t prod_out_remain,
-                                     const size_t* offsets) {
-  size_t x_dim_i = x_dims[i];
-  size_t out_dim_i = out_dims[i];
-  size_t x_stride = prod_x_remain / x_dim_i;
-  size_t out_stride = prod_out_remain / out_dim_i;
-  size_t offset_i = offsets[i];
-
-  if (i == rank - 1) {
-    x += offset_i;
-    for (size_t j = 0; j < out_dim_i; ++j) {
-      *out++ = *x++;
-    }
-  } else {
-    x += offset_i * x_stride;
-    for (size_t j = 0; j < out_dim_i; ++j) {
-      StridedMemcpy<T>(
-          x, x_dims, out, out_dims, i + 1, rank, x_stride, out_stride, offsets);
-      x += x_stride;
-      out += out_stride;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-struct RandomCropFunctor {
-  const T* x_;
-  T* out_;
-  size_t x_dims_[9];
-  size_t out_dims_[9];
-  int num_batchsize_dims_;
-  int rank_;
-  int64_t seed_;
-
-  size_t prod_batchsize_dims_;
-  size_t prod_x_ins_dims_;
-  size_t prod_out_ins_dims_;
-
-  RandomCropFunctor(const T* x,
-                    T* out,
-                    const framework::DDim& x_dims,
-                    const framework::DDim& out_dims,
-                    int num_batchsize_dims,
-                    int64_t seed)
-      : x_(x),
-        out_(out),
-        num_batchsize_dims_(num_batchsize_dims),
-        rank_(x_dims.size()),
-        seed_(seed) {
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        out_dims.size(),
-        platform::errors::InvalidArgument(
-            "The dimensions of Input(X) must equal to be the dimensions"
-            "of Output(Out), but received dimensions of Input(X) is [%d],"
-            "received dimensions of Output(Out) is [%d].",
-            x_dims.size(),
-            out_dims.size()));
-    PADDLE_ENFORCE_GT(
-        rank_,
-        num_batchsize_dims_,
-        platform::errors::InvalidArgument(
-            "The dimensions of Input(X) must be greater than the diff"
-            "value of Input(X)'s dimensions minus Atrr(shape)'s dimensions,"
-            "But received Input(X)'s dimensions is [%d], received value of"
-            "Input(X)'s dimensions minus Attr(shape)'s dimensions is [%d].",
-            rank_,
-            num_batchsize_dims_));
-    prod_batchsize_dims_ = 1;
-    prod_x_ins_dims_ = 1;
-    prod_out_ins_dims_ = 1;
-    for (size_t i = 0; i < static_cast<size_t>(rank_); ++i) {
-      size_t x_dim_i = x_dims[i];
-      size_t out_dim_i = out_dims[i];
-      x_dims_[i] = x_dim_i;
-      out_dims_[i] = out_dim_i;
-      if (i < static_cast<size_t>(num_batchsize_dims_)) {
-        PADDLE_ENFORCE_EQ(
-            x_dim_i,
-            out_dim_i,
-            platform::errors::InvalidArgument(
-                "The first [%d] dimension value of Input(X) and Output(Out)"
-                "must be equal, but received the [%d] dimension value of"
-                "Input(X) and Output(Out) respectively are [%d] and [%d].",
-                num_batchsize_dims_,
-                i,
-                x_dim_i,
-                out_dim_i));
-        prod_batchsize_dims_ *= x_dim_i;
-      } else {
-        prod_x_ins_dims_ *= x_dim_i;
-        prod_out_ins_dims_ *= out_dim_i;
-      }
-    }
-  }
-
-  HOSTDEVICE void operator()(size_t ins_idx) {
-    typename Random<DeviceContext>::Engine engine(seed_);
-    engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
-    size_t offsets[9] = {};
-    for (int i = num_batchsize_dims_; i < rank_; ++i) {
-      typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
-          0, x_dims_[i] - out_dims_[i]);
-      offsets[i - num_batchsize_dims_] = dist(engine);
-    }
-
-    const T* x = x_ + ins_idx * prod_x_ins_dims_;
-    T* out = out_ + ins_idx * prod_out_ins_dims_;
-
-    StridedMemcpy<T>(x,
-                     x_dims_ + num_batchsize_dims_,
-                     out,
-                     out_dims_ + num_batchsize_dims_,
-                     0,
-                     rank_ - num_batchsize_dims_,
-                     prod_x_ins_dims_,
-                     prod_out_ins_dims_,
-                     offsets);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class RandomCropKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    int64_t seed = 0;
-    auto& seed_tensor = GET_DATA_SAFELY(
-        ctx.Input<phi::DenseTensor>("Seed"), "Input", "Seed", "RandomCrop");
-    if (seed_tensor.IsInitialized()) {
-      if (platform::is_cpu_place(seed_tensor.place())) {
-        seed = *seed_tensor.template data<int64_t>();
-      } else {
-        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
-                        "your program";
-        phi::DenseTensor cpu_seed;
-        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
-        seed = *cpu_seed.data<int64_t>();
-      }
-    } else {
-      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
-                 "'startup_seed' instead.";
-      seed = ctx.Attr<int>("startup_seed");
-    }
-    auto shape = ctx.Attr<std::vector<int>>("shape");
-    auto& x = GET_DATA_SAFELY(
-        ctx.Input<phi::DenseTensor>("X"), "Input", "X", "RandomCrop");
-    auto& out = GET_DATA_SAFELY(
-        ctx.Output<phi::DenseTensor>("Out"), "Output", "Out", "RandomCrop");
-
-    int num_batchsize_dims = x.dims().size() - shape.size();
-    RandomCropFunctor<DeviceContext, T> functor(
-        x.template data<T>(),
-        out.template mutable_data<T>(ctx.GetPlace()),
-        x.dims(),
-        out.dims(),
-        num_batchsize_dims,
-        seed);
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(),
-        functor.prod_batchsize_dims_);
-
-    for_range(functor);
-
-    Random<phi::CPUContext>::Engine engine(seed);
-    engine.discard(functor.prod_batchsize_dims_ *
-                   (functor.rank_ - functor.num_batchsize_dims_));
-    *ctx.Output<phi::DenseTensor>("SeedOut")->mutable_data<int64_t>(
-        phi::make_ddim({1}), platform::CPUPlace()) = engine();
-  }
-};
-
-// TODO(fengjiayi): Backward of random crop op
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index cf8197a04dd695..5cea8f59631119 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -53,7 +53,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
       place_str = place_str.substr(0, place_str.length() - 1);
       std::istringstream sin(place_str);
       sin.seekg(std::string("PLACE(GPU:").size(), std::ios::beg);  // NOLINT
-      size_t num;
+      size_t num = 0;
       sin >> num;
       place = platform::CUDAPlace(static_cast<int>(num));
     }
diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc
index 2db8ac6b1bcb9b..f0c0409a729a5c 100644
--- a/paddle/fluid/operators/reader/py_reader.cc
+++ b/paddle/fluid/operators/reader/py_reader.cc
@@ -31,7 +31,7 @@ PyReader::PyReader(
 }
 
 void PyReader::ReadNext(paddle::framework::LoDTensorArray* out) {
-  bool success;
+  bool success = false;
   *out = queue_->Pop(&success);
   if (!success) out->clear();
 }
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
deleted file mode 100644
index 48a204c10e4be0..00000000000000
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class RNNMemoryHelperOp : public framework::OperatorBase {
- public:
-  RNNMemoryHelperOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto mem_var_name = Input("X");
-    auto *mem_var = scope.FindVar(mem_var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        mem_var,
-        platform::errors::NotFound("Cannot find mem_var: %s in scope.",
-                                   mem_var_name));
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE_NOT_NULL(out_var,
-                            platform::errors::NotFound(
-                                "Cannot find out_var: %s in scope.", out_name));
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    auto *out_tensor = out_var->GetMutable<phi::DenseTensor>();
-    auto &mem_tensor = mem_var->Get<phi::DenseTensor>();
-    framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor);
-    out_tensor->set_lod(mem_tensor.lod());
-  }
-};
-
-class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "RNNMemoryHelper");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "RNNMemoryHelper");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "");
-    AddOutput("Out", "");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddComment("");
-  }
-};
-
-class RNNMemoryHelperGradOp : public framework::OperatorBase {
- public:
-  RNNMemoryHelperGradOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto out_grad_var_name = Input(framework::GradVarName("Out"));
-    auto *out_grad_var = scope.FindVar(out_grad_var_name);
-
-    auto in_grad_var_name = Output(framework::GradVarName("X"));
-    auto *in_grad_var = scope.FindVar(in_grad_var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        in_grad_var,
-        platform::errors::NotFound("Cannot find in_grad_var: %s in scope.",
-                                   in_grad_var_name));
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    // NOTE(xiongkun03): In standalone executor, after each run, the
-    // var.tensor.holder will be delete instead of variable. So we need exam the
-    // IsInitialized().
-    if (out_grad_var == nullptr ||
-        !out_grad_var->Get<phi::DenseTensor>().IsInitialized()) {
-      VLOG(5) << "Using fill constant 0 as starting gradient";
-      auto in_var_name = Input("X");
-      auto *in_var = scope.FindVar(in_var_name);
-      auto &in_var_tensor = in_var->Get<phi::DenseTensor>();
-
-      framework::AttributeMap attrs;
-      attrs["dtype"] = framework::TransToProtoVarType(in_var_tensor.dtype());
-      attrs["shape"] = phi::vectorize<int>(in_var_tensor.dims());
-      attrs["value"] = 0.0f;
-
-      auto zero_op = framework::OpRegistry::CreateOp(
-          "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
-      zero_op->Run(scope, dev_place);
-    } else {
-      auto &out_grad_tensor = out_grad_var->Get<phi::DenseTensor>();
-      auto *in_grad_tensor = in_grad_var->GetMutable<phi::DenseTensor>();
-      framework::TensorCopy(
-          out_grad_tensor, dev_place, dev_ctx, in_grad_tensor);
-      in_grad_tensor->set_lod(out_grad_tensor.lod());
-    }
-  }
-};
-
-class RNNMemoryHelperGradOpInfoMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(framework::GradVarName("Out"), "");
-    AddInput("X", "");
-    AddInput("Out", "");
-    AddOutput(framework::GradVarName("X"), "");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddComment("");
-  }
-};
-
-class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    auto x_grad_name = framework::GradVarName("X");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "RNNMemoryHelperGrad");
-    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name),
-                   "Output",
-                   x_grad_name,
-                   "RNNMemoryHelperGrad");
-    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ x_grad_name);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(
-    rnn_memory_helper,
-    paddle::operators::RNNMemoryHelperOp,
-    paddle::operators::RNNMemoryHelperOpInfoMaker,
-    paddle::operators::RNNMemoryHelperOpShapeInference,
-    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(rnn_memory_helper_grad,
-                  paddle::operators::RNNMemoryHelperGradOp,
-                  paddle::operators::RNNMemoryHelperGradOpInfoMaker,
-                  paddle::operators::RNNMemoryHelperGradOpShapeInference);
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
deleted file mode 100644
index 1e2e27f4608714..00000000000000
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/sample_logits_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/operators/math/sample_prob.h"
-
-namespace paddle {
-namespace operators {
-
-class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Logits",
-             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
-             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
-             "and K is the class number.");
-    AddInput("Labels",
-             "(Tensor) The ground truth which is a 2-D tensor. Labels is a "
-             "Tensor<int64> with shape [N x NT], where NT is the number of"
-             "true labels for each example.");
-    AddInput("CustomizedSamples",
-             "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
-             "NT + S],"
-             " where N is the batch size, NT is the number of true labels "
-             "and S is the number of negtive sample for each example."
-             "The first NT elements of each row should be the same with true "
-             "labels, "
-             "followed by S custom negtive samples. This tensor"
-             "is only used when use_customized_samples is true.")
-        .AsDispensable();
-    AddInput(
-        "CustomizedProbabilities",
-        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
-        "The tensor has the same shape with CustomSamples,"
-        "and each element represents probability of element in CustomSamples. "
-        "This "
-        "tensor is only used when use_customized_samples is true.")
-        .AsDispensable();
-    AddOutput("Samples",
-              "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
-              "NT + S]."
-              "The outputs value of sampler, including NT true lables and S "
-              "negetive samples "
-              "for each example. This will be used in"
-              "backward calculation.")
-        .AsIntermediate();
-    AddOutput(
-        "Probabilities",
-        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
-        "The probabilities of sampled positive and negtive labels.")
-        .AsIntermediate();
-    AddOutput("LogitsDim", "Store dim information of Logits for gradient op")
-        .AsIntermediate();
-    AddOutput("LabelsDim", "Store dim information of Logits for gradient op")
-        .AsIntermediate();
-    AddOutput("SampledLogits",
-              "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
-              "[N, NT + S]. The outputs value of sampled logits, which will be"
-              "used in backward propagation.")
-        .AsIntermediate();
-    AddOutput(
-        "SampledLabels",
-        "(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
-        "with shape [N, NT]. The tonsor contains hard labels as input to "
-        " softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
-        " of Sampels are positive lables.");
-    AddAttr<bool>(
-        "use_customized_samples",
-        "An indicator whether to use customized samples with probabilities, if "
-        "True"
-        "the operator will use customized samples and customized probabilities"
-        "otherwise, the operator will generate them by itself.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "uniq",
-        "An indicator whether to sample non-repetitive negtive labels, if True"
-        "the operator will sample negtive labels without replacement."
-        "Otherwise, the operator will sample negtive labels with replacement.")
-        .SetDefault(true);
-    AddAttr<bool>(
-        "remove_accidental_hits",
-        "An indicator whether to remove accidental hits when samples hits true"
-        "labels, the removal is implemented by subtracting the corresponding"
-        "logits by float_max to subpress their softmax to be zero.")
-        .SetDefault(true);
-    AddAttr<int>("num_samples", "The number of negative samples.");
-    AddAttr<int>("seed", "Random seed for generating samples").SetDefault(0);
-
-    AddComment(R"DOC(
-  """
-  Computes sampled output training logits and labels suitable for implementing
-  sampled softmax.
-  """
-
-)DOC");
-  }
-};
-
-class SampleLogitsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Labels"), "Input", "Logits", "SampleLogitsOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Labels"), "Input", "Logits", "SampleLogitsOp");
-
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Samples"), "Output", "Samples", "SampleLogitsOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Probabilities"),
-                   "Output",
-                   "Probabilities",
-                   "SampleLogitsOp");
-    OP_INOUT_CHECK(ctx->HasOutput("SampledLogits"),
-                   "Output",
-                   "SampledLogits",
-                   "SampleLogitsOp");
-    OP_INOUT_CHECK(ctx->HasOutput("SampledLabels"),
-                   "Output",
-                   "SampledLabels",
-                   "SampleLogitsOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("LogitsDim"), "Output", "LogitsDim", "SampleLogitsOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("LabelsDim"), "Output", "LabelsDim", "SampleLogitsOp");
-
-    auto logits_dims = ctx->GetInputDim("Logits");
-    auto labels_dims = ctx->GetInputDim("Labels");
-
-    PADDLE_ENFORCE_EQ(logits_dims.size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "Input(Logits) of SampleLogitsOp should be 2D. "
-                          "But received shape = [%s] and dimension is %d.",
-                          logits_dims,
-                          logits_dims.size()));
-    PADDLE_ENFORCE_EQ(labels_dims.size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "Input(Labels) of SampleLogitsOp should be 2D. "
-                          "But received shape = [%s] and dimension is %d.",
-                          labels_dims,
-                          labels_dims.size()));
-
-    const int num_samples = ctx->Attrs().Get<int>("num_samples");
-    int num_sampled_classes = static_cast<int>(labels_dims[1] + num_samples);
-    if ((!ctx->IsRuntime()) && labels_dims[1] <= 0) {
-      num_sampled_classes = -1;
-    }
-    ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
-    ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
-    ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
-    ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
-
-    // append 0 to shape variable to avoid optimized by memory optimize pass
-    auto logits_dim_vec = phi::vectorize(logits_dims);
-    logits_dim_vec.push_back(0);
-    ctx->SetOutputDim("LogitsDim", phi::make_ddim(logits_dim_vec));
-
-    auto labels_dim_vec = phi::vectorize(labels_dims);
-    labels_dim_vec.push_back(0);
-    ctx->SetOutputDim("LabelsDim", phi::make_ddim(labels_dim_vec));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Logits");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-// UNDERSTAND: InferShape for Grad
-class SampleLogitsOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("LogitsDim"), "Input", "LogitsDim", "SampleLogitsOpGrad");
-    OP_INOUT_CHECK(
-        ctx->HasInput("LabelsDim"), "Input", "LabelsDim", "SampleLogitsOpGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Samples"),
-                   "Input",
-                   "SamplesabelsDim",
-                   "SampleLogitsOpGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("SampledLogits")),
-                   "Input",
-                   "SampledLogits@GRAD",
-                   "SampleLogitsOpGrad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Logits")),
-                   "Output",
-                   "Logits@GRAD",
-                   "SampleLogitsOpGrad");
-
-    auto logits_dims = ctx->GetInputDim("LogitsDim");
-    logits_dims = framework::DDim(logits_dims.Get(), logits_dims.size() - 1);
-    auto labels_dims = ctx->GetInputDim("LabelsDim");
-    labels_dims = framework::DDim(labels_dims.Get(), labels_dims.size() - 1);
-    PADDLE_ENFORCE_EQ(
-        logits_dims.size(),
-        2UL,
-        platform::errors::InvalidArgument(
-            "Input(LogitsDim) of SampleLogitsOpGrad should be 2D. "
-            "But received shape = [%s] and dimension is %d.",
-            logits_dims,
-            logits_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        labels_dims.size(),
-        2UL,
-        platform::errors::InvalidArgument(
-            "Input(LabelsDim) of SampleLogitsOpGrad should be 2D. "
-            "But received shape = [%s] and dimension is %d.",
-            labels_dims,
-            labels_dims.size()));
-
-    ctx->SetOutputDim(framework::GradVarName("Logits"), logits_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("SampledLogits"));
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-// UNDERSTAND: what's the rule for making a GradMaker TODO
-
-template <typename T>
-class SampleLogitsGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("sample_logits_grad");
-    grad_op->SetInput("LogitsDim", this->Output("LogitsDim"));
-    grad_op->SetInput("LabelsDim", this->Output("LabelsDim"));
-    grad_op->SetInput("Samples", this->Output("Samples"));
-    grad_op->SetInput(framework::GradVarName("SampledLogits"),
-                      this->OutputGrad("SampledLogits"));
-    grad_op->SetOutput(framework::GradVarName("Logits"),
-                       this->InputGrad("Logits"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sample_logits,
-                  ops::SampleLogitsOp,
-                  ops::SampleLogitsOpMaker,
-                  ops::SampleLogitsGradMaker<paddle::framework::OpDesc>,
-                  ops::SampleLogitsGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(sample_logits_grad, ops::SampleLogitsOpGrad);
-PD_REGISTER_STRUCT_KERNEL(
-    sample_logits, CPU, ALL_LAYOUT, ops::SampleLogitsKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(sample_logits_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SampleLogitsGradKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
deleted file mode 100644
index 6a853f71e6f329..00000000000000
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/sample_prob.h"
-#include "paddle/fluid/operators/sample_logits_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-// UNDERSTAND: something like take_along_axis in numpy.
-template <typename T>
-__global__ void GPUTakeAlongD1(size_t size,
-                               const int batch_size,
-                               const int array_slice_size,
-                               const int idx_slice_size,
-                               const T* p_array,
-                               const int64_t* p_index,
-                               T* p_value) {
-  const auto value_slice_size = idx_slice_size;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = blockDim.x * gridDim.x;
-
-  for (; idx < size; idx += step_size) {
-    int i = idx / idx_slice_size;
-    auto array_index = p_index[idx];
-    p_value[idx] = p_array[i * array_slice_size + array_index];
-  }
-}
-
-// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
-// indices, scatter is done in += way.
-template <typename T>
-__global__ void GPUPutAlongD1(size_t size,
-                              const int batch_size,
-                              const int array_slice_size,
-                              const int idx_slice_size,
-                              T* p_array,
-                              const int64_t* p_index,
-                              const T* p_value) {
-  const auto value_slice_size = idx_slice_size;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = blockDim.x * gridDim.x;
-
-  // size == batch_size
-  for (; idx < size; idx += step_size) {
-    int i = idx;
-    for (int j = 0; j < idx_slice_size; ++j) {
-      auto array_index = p_index[i * idx_slice_size + j];
-      p_array[i * array_slice_size + array_index] +=
-          p_value[i * idx_slice_size + j];
-    }
-  }
-}
-
-// UNDERSTAND: set label as 0,1,...,num_true-1
-template <typename T>
-__global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = blockDim.x * gridDim.x;
-
-  for (; idx < size; idx += step_size) {
-    p_array[idx] = idx % num_true;
-  }
-}
-
-// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
-// logits by a float max, here 1e20
-template <typename T>
-__global__ void gpu_compute_remove_accidental_hits(const int size,
-                                                   const int num_true,
-                                                   const int idx_slice_size,
-                                                   const int64_t* p_index,
-                                                   T* p_value) {
-  const auto value_slice_size = idx_slice_size;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = blockDim.x * gridDim.x;
-
-  for (; idx < size; idx += step_size) {
-    int i = idx / idx_slice_size;
-    if (idx % idx_slice_size < num_true) continue;
-    for (int j = 0; j < num_true; ++j) {
-      const auto true_idx = i * idx_slice_size + j;
-      if (p_index[true_idx] == p_index[idx]) {
-        p_value[idx] -= 1e20;
-        break;
-      }
-    }
-  }
-}
-
-template <typename T, typename DeviceContext>
-class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get necessary inputs
-    const phi::DenseTensor* logits = context.Input<phi::DenseTensor>("Logits");
-    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Labels");
-    VLOG(3) << "Enter SampleLogitsCUDAKernel";
-
-    // get necessary outputs
-    phi::DenseTensor* samples = context.Output<phi::DenseTensor>("Samples");
-    phi::DenseTensor* probabilities =
-        context.Output<phi::DenseTensor>("Probabilities");
-    phi::DenseTensor* sampled_logits =
-        context.Output<phi::DenseTensor>("SampledLogits");
-    phi::DenseTensor* sampled_labels =
-        context.Output<phi::DenseTensor>("SampledLabels");
-
-    // shapes
-    const auto batch_size = logits->dims()[0];
-    const auto num_classes = logits->dims()[1];
-    const auto labels_dim = labels->dims();
-    const auto num_true = labels_dim[1];
-    const auto samples_dim = samples->dims();
-
-    // attrs
-    const auto num_samples = context.Attr<int>("num_samples");
-    const bool use_customized_samples =
-        context.Attr<bool>("use_customized_samples");
-    const bool uniq = context.Attr<bool>("uniq");
-    const bool remove_accidental_hits =
-        context.Attr<bool>("remove_accidental_hits");
-
-    // device contexts
-    auto& dev_ctx = context.cuda_device_context();
-
-    // UNDERSTAND: allocate memories for temporaries
-    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
-    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
-    set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
-
-    auto sampled_labels_data =
-        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
-    int threads = 512;
-    size_t size = batch_size * num_true;
-    int grid = (size + threads - 1) / threads;
-    GPUSetLabel<T>
-        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-            size, num_true, sampled_labels_data);
-
-    if (use_customized_samples) {
-      const phi::DenseTensor* customized_samples =
-          context.Input<phi::DenseTensor>("CustomizedSamples");
-      const phi::DenseTensor* customized_probabilities =
-          context.Input<phi::DenseTensor>("CustomizedProbabilities");
-      PADDLE_ENFORCE_EQ(
-          customized_samples,
-          samples,
-          platform::errors::InvalidArgument(
-              "CustomizedSamples must be the same phi::DenseTensor with "
-              "Samples when use_customized_samples = True"));
-      PADDLE_ENFORCE_EQ(
-          customized_probabilities,
-          probabilities,
-          platform::errors::InvalidArgument(
-              "CustomizedProbabilities must be the same phi::DenseTensor with "
-              "Probabilities when use_customized_samples = True"));
-    } else {
-      samples->mutable_data<int64_t>(context.GetPlace());
-      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
-      // UNDERSTAND: sampling
-      const auto seed = context.Attr<int>("seed");
-      auto sampler_with_prob = math::GPUSampleWithProb<T>();
-      sampler_with_prob(context.cuda_device_context(),
-                        seed,
-                        num_classes,
-                        uniq,
-                        num_samples,
-                        labels,
-                        samples,
-                        probabilities);
-    }
-
-    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
-    const auto num_take = samples->dims()[1];
-    const auto array_dims = logits->dims();
-    const auto idx_dims = samples->dims();
-
-    const T* p_array = logits->data<T>();
-    const int64_t* p_index = samples->data<int64_t>();
-    T* p_value = sampled_logits->data<T>();
-
-    // src slice size
-    const auto array_slice_size = array_dims[1];
-    // index slice size
-    const auto idx_slice_size = idx_dims[1];
-
-    size = batch_size * num_take;
-    grid = (size + threads - 1) / threads;
-    GPUTakeAlongD1<T>
-        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-            size,
-            batch_size,
-            array_slice_size,
-            idx_slice_size,
-            p_array,
-            p_index,
-            p_value);
-
-    if (remove_accidental_hits) {
-      const size_t size = batch_size * (num_true + num_samples);
-      int grid = (size + threads - 1) / threads;
-      gpu_compute_remove_accidental_hits<T>
-          <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-              size, num_true, idx_slice_size, p_index, p_value);
-    }
-
-    // subtracted sampled logits with logQ(y|x)
-    auto probs = EigenMatrix<T>::From(*probabilities);
-    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
-    smp_logits.device(*dev_ctx.eigen_device()) =
-        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
-            .unaryExpr(TolerableValue<T>());
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto logits_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
-    const phi::DenseTensor* samples =
-        context.Input<phi::DenseTensor>("Samples");
-    const phi::DenseTensor* sampled_logits_grad =
-        context.Input<phi::DenseTensor>(
-            framework::GradVarName("SampledLogits"));
-    logits_grad->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx = context.cuda_device_context();
-    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
-    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
-
-    // UNDERSTAND: scatter it back to logit_grad
-    const auto batch_size = samples->dims()[0];
-    const auto num_put = samples->dims()[1];
-    const auto array_dims = logits_grad->dims();
-    const auto idx_dims = samples->dims();
-
-    T* p_array = logits_grad->data<T>();
-    const int64_t* p_index = samples->data<int64_t>();
-    const T* p_value = sampled_logits_grad->data<T>();
-
-    // src slice size
-    const auto array_slice_size = array_dims[1];
-    // index slice size
-    const auto idx_slice_size = idx_dims[1];
-
-    int threads = 128;
-    const size_t size = batch_size;
-    int grid = (size + threads - 1) / threads;
-
-    GPUPutAlongD1<T>
-        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-            size,
-            batch_size,
-            array_slice_size,
-            idx_slice_size,
-            p_array,
-            p_index,
-            p_value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(sample_logits,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SampleLogitsCUDAKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(sample_logits_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SampleLogitsGradCUDAKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
deleted file mode 100644
index bf58a054dad2d9..00000000000000
--- a/paddle/fluid/operators/sample_logits_op.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/sample_prob.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct TolerableValue {
-  HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE(std::is_floating_point<T>::value,
-                   "TolerableValue should be float in sample_logits_op.");
-    const T kApproInf = 1e20;
-    if (x == INFINITY) return kApproInf;
-    if (x == -INFINITY) return -kApproInf;
-    return x;
-  }
-};
-
-// UNDERSTAND: something like take_along_axis in numpy.
-template <typename T>
-static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
-                           const phi::DenseTensor& array,
-                           const phi::DenseTensor& index,
-                           phi::DenseTensor* value) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()),
-      true,
-      platform::errors::InvalidArgument("This kernel only runs on CPU."));
-  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
-  const auto batch_size = index.dims()[0];
-  const auto num_take = index.dims()[1];
-  const auto array_dims = array.dims();
-  const auto idx_dims = index.dims();
-  PADDLE_ENFORCE_EQ(idx_dims.size(),
-                    2,
-                    platform::errors::InvalidArgument(
-                        "index of CPUTakeAlongD1 should be 2D. "
-                        "But received shape = [%s] and dimension is %d.",
-                        idx_dims,
-                        idx_dims.size()));
-  PADDLE_ENFORCE_EQ(array_dims.size(),
-                    2,
-                    platform::errors::InvalidArgument(
-                        "array of CPUTakeAlongD1 should be 2D. "
-                        "But received shape = [%s] and dimension is %d.",
-                        array_dims,
-                        array_dims.size()));
-  PADDLE_ENFORCE_EQ(idx_dims[0],
-                    array_dims[0],
-                    platform::errors::InvalidArgument(
-                        "The first dimension of index and array of "
-                        "CPUTakeAlongD1 should be equal. "
-                        "But received index shape = [%s], array shape = [%s], "
-                        "and the first dimensions are %d and %d.",
-                        idx_dims,
-                        array_dims,
-                        idx_dims[0],
-                        array_dims[0]));
-  PADDLE_ENFORCE_EQ(
-      idx_dims,
-      value->dims(),
-      platform::errors::InvalidArgument(
-          "index and array of CPUTakeAlongD1 should have the same shape. "
-          "But received index shape = [%s], array shape = [%s].",
-          idx_dims,
-          value->dims()));
-
-  // UNDERSTAND: no allocations here
-  const T* p_array = array.data<T>();
-  const int64_t* p_index = index.data<int64_t>();
-  T* p_value = value->data<T>();
-
-  // src slice size
-  const auto array_slice_size = array_dims[1];
-
-  // index slice size
-  const auto idx_slice_size = idx_dims[1];
-  const auto value_slice_size = idx_slice_size;
-
-  for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < num_take; ++j) {
-      auto array_index = p_index[i * idx_slice_size + j];
-      p_value[i * value_slice_size + j] =
-          p_array[i * array_slice_size + array_index];
-    }
-  }
-}
-
-// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
-// indices, scatter is done in += way.
-template <typename T>
-static void CPUPutAlongD1(const platform::DeviceContext& ctx,
-                          phi::DenseTensor* array,
-                          const phi::DenseTensor& index,
-                          const phi::DenseTensor& value) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()),
-      true,
-      platform::errors::InvalidArgument("This kernel only runs on CPU."));
-  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
-  const auto batch_size = index.dims()[0];
-  const auto num_put = index.dims()[1];
-  auto array_dims = array->dims();
-  auto idx_dims = index.dims();
-  PADDLE_ENFORCE_EQ(idx_dims.size(),
-                    2,
-                    platform::errors::InvalidArgument(
-                        "index of CPUPutAlongD1 should be 2D. "
-                        "But received shape = [%s] and dimension is %d.",
-                        idx_dims,
-                        idx_dims.size()));
-  PADDLE_ENFORCE_EQ(array_dims.size(),
-                    2,
-                    platform::errors::InvalidArgument(
-                        "array of CPUPutAlongD1 should be 2D. "
-                        "But received shape = [%s] and dimension is %d.",
-                        array_dims,
-                        array_dims.size()));
-  PADDLE_ENFORCE_EQ(idx_dims[0],
-                    array_dims[0],
-                    platform::errors::InvalidArgument(
-                        "The first dimension of index and array of "
-                        "CPUPutAlongD1 should be equal. "
-                        "But received index shape = [%s], array shape = [%s], "
-                        "and the first dimensions are %d and %d.",
-                        idx_dims,
-                        array_dims,
-                        idx_dims[0],
-                        array_dims[0]));
-  PADDLE_ENFORCE_EQ(
-      idx_dims,
-      value.dims(),
-      platform::errors::InvalidArgument(
-          "index and array of CPUPutAlongD1 should have the same shape. "
-          "But received index shape = [%s], array shape = [%s].",
-          idx_dims,
-          value.dims()));
-
-  // UNDERSTAND: no allocations here
-  T* p_array = array->data<T>();
-  const int64_t* p_index = index.data<int64_t>();
-  const T* p_value = value.data<T>();
-
-  // slice sizes
-  const auto array_slice_size = array_dims[1];
-  const auto idx_slice_size = idx_dims[1];
-  const auto value_slice_size = idx_slice_size;
-
-  for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < num_put; ++j) {
-      auto array_index = p_index[i * idx_slice_size + j];
-      p_array[i * array_slice_size + array_index] +=
-          p_value[i * value_slice_size + j];
-    }
-  }
-}
-
-// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
-// logits by a float max, here 1e20
-template <typename T>
-static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
-                                           phi::DenseTensor* sampled_logits,
-                                           const phi::DenseTensor& samples,
-                                           const int num_true) {
-  const auto batch_size = sampled_logits->dims()[0];
-  const auto num_sampled_classes = sampled_logits->dims()[1];
-  T* sampled_logits_data = sampled_logits->data<T>();
-  const auto samples_data = samples.data<int64_t>();
-
-  std::unordered_set<int64_t> tmp_true_labels;
-  for (int i = 0; i < batch_size; ++i) {
-    tmp_true_labels.clear();
-    tmp_true_labels.insert(samples_data + i * num_sampled_classes,
-                           samples_data + i * num_sampled_classes + num_true);
-    for (int j = num_true; j < num_sampled_classes; ++j) {
-      const auto idx = i * num_sampled_classes + j;
-      if (tmp_true_labels.find(samples_data[idx]) != tmp_true_labels.end())
-        sampled_logits_data[idx] -= 1e20;
-    }
-  }
-}
-
-template <typename T, typename DeviceContext>
-class SampleLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(context.GetPlace()),
-        true,
-        platform::errors::InvalidArgument("this kernel only runs on cpu."));
-    VLOG(3) << "Enter SampleLogitsKernel";
-    // get necessary inputs
-    const phi::DenseTensor* logits = context.Input<phi::DenseTensor>("Logits");
-    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Labels");
-
-    // get necessary outputs
-    phi::DenseTensor* samples = context.Output<phi::DenseTensor>("Samples");
-    phi::DenseTensor* probabilities =
-        context.Output<phi::DenseTensor>("Probabilities");
-    phi::DenseTensor* sampled_logits =
-        context.Output<phi::DenseTensor>("SampledLogits");
-    phi::DenseTensor* sampled_labels =
-        context.Output<phi::DenseTensor>("SampledLabels");
-
-    // shapes
-    const auto batch_size = logits->dims()[0];
-    const auto num_classes = logits->dims()[1];
-    const auto labels_dim = labels->dims();
-    const auto num_true = labels_dim[1];
-    const auto samples_dim = samples->dims();
-
-    // attrs
-    const auto num_samples = context.Attr<int>("num_samples");
-    const bool use_customized_samples =
-        context.Attr<bool>("use_customized_samples");
-    const bool remove_accidental_hits =
-        context.Attr<bool>("remove_accidental_hits");
-
-    // device contexts
-    auto& dev_ctx = context.template device_context<phi::CPUContext>();
-
-    // UNDERSTAND: allocate memories for temporaries
-    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
-    auto sampled_labels_data =
-        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
-    for (int i = 0; i < batch_size; ++i) {
-      for (int j = 0; j < num_true; ++j) {
-        sampled_labels_data[i * num_true + j] = j;
-      }
-    }
-
-    if (use_customized_samples) {
-      const phi::DenseTensor* customized_samples =
-          context.Input<phi::DenseTensor>("CustomizedSamples");
-      const phi::DenseTensor* customized_probabilities =
-          context.Input<phi::DenseTensor>("CustomizedProbabilities");
-      PADDLE_ENFORCE_EQ(
-          customized_samples,
-          samples,
-          platform::errors::InvalidArgument(
-              "CustomizedSamples must be the same phi::DenseTensor with "
-              "Samples when use_customized_samples = True"));
-      PADDLE_ENFORCE_EQ(
-          customized_probabilities,
-          probabilities,
-          platform::errors::InvalidArgument(
-              "CustomizedProbabilities must be the same phi::DenseTensor with "
-              "Probabilities when use_customized_samples = True"));
-    } else {
-      samples->mutable_data<int64_t>(context.GetPlace());
-      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
-      // UNDERSTAND: sampling
-      const auto seed = context.Attr<int>("seed");
-      auto sampler_with_prob = math::SampleWithProb<phi::CPUContext, T>();
-      sampler_with_prob(dev_ctx,
-                        math::LogUniformSampler(num_classes, seed),
-                        num_samples,
-                        labels,
-                        samples,
-                        probabilities);
-    }
-
-    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
-    CPUTakeAlongD1<T>(dev_ctx, *logits, *samples, sampled_logits);
-    if (remove_accidental_hits) {
-      compute_remove_accidental_hits<T>(
-          dev_ctx, sampled_logits, *samples, num_true);
-    }
-
-    // subtracted sampled logits with logQ(y|x)
-    auto probs = EigenMatrix<T>::From(*probabilities);
-    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
-    smp_logits.device(*dev_ctx.eigen_device()) =
-        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
-            .unaryExpr(TolerableValue<T>());
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SampleLogitsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto logits_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
-    const phi::DenseTensor* samples =
-        context.Input<phi::DenseTensor>("Samples");
-    const phi::DenseTensor* sampled_logits_grad =
-        context.Input<phi::DenseTensor>(
-            framework::GradVarName("SampledLogits"));
-    logits_grad->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx = context.template device_context<phi::CPUContext>();
-    phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
-    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
-
-    // UNDERSTAND: scatter it back to logit_grad
-    CPUPutAlongD1<T>(dev_ctx, logits_grad, *samples, *sampled_logits_grad);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
deleted file mode 100644
index c8ce5475e545bc..00000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SequenceTopkAvgPooling");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ROW"), "Input", "ROW", "SequenceTopkAvgPooling");
-    OP_INOUT_CHECK(
-        ctx->HasInput("COLUMN"), "Input", "COLUMN", "SequenceTopkAvgPooling");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "SequenceTopkAvgPooling");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("pos"), "Output", "pos", "SequenceTopkAvgPooling");
-
-    auto attr = ctx->Attrs();
-    auto channel_num = attr.Get<int>("channel_num");
-    PADDLE_ENFORCE_GT(
-        channel_num,
-        0,
-        platform::errors::InvalidArgument(
-            "Expected channel_num > 0, but received %d.", channel_num));
-
-    auto topks = attr.Get<std::vector<int>>("topks");
-    auto num_k = topks.size();
-    PADDLE_ENFORCE_GT(
-        num_k,
-        0,
-        platform::errors::InvalidArgument(
-            "Expected topks.size() > 0, but received %zu.", num_k));
-
-    auto row_dim = ctx->GetInputDim("ROW");
-    auto row_shape_0 = row_dim[0];
-
-    std::vector<int> vec_out_shape;
-    vec_out_shape.push_back(row_shape_0);          // NOLINT
-    vec_out_shape.push_back(channel_num * num_k);  // NOLINT
-
-    ctx->SetOutputDim("Out", phi::make_ddim(vec_out_shape));
-    ctx->ShareLoD("ROW", "Out");
-  }
-};
-
-class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor) The variable-length input of SequenceTopkPoolingOp");
-    AddInput("ROW", "(LoDTensor) the row info");
-    AddInput("COLUMN", "(LoDTensor) the column info");
-    AddOutput(
-        "Out",
-        "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD "
-        "information.");
-    AddOutput("pos", "(Tensor<int>) store the topk index ").AsIntermediate();
-    AddAttr<std::vector<int>>("topks", "topks");
-    AddAttr<int>("channel_num", "channel number");
-    AddComment(R"DOC(
-    sequecen topk average pooling op
-    )DOC");
-  }
-};
-
-class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "SequenceTopkAvgPoolingGrad");
-    OP_INOUT_CHECK(
-        ctx->HasInput("X"), "Input", "X", "SequenceTopkAvgPoolingGrad");
-
-    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SequenceTopkAvgPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op_desc_ptr) const override {
-    op_desc_ptr->SetType("sequence_topk_avg_pooling_grad");
-    op_desc_ptr->SetInput("X", this->Input("X"));
-    op_desc_ptr->SetInput("ROW", this->Input("ROW"));
-    op_desc_ptr->SetInput("COLUMN", this->Input("COLUMN"));
-    op_desc_ptr->SetInput("pos", this->Output("pos"));
-    op_desc_ptr->SetInput(framework::GradVarName("Out"),
-                          this->OutputGrad("Out"));
-    op_desc_ptr->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op_desc_ptr->SetAttrMap(this->Attrs());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    sequence_topk_avg_pooling,
-    ops::SequenceTopkAvgPoolingOp,
-    ops::SequenceTopkAvgPoolingOpMaker,
-    ops::SequenceTopkAvgPoolGradOpMaker<paddle::framework::OpDesc>,
-    ops::SequenceTopkAvgPoolGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(sequence_topk_avg_pooling_grad,
-                  ops::SequenceTopkAvgPoolingGradOp);
-PD_REGISTER_STRUCT_KERNEL(sequence_topk_avg_pooling,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceTopkAvgPoolingKernel,
-                          float) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_topk_avg_pooling_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceTopkAvgPoolingGradKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
deleted file mode 100644
index df69acc7488723..00000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <limits>
-#include <queue>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = phi::DenseTensor;
-using LoDTensor = phi::DenseTensor;
-static constexpr int TopKPosPaddingId = -1;
-
-namespace details {
-
-template <typename T>
-static void get_topk_pos(const T* data, int length, int k, int* pos) {
-  VLOG(3) << "length: " << length << " , k : " << k;
-
-  std::priority_queue<std::pair<T, int>,
-                      std::vector<std::pair<T, int>>,
-                      std::greater<std::pair<T, int>>>
-      topk_queue;
-
-  for (int i = 0; i < length; ++i) {
-    T elem = data[i];
-    if (topk_queue.size() < static_cast<size_t>(k)) {
-      topk_queue.emplace(elem, i);
-    } else {
-      if (elem >= topk_queue.top().first) {
-        // replace top node if found a bigger value
-        topk_queue.pop();
-        topk_queue.emplace(elem, i);
-      }
-    }
-  }
-  // reversely assign value
-  int real_k = topk_queue.size();
-  for (int i = real_k - 1; i >= 0; --i) {
-    pos[i] = topk_queue.top().second;
-    topk_queue.pop();
-  }
-  // if length of data is less than k, fill TopKPosPaddingId at the end of pos.
-  for (int i = real_k; i < k; ++i) {
-    pos[i] = TopKPosPaddingId;
-  }
-}
-}  // namespace details
-
-template <typename T, typename DeviceContext>
-class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* row = context.Input<LoDTensor>("ROW");
-    auto* col = context.Input<LoDTensor>("COLUMN");
-    auto* out = context.Output<LoDTensor>("Out");
-    auto* pos = context.Output<phi::DenseTensor>("pos");
-
-    PADDLE_ENFORCE_EQ(
-        in->lod().empty(),
-        false,
-        platform::errors::InvalidArgument(
-            "Input(X) Tensor of SequenceTopkAvgPoolingOp does not "
-            "contain LoD information."));
-    PADDLE_ENFORCE_EQ(
-        row->lod().empty(),
-        false,
-        platform::errors::InvalidArgument(
-            "Input(ROW) Tensor of SequenceTopkAvgPoolingOp does not "
-            "contain LoD information."));
-    PADDLE_ENFORCE_EQ(
-        col->lod().empty(),
-        false,
-        platform::errors::InvalidArgument(
-            "Input(COLUMN) Tensor of SequenceTopkAvgPoolingOp does "
-            "not contain LoD information."));
-
-    auto channel_num = context.Attr<int>("channel_num");
-    auto topks = context.Attr<std::vector<int>>("topks");
-    auto k_num = topks.size();
-    auto max_k = topks[topks.size() - 1];
-    PADDLE_ENFORCE_GE(max_k,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Expected max_k >= 0, but received %d.", max_k));
-    std::vector<int> vec_pos_shape;
-    auto in_lod = in->lod()[0];
-
-    auto row_lod = row->lod()[0];
-    auto col_lod = col->lod()[0];
-    int batch_size = row_lod.size() - 1;
-    int pos_total_size = row_lod[batch_size] * channel_num * max_k;
-    vec_pos_shape.push_back(pos_total_size);
-    pos->Resize({phi::make_ddim(vec_pos_shape)});
-    auto pos_data = pos->mutable_data<int>(context.GetPlace());
-
-    int offset = 0;
-    phi::Vector<size_t> vec_out_lod;
-    vec_out_lod.reserve(batch_size + 1);
-    for (int i = 0; i <= batch_size; ++i) {
-      offset = row_lod[i];
-      vec_out_lod.push_back(offset);
-    }
-
-    framework::LoD lod_temp;
-    lod_temp.push_back(vec_out_lod);
-    out->set_lod(lod_temp);
-
-    auto din_data = in->data<T>();
-    auto dout_data = out->mutable_data<T>(context.GetPlace());
-
-    T* sum_data = new T[max_k];
-    for (int i = 0; i < batch_size; ++i) {
-      int total_size = in_lod[i + 1] - in_lod[i];
-      int row_size = row_lod[i + 1] - row_lod[i];
-      int col_size = col_lod[i + 1] - col_lod[i];
-      PADDLE_ENFORCE_EQ(total_size,
-                        channel_num * row_size * col_size,
-                        platform::errors::PreconditionNotMet(
-                            "Expected total_size == channel_num * row_size * "
-                            "col_size, but got %d != %d.",
-                            total_size,
-                            channel_num * row_size * col_size));
-
-      int feature_num = row_size * col_size;
-      for (int j = 0; j < channel_num; ++j) {
-        auto input_offset_feature_data = din_data + in_lod[i] + j * feature_num;
-
-        for (int r = 0; r < row_size; ++r) {
-          auto row_data = input_offset_feature_data + r * col_size;
-
-          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
-                                r * channel_num * max_k + j * max_k;
-          auto out_slice_data = dout_data + row_lod[i] * channel_num * k_num +
-                                r * channel_num * k_num + j * k_num;
-
-          details::get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
-          if (pos_slice_data[0] == TopKPosPaddingId) {
-            sum_data[0] = 0.0;
-          } else {
-            sum_data[0] = row_data[pos_slice_data[0]];
-          }
-          for (int k = 1; k < max_k; ++k) {
-            if (pos_slice_data[k] == TopKPosPaddingId) {
-              sum_data[k] = sum_data[k - 1];
-            } else {
-              sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
-            }
-          }
-          for (size_t k = 0; k < k_num; ++k) {
-            out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
-          }
-        }
-      }
-    }
-    delete[] sum_data;
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* pos_input = context.Input<phi::DenseTensor>("pos");
-    auto* row_input = context.Input<LoDTensor>("ROW");
-    auto* col_input = context.Input<LoDTensor>("COLUMN");
-    auto* forward_input = context.Input<LoDTensor>("X");
-
-    int batch_size = row_input->lod()[0].size() - 1;
-    auto channel_num = context.Attr<int>("channel_num");
-    auto topks = context.Attr<std::vector<int>>("topks");
-    auto k_num = topks.size();
-    auto max_k = topks[k_num - 1];
-
-    auto out_lod = forward_input->lod();
-    d_in->set_lod(out_lod);
-
-    d_in->mutable_data<T>(context.GetPlace());
-    auto pos_data = pos_input->data<int>();
-    auto dout_data = d_out->data<T>();
-
-    auto& dev_ctx = context.template device_context<phi::CPUContext>();
-    phi::funcs::SetConstant<phi::CPUContext, T> zero;
-    zero(dev_ctx, d_in, static_cast<T>(0.0));
-
-    auto din_data = d_in->data<T>();
-
-    auto out_offset = out_lod[0];
-    auto row_lod = row_input->lod()[0];
-    auto col_lod = col_input->lod()[0];
-
-    for (int i = 0; i < batch_size; ++i) {
-      int row_size = row_lod[i + 1] - row_lod[i];
-      int col_size = col_lod[i + 1] - col_lod[i];
-      int feature_num = row_size * col_size;
-
-      for (int j = 0; j < channel_num; ++j) {
-        auto in_offset_feature_data =
-            din_data + out_offset[i] + j * feature_num;
-
-        for (int r = 0; r < row_size; r++) {
-          auto row_data = dout_data + row_lod[i] * channel_num * k_num +
-                          r * channel_num * k_num + j * k_num;
-          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
-                                r * channel_num * max_k + j * max_k;
-          auto in_slice_data = in_offset_feature_data + r * col_size;
-
-          for (size_t m = 0; m < k_num; ++m) {
-            for (int k = 0; k < topks[m]; ++k) {
-              if (pos_slice_data[k] == TopKPosPaddingId) {
-                break;
-              } else {
-                in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
deleted file mode 100644
index c1abfcb3e436f1..00000000000000
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/smooth_l1_loss_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SmoothL1LossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SmoothL1Loss");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "SmoothL1Loss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(y_dims) <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims,
-          y_dims,
-          platform::errors::InvalidArgument(
-              "Input(X) ans Input(Y) of SmoothL1LossOp should "
-              "have the same size, but received X dim is %s, Y dim is %s",
-              x_dims.to_str(),
-              y_dims.to_str()));
-    }
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The tensor rank of Input(X) of SmoothL1LossOp "
-                          "should not be less than 2, but received %d.",
-                          x_dims.size()));
-    if (ctx->HasInput("InsideWeight")) {
-      PADDLE_ENFORCE_EQ(ctx->HasInput("OutsideWeight"),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "If weights are provided, must specify both "
-                            "inside and outside weights."));
-      auto dims = ctx->GetInputDim("InsideWeight");
-      bool check = true;
-      if ((!ctx->IsRuntime()) &&
-          (phi::product(dims) <= 0 || phi::product(x_dims) <= 0)) {
-        check = false;
-      }
-      if (check) {
-        PADDLE_ENFORCE_EQ(x_dims,
-                          dims,
-                          platform::errors::InvalidArgument(
-                              "Input(X) ans Input(InsideWeight) of "
-                              "SmoothL1LossOp should have the same size, but "
-                              "received X dim is %s, InsideWeight dim is %s",
-                              x_dims.to_str(),
-                              dims.to_str()));
-      }
-
-      dims = ctx->GetInputDim("OutsideWeight");
-      check = true;
-      if ((!ctx->IsRuntime()) &&
-          (phi::product(dims) <= 0 || phi::product(x_dims) <= 0)) {
-        check = false;
-      }
-      if (check) {
-        PADDLE_ENFORCE_EQ(x_dims,
-                          dims,
-                          platform::errors::InvalidArgument(
-                              "Input(X) ans Input(OutsideWeight) of "
-                              "SmoothL1LossOp should have the same size, but "
-                              "received X dim is %s, OutsideWeight dim is %s",
-                              x_dims.to_str(),
-                              dims.to_str()));
-      }
-    }
-
-    ctx->SetOutputDim("Diff", x_dims);
-    // loss is a two-rank tensor
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-  }
-};
-
-class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "The input value of smooth l1 loss op with shape "
-             "[batch_size, dim1, ..., dimN].");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "The target value of smooth l1 loss op with same shape as X.");
-    AddInput("InsideWeight",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "This input is optional and should have same shape with X. "
-             "If provided, the result of (X - Y) will be multiplied "
-             "by this tensor element by element.")
-        .AsDispensable();
-    AddInput("OutsideWeight",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "This input is optional and should have same shape with X. "
-             "If provided, the out smooth l1 loss will be multiplied by this "
-             "tensor element by element.")
-        .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
-        .AsIntermediate();
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
-              "The output smooth l1 loss with shape [batch_size, 1].");
-    AddAttr<float>("sigma",
-                   "Hyper parameter of smooth l1 loss op."
-                   "A float scalar with default value 3.0.")
-        .SetDefault(1.0);
-    AddComment(R"DOC(
-Smooth L1 Loss Operator.
-
-This operator computes the smooth l1 loss for X and Y.
-The operator takes the first dimension of X and Y as batch size.
-For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the shape of Out is [batch_size, 1].
-
-The equation is:
-$$
-Out_{\sigma}(X, Y)_i = \begin{cases}
-0.5 * (\sigma * (X_i - Y_i)) ^ 2
-\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
-\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
-\quad otherwise
-\end{cases}
-$$
-
-In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
-element of Out, X and Y.
-
-)DOC");
-  }
-};
-
-class SmoothL1LossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("Diff");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_GE(
-        out_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The tensor rank of Input(Out@Grad) should be 2, but received %d.",
-            out_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          out_dims[0],
-          in_dims[0],
-          platform::errors::InvalidArgument(
-              "The 1st dimension of Input(Out@Grad) must be "
-              "same as input in SmoothL1LossGradOp, but received %d and %d.",
-              out_dims[0],
-              in_dims[0]));
-      PADDLE_ENFORCE_EQ(out_dims[1],
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The 2nd dimension of Input(Out@Grad) must be 1 in "
-                            "SmoothL1LossGradOp, but received %d.",
-                            out_dims[1]));
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, in_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, in_dims);
-    }
-  }
-};
-
-template <typename T>
-class SmoothL1LossGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("smooth_l1_loss_grad");
-    op->SetInput("InsideWeight", this->Input("InsideWeight"));
-    op->SetInput("OutsideWeight", this->Input("OutsideWeight"));
-    op->SetInput("Diff", this->Output("Diff"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-
-    op->SetAttrMap(this->Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(smooth_l1_loss,
-                  ops::SmoothL1LossOp,
-                  ops::SmoothL1LossOpMaker,
-                  ops::SmoothL1LossGradMaker<paddle::framework::OpDesc>,
-                  ops::SmoothL1LossGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
-PD_REGISTER_STRUCT_KERNEL(
-    smooth_l1_loss, CPU, ALL_LAYOUT, ops::SmoothL1LossKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    smooth_l1_loss_grad, CPU, ALL_LAYOUT, ops::SmoothL1LossGradKernel, float) {}
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu
deleted file mode 100644
index 31d528855ccbef..00000000000000
--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/smooth_l1_loss_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    smooth_l1_loss, GPU, ALL_LAYOUT, ops::SmoothL1LossKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    smooth_l1_loss_grad, GPU, ALL_LAYOUT, ops::SmoothL1LossGradKernel, float) {}
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h
deleted file mode 100644
index bc57087d931ae7..00000000000000
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SmoothL1LossForward {
-  HOSTDEVICE SmoothL1LossForward(const T& sigma2) : sigma2(sigma2) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val < 1.0 / sigma2) {
-      return 0.5 * val * val * sigma2;
-    } else {
-      return abs_val - 0.5 / sigma2;
-    }
-  }
-
-  T sigma2;
-};
-
-template <typename T, typename DeviceContext, typename AttrType = T>
-class SmoothL1LossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto* in1 = context.Input<phi::DenseTensor>("Y");
-    auto* in2 = context.Input<phi::DenseTensor>("InsideWeight");
-    auto* in3 = context.Input<phi::DenseTensor>("OutsideWeight");
-    auto* out0 = context.Output<phi::DenseTensor>("Diff");
-    auto* out1 = context.Output<phi::DenseTensor>("Out");
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
-    T sigma2 = sigma * sigma;
-    bool has_weight = (in2 != nullptr) && (in3 != nullptr);
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    auto diff = EigenVector<T>::Flatten(*out0);
-
-    diff.device(*place) = x - y;
-    // multiply inside weight
-    if (has_weight) {
-      auto inside_weight = EigenVector<T>::Flatten(*in2);
-      // cache diff, reused in bp
-      diff.device(*place) = diff * inside_weight;
-    }
-
-    auto in_counts = in0->numel();
-    phi::DenseTensor ptensor_errors;
-    ptensor_errors.mutable_data<T>({static_cast<int>(in_counts)},
-                                   context.GetPlace());
-    auto errors = EigenVector<T>::Flatten(ptensor_errors);
-    // apply smooth l1 forward
-    errors.device(*place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
-
-    // multiply outside weight
-    if (has_weight) {
-      auto outside_weight = EigenVector<T>::Flatten(*in3);
-      errors.device(*place) = errors * outside_weight;
-    }
-    auto loss = EigenVector<T>::Flatten(*out1);
-    // first dimension of 'X' is the number of samples
-    auto mat_dims =
-        phi::make_ddim({static_cast<int>(in0->dims()[0]),
-                        static_cast<int>(in_counts / in0->dims()[0])});
-    auto errors_mat_view = EigenMatrix<T>::From(ptensor_errors, mat_dims);
-    loss.device(*place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
-  }
-};
-
-template <typename T>
-struct SmoothL1LossBackward {
-  HOSTDEVICE SmoothL1LossBackward(const T& sigma2) : sigma2(sigma2) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val < 1.0 / sigma2) {
-      return sigma2 * val;
-    } else {
-      return (0 < val) - (val < 0);
-    }
-  }
-
-  T sigma2;
-};
-
-template <typename T, typename DeviceContext, typename AttrType = T>
-class SmoothL1LossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<phi::DenseTensor>("InsideWeight");
-    auto* in1 = context.Input<phi::DenseTensor>("OutsideWeight");
-    auto* in2 = context.Input<phi::DenseTensor>("Diff");
-    auto* og = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
-    T sigma2 = sigma * sigma;
-    bool has_weight = (in0 != nullptr) && (in1 != nullptr);
-
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto in_dims = in2->dims();
-    auto counts = in2->numel();
-    auto cols = counts / in_dims[0];
-    auto mat_dims =
-        phi::make_ddim({static_cast<int>(in_dims[0]), static_cast<int>(cols)});
-
-    phi::DenseTensor ptensor_diff;
-    ptensor_diff.mutable_data<T>({static_cast<int>(counts)},
-                                 context.GetPlace());
-    auto diff = EigenVector<T>::Flatten(ptensor_diff);
-    // apply smooth l1 backwoard
-    diff.device(*place) = EigenVector<T>::Flatten(*in2).unaryExpr(
-        SmoothL1LossBackward<T>(sigma2));
-
-    // compute weights
-    phi::DenseTensor ptensor_weights;
-    ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
-    auto weights = EigenMatrix<T>::From(ptensor_weights);
-    // initialize to 1.0
-    weights.device(*place) = weights.constant(static_cast<T>(1.0));
-    if (has_weight) {
-      auto inside_weight = EigenMatrix<T>::From(*in0, mat_dims);
-      auto outside_weight = EigenMatrix<T>::From(*in1, mat_dims);
-      weights.device(*place) = inside_weight * outside_weight;
-    }
-
-    // compute gradients
-    auto out_grad = EigenMatrix<T>::From(*og);
-    auto diff_mat_view = EigenMatrix<T>::From(ptensor_diff, mat_dims);
-    auto gradients = out_grad.broadcast(
-                         Eigen::array<int, 2>({{1, static_cast<int>(cols)}})) *
-                     weights * diff_mat_view;
-
-    auto* out0 = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    if (out0) {
-      out0->mutable_data<T>(context.GetPlace());
-      auto x_grad = EigenMatrix<T>::From(*out0, mat_dims);
-      x_grad.device(*place) = gradients;
-    }
-
-    if (out1) {
-      out1->mutable_data<T>(context.GetPlace());
-      auto y_grad = EigenMatrix<T>::From(*out1, mat_dims);
-      y_grad.device(*place) = -1 * gradients;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
deleted file mode 100644
index da0c2e4a3cbb21..00000000000000
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/space_to_depth_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class SpaceToDepthOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   platform::errors::InvalidArgument(
-                       "Input(X) of SpaceToDepthOp should not be null."));
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   platform::errors::InvalidArgument(
-                       "Output(Out) of SpaceToDepthOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        4,
-        platform::errors::InvalidArgument("input should be a 4D tensor"));
-    auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
-
-    PADDLE_ENFORCE_GT(blocksize,
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The blocksize should be Greater than 1"));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GT(x_dims[1],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input channel should be Greater than 0"));
-      PADDLE_ENFORCE_GT(x_dims[2],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input Height should be Greater than 0"));
-      PADDLE_ENFORCE_GT(x_dims[3],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input Width should be Greater than 0"));
-
-      PADDLE_ENFORCE_EQ(
-          x_dims[1] % (blocksize * blocksize),
-          0,
-          platform::errors::InvalidArgument(
-              "input channel should be divisible of the square of "
-              "SpaceToDepthOp blocksize"));
-      PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize),
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input Height should be divisible of the square of "
-                            "SpaceToDepthOp blocksize"));
-      PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize),
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input Width should be divisible of the square of "
-                            "SpaceToDepthOp blocksize"));
-    } else {
-      if (x_dims[1] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[1],
-                          0,
-                          platform::errors::InvalidArgument(
-                              "input channel should be Greater than 0"));
-        PADDLE_ENFORCE_EQ(
-            x_dims[1] % (blocksize * blocksize),
-            0,
-            platform::errors::InvalidArgument(
-                "input channel should be divisible of the square of "
-                "SpaceToDepthOp blocksize"));
-      }
-      if (x_dims[2] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[2],
-                          0,
-                          platform::errors::InvalidArgument(
-                              "input Height should be Greater than 0"));
-        PADDLE_ENFORCE_EQ(
-            x_dims[2] % (blocksize),
-            0,
-            platform::errors::InvalidArgument(
-                "input Height should be divisible of the square of "
-                "SpaceToDepthOp blocksize"));
-      }
-
-      if (x_dims[3] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[3],
-                          0,
-                          platform::errors::InvalidArgument(
-                              "input Width should be Greater than 0"));
-
-        PADDLE_ENFORCE_EQ(
-            x_dims[3] % (blocksize),
-            0,
-            platform::errors::InvalidArgument(
-                "input Width should be divisible of the square of "
-                "SpaceToDepthOp blocksize"));
-      }
-    }
-
-    VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims
-            << "Attribute blocksize" << blocksize << std::endl;
-
-    std::vector<int64_t> output_shape(4, 0);  // [B,C,H,W]
-    output_shape[0] = x_dims[0];
-    output_shape[1] = x_dims[1] * blocksize * blocksize;
-    output_shape[2] = x_dims[2] / blocksize;
-    output_shape[3] = x_dims[3] / blocksize;
-
-    auto out_dims = phi::make_ddim(output_shape);
-
-    ctx->SetOutputDim("Out", out_dims);
-
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
-class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor). The input should be a 4D tensor B * C * W * H of "
-             "SpaceToDepthOp "
-             "operator.");
-    AddOutput("Out",
-              "(Tensor), The output should be a 4D tensor B * C2 * W2 * H2 of "
-              "SpaceToDepthOp operator.");
-    AddAttr<int64_t>(
-        "blocksize",
-        "(int64_t, default 2) blocksize used to do change Space To Depth.")
-        .SetDefault(2)
-        .GreaterThan(1);
-    AddComment(R"DOC(
-        reorg operator used in Yolo v2.
-        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,
-
-        Reshape Input(X) into the shape according to Attr(blocksize). The
-        data in Input(X) are unchanged.
-
-        Examples:
-
-            1. Given a 4-D tensor Input(X) with a shape [128, 2048, 26, 26], and the blocksize is 2, the reorg operator will transform Input(X)
-            into a 4-D tensor with shape [128, 2048, 13, 13] and leaving Input(X)'s data unchanged.
-
-    )DOC");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SpaceToDepthGradOpNoBufferVarsInferer, "X");
-
-template <typename T>
-class SpaceToDepthGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("space_to_depth_grad");
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("X", this->Input("X"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class SpaceToDepthGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("X"),
-        platform::errors::InvalidArgument("Input(X) shouldn't be null."));
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   platform::errors::InvalidArgument(
-                       "Input(Out@GRAD) shouldn't be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(space_to_depth,
-                  ops::SpaceToDepthOp,
-                  ops::SpaceToDepthOpMaker,
-                  ops::SpaceToDepthGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SpaceToDepthGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(space_to_depth_grad,
-                  ops::SpaceToDepthGradOp,
-                  ops::SpaceToDepthGradOpNoBufferVarsInferer);
-PD_REGISTER_STRUCT_KERNEL(space_to_depth,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SpaceToDepthKernel,
-                          int,
-                          int64_t,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(space_to_depth_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SpaceToDepthGradKernel,
-                          int,
-                          int64_t,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu
deleted file mode 100644
index 7f62509ee7d2e2..00000000000000
--- a/paddle/fluid/operators/space_to_depth_op.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/space_to_depth_op.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(space_to_depth,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SpaceToDepthKernel,
-                          int,
-                          int64_t,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(space_to_depth_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SpaceToDepthGradKernel,
-                          int,
-                          int64_t,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/space_to_depth_op.h b/paddle/fluid/operators/space_to_depth_op.h
deleted file mode 100644
index 18ff67c6132be7..00000000000000
--- a/paddle/fluid/operators/space_to_depth_op.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-#define PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-#endif  // PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class space_to_depth_compute {
- public:
-  HOSTDEVICE space_to_depth_compute(const T *x,
-                                    int64_t w,
-                                    int64_t h,
-                                    int64_t c,
-                                    int64_t batch,
-                                    int64_t blocksize,
-                                    int64_t forward,
-                                    T *out)
-      : x_(x),
-        w_(w),
-        h_(h),
-        c_(c),
-        batch_(batch),
-        blocksize_(blocksize),
-        forward_(forward),
-        out_(out) {}
-
-  HOSTDEVICE void operator()(int64_t in_index) {
-    int64_t out_c = c_ / (blocksize_ * blocksize_);
-    // calculate each dim position with index of tensor
-    int64_t b = in_index / (c_ * h_ * w_);
-    int64_t k = (in_index % (c_ * h_ * w_)) / (h_ * w_);
-    int64_t j = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) / w_;
-    int64_t i = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) % w_;
-
-    int64_t c2 = k % out_c;
-    int64_t offset = k / out_c;
-    int64_t w2 = i * blocksize_ + offset % blocksize_;
-    int64_t h2 = j * blocksize_ + offset / blocksize_;
-    int64_t out_index =
-        w2 + w_ * blocksize_ * (h2 + h_ * blocksize_ * (c2 + out_c * b));
-    if (forward_)
-      out_[out_index] = x_[in_index];
-    else
-      out_[in_index] = x_[out_index];
-  }
-
- private:
-  const T *x_;
-  int64_t w_, h_, c_, batch_, blocksize_, forward_;
-  T *out_;
-};
-
-template <typename T, typename DeviceContext>
-class SpaceToDepthKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto blocksize = context.Attr<int64_t>("blocksize");
-    auto in_dims = x->dims();
-    out->mutable_data(context.GetPlace(), x->type());
-
-    auto out_dims = out->dims();
-    auto B = in_dims[0];
-    auto C = in_dims[1];
-    auto H = in_dims[2];
-    auto W = in_dims[3];
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(x->numel()));
-
-    auto *x_data = x->data<T>();
-    auto *out_data = out->data<T>();
-    paddle::operators::space_to_depth_compute<T> computer(
-        x_data, W, H, C, B, blocksize, 1, out_data);
-    for_range(computer);
-
-    out->Resize(out_dims);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SpaceToDepthGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto blocksize = context.Attr<int64_t>("blocksize");
-    auto in_dims = d_x->dims();
-    d_x->mutable_data(context.GetPlace(), d_out->type());
-
-    auto B = in_dims[0];
-    auto C = in_dims[1];
-    auto H = in_dims[2];
-    auto W = in_dims[3];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(d_x->numel()));
-
-    auto *dx_data = d_x->data<T>();
-    auto *dout_data = d_out->data<T>();
-
-    paddle::operators::space_to_depth_compute<T> computer(
-        dout_data, W, H, C, B, blocksize, 0, dx_data);
-    for_range(computer);
-
-    d_x->Resize(in_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index e648575a1edca1..6b79d5c35b7838 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -107,7 +107,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     }
 
     for (size_t t = 0; t < 2; ++t) {
-      phi::DenseTensor *out;
+      phi::DenseTensor *out = nullptr;
       if (t == 0) {
         out = out_false;
       } else {
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index 246e9368a7d2fc..f362b05fc6d10c 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -65,12 +65,12 @@ class SplitOp : public framework::OperatorWithKernel {
     if (ctx->IsRuntime() && ctx->HasInput("AxisTensor")) {
       Variable *var =
           PADDLE_GET_CONST(Variable *, ctx->GetInputVarPtrs("AxisTensor")[0]);
-      axis_final = std::move(framework::MakePhiScalarFromVar(*var));
+      axis_final = framework::MakePhiScalarFromVar(*var);
     } else if (!ctx->IsRuntime() && ctx->HasInput("AxisTensor")) {
-      axis_final = std::move(phi::Scalar(-1));
+      axis_final = phi::Scalar(-1);
       axis_final.SetFromTensor(true);
     } else {
-      axis_final = std::move(phi::Scalar(axis));
+      axis_final = phi::Scalar(axis);
     }
 
     // Construct sections_final
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
deleted file mode 100644
index 0f2f727dd9135d..00000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squared_l2_distance_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class SquaredL2DistanceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SquaredL2DistanceOp");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "SquaredL2DistanceOp");
-    OP_INOUT_CHECK(ctx->HasOutput("sub_result"),
-                   "Output",
-                   "sub_result",
-                   "SquaredL2DistanceOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "SquaredL2DistanceOp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(phi::arity(x_dims),
-                      phi::arity(y_dims),
-                      platform::errors::InvalidArgument(
-                          "Input(X) and Input(X) of SquaredL2DistanceOp should "
-                          "have same dimensions. "
-                          "But received X's shape = [%s] and Y's shape = [%s], "
-                          "the dimensions are %d and %d respectively",
-                          x_dims,
-                          y_dims,
-                          phi::arity(x_dims),
-                          phi::arity(y_dims)));
-
-    int rank = phi::arity(x_dims);
-    PADDLE_ENFORCE_GE(
-        rank,
-        2,
-        platform::errors::InvalidArgument(
-            "Input dimensions of SquaredL2DistanceOp should be at least 2."
-            "But received shape = [%s] and dimension is %d.",
-            x_dims,
-            rank));
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(y_dims) <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          product(x_dims) / x_dims[0],
-          product(y_dims) / y_dims[0],
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Y) of SquaredL2DistanceOp should "
-              "have same dimensions."
-              "But received X's shape = [%s] and Y's shape = [%s]"
-              ", the products are %d and %d respectively",
-              x_dims,
-              y_dims,
-              product(x_dims) / x_dims[0],
-              product(y_dims) / y_dims[0]));
-    }
-    check = true;
-    if ((!ctx->IsRuntime()) && (y_dims[0] <= 0 || x_dims[0] <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[0] == 1 || y_dims[0] == x_dims[0],
-          true,
-          platform::errors::InvalidArgument(
-              "First dimension of Input(Y) of SquaredL2DistanceOp "
-              "must be equal to 1 or to first dimension of Input(X)."
-              "But received X's shape = [%s] and Y's shape = [%s],"
-              "the first dimensions are %d and %d respectively",
-              x_dims,
-              y_dims,
-              x_dims[0],
-              y_dims[0]));
-    }
-    ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SquaredL2DistanceGradOpNoBufferVarsInferer,
-                                    "X",
-                                    "Y");
-
-template <typename T>
-class SquaredL2DistanceGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("squared_l2_distance_grad");
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("sub_result", this->Output("sub_result"));
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
-    AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
-    AddOutput("sub_result",
-              "(Tensor) Buffering subtraction result which "
-              "will be reused in backward.")
-        .AsIntermediate();
-    AddOutput("Out", "(Tensor) Squared l2 distance between input and target.");
-    AddComment(R"DOC(
-SquaredL2Distance operator
-
-This operator will cacluate the squared L2 distance for the input and
-the target. Number of distance value will be equal to the first dimension
-of input. First dimension of the target could be equal to the input or to 1.
-If the first dimension of target is 1, the operator will broadcast target's
-first dimension to input's first dimension. During backward propagation,
-the user can decide whether to calculate the gradient of the input or
-the target or both.
-
-Both the input X and Y can carry the LoD (Level of Details) information.
-However, the output only shares the LoD information with input X.
-    )DOC");
-  }
-};
-
-class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("sub_result"),
-                   "Input",
-                   "sub_result",
-                   "SquaredL2DistanceGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@GRAD",
-                   "SquaredL2DistanceGradOp");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          out_dims[0],
-          x_dims[0],
-          platform::errors::InvalidArgument(
-              "First dimension of output gradient and Input(X) "
-              "of SquaredL2DistanceGradOp must be equal "
-              "But received X's shape = [%s] and grad's shape = [%s], "
-              "the first dimensions are %d and %d respectively",
-              x_dims,
-              out_dims,
-              x_dims[0],
-              out_dims[0]));
-      PADDLE_ENFORCE_EQ(out_dims[1],
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Second dimension of output gradient of "
-                            "SquaredL2DistanceGradOp must be 1. "
-                            "But received grad's shape = [%s], "
-                            "with second dimension %d",
-                            out_dims,
-                            out_dims[1]));
-    }
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
-    if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "sub_result"),
-        ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    squared_l2_distance,
-    ops::SquaredL2DistanceOp,
-    ops::SquaredL2DistanceOpMaker,
-    ops::SquaredL2DistanceGradOpMaker<paddle::framework::OpDesc>,
-    ops::SquaredL2DistanceGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(squared_l2_distance_grad,
-                  ops::SquaredL2DistanceGradOp,
-                  ops::SquaredL2DistanceGradOpNoBufferVarsInferer);
-
-PD_REGISTER_STRUCT_KERNEL(
-    squared_l2_distance, CPU, ALL_LAYOUT, ops::SquaredL2DistanceKernel, float) {
-}
-PD_REGISTER_STRUCT_KERNEL(squared_l2_distance_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SquaredL2DistanceGradKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu
deleted file mode 100644
index 4411df4d9ab7f0..00000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/squared_l2_distance_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    squared_l2_distance, GPU, ALL_LAYOUT, ops::SquaredL2DistanceKernel, float) {
-}
-PD_REGISTER_STRUCT_KERNEL(squared_l2_distance_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SquaredL2DistanceGradKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
deleted file mode 100644
index 18039835c55c3f..00000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class SquaredL2DistanceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto* in1 = context.Input<phi::DenseTensor>("Y");
-    auto* out0 = context.Output<phi::DenseTensor>("sub_result");
-    auto* out1 = context.Output<phi::DenseTensor>("Out");
-
-    auto in0_dims = in0->dims();
-    auto in1_dims = in1->dims();
-
-    int cols = in0->numel() / in0_dims[0];
-    // reduce dimensions except the first
-    auto x = framework::EigenMatrix<T>::From(
-        *in0, phi::make_ddim({in0_dims[0], cols}));
-    auto y = framework::EigenMatrix<T>::From(
-        *in1, phi::make_ddim({in1_dims[0], cols}));
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto sub_result = framework::EigenMatrix<T>::From(*out0);
-    auto z = framework::EigenVector<T>::Flatten(*out1);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto x_dims = x.dimensions();
-    auto y_dims = y.dimensions();
-    // buffer the substraction result
-    if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) {
-      sub_result.device(place) =
-          x -
-          y.broadcast(Eigen::array<int, 2>({{static_cast<int>(x_dims[0]), 1}}));
-    } else {
-      sub_result.device(place) = x - y;
-    }
-    auto sub_res_pow2 = sub_result * sub_result;
-    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<phi::DenseTensor>("sub_result");
-    auto* in1 = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_g = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* y_g = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        x_g,
-        platform::errors::NotFound(
-            "variable(%s) cannot be found "
-            "in scope for operator 'squared_l2_distance_grad'.",
-            framework::GradVarName("X")));
-    PADDLE_ENFORCE_NOT_NULL(
-        y_g,
-        platform::errors::NotFound(
-            "variable(%s) cannot be found "
-            "in scope for operator 'squared_l2_distance_grad'.",
-            framework::GradVarName("Y")));
-
-    auto sub_result = framework::EigenMatrix<T>::From(*in0);
-    auto out_grad = framework::EigenMatrix<T>::From(*in1);
-
-    auto x_dims = x_g->dims();
-    auto y_dims = y_g->dims();
-
-    int cols = x_g->numel() / x_dims[0];
-    // calculate gradient
-    auto grad_mat = 2 *
-                    (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
-                    sub_result;
-
-    // propagate back to input
-    auto& eigen_place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    x_g->mutable_data<T>(context.GetPlace());
-    // eigen matrix
-    auto x_grad = framework::EigenMatrix<T>::From(
-        *x_g, phi::make_ddim({x_dims[0], cols}));
-    // dimensions are same with subResult
-    x_grad.device(eigen_place) = grad_mat;
-
-    y_g->mutable_data<T>(context.GetPlace());
-
-    PADDLE_ENFORCE_GE(sub_result.dimensions()[0],
-                      y_dims[0],
-                      platform::errors::InvalidArgument(
-                          "First dimension of gradient must be greater or "
-                          "equal than first dimension of target. But received "
-                          "gradient dimension = %d and target dimension is %d.",
-                          sub_result.dimensions()[0],
-                          y_dims[0]));
-
-    if (sub_result.dimensions()[0] == y_dims[0]) {
-      auto y_grad = framework::EigenMatrix<T>::From(
-          *y_g, phi::make_ddim({y_dims[0], cols}));
-      y_grad.device(eigen_place) = -1 * grad_mat;
-    } else {
-      auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
-      auto y_grad = framework::EigenVector<T>::Flatten(*y_g);
-      y_grad.device(eigen_place) = col_sum_res;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
deleted file mode 100644
index 115901d3ee2ee5..00000000000000
--- a/paddle/fluid/operators/squeeze_op.cc
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squeeze_op.h"
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
-                               const framework::DDim &in_dims,
-                               bool is_runtime) {
-  size_t num_squeeze_dims = squeeze_dims.size();
-  std::vector<bool> should_squeeze(in_dims.size(), false);
-
-  // Mark dimensions need to be squeezed.
-  if (num_squeeze_dims == 0) {
-    for (int i = 0; i < in_dims.size(); ++i) {
-      if (in_dims[i] == 1) {
-        should_squeeze[i] = true;
-      }
-    }
-  } else {
-    for (size_t i = 0; i < num_squeeze_dims; ++i) {
-      int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
-                                        : squeeze_dims[i];
-
-      PADDLE_ENFORCE_GE(
-          current,
-          0,
-          platform::errors::InvalidArgument(
-              "Each axis in Attr(axes) should be in the range of [%d, %d]"
-              "But current axis is:%d, input tensor's shape = [%s].",
-              -in_dims.size(),
-              in_dims.size() - 1,
-              current,
-              in_dims));
-      PADDLE_ENFORCE_LT(
-          current,
-          in_dims.size(),
-          platform::errors::InvalidArgument(
-              "Each axis in Attr(axes) should be in the range of [%d, %d]"
-              "But current axis is:%d, input tensor's shape = [%s].",
-              -in_dims.size(),
-              in_dims.size() - 1,
-              current,
-              in_dims));
-
-      if (!should_squeeze[current]) {
-        if (is_runtime) {
-          // At run time, dim of 1 is allowed to squeeze
-          if (in_dims[current] == 1) {
-            should_squeeze[current] = true;
-          }
-        } else {
-          // At compile time, dim of -1 or 1 is allowed to squeeze
-          if (in_dims[current] == 1 || in_dims[current] == -1) {
-            should_squeeze[current] = true;
-          }
-        }
-      }
-    }
-  }
-  // Make output dimensions
-  std::vector<int64_t> output_shape;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    if (!should_squeeze[i]) {
-      output_shape.push_back(in_dims[i]);
-    }
-  }
-  return phi::make_ddim(output_shape);
-}
-
-class SqueezeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Squeeze");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Squeeze");
-
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Check input tensor dims (<6) Eigen limit.
-    PADDLE_ENFORCE_LE(x_dims.size(),
-                      6,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(X) "
-                          "should be in the range of [1, 6] (Eigen limit)."
-                          "But received X's dimensions = %d, X's shape=[%s].",
-                          x_dims.size(),
-                          x_dims));
-
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    auto out_dims = GetOutputShape(axes, x_dims, false);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class SqueezeGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    context->SetOutputDim(framework::GradVarName("X"),
-                          context->GetInputDim("X"));
-    context->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
-    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
-    AddAttr<std::vector<int>>("axes",
-                              "(std::vector<int>). List of integers,"
-                              " indicating the dimensions to squeeze.")
-        .SetDefault({})
-        .SupportTensor();
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<std::string>(
-        "mkldnn_data_type",
-        "(string, default \"float32\"). Data type of mkldnn kernel")
-        .SetDefault("float32")
-        .InEnum({"float32", "bfloat16"})
-        .AsExtra();
-    AddComment(R"DOC(
-        Squeeze Operator.
-
-        Remove single-dimensional entries from the shape of a tensor.
-        Takes a parameter axes with a list of axes to squeeze.
-        If axes is not provided, all the single dimensions will be removed from the shape.
-        If an axis is selected with shape entry not equal to one, an error is raised.
-
-        Examples:
-        Case 1:
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = [0]
-          we get:
-            Out.shape = (3, 1, 5)
-
-        Case 2:
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = []
-          we get:
-            Out.shape = (3, 5)
-    )DOC");
-  }
-};
-
-template <typename T>
-class SqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("squeeze_grad");
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-template <typename T>
-class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("squeeze");
-    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
-    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(squeeze,
-                  ops::SqueezeOp,
-                  ops::SqueezeOpMaker,
-                  ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(squeeze_grad,
-                  ops::SqueezeGradOp,
-                  ops::SqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
-                  ops::SqueezeGradNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    squeeze,
-    ops::SqueezeKernel<phi::CPUContext, float>,
-    ops::SqueezeKernel<phi::CPUContext, double>,
-    ops::SqueezeKernel<phi::CPUContext, bool>,
-    ops::SqueezeKernel<phi::CPUContext, int>,
-    ops::SqueezeKernel<phi::CPUContext, uint8_t>,
-    ops::SqueezeKernel<phi::CPUContext, int8_t>,
-    ops::SqueezeKernel<phi::CPUContext, int64_t>,
-    ops::SqueezeKernel<phi::CPUContext, paddle::platform::complex<float>>,
-    ops::SqueezeKernel<phi::CPUContext, paddle::platform::complex<double>>,
-    ops::SqueezeKernel<phi::CPUContext, paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<phi::CPUContext, float>,
-    ops::SqueezeGradKernel<phi::CPUContext, double>,
-    ops::SqueezeGradKernel<phi::CPUContext, bool>,
-    ops::SqueezeGradKernel<phi::CPUContext, int>,
-    ops::SqueezeGradKernel<phi::CPUContext, uint8_t>,
-    ops::SqueezeGradKernel<phi::CPUContext, int8_t>,
-    ops::SqueezeGradKernel<phi::CPUContext, int64_t>,
-    ops::SqueezeGradKernel<phi::CPUContext, paddle::platform::complex<float>>,
-    ops::SqueezeGradKernel<phi::CPUContext, paddle::platform::complex<double>>,
-    ops::SqueezeGradKernel<phi::CPUContext, paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc
deleted file mode 100644
index a77b369c403732..00000000000000
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squeeze_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    squeeze,
-    ops::SqueezeKernel<phi::GPUContext, float>,
-    ops::SqueezeKernel<phi::GPUContext, double>,
-    ops::SqueezeKernel<phi::GPUContext, plat::float16>,
-    ops::SqueezeKernel<phi::GPUContext, plat::bfloat16>,
-    ops::SqueezeKernel<phi::GPUContext, bool>,
-    ops::SqueezeKernel<phi::GPUContext, int>,
-    ops::SqueezeKernel<phi::GPUContext, uint8_t>,
-    ops::SqueezeKernel<phi::GPUContext, int8_t>,
-    ops::SqueezeKernel<phi::GPUContext, int64_t>,
-    ops::SqueezeKernel<phi::GPUContext, paddle::platform::complex<float>>,
-    ops::SqueezeKernel<phi::GPUContext, paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<phi::GPUContext, float>,
-    ops::SqueezeGradKernel<phi::GPUContext, double>,
-    ops::SqueezeGradKernel<phi::GPUContext, plat::float16>,
-    ops::SqueezeGradKernel<phi::GPUContext, plat::bfloat16>,
-    ops::SqueezeGradKernel<phi::GPUContext, bool>,
-    ops::SqueezeGradKernel<phi::GPUContext, int>,
-    ops::SqueezeGradKernel<phi::GPUContext, uint8_t>,
-    ops::SqueezeGradKernel<phi::GPUContext, int8_t>,
-    ops::SqueezeGradKernel<phi::GPUContext, int64_t>,
-    ops::SqueezeGradKernel<phi::GPUContext, paddle::platform::complex<float>>,
-    ops::SqueezeGradKernel<phi::GPUContext, paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
index 0c5b5dfd4c8b0c..6f0da1d42e5467 100644
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -26,42 +26,67 @@ namespace operators {
 
 framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
                                const framework::DDim &in_dims,
-                               bool is_runtime);
-
-template <typename DeviceContext, typename T>
-class SqueezeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    auto &axes = context.Attr<std::vector<int>>("axes");
-    auto x_dims = in->dims();
-    auto out_dims = GetOutputShape(axes, x_dims, true);
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
+                               bool is_runtime) {
+  size_t num_squeeze_dims = squeeze_dims.size();
+  std::vector<bool> should_squeeze(in_dims.size(), false);
+
+  // Mark dimensions need to be squeezed.
+  if (num_squeeze_dims == 0) {
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (in_dims[i] == 1) {
+        should_squeeze[i] = true;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < num_squeeze_dims; ++i) {
+      int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
+                                        : squeeze_dims[i];
+
+      PADDLE_ENFORCE_GE(
+          current,
+          0,
+          platform::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(),
+              in_dims.size() - 1,
+              current,
+              in_dims));
+      PADDLE_ENFORCE_LT(
+          current,
+          in_dims.size(),
+          platform::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(),
+              in_dims.size() - 1,
+              current,
+              in_dims));
+
+      if (!should_squeeze[current]) {
+        if (is_runtime) {
+          // At run time, dim of 1 is allowed to squeeze
+          if (in_dims[current] == 1) {
+            should_squeeze[current] = true;
+          }
+        } else {
+          // At compile time, dim of -1 or 1 is allowed to squeeze
+          if (in_dims[current] == 1 || in_dims[current] == -1) {
+            should_squeeze[current] = true;
+          }
+        }
+      }
+    }
   }
-};
-
-template <typename DeviceContext, typename T>
-class SqueezeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto in_dims = ctx.Input<phi::DenseTensor>("X")->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(in_dims);
+  // Make output dimensions
+  std::vector<int64_t> output_shape;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (!should_squeeze[i]) {
+      output_shape.push_back(in_dims[i]);
+    }
   }
-};
+  return phi::make_ddim(output_shape);
+}
 
 template <typename DeviceContext, typename T>
 class Squeeze2Kernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc
index dd1e421e6cb1ae..68126e187b4e58 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -131,13 +131,13 @@ void WordPieceTokenizer::Tokenize(const wstring& text,
                                   vector<int64_t>* token_ids) const {
   size_t len = text.size();
   if (len > max_input_chars_per_word_) {
-    token_ids->emplace_back(std::move(unk_token_id_));
+    token_ids->emplace_back(unk_token_id_);
     return;
   }
 
   auto it = vocab_->find(text);
   if (it != vocab_->end()) {
-    token_ids->emplace_back(std::move(it->second));
+    token_ids->emplace_back(it->second);
     return;
   }
 
@@ -146,7 +146,7 @@ void WordPieceTokenizer::Tokenize(const wstring& text,
   while (start < len) {
     size_t end = len;
     std::wstring cur_substr;
-    int64_t cur_substr_id;
+    int64_t cur_substr_id = 0;
     while (start < end) {
       std::wstring sub = text.substr(start, end - start);
       if (start > 0) {
@@ -162,15 +162,15 @@ void WordPieceTokenizer::Tokenize(const wstring& text,
     }
 
     if (cur_substr.empty()) {
-      token_ids->emplace_back(std::move(unk_token_id_));
+      token_ids->emplace_back(unk_token_id_);
       return;
     } else {
       start = end;
-      wordpiece_ids.emplace_back(std::move(cur_substr_id));
+      wordpiece_ids.emplace_back(cur_substr_id);
     }
   }
   for (auto& token_id : wordpiece_ids) {
-    token_ids->emplace_back(std::move(token_id));
+    token_ids->emplace_back(token_id);
   }
 }
 
@@ -219,9 +219,9 @@ void BertTokenizer::Tokenize(const string& text,
       if (IsChineseChar(w_token[0])) {
         auto vocab_it = vocab_->find(w_token);
         if (vocab_it != vocab_->end()) {
-          split_token_ids->emplace_back(std::move(vocab_it->second));
+          split_token_ids->emplace_back(vocab_it->second);
         } else {
-          split_token_ids->emplace_back(std::move(unk_token_id_));
+          split_token_ids->emplace_back(unk_token_id_);
         }
       } else {
         word_piece_tokenizer_.Tokenize(w_token, split_token_ids);
@@ -241,29 +241,29 @@ void BertTokenizer::BuildInputsWithSpecialTokens(
   if (token_ids_1.empty()) {
     inputs->clear();
     inputs->resize(token_ids_0.size() + 2);
-    inputs->at(0) = std::move(cls_token_id_);
+    inputs->at(0) = cls_token_id_;
     size_t i = 1;
     for (auto& token_id : token_ids_0) {
-      inputs->at(i) = std::move(token_id);
+      inputs->at(i) = token_id;
       ++i;
     }
-    inputs->at(i) = std::move(sep_token_id_);
+    inputs->at(i) = sep_token_id_;
   } else {
     inputs->clear();
     inputs->resize(token_ids_0.size() + token_ids_1.size() + 3);
-    inputs->at(0) = std::move(cls_token_id_);
+    inputs->at(0) = cls_token_id_;
     size_t i = 1;
     for (auto& token_id : token_ids_0) {
-      inputs->at(i) = std::move(token_id);
+      inputs->at(i) = token_id;
       ++i;
     }
-    inputs->at(i) = std::move(sep_token_id_);
+    inputs->at(i) = sep_token_id_;
     ++i;
     for (auto& token_id : token_ids_1) {
-      inputs->at(i) = std::move(token_id);
+      inputs->at(i) = token_id;
       ++i;
     }
-    inputs->at(i) = std::move(sep_token_id_);
+    inputs->at(i) = sep_token_id_;
   }
 }
 
@@ -333,9 +333,9 @@ int BertTokenizer::Encode(
       wstring token = unicode_text.substr(i, 1);
       auto it = vocab_->find(token);
       if (it != vocab_->end()) {
-        ids.emplace_back(std::move(it->second));
+        ids.emplace_back(it->second);
       } else {
-        ids.emplace_back(std::move(unk_token_id_));
+        ids.emplace_back(unk_token_id_);
       }
     }
   }
diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt
index 9333535a7ec2e9..bbccad1d2a1495 100644
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
@@ -8,6 +8,7 @@ if(NOT WIN32)
     SRCS tensorrt_engine_op_test.cc
     DEPS tensorrt_engine_op analysis fleet_executor python)
 else()
+  get_property(paddle_lib GLOBAL PROPERTY PADDLE_LIB_NAME)
   nv_test(
     test_tensorrt_engine_op
     SRCS tensorrt_engine_op_test.cc
diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc
deleted file mode 100644
index 7265d966b9e2a4..00000000000000
--- a/paddle/fluid/operators/tree_conv_op.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/tree_conv_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-class TreeConvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("NodesVector",
-             "(Tensor) The feature vector of every node on the tree. "
-             "The shape of the feature vector must be "
-             "[max_tree_node_size, feature_size].");
-    AddInput("EdgeSet",
-             "(Tensor) The Edges of Tree. The edge must be directional. "
-             "The shape of the edge set must be [max_tree_node_size, 2].");
-    AddInput("Filter",
-             "(Tensor) The feature detector. "
-             "The shape of the filter is "
-             "[feature_size, 3, output_size, num_filters].");
-    AddOutput("Out",
-              "(Tensor) The feature vector of subtrees. "
-              "The shape of the output tensor is [max_tree_node_size, "
-              "output_size, num_filters]. "
-              "The output tensor could be a new feature "
-              "vector for next tree convolution layers.");
-    AddAttr<int>("max_depth",
-                 "(int, default: 2) The depth of feature detector.")
-        .SetDefault(2)
-        .GreaterThan(1);
-    AddComment(R"DOC(
-**Tree-Based Convolution Operator**
-
-Tree-Based Convolution is a kind of convolution based on tree structure.
-Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
-which is used to classify tree structures, such as Abstract Syntax Tree.
-Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
-which regards multiway tree as binary tree.
-The paper of Tree-Based Convolution Operator is here:
-https://arxiv.org/abs/1409.5718v1
-)DOC");
-  }
-};
-class TreeConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("NodesVector"), "Input", "NodesVector", "TreeConv");
-    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "TreeConv");
-    OP_INOUT_CHECK(ctx->HasInput("EdgeSet"), "Input", "EdgeSet", "TreeConv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TreeConv");
-
-    auto edge_dims = ctx->GetInputDim("EdgeSet");
-    auto vector_dims = ctx->GetInputDim("NodesVector");
-    auto filter_dims = ctx->GetInputDim("Filter");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(edge_dims[2],
-                        2,
-                        platform::errors::InvalidArgument(
-                            "Input(EdgeSet) dim[2] should be 2. "
-                            "But received Input(EdgeSet) dim[2] is %d.",
-                            edge_dims[2]));
-    } else {
-      if (edge_dims[2] != -1) {
-        PADDLE_ENFORCE_EQ(edge_dims[2],
-                          2,
-                          platform::errors::InvalidArgument(
-                              "Input(EdgeSet) dim[2] should be 2. "
-                              "But received Input(EdgeSet) dim[2] is %d.",
-                              edge_dims[2]));
-      }
-    }
-    PADDLE_ENFORCE_EQ(edge_dims.size(),
-                      3,
-                      platform::errors::InvalidArgument(
-                          "The dimension of EdgeSet Tensor should be 3. "
-                          "But received the dimension of EdgeSet Tensor is %d.",
-                          edge_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        vector_dims.size(),
-        3,
-        platform::errors::InvalidArgument(
-            "The dimension of NodesVector Tensor should be 3. "
-            "But received the dimension of NodesVector Tensor is %d.",
-            vector_dims.size()));
-    PADDLE_ENFORCE_EQ(filter_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimension of Filter Tensor should be 4. "
-                          "But received the dimension of Filter Tensor is %d.",
-                          filter_dims.size()));
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(filter_dims[1],
-                        3,
-                        platform::errors::InvalidArgument(
-                            "Input(Filter) dim[1] should be 3. "
-                            "But received Input(Filter) dim[1] is %d.",
-                            filter_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          filter_dims[0],
-          vector_dims[2],
-          platform::errors::InvalidArgument(
-              "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]. "
-              "But received Input(Filter) dim[0] = %d, Input(NodesVector) "
-              "dim[2] = %d.",
-              filter_dims[0],
-              vector_dims[2]));
-    } else {
-      if (filter_dims[1] != -1) {
-        PADDLE_ENFORCE_EQ(filter_dims[1],
-                          3,
-                          platform::errors::InvalidArgument(
-                              "Input(Filter) dim[1] should be 3. "
-                              "But received Input(Filter) dim[1] is %d.",
-                              filter_dims[1]));
-      }
-
-      if (filter_dims[0] != -1 && vector_dims[2] != -1) {
-        PADDLE_ENFORCE_EQ(
-            filter_dims[0],
-            vector_dims[2],
-            platform::errors::InvalidArgument(
-                "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]. "
-                "But received Input(Filter) dim[0] = %d, Input(NodesVector) "
-                "dim[2] = %d.",
-                filter_dims[0],
-                vector_dims[2]));
-      }
-    }
-    auto output_dims = phi::make_ddim(
-        {vector_dims[0], vector_dims[1], filter_dims[2], filter_dims[3]});
-    ctx->SetOutputDim("Out", output_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "NodesVector"),
-        ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class TreeConvGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("tree_conv_grad");
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("Filter", this->Input("Filter"));
-    op->SetInput("EdgeSet", this->Input("EdgeSet"));
-    op->SetInput("NodesVector", this->Input("NodesVector"));
-
-    op->SetOutput(framework::GradVarName("NodesVector"),
-                  this->InputGrad("NodesVector"));
-    op->SetOutput(framework::GradVarName("Filter"), this->InputGrad("Filter"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class TreeConvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "grad_TreeConv");
-    OP_INOUT_CHECK(
-        ctx->HasInput("EdgeSet"), "Input", "EdgeSet", "grad_TreeConv");
-    OP_INOUT_CHECK(
-        ctx->HasInput("NodesVector"), "Input", "NodesVector", "grad_TreeConv");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "grad_TreeConv");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("NodesVector")),
-                   "Output",
-                   framework::GradVarName("NodesVector"),
-                   "grad_TreeConv");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter")),
-                   "Output",
-                   framework::GradVarName("Filter"),
-                   "grad_TreeConv");
-
-    auto vectors_dims = ctx->GetInputDim("NodesVector");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("NodesVector"))) {
-      ctx->SetOutputDim(framework::GradVarName("NodesVector"), vectors_dims);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "NodesVector"),
-        ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(tree_conv,
-                  ops::TreeConvOp,
-                  ops::TreeConvOpMaker,
-                  ops::TreeConvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TreeConvGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(
-    tree_conv, CPU, ALL_LAYOUT, ops::TreeConvKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    tree_conv_grad, CPU, ALL_LAYOUT, ops::TreeConvGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
deleted file mode 100644
index 18fd5bea29d308..00000000000000
--- a/paddle/fluid/operators/tree_conv_op.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iostream>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/tree2col.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-using DDim = framework::DDim;
-template <typename T, typename DeviceContext>
-class TreeConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    math::Tree2ColFunctor<DeviceContext, T> tree2col;
-    phi::funcs::SetConstant<DeviceContext, T> constant;
-
-    auto *Edges = ctx.Input<phi::DenseTensor>("EdgeSet");
-    auto *Embeddings = ctx.Input<phi::DenseTensor>("NodesVector");
-    auto *Filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto *output_emb = ctx.Output<phi::DenseTensor>("Out");
-    int max_depth = ctx.Attr<int>("max_depth");
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    phi::DenseTensor W;
-    W.ShareDataWith(*Filter);
-    W.Resize(phi::flatten_to_2d(Filter->dims(), 2));
-
-    int batch_size = static_cast<int>(Edges->dims()[0]);
-    int n = static_cast<int>(Embeddings->dims()[1]);
-    int out_size = static_cast<int>(Filter->dims()[2]);
-    int num_filters = static_cast<int>(Filter->dims()[3]);
-    output_emb->mutable_data<T>({batch_size, n, out_size, num_filters},
-                                ctx.GetPlace());
-
-    auto edge_set_slicedim = phi::slice_ddim(
-        Edges->dims(), 1, static_cast<int>(Edges->dims().size()));
-
-    auto embedding_slicedim = phi::slice_ddim(
-        Embeddings->dims(), 1, static_cast<int>(Embeddings->dims().size()));
-
-    auto output_slicedim = phi::slice_ddim(
-        output_emb->dims(), 1, static_cast<int>(output_emb->dims().size()));
-
-    output_slicedim = phi::flatten_to_2d(output_slicedim, 1);
-
-    for (int idx = 0; idx < batch_size; idx++) {
-      auto edge_set = Edges->Slice(idx, idx + 1).Resize(edge_set_slicedim);
-      auto embeddings =
-          Embeddings->Slice(idx, idx + 1).Resize(embedding_slicedim);
-      auto out_vec = output_emb->Slice(idx, idx + 1).Resize(output_slicedim);
-      phi::DenseTensor patch;
-      tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
-      constant(dev_ctx, &out_vec, 0);
-      blas.MatMul(patch, W, &out_vec);
-    }
-  }
-};
-template <typename T, typename DeviceContext>
-class TreeConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out_g = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *in_g =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("NodesVector"));
-    auto *filter_g =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-    int max_depth = ctx.Attr<int>("max_depth");
-    auto *Embeddings = ctx.Input<phi::DenseTensor>("NodesVector");
-    auto *edges = ctx.Input<phi::DenseTensor>("EdgeSet");
-    auto *Filter = ctx.Input<phi::DenseTensor>("Filter");
-    math::Tree2ColFunctor<DeviceContext, T> tree2col;
-    math::Col2TreeFunctor<DeviceContext, T> col2tree;
-    phi::funcs::SetConstant<DeviceContext, T> constant;
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    phi::DenseTensor W;
-    W.ShareDataWith(*Filter);
-    W.Resize(phi::flatten_to_2d(Filter->dims(), 1));
-
-    int batch_size = static_cast<int>(Embeddings->dims()[0]);
-
-    auto edge_set_slicedim = phi::slice_ddim(
-        edges->dims(), 1, static_cast<int>(edges->dims().size()));
-
-    auto embedding_slicedim = phi::slice_ddim(
-        Embeddings->dims(), 1, static_cast<int>(Embeddings->dims().size()));
-
-    auto out_grad_dims = phi::slice_ddim(
-        out_g->dims(), 1, static_cast<int>(out_g->dims().size()));
-    out_grad_dims = phi::flatten_to_2d(out_grad_dims, 1);
-    if (filter_g) {
-      filter_g->mutable_data<T>(Filter->dims(), ctx.GetPlace());
-      phi::DenseTensor f_g;
-      f_g.ShareDataWith(*filter_g);
-      f_g.Resize(phi::flatten_to_2d(Filter->dims(), 2));
-      constant(dev_ctx, filter_g, 0);
-      for (int batch_id = 0; batch_id < batch_size; batch_id++) {
-        auto edge_set =
-            edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim);
-        auto embeddings = Embeddings->Slice(batch_id, batch_id + 1)
-                              .Resize(embedding_slicedim);
-        auto out_grad =
-            out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
-        phi::DenseTensor patch;
-        tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
-        blas.MatMul(patch, true, out_grad, false, T(1.0), &f_g, T(1.0));
-      }
-    }
-    if (in_g) {
-      auto input_grad_dims = phi::slice_ddim(
-          in_g->dims(), 1, static_cast<int>(in_g->dims().size()));
-      in_g->mutable_data<T>(Embeddings->dims(), ctx.GetPlace());
-      constant(dev_ctx, in_g, 0);
-      for (int batch_id = 0; batch_id < batch_size; batch_id++) {
-        auto edge_set =
-            edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim);
-        auto out_grad =
-            out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
-        auto in_grad =
-            in_g->Slice(batch_id, batch_id + 1).Resize(input_grad_dims);
-        phi::DenseTensor in_grad_temp;
-        col2tree(dev_ctx, edge_set, out_grad, &in_grad_temp, max_depth);
-        blas.MatMul(in_grad_temp, false, W, true, &in_grad);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 3cfb53ea14b65a..2e1b6f86d6370c 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -97,8 +97,7 @@ register_unity_group(
   fill_constant_batch_size_like_op.cc
   fill_constant_op.cc
   fill_op.cc
-  fill_zeros_like_op.cc
-  filter_by_instag_op.cc)
+  fill_zeros_like_op.cc)
 register_unity_group(
   cc
   flatten_op.cc
@@ -124,7 +123,6 @@ register_unity_group(
   isfinite_v2_op.cc)
 register_unity_group(
   cc
-  inplace_abn_op.cc
   interpolate_v2_op.cc
   inverse_op.cc
   is_empty_op.cc
@@ -148,13 +146,11 @@ register_unity_group(
   lookup_table_dequant_op.cc
   lrn_op.cc
   mkldnn/lrn_mkldnn_op.cc
-  lstm_unit_op.cc
-  lstmp_op.cc)
+  lstm_unit_op.cc)
 register_unity_group(
   cc
   log_loss_op.cc
   lookup_table_v2_op.cc
-  margin_rank_loss_op.cc
   masked_select_op.cc
   match_matrix_tensor_op.cc
   matmul_op.cc
@@ -167,7 +163,6 @@ register_unity_group(
 register_unity_group(
   cc
   concat_op.cc
-  conv_shift_op.cc
   dequantize_log_op.cc
   dropout_op.cc
   expand_op.cc
@@ -178,7 +173,6 @@ register_unity_group(
   matmul_v2_op.cc)
 register_unity_group(
   cc
-  mean_iou_op.cc
   mean_op.cc
   minus_op.cc
   mish_op.cc
@@ -203,7 +197,6 @@ register_unity_group(
   positive_negative_pair_op.cc
   prelu_op.cc
   print_op.cc
-  prroi_pool_op.cc
   psroi_pool_op.cc
   pull_box_extended_sparse_op.cc
   pull_box_sparse_op.cc
@@ -215,7 +208,6 @@ register_unity_group(
   quantize_op.cc
   mkldnn/quantize_mkldnn_op.cc
   queue_generator_op.cc
-  random_crop_op.cc
   range_op.cc
   rank_attention_op.cc
   rank_loss_op.cc
@@ -227,11 +219,9 @@ register_unity_group(
   reverse_op.cc)
 register_unity_group(
   cc
-  rnn_memory_helper_op.cc
   roi_align_op.cc
   roll_op.cc
   run_program_op.cc
-  sample_logits_op.cc
   sampling_id_op.cc
   save_combine_op.cc
   save_op.cc
@@ -258,7 +248,6 @@ register_unity_group(
   slice_op.cc)
 register_unity_group(
   cc
-  space_to_depth_op.cc
   spectral_norm_op.cc
   split_lod_tensor_op.cc
   split_op.cc
@@ -283,12 +272,10 @@ register_unity_group(
   trace_op.cc
   transpose_op.cc
   mkldnn/transpose_mkldnn_op.cc
-  tree_conv_op.cc
   unbind_op.cc
   unfold_op.cc)
 register_unity_group(
   cc
-  smooth_l1_loss_op.cc
   uniform_random_batch_size_like_op.cc
   unique_op.cc
   unique_with_counts_op.cc
@@ -328,7 +315,7 @@ register_unity_group(
   unbind_op.cu.cc
   unpool_op.cu.cc
   unsqueeze_op.cu.cc)
-register_unity_group(cc arg_max_op.cc arg_min_op.cc squared_l2_distance_op.cc)
+register_unity_group(cc arg_max_op.cc arg_min_op.cc)
 register_unity_group(
   cc
   linear_chain_crf_op.cc
@@ -415,7 +402,6 @@ register_unity_group(
   isfinite_v2_op.cu)
 register_unity_group(
   cu
-  inplace_abn_op.cu
   interpolate_v2_op.cu
   isfinite_op.cu
   l1_norm_op.cu
@@ -434,18 +420,10 @@ register_unity_group(
   rank_loss_op.cu
   real_op.cu)
 register_unity_group(
-  cu
-  log_loss_op.cu
-  lookup_table_v2_op.cu
-  margin_rank_loss_op.cu
-  masked_select_op.cu
-  lstmp_op.cu
-  shuffle_channel_op.cu
-  softmax_cudnn_op.cu
-  squared_l2_distance_op.cu)
+  cu log_loss_op.cu lookup_table_v2_op.cu masked_select_op.cu
+  shuffle_channel_op.cu softmax_cudnn_op.cu)
 register_unity_group(
   cu
-  conv_shift_op.cu
   dequantize_log_op.cu
   dropout_op.cu
   fake_quantize_op.cu
@@ -455,7 +433,6 @@ register_unity_group(
   softmax_with_cross_entropy_op.cu)
 register_unity_group(
   cu
-  mean_iou_op.cu
   mean_op.cu
   minus_op.cu
   mish_op.cu
@@ -474,23 +451,15 @@ register_unity_group(
   partial_sum_op.cu
   pixel_shuffle_op.cu
   prelu_op.cu
-  prroi_pool_op.cu
   run_program_op.cu
   pull_box_extended_sparse_op.cu
   pull_box_sparse_op.cu)
-register_unity_group(
-  cu
-  random_crop_op.cu
-  range_op.cu
-  reverse_op.cu
-  partial_concat_op.cu
-  kldiv_loss_op.cu
-  instance_norm_op.cu)
+register_unity_group(cu range_op.cu reverse_op.cu partial_concat_op.cu
+                     kldiv_loss_op.cu instance_norm_op.cu)
 register_unity_group(
   cu
   roi_align_op.cu
   roll_op.cu
-  sample_logits_op.cu
   sampling_id_op.cu
   save_combine_op.cu
   save_op.cu
@@ -509,7 +478,6 @@ register_unity_group(
   slice_op.cu)
 register_unity_group(
   cu
-  space_to_depth_op.cu
   spectral_norm_op.cu
   split_op.cu
   split_selected_rows_op.cu
@@ -517,23 +485,12 @@ register_unity_group(
   sum_op.cu
   temporal_shift_op.cu
   arg_max_op.cu)
-register_unity_group(
-  cu
-  row_conv_op.cu
-  tree_conv_op.cu
-  tril_triu_op.cu
-  unfold_op.cu
-  arg_min_op.cu
-  crop_tensor_op.cu)
-register_unity_group(
-  cu
-  smooth_l1_loss_op.cu
-  uniform_random_batch_size_like_op.cu
-  unstack_op.cu
-  where_index_op.cu
-  where_op.cu
-  layer_norm_op.cu)
+register_unity_group(cu row_conv_op.cu tril_triu_op.cu unfold_op.cu
+                     arg_min_op.cu crop_tensor_op.cu)
+register_unity_group(cu uniform_random_batch_size_like_op.cu unstack_op.cu
+                     where_index_op.cu where_op.cu layer_norm_op.cu)
 register_unity_group(cu expand_as_op.cu stack_op.cu)
+
 # The following groups are to make better use of `/MP` which MSVC's parallel
 # compilation instruction when compiling in Unity Build.
 register_unity_group(cu activation_op.cu)
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
deleted file mode 100644
index 5c6816a171fbc5..00000000000000
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unsqueeze_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class UnsqueezeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of "
-                          "Unsqueeze operator should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of "
-                          "Unsqueeze operator should not be null."));
-
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Validity Check: input tensor dims (<6).
-    PADDLE_ENFORCE_LE(x_dims.size(),
-                      6,
-                      platform::errors::InvalidArgument(
-                          "Invalid "
-                          "dimensions, the rank of Input(X) "
-                          "should be in the range of [1, 6] (Eigen limit)"));
-    if (!axes.empty()) {
-      auto out_dims = GetOutputShape(axes, x_dims);
-      ctx->SetOutputDim("Out", out_dims);
-      if (x_dims[0] == out_dims[0]) {
-        // Only pass LoD when the first dimension of output and Input(X)
-        // are the same.
-        ctx->ShareLoD("X", "Out");
-      }
-    } else if (ctx->HasInputs("AxesTensorList")) {
-      auto AxesTensorList = ctx->Inputs("AxesTensorList");
-      int output_size = x_dims.size() + static_cast<int>(AxesTensorList.size());
-      PADDLE_ENFORCE_LE(output_size,
-                        6,
-                        platform::errors::InvalidArgument(
-                            "The output tensor's rank should be less than 6."));
-      std::vector<int> vec_out_dims(output_size, -1);
-      ctx->SetOutputDim("Out", phi::make_ddim(vec_out_dims));
-    } else if (ctx->HasInput("AxesTensor")) {
-      auto axes_dims = ctx->GetInputDim("AxesTensor");
-      PADDLE_ENFORCE_EQ(axes_dims.size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(AxesTensor)'s dimension of "
-                            "Op(unsqueeze) must be 1. "
-                            "But received AxesTensor's shape = [%s], "
-                            "AxesTensor's dimension = %d.",
-                            axes_dims,
-                            axes_dims.size()));
-      PADDLE_ENFORCE_GE(
-          axes_dims[0],
-          0,
-          platform::errors::InvalidArgument(
-              "Input(AxesTensor)'s shape must be known. But received "
-              "AxesTensor's shape = [%s]",
-              axes_dims));
-      int output_size = x_dims.size() + static_cast<int>(axes_dims[0]);
-      PADDLE_ENFORCE_LE(output_size,
-                        6,
-                        platform::errors::InvalidArgument(
-                            "The output tensor's rank should be less than 6."));
-      std::vector<int> vec_out_dims(output_size, -1);
-      ctx->SetOutputDim("Out", phi::make_ddim(vec_out_dims));
-    }
-  }
-
-  static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
-                                        const framework::DDim &in_dims) {
-    int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
-    int cur_output_size = in_dims.size();
-    std::vector<int64_t> output_shape(output_size, 0);
-
-    // Validity Check: rank range.
-    PADDLE_ENFORCE_LE(output_size,
-                      6,
-                      platform::errors::InvalidArgument(
-                          "The output tensor's rank should be less than 6."));
-
-    for (int axis : unsqz_dims) {
-      int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
-      // Vaildity Check: the axis bound
-      PADDLE_ENFORCE_GE(
-          cur,
-          0,
-          platform::errors::InvalidArgument("The insert dimension value should "
-                                            "not be less than 0"));
-      PADDLE_ENFORCE_LE(cur,
-                        cur_output_size,
-                        platform::errors::InvalidArgument(
-                            "The insert dimension value shoud not be larger "
-                            "than the dimension size of input tensor"));
-      // Move old axis, and insert new axis
-      for (int i = cur_output_size; i >= cur; --i) {
-        if (output_shape[i] == 1) {
-          // Move axis
-          output_shape[i + 1] = 1;
-          output_shape[i] = 0;
-        }
-      }
-      output_shape[cur] = 1;
-      // Add the output size.
-      cur_output_size++;
-    }
-
-    // Make output shape
-    for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
-      if (output_shape[out_idx] == 0) {
-        output_shape[out_idx] = in_dims[in_idx++];
-      }
-    }
-
-    return phi::make_ddim(output_shape);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(framework::TransToProtoVarType(
-                              ctx.Input<phi::DenseTensor>("X")->type()),
-                          ctx.GetPlace());
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string &var_name,
-      const phi::DenseTensor &tensor,
-      const phi::KernelKey &expected_kernel_type) const override {
-    if (var_name == "AxesTensor" || var_name == "AxesTensorList") {
-      return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                            expected_kernel_type.layout(),
-                            expected_kernel_type.dtype());
-    }
-    return phi::KernelKey(
-        tensor.place(), tensor.layout(), expected_kernel_type.dtype());
-  }
-};
-
-class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor). The input tensor of unsqueeze operator.");
-    AddInput("AxesTensor",
-             "(Tensor<int32>, optional). The dimensions to be inserted. "
-             "If it exists, it will replace Attr(axes).")
-        .AsDispensable();
-    AddInput(
-        "AxesTensorList",
-        "(vector<Tensor<int32>>, optional). The dimensions to be inserted. "
-        "If it exists, it will replace Attr(axes)."
-        "The shape of the element in vector must be [1].")
-        .AsDuplicable()
-        .AsDispensable();
-    AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
-    AddAttr<std::vector<int>>("axes",
-                              "(std::vector<int>). List of integers,"
-                              " indicating the dimensions to be inserted")
-        .SetDefault({})
-        .AddCustomChecker([](const std::vector<int> &axes) {
-          // Validity Check: axes dims (<6).
-          PADDLE_ENFORCE_LT(static_cast<int>(axes.size()),
-                            6,
-                            platform::errors::InvalidArgument(
-                                "Invalid "
-                                "dimensions, dynamic dimensions should be "
-                                "within [1, 6] dimensions (Eigen limit)."));
-          // Validity Check: the range of unsqueeze axis.
-          for (int axis : axes) {
-            PADDLE_ENFORCE_LT(axis,
-                              6,
-                              platform::errors::InvalidArgument(
-                                  "Invalid "
-                                  "dimensions, input axis should be"
-                                  "within [1, 6] dimensions (Eigen limit)."));
-          }
-        });
-    AddComment(R"DOC(
-    Unsqueeze Operator.
-
-    Insert single-dimensional entries to the shape of a tensor.
-    Takes one required argument axes, a list of dimensions that will be inserted.
-    Dimension indices in axes are as seen in the output tensor.
-
-    For example:
-      Given a tensor such that tensor with shape [3, 4, 5],
-      then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
-    )DOC");
-  }
-};
-
-class UnsqueezeGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class UnsqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("unsqueeze_grad");
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-template <typename T>
-class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("unsqueeze");
-    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
-    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X");
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(unsqueeze,
-                  ops::UnsqueezeOp,
-                  ops::UnsqueezeOpMaker,
-                  ops::UnsqueezeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::UnsqueezeGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(unsqueeze_grad,
-                  ops::UnsqueezeGradOp,
-                  ops::UnsqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
-                  ops::UnsqueezeGradOpNoNeedBufferVarInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze,
-    ops::UnsqueezeKernel<phi::CPUContext, float>,
-    ops::UnsqueezeKernel<phi::CPUContext, double>,
-    ops::UnsqueezeKernel<phi::CPUContext, bool>,
-    ops::UnsqueezeKernel<phi::CPUContext, int>,
-    ops::UnsqueezeKernel<phi::CPUContext, int16_t>,
-    ops::UnsqueezeKernel<phi::CPUContext, uint8_t>,
-    ops::UnsqueezeKernel<phi::CPUContext, int8_t>,
-    ops::UnsqueezeKernel<phi::CPUContext, int64_t>,
-    ops::UnsqueezeKernel<phi::CPUContext, paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<phi::CPUContext, paddle::platform::complex<double>>,
-    ops::UnsqueezeKernel<phi::CPUContext, paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze_grad,
-    ops::UnsqueezeGradKernel<phi::CPUContext, float>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, double>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, bool>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, int>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, int16_t>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, uint8_t>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, int8_t>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, int64_t>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, paddle::platform::complex<float>>,
-    ops::UnsqueezeGradKernel<phi::CPUContext,
-                             paddle::platform::complex<double>>,
-    ops::UnsqueezeGradKernel<phi::CPUContext, paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc
deleted file mode 100644
index 3a98a64d858a5d..00000000000000
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unsqueeze_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze,
-    ops::UnsqueezeKernel<phi::GPUContext, float>,
-    ops::UnsqueezeKernel<phi::GPUContext, double>,
-    ops::UnsqueezeKernel<phi::GPUContext, plat::float16>,
-    ops::UnsqueezeKernel<phi::GPUContext, plat::bfloat16>,
-    ops::UnsqueezeKernel<phi::GPUContext, bool>,
-    ops::UnsqueezeKernel<phi::GPUContext, int>,
-    ops::UnsqueezeKernel<phi::GPUContext, int16_t>,
-    ops::UnsqueezeKernel<phi::GPUContext, uint8_t>,
-    ops::UnsqueezeKernel<phi::GPUContext, int8_t>,
-    ops::UnsqueezeKernel<phi::GPUContext, int64_t>,
-    ops::UnsqueezeKernel<phi::GPUContext, paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<phi::GPUContext, paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze_grad,
-    ops::UnsqueezeGradKernel<phi::GPUContext, float>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, double>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, plat::float16>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, plat::bfloat16>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, bool>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, int>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, int16_t>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, int8_t>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, uint8_t>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, int64_t>,
-    ops::UnsqueezeGradKernel<phi::GPUContext, paddle::platform::complex<float>>,
-    ops::UnsqueezeGradKernel<phi::GPUContext,
-                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
deleted file mode 100644
index 94d8ede8e134c2..00000000000000
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class UnsqueezeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto axes = context.Attr<std::vector<int>>("axes");
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    auto x_dims = in->dims();
-
-    bool need_resize_out_dims = false;
-    if (axes.empty()) {
-      auto axes_tensor_list =
-          context.MultiInput<phi::DenseTensor>("AxesTensorList");
-      if (axes_tensor_list.size() > 0) {
-        axes = GetDataFromTensorList<int>(axes_tensor_list);
-      } else if (context.HasInput("AxesTensor")) {
-        auto *axes_tensor = context.Input<phi::DenseTensor>("AxesTensor");
-        axes = phi::GetVectorFromTensor<int>(axes_tensor);
-      }
-      need_resize_out_dims = true;
-    }
-    framework::DDim out_dims = out->dims();
-    if (need_resize_out_dims) {
-      out_dims = GetOutputShape(axes, x_dims);
-      out->Resize(out_dims);
-    }
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-
-  static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
-                                        const framework::DDim &in_dims) {
-    int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
-    int cur_output_size = in_dims.size();
-    std::vector<int64_t> output_shape(output_size, 0);
-
-    // Validity Check: rank range.
-    PADDLE_ENFORCE_LE(output_size,
-                      6,
-                      platform::errors::InvalidArgument(
-                          "The output "
-                          "tensor's rank should be less than 6."));
-
-    for (int axis : unsqz_dims) {
-      int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
-      // Vaildity Check: the axis bound
-      PADDLE_ENFORCE_GE(
-          cur,
-          0,
-          platform::errors::InvalidArgument("The insert dimension value should "
-                                            "not be less than 0"));
-      PADDLE_ENFORCE_LE(cur,
-                        cur_output_size,
-                        platform::errors::InvalidArgument(
-                            "The insert dimension value shoule not be larger "
-                            "than the dimension size of input tensor"));
-      // Move old axis, and insert new axis
-      for (int i = cur_output_size; i >= cur; --i) {
-        if (output_shape[i] == 1) {
-          // Move axis
-          output_shape[i + 1] = 1;
-          output_shape[i] = 0;
-        }
-      }
-      output_shape[cur] = 1;
-      // Add the output size.
-      cur_output_size++;
-    }
-
-    // Make output shape
-    for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
-      if (output_shape[out_idx] == 0) {
-        output_shape[out_idx] = in_dims[in_idx++];
-      }
-    }
-
-    return phi::make_ddim(output_shape);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnsqueezeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto in_dims = ctx.Input<phi::DenseTensor>("X")->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(in_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Unsqueeze2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    // auto in_dims = d_x->dims();
-
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(x_dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/pir/CMakeLists.txt b/paddle/fluid/pir/CMakeLists.txt
index 1ff77c6d7187e0..24f5e2892de8e2 100644
--- a/paddle/fluid/pir/CMakeLists.txt
+++ b/paddle/fluid/pir/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(dialect)
 add_subdirectory(transforms)
+add_subdirectory(drr)
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
index 62c1129f846209..8ad46bc8906adb 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
@@ -25,7 +25,7 @@ const char* PhiKernelOp::attributes_name[attributes_num] = {  // NOLINT
     "kernel_name",
     "kernel_key"};
 
-void PhiKernelOp::Verify() {
+void PhiKernelOp::VerifySig() {
   VLOG(4) << "Verifying inputs, outputs and attributes for: PhiKernelOp.";
 
   auto& attributes = this->attributes();
@@ -64,7 +64,7 @@ const char* LegacyKernelOp::attributes_name[attributes_num] = {  // NOLINT
     "kernel_name",
     "kernel_key"};
 
-void LegacyKernelOp::Verify() {
+void LegacyKernelOp::VerifySig() {
   VLOG(4) << "Verifying inputs, outputs and attributes for: LegacyKernelOp.";
 
   auto& attributes = this->attributes();
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
index 8a18959665e0c7..a96aa5732d5806 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
@@ -29,7 +29,7 @@ class PhiKernelOp : public pir::Op<PhiKernelOp> {
   std::string op_name();
   std::string kernel_name();
   phi::KernelKey kernel_key();
-  void Verify();
+  void VerifySig();
 };
 
 class LegacyKernelOp : public pir::Op<LegacyKernelOp> {
@@ -41,7 +41,7 @@ class LegacyKernelOp : public pir::Op<LegacyKernelOp> {
   std::string op_name();
   std::string kernel_name();
   phi::KernelKey kernel_key();
-  void Verify();
+  void VerifySig();
 };
 
 }  // namespace dialect
diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
index 9f51351f6ea044..c336dc7b61be18 100644
--- a/paddle/fluid/pir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -197,7 +197,9 @@ def _is_optional_input(self, op_info, input_name):
             return True
         return False
 
-    def _is_optinonal_output(self, op_info, output_name):
+    def _is_optional_output(self, op_info, op_name, output_name):
+        if op_name.endswith(('_grad', '_grad_')):
+            return False
         inplace_map = op_info.inplace_map
         input_optional_list = op_info.input_optional_list
         input_name_list = op_info.input_name_list
@@ -271,7 +273,7 @@ def _gen_api_args(
         )
         return (inputs + ', ' + attrs).strip(', ')
 
-    def _gen_ret_type(self, op_info):
+    def _gen_ret_type(self, op_info, op_name):
         name_list = op_info.output_name_list
         type_list = op_info.output_type_list
         intermediate_list = op_info.output_intermediate_list
@@ -285,7 +287,7 @@ def _gen_ret_type(self, op_info):
             ):
                 if intermediate == 'true':
                     continue
-                if self._is_optinonal_output(op_info, name):
+                if self._is_optional_output(op_info, op_name, name):
                     ret.append(OPTIONAL_OUTPUT_TYPE_MAP[type])
                 else:
                     ret.append(OUTPUT_TYPE_MAP[type])
@@ -293,7 +295,7 @@ def _gen_ret_type(self, op_info):
         elif output_num == 1:
             index = intermediate_list.index('false')
             name = name_list[index]
-            if self._is_optinonal_output(op_info, name):
+            if self._is_optional_output(op_info, op_name, name):
                 return OPTIONAL_OUTPUT_TYPE_MAP[type_list[index]]
             else:
                 return OUTPUT_TYPE_MAP[type_list[index]]
@@ -304,7 +306,7 @@ def _gen_one_declare(
         self, op_info, op_name, is_mutable_attr, is_vector_mutable_attr
     ):
         return API_DECLARE_TEMPLATE.format(
-            ret_type=self._gen_ret_type(op_info),
+            ret_type=self._gen_ret_type(op_info, op_name),
             api_name=op_name,
             args=self._gen_api_args(
                 op_info, True, is_mutable_attr, is_vector_mutable_attr
@@ -367,7 +369,7 @@ def _gen_handle_optional_outputs(self, op_info, op_name):
         ):
             if intermediate == 'true':
                 continue
-            if self._is_optinonal_output(op_info, name):
+            if self._is_optional_output(op_info, op_name, name):
                 if VECTOR_TYPE in type:
                     ret += OPTIONAL_VECTOR_OPRESULT_OUTPUT_TEMPLATE.format(
                         name=name,
@@ -461,7 +463,7 @@ def _gen_compute_op(
             op_inst_name,
         )
 
-    def _gen_out_split_and_ret_list(self, op_info, op_inst_name):
+    def _gen_out_split_and_ret_list(self, op_info, op_name, op_inst_name):
         name_list = op_info.output_name_list
         type_list = op_info.output_type_list
         intermediate_list = op_info.output_intermediate_list
@@ -480,7 +482,7 @@ def _gen_out_split_and_ret_list(self, op_info, op_inst_name):
         ):
             if intermediate == 'true':
                 continue
-            if self._is_optinonal_output(op_info, name):
+            if self._is_optional_output(op_info, op_name, name):
                 ret_list.append(f'optional_{name}')
             elif VECTOR_TYPE in type:
                 split_op_name = f'{name}_split_op'
@@ -503,7 +505,7 @@ def _gen_return_result(self, ret_list):
     def _gen_one_impl(
         self, op_info, op_name, is_mutable_attr, is_vector_mutable_attr
     ):
-        ret_type = self._gen_ret_type(op_info)
+        ret_type = self._gen_ret_type(op_info, op_name)
         in_combine, in_combine_op_list = self._gen_in_combine(
             op_info, is_mutable_attr, is_vector_mutable_attr
         )
@@ -514,7 +516,7 @@ def _gen_one_impl(
             compute_op += f' (void){op_inst_name};'
 
         out_split, ret_list = self._gen_out_split_and_ret_list(
-            op_info, op_inst_name
+            op_info, op_name, op_inst_name
         )
 
         ret = API_IMPL_TEMPLATE.format(
diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
new file mode 100644
index 00000000000000..2c559330eec99c
--- /dev/null
+++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# =====================================
+# DecompInterface gen op list
+# =====================================
+
+
+decomp_interface_declare_gen_op_list = ['mean']
+
+decomp_interface_implementation_gen_op_list = ["mean"]
diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index e24902c712c1a7..85cb1742e8edf9 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -21,6 +21,7 @@
     'ReduceIntArrayAxisInferMeta',
     'ReshapeWithXShapeInferMeta',
     'SliceRawInferMeta',
+    'StackInferMeta',
 }
 
 _PREPARE_DATA_WITH_VECTOR_INT64_MTTABLE_ATTRIBUTE = {'FrobeniusNormOp'}
@@ -144,6 +145,7 @@ def GenBuildInputArgsStr(
     'int': 'phi::DataType::INT32',
     'int64_t': 'phi::DataType::INT64',
     'float': 'phi::DataType::FLOAT32',
+    'double': 'phi::DataType::FLOAT64',
     'std::vector<int64_t>': 'phi::DataType::INT64',
     'const std::vector<int64_t>&': 'phi::DataType::INT64',
     'bool': 'phi::DataType::BOOL',
@@ -694,41 +696,36 @@ def gen_build_func_str(
     )
 
     GET_ATTRIBUTES_FROM_MAP_TEMPLATE = """
-  PADDLE_ENFORCE(
+  IR_ENFORCE(
       attributes.find("{attribute_name}") != attributes.end(),
-      phi::errors::NotFound(
-          "'{attribute_name}' Attribute is expected for {op_name}. "));
+          "'{attribute_name}' Attribute is expected for {op_name}. ");
   {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<{attr_ir_type}>().data();
 """
     GET_STR_ATTRIBUTES_FROM_MAP_TEMPLATE = """
-  PADDLE_ENFORCE(
+  IR_ENFORCE(
       attributes.find("{attribute_name}") != attributes.end(),
-      phi::errors::NotFound(
-          "'{attribute_name}' Attribute is expected for {op_name}. "));
+          "'{attribute_name}' Attribute is expected for {op_name}. ");
   {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<pir::StrAttribute>().AsString();
 """
     GET_ARRAY_ATTRIBUTE_FROM_MAP_TEMPLATE = """
-  PADDLE_ENFORCE(
+  IR_ENFORCE(
       attributes.find("{attribute_name}") != attributes.end(),
-      phi::errors::NotFound(
-          "'{attribute_name}' Attribute is expected for {op_name}. "));
+          "'{attribute_name}' Attribute is expected for {op_name}. ");
   {attr_type} {attribute_name};
   for (size_t i = 0; i < attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().size(); i++) {{
     {attribute_name}.push_back(attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().at(i).dyn_cast<{inner_type}>().{data_name}());
   }}
 """
     GET_INTARRAY_ATTRIBUTE_FROM_MAP_TEMPLATE = """
-  PADDLE_ENFORCE(
+  IR_ENFORCE(
       attributes.find("{attribute_name}") != attributes.end(),
-      phi::errors::NotFound(
-          "'{attribute_name}' Attribute is expected for {op_name}. "));
+          "'{attribute_name}' Attribute is expected for {op_name}. ");
   {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<paddle::dialect::IntArrayAttribute>().data().GetData();
 """
     GET_SCALAR_ATTRIBUTE_FROM_MAP_TEMPLATE = """
-  PADDLE_ENFORCE(
+  IR_ENFORCE(
       attributes.find("{attribute_name}") != attributes.end(),
-      phi::errors::NotFound(
-          "'{attribute_name}' Attribute is expected for {op_name}. "));
+          "'{attribute_name}' Attribute is expected for {op_name}. ");
   {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<paddle::dialect::ScalarAttribute>().data().to<{attr_type}>();
 """
 
diff --git a/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py b/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py
new file mode 100644
index 00000000000000..c760d7fb85b84e
--- /dev/null
+++ b/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import yaml
+from op_gen import OpCompatParser, OpInfoParser, to_pascal_case
+
+CPP_FILE_TEMPLATE = """
+#include "paddle/fluid/pir/drr/ir_operation_factory.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+
+namespace pir {{
+namespace drr {{
+
+void OperationFactory::RegisterGeneratedOpCreator() {{
+{body}
+}}
+
+}}  // namespace drr
+}}  // namespace pir
+
+"""
+
+NORMAL_FUNCTION_TEMPLATE = """
+  RegisterOperationCreator(
+      "{op_name}",
+      [](const std::vector<Value>& inputs,
+         const pir::AttributeMap& attrs,
+         pir::PatternRewriter& rewriter) {{
+        return rewriter.Build<paddle::dialect::{op_class_name}>(
+         {params_code});
+      }});
+"""
+
+MUTABLE_ATTR_FUNCTION_TEMPLATE = """
+  RegisterOperationCreator(
+      "{op_name}",
+      [](const std::vector<Value>& inputs,
+         const pir::AttributeMap& attrs,
+         pir::PatternRewriter& rewriter) {{
+        // mutable_attr is tensor
+        if (inputs.size() > {inputs_num}) {{
+          return rewriter.Build<paddle::dialect::{op_class_name}>(
+          {params_code_with_mutable_attr});
+        }} else {{
+          return rewriter.Build<paddle::dialect::{op_class_name}>(
+          {params_code_no_mutable_attr});
+        }}
+      }});
+"""
+
+
+class OpCreatorCodeGen:
+    def __init__(self, op_yaml_files, op_compat_yaml_file, dialect_name):
+        self.op_info_items = self.parse_yaml(op_yaml_files, op_compat_yaml_file)
+        self.dialect_name = dialect_name
+
+    def parse_yaml(self, op_yaml_files, op_compat_yaml_file):
+        op_compat_parser = OpCompatParser(op_compat_yaml_file)
+
+        op_yaml_items = []
+        for yaml_file in op_yaml_files:
+            with open(yaml_file, "r") as f:
+                ops = yaml.safe_load(f)
+                op_yaml_items = op_yaml_items + ops
+        op_info_items = []
+        for op in op_yaml_items:
+            op_compat_item = op_compat_parser.get_compat(op['name'])
+            if (
+                op_compat_item is not None
+                and op_compat_item['op'] == "pow"
+                and 'scalar' in op_compat_item
+            ):
+                op_compat_item = op_compat_item.pop('scalar')
+            op_info_items.append(OpInfoParser(op, op_compat_item))
+        return op_info_items
+
+    def gen_cpp_file_code(self, cpp_file_path):
+        body_code = ""
+        for op_info_item in self.op_info_items:
+            if op_info_item.infer_meta_map is None:
+                continue
+            for phi_op_name in op_info_item.op_phi_name:
+                ir_op_name = self.dialect_name + "." + phi_op_name
+                params_no_mutable_attr = []
+                for i in range(len(op_info_item.input_name_list)):
+                    params_no_mutable_attr.append(
+                        f"inputs[{i}].dyn_cast<pir::OpResult>()"
+                    )
+                if len(op_info_item.attribute_name_list) > 0:
+                    params_no_mutable_attr.append("attrs")
+
+                if len(op_info_item.mutable_attribute_name_list) == 0:
+                    body_code += NORMAL_FUNCTION_TEMPLATE.format(
+                        op_name=ir_op_name,
+                        op_class_name=(to_pascal_case(phi_op_name) + "Op"),
+                        params_code=", ".join(params_no_mutable_attr),
+                    )
+                else:
+                    params_with_mutable_attr = []
+                    for i in range(
+                        len(op_info_item.input_name_list)
+                        + len(op_info_item.mutable_attribute_name_list)
+                    ):
+                        params_with_mutable_attr.append(
+                            f"inputs[{i}].dyn_cast<pir::OpResult>()"
+                        )
+                    if len(op_info_item.attribute_name_list) > len(
+                        op_info_item.mutable_attribute_name_list
+                    ):
+                        # TODO(zyfncg): Currently Op::Build Interface doesn't support this case.
+                        continue
+                        # params_with_mutable_attr.append("attrs")
+
+                    body_code += MUTABLE_ATTR_FUNCTION_TEMPLATE.format(
+                        op_name=ir_op_name,
+                        inputs_num=len(op_info_item.input_name_list),
+                        op_class_name=(to_pascal_case(phi_op_name) + "Op"),
+                        params_code_with_mutable_attr=",".join(
+                            params_with_mutable_attr
+                        ),
+                        params_code_no_mutable_attr=", ".join(
+                            params_no_mutable_attr
+                        ),
+                    )
+
+        with open(cpp_file_path, 'w') as f:
+            f.write(CPP_FILE_TEMPLATE.format(body=body_code))
+
+
+def ParseArguments():
+    parser = argparse.ArgumentParser(
+        description='Generate Op Creator Files By Yaml'
+    )
+    parser.add_argument('--op_yaml_files', type=str)
+    parser.add_argument('--op_compat_yaml_file', type=str)
+    parser.add_argument('--dialect_name', type=str)
+    parser.add_argument('--op_creator_file', type=str)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = ParseArguments()
+    op_yaml_files = args.op_yaml_files.split(",")
+    op_compat_yaml_file = args.op_compat_yaml_file
+    op_creator_file = args.op_creator_file
+    dialect_name = args.dialect_name
+
+    code_gen = OpCreatorCodeGen(
+        op_yaml_files, op_compat_yaml_file, dialect_name
+    )
+    code_gen.gen_cpp_file_code(op_creator_file)
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 167b950ee95e7c..8983ffa38b5629 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -19,6 +19,7 @@
 import sys
 
 import yaml
+from decomp_interface_gen_op_list import decomp_interface_declare_gen_op_list
 from op_build_gen import gen_build_func_str, gen_build_func_str_by_invoke
 from op_interface_gen import (
     gen_exclusive_interface_str,
@@ -27,10 +28,7 @@
 )
 from op_member_func_gen import gen_op_get_inputs_outputs_str
 from op_verify_gen import gen_verify_func_str
-from vjp_interface_gen_op_list import (
-    vjp_interface_declare_gen_op_list,
-    vjp_interface_implementation_gen_op_list,
-)
+from vjp_interface_black_list import vjp_interface_black_list
 
 # import from paddle/fluid/primitive/code_gen/gen.py
 sys.path.append(
@@ -61,6 +59,7 @@
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
 #include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
+#include "paddle/fluid/pir/dialect/operator/interface/decomp.h"
 #include "paddle/fluid/pir/dialect/operator/trait/inplace.h"
 #include "paddle/fluid/pir/dialect/operator/trait/custom_vjp.h"
 #include "paddle/fluid/framework/infershape_utils.h"
@@ -99,7 +98,7 @@ class {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
   {build_mutable_attr_is_input}
   {build_attr_num_over_1}
   {build_mutable_attr_is_input_attr_num_over_1}
-  void Verify();
+  void VerifySig();
 {get_inputs_and_outputs}
 {exclusive_interface}
 }};
@@ -477,7 +476,7 @@ def parse_mutable_attribute(self):
                     if (self.op_compat_item['op'] == "isclose") or (
                         self.op_compat_item['op'] == "allclose"
                     ):
-                        data_type = "float"
+                        data_type = "double"
                     mutable_attribute_type_list.append(
                         [
                             "paddle::dialect::ScalarAttribute",
@@ -1036,9 +1035,11 @@ def OpGenerator(
 
         if (
             op_info.backward_name
-            and op_info.op_phi_name[0] in vjp_interface_declare_gen_op_list
+            and op_info.op_phi_name[0] not in vjp_interface_black_list
         ):
             op_interfaces += ["paddle::dialect::VjpInterface"]
+        if op_info.op_phi_name[0] in decomp_interface_declare_gen_op_list:
+            op_interfaces += ["paddle::dialect::DecompInterface"]
         exclusive_interface_str = gen_exclusive_interface_str(
             op_info, op_info_items
         )
@@ -1444,7 +1445,7 @@ def OpGenerator(
                     if (
                         op_info.backward_name
                         and op_info.op_phi_name[0]
-                        in vjp_interface_implementation_gen_op_list
+                        not in vjp_interface_black_list
                     ):
                         op_vjp_str = gen_op_vjp_str(
                             op_class_name,
diff --git a/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py b/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
index 9c8ff889f2b219..299d4197b79475 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from decomp_interface_gen_op_list import decomp_interface_declare_gen_op_list
+
 # generator interfaces
-from vjp_interface_gen_op_list import vjp_interface_declare_gen_op_list
+from vjp_interface_black_list import vjp_interface_black_list
 
 OP_INFER_SHAPE_TEMPLATE = """
 void {op_name}::InferMeta( phi::InferMetaContext *infer_meta ) {{
@@ -26,12 +28,12 @@
     {input_type} {input_name}(std::make_shared<primitive::LazyTensor>(op_obj.{input_name}()));"""
 
 OP_VJP_FORWARD_MULTI_INPUT_TEMPLATE = """
-    pir::CombineOp combine_op_obj =
+    pir::CombineOp combine_op_obj_{input_name} =
       op_obj.{input_name}().dyn_cast<pir::OpResult>().owner()->dyn_cast<pir::CombineOp>();
     std::vector<Tensor> {input_name};
-    for (size_t idx = 0; idx < combine_op_obj.inputs().size(); idx++) {{
+    for (size_t idx = 0; idx < combine_op_obj_{input_name}.inputs().size(); idx++) {{
         {input_name}.emplace_back(
-            std::make_shared<primitive::LazyTensor>(combine_op_obj.inputs()[idx]));
+            std::make_shared<primitive::LazyTensor>(combine_op_obj_{input_name}.inputs()[idx]));
     }}"""
 
 OP_VJP_FORWARD_OPTIONAL_INPUT_TEMPLATE = """
@@ -63,6 +65,23 @@
             std::make_shared<primitive::LazyTensor>(out_grads[{index}][idx]));
     }}"""
 
+OP_VJP_FORWARD_OPTIONAL_OUTPUT_GRAD_TEMPLATE = """
+    paddle::optional<Tensor> {output_grad_name};
+    if (!IsEmptyValue(out_grads[{idx1}][{idx2}])){{
+        {output_grad_name} = paddle::make_optional<Tensor>(Tensor(std::make_shared<primitive::LazyTensor>(out_grads[{idx1}][{idx2}])));
+    }}"""
+
+OP_VJP_FORWARD_OPTIONAL_VECTOR_OUTPUT_GRAD_TEMPLATE = """
+    paddle::optional<std::vector<Tensor>> {output_grad_name};
+    std::vector<Tensor> optional_{output_grad_name};
+    if (!IsEmptyValue(out_grads[{index}])){{
+        for (size_t idx = 0; idx < out_grads[{index}].size(); idx++) {{
+            optional_{output_grad_name}.emplace_back(
+                std::make_shared<primitive::LazyTensor>(out_grads[{index}][idx]));
+        }}
+        {output_grad_name} = paddle::make_optional<std::vector<Tensor>>(optional_{output_grad_name});
+    }}"""
+
 OP_VJP_ATTRIBUTE_TEMPLATE = """
     {attr_type} {attr_name} = op->attribute("{attr_name}").dyn_cast<{attr_parse_type}>().{func}();"""
 
@@ -131,26 +150,25 @@ def gen_op_vjp_str(
     grad_idx = -1
     for idx in range(len(bw_input_list)):
         build_args_str += bw_input_list[idx] + ", "
-        if op_grad_info.input_optional_list[idx] == 'true':
-            input_type = input_types_map[op_grad_info.input_type_list[idx]]
-            if input_type == 'Tensor':
-                forward_input_output_code += (
-                    OP_VJP_FORWARD_OPTIONAL_INPUT_TEMPLATE.format(
-                        input_name=bw_input_list[idx],
+        input_type = input_types_map[op_grad_info.input_type_list[idx]]
+        if (
+            bw_input_list[idx] in op_info.input_name_list
+            or bw_input_list[idx] in op_info.output_name_list
+        ):
+            if op_grad_info.input_optional_list[idx] == 'true':
+                if input_type == 'Tensor':
+                    forward_input_output_code += (
+                        OP_VJP_FORWARD_OPTIONAL_INPUT_TEMPLATE.format(
+                            input_name=bw_input_list[idx],
+                        )
                     )
-                )
-            else:
-                forward_input_output_code += (
-                    OP_VJP_FORWARD_OPTIONAL_VECTOR_INPUT_TEMPLATE.format(
-                        input_name=bw_input_list[idx],
+                else:
+                    forward_input_output_code += (
+                        OP_VJP_FORWARD_OPTIONAL_VECTOR_INPUT_TEMPLATE.format(
+                            input_name=bw_input_list[idx],
+                        )
                     )
-                )
-        else:
-            if (
-                bw_input_list[idx] in op_info.input_name_list
-                or bw_input_list[idx] in op_info.output_name_list
-            ):
-                input_type = input_types_map[op_grad_info.input_type_list[idx]]
+            else:
                 if input_type == 'Tensor':
                     forward_input_output_code += (
                         OP_VJP_FORWARD_INPUT_OR_OUTPUT_TEMPLATE.format(
@@ -164,9 +182,22 @@ def gen_op_vjp_str(
                             input_name=bw_input_list[idx],
                         )
                     )
+        else:
+            grad_idx += 1
+            if op_grad_info.input_optional_list[idx] == 'true':
+                if input_type == 'Tensor':
+                    forward_input_output_code += (
+                        OP_VJP_FORWARD_OPTIONAL_OUTPUT_GRAD_TEMPLATE.format(
+                            output_grad_name=bw_input_list[idx],
+                            idx1=grad_idx,
+                            idx2=0,
+                        )
+                    )
+                else:
+                    forward_input_output_code += OP_VJP_FORWARD_OPTIONAL_VECTOR_OUTPUT_GRAD_TEMPLATE.format(
+                        output_grad_name=bw_input_list[idx], index=grad_idx
+                    )
             else:
-                grad_idx += 1
-                input_type = input_types_map[op_grad_info.input_type_list[idx]]
                 if input_type == 'Tensor':
                     forward_output_grad_code += (
                         OP_VJP_FORWARD_OUTPUT_GRAD_TEMPLATE.format(
@@ -285,6 +316,8 @@ def gen_exclusive_interface_str(op_info, op_info_items):
             exclusive_interface_str += (
                 "  static void InferMeta( phi::InferMetaContext *infer_meta );"
             )
-    if op_info.op_phi_name[0] in vjp_interface_declare_gen_op_list:
+    if op_info.op_phi_name[0] not in vjp_interface_black_list:
         exclusive_interface_str += "\n  static std::vector<std::vector<pir::OpResult>> Vjp(pir::Operation* op, const std::vector<std::vector<pir::Value>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients);"
+    if op_info.op_phi_name[0] in decomp_interface_declare_gen_op_list:
+        exclusive_interface_str += "\n  static std::vector<std::vector<pir::OpResult>> Decomp(pir::Operation* op);"
     return exclusive_interface_str
diff --git a/paddle/fluid/pir/dialect/op_generator/op_verify_gen.py b/paddle/fluid/pir/dialect/op_generator/op_verify_gen.py
index 1b8c82b27d90be..f42a73347d13ad 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_verify_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_verify_gen.py
@@ -14,13 +14,13 @@
 
 # verify
 OP_VERIFY_TEMPLATE = """
-void {op_name}::Verify() {{
+void {op_name}::VerifySig() {{
   VLOG(4) << "Start Verifying inputs, outputs and attributes for: {op_name}.";
   VLOG(4) << "Verifying inputs:";
   {{
   auto input_size = num_operands();
-  PADDLE_ENFORCE_EQ(input_size, {inputs_size}u,
-                    phi::errors::PreconditionNotMet("The size %d of inputs must be equal to {inputs_size}.", input_size));{inputs_type_check}
+  IR_ENFORCE(input_size == {inputs_size}u,
+                    "The size %d of inputs must be equal to {inputs_size}.", input_size);{inputs_type_check}
   }}
   VLOG(4) << "Verifying attributes:";
   {{{attributes_check}
@@ -28,95 +28,95 @@
   VLOG(4) << "Verifying outputs:";
   {{
   auto output_size = num_results();
-  PADDLE_ENFORCE_EQ(output_size, {outputs_size}u,
-                    phi::errors::PreconditionNotMet("The size %d of outputs must be equal to {outputs_size}.", output_size));{outputs_type_check}
+  IR_ENFORCE(output_size == {outputs_size}u,
+                    "The size %d of outputs must be equal to {outputs_size}.", output_size);{outputs_type_check}
   }}
   VLOG(4) << "End Verifying for: {op_name}.";
 }}
 """
 
 GRAD_OP_VERIFY_TEMPLATE = """
-void {op_name}::Verify() {{}}
+void {op_name}::VerifySig() {{}}
 """
 
 INPUT_TYPE_CHECK_TEMPLATE = """
-  PADDLE_ENFORCE((*this)->operand_source({index}).type().isa<{standard}>(),
-                  phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));"""
+  IR_ENFORCE((*this)->operand_source({index}).type().isa<{standard}>(),
+                  "Type validation failed for the {index}th input.");"""
 INPUT_VECTORTYPE_CHECK_TEMPLATE = """
   if (auto vec_type = (*this)->operand_source({index}).type().dyn_cast<pir::VectorType>()) {{
       for (size_t i = 0; i < vec_type.size(); ++i) {{
-        PADDLE_ENFORCE(vec_type[i].isa<{standard}>(),
-                       phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));
+        IR_ENFORCE(vec_type[i].isa<{standard}>(),
+                       "Type validation failed for the {index}th input.");
       }}
   }}
   else {{
-    PADDLE_ENFORCE((*this)->operand_source({index}).type().isa<{standard}>(),
-                   phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));
+    IR_ENFORCE((*this)->operand_source({index}).type().isa<{standard}>(),
+                   "Type validation failed for the {index}th input.");
   }}"""
 INPUT_OPTIONAL_TYPE_CHECK_TEMPLATE = """
   if (auto val = (*this)->operand({index})) {{
-    PADDLE_ENFORCE(val.type().isa<{standard}>(),
-                   phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));
+    IR_ENFORCE(val.type().isa<{standard}>(),
+                   "Type validation failed for the {index}th input.");
   }}"""
 INPUT_OPTIONAL_VECTORTYPE_CHECK_TEMPLATE = """
   if (auto val =  (*this)->operand({index})) {{
     if (auto vec_type = val.type().dyn_cast<pir::VectorType>()) {{
       for (size_t i = 0; i < vec_type.size(); i++) {{
-        PADDLE_ENFORCE(vec_type[i].isa<{standard}>(),
-                          phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));
+        IR_ENFORCE(vec_type[i].isa<{standard}>(),
+                          "Type validation failed for the {index}th input.");
       }}
     }}
     else {{
-      PADDLE_ENFORCE(val.type().isa<{standard}>(),
-                        phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));
+      IR_ENFORCE(val.type().isa<{standard}>(),
+                        "Type validation failed for the {index}th input.");
     }}
   }}"""
 ATTRIBUTE_CHECK_TEMPLATE = """
-  PADDLE_ENFORCE(attributes.count("{attribute_name}")>0,
-                 phi::errors::PreconditionNotMet("{attribute_name} does not exist."));
-  PADDLE_ENFORCE(attributes.at("{attribute_name}").isa<{standard}>(),
-                 phi::errors::PreconditionNotMet("Type of attribute: {attribute_name} is not {standard}."));
+  IR_ENFORCE(attributes.count("{attribute_name}")>0,
+                 "{attribute_name} does not exist.");
+  IR_ENFORCE(attributes.at("{attribute_name}").isa<{standard}>(),
+                 "Type of attribute: {attribute_name} is not {standard}.");
 """
 ATTRIBUTE_VECTOR_CHECK_TEMPLATE = """
-  PADDLE_ENFORCE(attributes.count("{attribute_name}")>0,
-                 phi::errors::PreconditionNotMet("{attribute_name} does not exist."));
-  PADDLE_ENFORCE(attributes.at("{attribute_name}").isa<pir::ArrayAttribute>(),
-                 phi::errors::PreconditionNotMet("Type of attribute: {attribute_name} is not pir::ArrayAttribute."));
+  IR_ENFORCE(attributes.count("{attribute_name}")>0,
+                 "{attribute_name} does not exist.");
+  IR_ENFORCE(attributes.at("{attribute_name}").isa<pir::ArrayAttribute>(),
+                 "Type of attribute: {attribute_name} is not pir::ArrayAttribute.");
   for (size_t i = 0; i < attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().size(); i++) {{
-    PADDLE_ENFORCE(attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().at(i).isa<{standard}>(),
-                   phi::errors::PreconditionNotMet("Type of attribute: {attribute_name} is not right."));
+    IR_ENFORCE(attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().at(i).isa<{standard}>(),
+                   "Type of attribute: {attribute_name} is not right.");
   }}"""
 OUTPUT_TYPE_CHECK_TEMPLATE = """
-  PADDLE_ENFORCE((*this)->result({index}).type().isa<{standard}>(),
-                 phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));"""
+  IR_ENFORCE((*this)->result({index}).type().isa<{standard}>(),
+                 "Type validation failed for the {index}th output.");"""
 OUTPUT_VECTORTYPE_CHECK_TEMPLATE = """
   auto output_{index}_type = (*this)->result({index}).type();
   if (auto vec_type = output_{index}_type.dyn_cast<pir::VectorType>()) {{
     for (size_t i = 0; i < vec_type.size(); i++) {{
-      PADDLE_ENFORCE(vec_type[i].isa<{standard}>(),
-                     phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));
+      IR_ENFORCE(vec_type[i].isa<{standard}>(),
+                     "Type validation failed for the {index}th output.");
     }}
   }}
   else {{
-    PADDLE_ENFORCE(output_{index}_type.isa<{standard}>(),
-                   phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));
+    IR_ENFORCE(output_{index}_type.isa<{standard}>(),
+                   "Type validation failed for the {index}th output.");
   }}"""
 OUTPUT_OPTIONAL_TYPE_CHECK_TEMPLATE = """
   if (auto output_{index}_type = (*this)->result({index}).type()) {{
-    PADDLE_ENFORCE(output_{index}_type.isa<{standard}>(),
-                   phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));
+    IR_ENFORCE(output_{index}_type.isa<{standard}>(),
+                   "Type validation failed for the {index}th output.");
   }}"""
 OUTPUT_OPTIONAL_VECTORTYPE_CHECK_TEMPLATE = """
   if (auto output_{index}_type = (*this)->result({index}).type()) {{
     if (auto vec_type = output_{index}_type.dyn_cast<pir::VectorType>()) {{
       for (size_t i = 0; i < vec_type.size(); ++i) {{
-        PADDLE_ENFORCE(vec_type[i].isa<{standard}>(),
-                       phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));
+        IR_ENFORCE(vec_type[i].isa<{standard}>(),
+                       "Type validation failed for the {index}th output.");
       }}
     }}
     else {{
-      PADDLE_ENFORCE(output_{index}_type.isa<{standard}>(),
-                     phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));
+      IR_ENFORCE(output_{index}_type.isa<{standard}>(),
+                     "Type validation failed for the {index}th output.");
     }}
   }}"""
 
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index fec69b8ce5a4ec..e2d17e7f118023 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -71,32 +71,33 @@
 NEED_GEN_STATIC_ONLY_APIS = ['fetch']
 
 NO_NEED_GEN_STATIC_ONLY_APIS = [
-    'set_value_with_tensor',
-    'set_value_with_tensor_',
-    'fused_bn_add_activation_',
-    'fused_batch_norm_act_',
     'add_n_',
-    'set_value',
-    'assign_value',
-    'set_value_',
-    'embedding_grad_sparse',
     'add_n_with_kernel',
-    'print',
-    'send_v2',
-    'shadow_feed',
-    'recv_v2',
-    'rnn_',
-    'fused_scale_bias_relu_conv_bnstats',
+    'assign_value',
     'batch_norm_',
+    'c_allgather',
+    'c_allreduce_max',
     'c_allreduce_sum',
     'c_embedding',
     'c_identity',
     'c_reduce_sum',
-    'c_allreduce_max',
-    'c_allgather',
+    'dpsgd',
+    'embedding_grad_sparse',
+    'fused_attention',
+    'fused_batch_norm_act_',
+    'fused_bn_add_activation_',
+    'fused_feedforward',
+    'fused_scale_bias_relu_conv_bnstats',
+    'print',
+    'recv_v2',
+    'rnn_',
     'seed',
-    "fused_attention",
-    "fused_feedforward",
+    'send_v2',
+    'set_value',
+    'set_value_',
+    'set_value_with_tensor',
+    'set_value_with_tensor_',
+    'shadow_feed',
 ]
 
 
diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py
new file mode 100644
index 00000000000000..c63e0c4e418338
--- /dev/null
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# =====================================
+# VjpInterface gen op list
+# =====================================
+# we don't support vjp function code
+# gen now, so we use a whitelist to
+# control the generation of Vjp methods.
+# TODO(wanghao107)
+# remove this file and support Vjp methods
+# code gen.
+
+
+vjp_interface_black_list = [
+    'frobenius_norm',
+    'write_to_array',
+    'fused_attention',
+    'fused_feedforward',
+    'set_value',
+    'set_value_with_tensor',
+    'silu_grad',
+    'fused_dropout_add',
+    'fused_rotary_position_embedding',
+]
diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
deleted file mode 100644
index 3a559ef8dedf84..00000000000000
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# =====================================
-# VjpInterface gen op list
-# =====================================
-# we don't support vjp function code
-# gen now, so we use a whitelist to
-# control the generation of Vjp methods.
-# TODO(wanghao107)
-# remove this file and support Vjp methods
-# code gen.
-
-
-vjp_interface_declare_gen_op_list = [
-    'where',
-    "tanh",
-    "mean",
-    "divide",
-    "sum",
-    "add",
-    "concat",
-    "split",
-    "split_with_num",
-    "gelu",
-    "matmul",
-    "erf",
-    "multiply",
-    "pow",
-    "rsqrt",
-    "subtract",
-    "square",
-    "dropout",
-    'exp',
-    'expm1',
-    'expand',
-    'layer_norm',
-    'reshape',
-    'cast',
-    "scale",
-    'softmax',
-    'silu',
-    'elementwise_pow',
-    'embedding',
-    'fused_softmax_mask_upper_triangle',
-    'slice',
-    'transpose',
-    'slice_grad',
-    'gather_nd',
-    'stack',
-    'poisson',
-    'gumbel_softmax',
-    'pad',
-    'pad3d',
-    'squeeze',
-    'unsqueeze',
-    'tril',
-    'triu',
-    'squeeze',
-    'unsqueeze',
-    'conv2d',
-    'depthwise_conv2d',
-    'sqrt',
-    'flatten',
-    'relu',
-    'abs',
-    'log',
-    'clip',
-    'ceil',
-    'p_norm',
-    'maximum',
-    'argsort',
-    'min',
-    'batch_norm',
-    'max_pool2d_with_index',
-    'pool2d',
-    'minimum',
-    'prod',
-    'round',
-    'sin',
-    'cos',
-    'dot',
-    'floor',
-    'topk',
-    'square',
-    'gather',
-    'label_smooth',
-    'cross_entropy_with_softmax',
-    'mean_all',
-    'cumsum',
-    'linear_interp',
-    'bilinear_interp',
-    'trilinear_interp',
-    'nearest_interp',
-    'bicubic_interp',
-    'assign',
-    'assign_out_',
-    'real',
-    'flip',
-    'softmax',
-    'expand',
-    'conv2d_transpose',
-    'depthwise_conv2d_transpose',
-    'sigmoid',
-    'pad',
-    'pad3d',
-    'einsum',
-    'leaky_relu',
-    'log10',
-    'conv3d',
-    'solve',
-    'diag',
-    'trace',
-    'tile',
-]
-vjp_interface_implementation_gen_op_list = [
-    'where',
-    "tanh",
-    "mean",
-    "divide",
-    "sum",
-    "add",
-    "concat",
-    "split",
-    "split_with_num",
-    "gelu",
-    "matmul",
-    "erf",
-    "multiply",
-    "subtract",
-    "pow",
-    "rsqrt",
-    "square",
-    "dropout",
-    'exp',
-    'expm1',
-    'expand',
-    'layer_norm',
-    'reshape',
-    'cast',
-    "scale",
-    'softmax',
-    'silu',
-    'elementwise_pow',
-    'embedding',
-    'fused_softmax_mask_upper_triangle',
-    'slice',
-    'transpose',
-    'slice_grad',
-    'gather_nd',
-    'stack',
-    'poisson',
-    'gumbel_softmax',
-    'pad',
-    'pad3d',
-    'squeeze',
-    'unsqueeze',
-    'tril',
-    'triu',
-    'squeeze',
-    'unsqueeze',
-    'conv2d',
-    'depthwise_conv2d',
-    'sqrt',
-    'flatten',
-    'relu',
-    'abs',
-    'log',
-    'clip',
-    'ceil',
-    'p_norm',
-    'maximum',
-    'argsort',
-    'min',
-    'batch_norm',
-    'max_pool2d_with_index',
-    'pool2d',
-    'minimum',
-    'prod',
-    'round',
-    'sin',
-    'cos',
-    'dot',
-    'floor',
-    'topk',
-    'square',
-    'gather',
-    'label_smooth',
-    'cross_entropy_with_softmax',
-    'mean_all',
-    'cumsum',
-    'linear_interp',
-    'bilinear_interp',
-    'trilinear_interp',
-    'nearest_interp',
-    'bicubic_interp',
-    'assign',
-    'assign_out_',
-    'real',
-    'flip',
-    'softmax',
-    'expand',
-    'conv2d_transpose',
-    'depthwise_conv2d_transpose',
-    'sigmoid',
-    'pad',
-    'pad3d',
-    'einsum',
-    'leaky_relu',
-    'log10',
-    'conv3d',
-    'solve',
-    'diag',
-    'trace',
-    'tile',
-]
diff --git a/paddle/fluid/pir/dialect/operator/interface/decomp.h b/paddle/fluid/pir/dialect/operator/interface/decomp.h
new file mode 100644
index 00000000000000..10a6e51e7db3c6
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/decomp.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/pir/core/op_base.h"
+
+namespace paddle {
+namespace dialect {
+class DecompInterface : public pir::OpInterfaceBase<DecompInterface> {
+ public:
+  struct Concept {
+    explicit Concept(
+        std::vector<std::vector<pir::OpResult>> (*decomp)(pir::Operation* op))
+        : decomp_(decomp) {}
+    std::vector<std::vector<pir::OpResult>> (*decomp_)(pir::Operation* op);
+  };
+
+  template <class ConcreteOp>
+  struct Model : public Concept {
+    static std::vector<std::vector<pir::OpResult>> Decomp(pir::Operation* op) {
+      return ConcreteOp::Decomp(op);
+    }
+    Model() : Concept(Decomp) {}
+  };
+
+  /// Constructor
+  DecompInterface(pir::Operation* op, Concept* impl)
+      : pir::OpInterfaceBase<DecompInterface>(op), impl_(impl) {}
+
+  std::vector<std::vector<pir::OpResult>> Decomp(pir::Operation* op) {
+    return impl_->decomp_(op);
+  }
+
+ private:
+  Concept* impl_;
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::DecompInterface)
diff --git a/paddle/fluid/pir/dialect/operator/interface/interface.cc b/paddle/fluid/pir/dialect/operator/interface/interface.cc
index ce8bdb6c6829f8..8a4049ff09544b 100644
--- a/paddle/fluid/pir/dialect/operator/interface/interface.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/interface.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/pir/dialect/operator/interface/decomp.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
@@ -37,3 +38,4 @@ std::vector<std::vector<pir::OpResult>> VjpInterface::Vjp(
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::InferMetaInterface)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OpYamlInfoInterface)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::VjpInterface)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::DecompInterface)
diff --git a/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
index befbb84a7117df..6c07f558e61abc 100644
--- a/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
@@ -52,18 +52,12 @@ set(op_source_file_tmp ${op_source_file}.tmp)
 set(op_vjp_source_file ${PD_DIALECT_BINARY_DIR}/pd_op_vjp.cc)
 set(op_vjp_source_file_tmp ${op_vjp_source_file}.tmp)
 
-add_custom_command(
-  OUTPUT ${op_yaml_file3} ${op_yaml_file4}
+execute_process(
   COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_op_dir}
   COMMAND ${PYTHON_EXECUTABLE} ${op_gen_parsed_yaml_file} --op_yaml_path
           ${pd_op_forward_yaml_file} --output_path ${op_yaml_file3}
-  COMMENT "Generate pd_ops.parsed.yaml"
   COMMAND ${PYTHON_EXECUTABLE} ${op_gen_parsed_yaml_file} --op_yaml_path
-          ${pd_op_backward_yaml_file} --output_path ${op_yaml_file4} --backward
-  COMMENT "Generate pd_ops_backward.parsed.yaml"
-  DEPENDS ${op_gen_parsed_yaml_file} ${pd_op_forward_yaml_file}
-          ${pd_op_backward_yaml_file}
-  VERBATIM)
+          ${pd_op_backward_yaml_file} --output_path ${op_yaml_file4} --backward)
 
 add_custom_command(
   OUTPUT ${op_header_file} ${op_source_file} ${op_vjp_source_file}
@@ -190,8 +184,8 @@ cc_library(
   DEPS phi pd_interface pd_trait type_info)
 cc_library(
   pd_op_dialect_op
-  SRCS ${op_source_file} manual_op.cc
-  DEPS pd_op_dialect_core)
+  SRCS ${op_source_file} manual_op.cc control_flow_op.cc
+  DEPS pd_op_dialect_core pir_control_flow)
 cc_library(
   api_builder
   SRCS api_builder.cc
@@ -209,6 +203,6 @@ target_include_directories(pd_op_dialect_api INTERFACE ${PD_DIALECT_BINARY_DIR})
 
 cc_library(
   pd_op_dialect
-  SRCS op_dialect.cc manual_op_vjp.cc ${op_vjp_source_file}
+  SRCS op_dialect.cc manual_op_decomp.cc manual_op_vjp.cc ${op_vjp_source_file}
   DEPS pd_op_dialect_api param_to_variable primitive_vjp_experimental
        pd_op_dialect_utils op_yaml_info_parser)
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
new file mode 100644
index 00000000000000..c235799633896b
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef GET_OP_LIST
+#undef GET_OP_LIST
+paddle::dialect::IfOp, paddle::dialect::WhileOp
+#else
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/ir_printer.h"
+#include "paddle/pir/core/operation_utils.h"
+#include "paddle/pir/core/utils.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
+
+namespace paddle {
+namespace dialect {
+
+void IfOp::Build(pir::Builder &builder,             // NOLINT
+                 pir::OperationArgument &argument,  // NOLINT
+                 pir::Value cond,
+                 std::vector<pir::Type> &&output_types) {
+  VLOG(4) << "Start build IfOp";
+  argument.AddRegions(2u);
+  argument.AddInput(cond);
+  argument.output_types.swap(output_types);
+}
+
+void IfOp::Build(pir::Builder &builder,             // NOLINT
+                 pir::OperationArgument &argument,  // NOLINT
+                 pir::Value cond,
+                 std::unique_ptr<pir::Block> &&true_block,
+                 std::unique_ptr<pir::Block> &&false_block) {
+  VLOG(4) << "Start build IfOp";
+  if (true_block && !true_block->empty() &&
+      true_block->back()->isa<pir::YieldOp>()) {
+    auto *op = true_block->back();
+    for (size_t i = 0; i < op->num_operands(); ++i) {
+      argument.AddOutput(op->operand(i).type());
+    }
+  }
+  if (false_block && !false_block->empty() &&
+      false_block->back()->isa<pir::YieldOp>()) {
+    auto *op = false_block->back();
+    PADDLE_ENFORCE_EQ(op->num_operands(),
+                      argument.output_types.size(),
+                      phi::errors::PreconditionNotMet(
+                          "The output size of true block and false block must "
+                          "be equal. but they are %u and %u, respectively",
+                          argument.output_types.size(),
+                          op->num_operands()));
+    for (size_t i = 0; i < op->num_operands(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          op->operand(i).type(),
+          argument.output_types[i],
+          phi::errors::PreconditionNotMet("The output[%d] type of true block "
+                                          "and false block must be equal.",
+                                          i));
+    }
+  } else {
+    PADDLE_ENFORCE(argument.output_types.empty(),
+                   phi::errors::PreconditionNotMet(
+                       "The output size of true block and false block must be "
+                       "equal. but they are %u and 0, respectively",
+                       argument.output_types.size()));
+  }
+  argument.AddRegion()->push_back(true_block.release());
+  argument.AddRegion()->push_back(false_block.release());
+  argument.AddInput(cond);
+}
+
+pir::Block *IfOp::true_block() {
+  pir::Region &true_region = (*this)->region(0);
+  if (true_region.empty()) true_region.emplace_back();
+  return true_region.front();
+}
+pir::Block *IfOp::false_block() {
+  pir::Region &false_region = (*this)->region(1);
+  if (false_region.empty()) false_region.emplace_back();
+  return false_region.front();
+}
+void IfOp::Print(pir::IrPrinter &printer) {
+  auto &os = printer.os;
+  auto op = operation();
+  printer.PrintOpResult(op);
+  os << " = pd_op.if";
+  printer.PrintOpOperands(op);
+  os << " -> ";
+  printer.PrintOpReturnType(op);
+  os << "{";
+  for (auto item : *true_block()) {
+    os << "\n  ";
+    printer.PrintOperation(item);
+  }
+  os << "\n } else {";
+  for (auto item : *false_block()) {
+    os << "\n  ";
+    printer.PrintOperation(item);
+  }
+  os << "\n }";
+}
+void IfOp::VerifySig() {
+  VLOG(4) << "Start Verifying inputs, outputs and attributes for: IfOp.";
+  auto input_size = num_operands();
+  PADDLE_ENFORCE_EQ(
+      input_size,
+      1u,
+      phi::errors::PreconditionNotMet(
+          "The size %d of inputs must be equal to 1.", input_size));
+
+  if ((*this)->operand_source(0).type().isa<pir::DenseTensorType>()) {
+    PADDLE_ENFORCE(
+        (*this)
+            ->operand_source(0)
+            .type()
+            .dyn_cast<pir::DenseTensorType>()
+            .dtype()
+            .isa<pir::BoolType>(),
+        phi::errors::PreconditionNotMet(
+            "Type validation failed for the 1th input, it should be a "
+            "bool DenseTensorType."));
+  }
+
+  PADDLE_ENFORCE_EQ((*this)->num_regions(),
+                    2u,
+                    phi::errors::PreconditionNotMet(
+                        "The size %d of regions must be equal to 2.",
+                        (*this)->num_regions()));
+}
+
+void IfOp::VerifyRegion() {
+  VLOG(4) << "Start Verifying sub regions for: IfOp.";
+  PADDLE_ENFORCE_EQ(
+      (*this)->region(0).size(),
+      1u,
+      phi::errors::PreconditionNotMet("The size %d of true_region must be 1.",
+                                      (*this)->region(0).size()));
+
+  if ((*this)->num_results() != 0) {
+    PADDLE_ENFORCE_EQ(
+        (*this)->region(0).size(),
+        (*this)->region(1).size(),
+        phi::errors::PreconditionNotMet("The size %d of true_region must be "
+                                        "equal to the size %d of false_region.",
+                                        (*this)->region(0).size(),
+                                        (*this)->region(1).size()));
+
+    auto *true_last_op = (*this)->region(0).front()->back();
+    auto *false_last_op = (*this)->region(1).front()->back();
+    PADDLE_ENFORCE_EQ(true_last_op->isa<pir::YieldOp>(),
+                      true,
+                      phi::errors::PreconditionNotMet(
+                          "The last of true block must be YieldOp"));
+    PADDLE_ENFORCE_EQ(true_last_op->num_operands(),
+                      (*this)->num_results(),
+                      phi::errors::PreconditionNotMet(
+                          "The size of last of true block op's input must be "
+                          "equal to IfOp's outputs num."));
+    PADDLE_ENFORCE_EQ(false_last_op->isa<pir::YieldOp>(),
+                      true,
+                      phi::errors::PreconditionNotMet(
+                          "The last of false block must be YieldOp"));
+    PADDLE_ENFORCE_EQ(false_last_op->num_operands(),
+                      (*this)->num_results(),
+                      phi::errors::PreconditionNotMet(
+                          "The size of last of false block op's input must be "
+                          "equal to IfOp's outputs num."));
+  }
+}
+
+void WhileOp::Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value cond,
+                    const std::vector<pir::Value> &inputs) {
+  argument.AddInput(cond);
+  argument.AddInputs(inputs);
+  for (auto val : inputs) {
+    argument.AddOutput(val.type());
+  }
+  argument.AddRegion(nullptr);
+}
+pir::Block *WhileOp::body_block() {
+  pir::Region &body_region = (*this)->region(0);
+  if (body_region.empty()) body_region.emplace_back();
+  return body_region.front();
+}
+pir::Value WhileOp::cond() { return (*this)->operand_source(0); }
+
+void WhileOp::Print(pir::IrPrinter &printer) {
+  auto &os = printer.os;
+  auto op = operation();
+  printer.PrintOpResult(op);
+  os << " = \"" << name() << "\"(";
+  printer.PrintValue(cond());
+  os << ") [";
+  auto operands = (*this)->operands_source();
+  pir::PrintInterleave(
+      operands.begin() + 1,
+      operands.end(),
+      [&](pir::Value v) { printer.PrintValue(v); },
+      [&]() { os << ", "; });
+  os << "] { \n ^";
+  pir::PrintInterleave(
+      body_block()->args_begin(),
+      body_block()->args_end(),
+      [&](pir::Value v) { printer.PrintValue(v); },
+      [&]() { os << ", "; });
+  for (auto item : *body_block()) {
+    os << "\n  ";
+    printer.PrintOperation(item);
+  }
+  os << "\n }";
+}
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp)
+
+#endif
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
new file mode 100644
index 00000000000000..3ad3a7c4215c22
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/pir/core/op_base.h"
+
+namespace paddle {
+namespace dialect {
+
+class IfOp : public pir::Op<IfOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.if"; }
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value cond,
+                    std::vector<pir::Type> &&output_types);
+
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value cond,
+                    std::unique_ptr<pir::Block> &&true_block,
+                    std::unique_ptr<pir::Block> &&false_block);
+
+  pir::Value cond() { return operand_source(0); }
+  pir::Block *true_block();
+  pir::Block *false_block();
+  void Print(pir::IrPrinter &printer);  // NOLINT
+  void VerifySig();
+  void VerifyRegion();
+};
+
+///
+/// \brief The WhileOp is an operation that iterates over a loop body based on a
+/// condition. It takes two inputs: cond_value and loop_vars. The output of the
+/// WhileOp must have the same arity (length and structure) with loop_vars." The
+/// semantics of WhileOp[outputs = while_op(cond, inputs)] are as below:
+///   outputs = inputs
+///   while(cond){
+///      cond, outputs = body(outputs)
+///   }
+///
+class WhileOp : public pir::Op<WhileOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.while"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value cond,
+                    const std::vector<pir::Value> &inputs);
+  pir::Block *body_block();
+  pir::Value cond();
+  void Print(pir::IrPrinter &printer);  // NOLINT
+  void VerifySig() {}
+  void VerifyRegion() {}
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp)
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
index eb5acbf2388ea8..be652e48263301 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
@@ -100,5 +100,24 @@ pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
           out_grad_combine_op.out(), axis);
   return split_grad_op.result(0);
 }
+
+pir::OpResult ones(const std::vector<int64_t>& shape,
+                   phi::DataType dtype,
+                   const Place& place) {
+  return paddle::dialect::full(shape, 1, dtype, place);
+}
+
+pir::OpResult ones_like(pir::Value x_,
+                        phi::DataType dtype,
+                        const Place& place) {
+  return paddle::dialect::full_like(x_, 1, dtype, place);
+}
+
+pir::OpResult zeros(const std::vector<int64_t>& shape,
+                    phi::DataType dtype,
+                    const Place& place) {
+  return paddle::dialect::full(shape, 0, dtype, place);
+}
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
index fe579295ad5a09..a9df64a905b24d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
@@ -47,5 +47,18 @@ pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
 
 pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
                                   const pir::Value& axis);
+
+pir::OpResult ones(const std::vector<int64_t>& shape,
+                   phi::DataType dtype = phi::DataType::FLOAT32,
+                   const Place& place = phi::CPUPlace());
+
+pir::OpResult ones_like(pir::Value x_,
+                        phi::DataType dtype = phi::DataType::UNDEFINED,
+                        const Place& place = {});
+
+pir::OpResult zeros(const std::vector<int64_t>& shape,
+                    phi::DataType dtype = phi::DataType::FLOAT32,
+                    const Place& place = phi::CPUPlace());
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index 66a52f99c6b44e..00ba7da80aa253 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -50,7 +50,7 @@ OpInfoTuple AddNOp::GetOpInfo() {
   return std::make_tuple(inputs, attributes, outputs, run_time_info, "add_n");
 }
 
-void AddNOp::Verify() {
+void AddNOp::VerifySig() {
   VLOG(4) << "Start Verifying inputs, outputs and attributes for: AddNOp.";
   VLOG(4) << "Verifying inputs:";
   {
@@ -222,7 +222,7 @@ void AddN_Op::Build(pir::Builder &builder,
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
 }
 
-void AddN_Op::Verify() {
+void AddN_Op::VerifySig() {
   VLOG(4) << "Start Verifying inputs, outputs and attributes for: AddN_Op.";
   VLOG(4) << "Verifying inputs:";
   {
@@ -345,7 +345,7 @@ void AddNWithKernelOp::Build(pir::Builder &builder,
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
 }
 
-void AddNWithKernelOp::Verify() {
+void AddNWithKernelOp::VerifySig() {
   VLOG(4) << "Start Verifying inputs, outputs and attributes for: "
              "AddNWithKernelOp.";
   VLOG(4) << "Verifying inputs:";
@@ -429,9 +429,9 @@ OpInfoTuple FusedGemmEpilogueOp::GetOpInfo() {
   paddle::dialect::OpRunTimeInfo run_time_info(
       "FusedGemmEpilogueInferMeta",
       {"x", "y", "bias", "trans_x", "trans_y", "activation"},
-      "",
-      {""},
-      {""},
+      {"fused_gemm_epilogue"},
+      {"x", "y", "bias", "trans_x", "trans_y", "activation"},
+      {},
       {},
       {},
       {});
@@ -561,7 +561,7 @@ void FusedGemmEpilogueOp::Build(pir::Builder &builder,
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
 }
 
-void FusedGemmEpilogueOp::Verify() {
+void FusedGemmEpilogueOp::VerifySig() {
   VLOG(4) << "Start Verifying inputs, outputs and attributes for: "
              "FusedGemmEpilogueOp.";
   VLOG(4) << "Verifying inputs:";
@@ -674,9 +674,15 @@ OpInfoTuple FusedGemmEpilogueGradOp::GetOpInfo() {
                                                 "trans_x",
                                                 "trans_y",
                                                 "activation_grad"},
-                                               "",
-                                               {""},
-                                               {""},
+                                               {"fused_gemm_epilogue_grad"},
+                                               {"x",
+                                                "y",
+                                                "reserve_space",
+                                                "out_grad",
+                                                "trans_x",
+                                                "trans_y",
+                                                "activation_grad"},
+                                               {},
                                                {},
                                                {},
                                                {});
@@ -833,7 +839,7 @@ void FusedGemmEpilogueGradOp::Build(pir::Builder &builder,
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
 }
 
-void FusedGemmEpilogueGradOp::Verify() {}
+void FusedGemmEpilogueGradOp::VerifySig() {}
 
 void FusedGemmEpilogueGradOp::InferMeta(phi::InferMetaContext *infer_meta) {
   auto fn = PD_INFER_META(phi::FusedGemmEpilogueGradInferMeta);
@@ -983,7 +989,7 @@ void SplitGradOp::Build(pir::Builder &builder,
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
 }
 
-void SplitGradOp::Verify() {
+void SplitGradOp::VerifySig() {
   VLOG(4) << "Start Verifying inputs, outputs and attributes for: SplitGradOp.";
   VLOG(4) << "Verifying inputs:";
   {
@@ -1040,68 +1046,6 @@ void SplitGradOp::InferMeta(phi::InferMetaContext *infer_meta) {
   fn(infer_meta);
 }
 
-void IfOp::Build(pir::Builder &builder,             // NOLINT
-                 pir::OperationArgument &argument,  // NOLINT
-                 pir::Value cond,
-                 std::vector<pir::Type> &&output_types) {
-  VLOG(4) << "Start build IfOp";
-
-  argument.AddRegions(2u);
-  argument.AddInput(cond);
-  argument.output_types.swap(output_types);
-}
-pir::Block *IfOp::true_block() {
-  pir::Region &true_region = (*this)->region(0);
-  if (true_region.empty()) true_region.emplace_back();
-  return true_region.front();
-}
-pir::Block *IfOp::false_block() {
-  pir::Region &false_region = (*this)->region(1);
-  if (false_region.empty()) false_region.emplace_back();
-  return false_region.front();
-}
-void IfOp::Print(pir::IrPrinter &printer) {
-  auto &os = printer.os;
-  auto op = operation();
-  printer.PrintOpResult(op);
-  os << " = pd_op.if";
-  printer.PrintOpOperands(op);
-  os << " -> ";
-  printer.PrintOpReturnType(op);
-  os << "{";
-  for (auto item : *true_block()) {
-    os << "\n  ";
-    printer.PrintOperation(item);
-  }
-  os << "\n } else {";
-  for (auto item : *false_block()) {
-    os << "\n  ";
-    printer.PrintOperation(item);
-  }
-  os << "\n }";
-}
-void IfOp::Verify() {}
-
-void WhileOp::Build(pir::Builder &builder,             // NOLINT
-                    pir::OperationArgument &argument,  // NOLINT
-                    const std::vector<pir::Value> &inputs,
-                    const std::vector<pir::Type> &output_types) {
-  // auto insert_point = builder.insert_point();
-  argument.AddInputs(inputs);
-  argument.AddOutputs(output_types);
-  argument.AddRegion(nullptr);
-  argument.AddRegion(nullptr);
-}
-pir::Block *WhileOp::cond_block() {
-  pir::Region &cond_region = (*this)->region(0);
-  if (cond_region.empty()) cond_region.emplace_back();
-  return cond_region.front();
-}
-pir::Block *WhileOp::body_block() {
-  pir::Region &body_region = (*this)->region(1);
-  if (body_region.empty()) body_region.emplace_back();
-  return body_region.front();
-}
 }  // namespace dialect
 }  // namespace paddle
 
@@ -1111,5 +1055,3 @@ IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueGradOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp)
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.h b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
index 93f24e80cb5248..317ce64feea084 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
@@ -45,7 +45,7 @@ class AddNOp : public pir::Op<AddNOp,
                     pir::OperationArgument &argument,  // NOLINT
                     pir::Value inputs);
 
-  void Verify();
+  void VerifySig();
   pir::Value inputs() { return operand_source(0); }
   pir::OpResult out() { return result(0); }
   static void InferMeta(phi::InferMetaContext *infer_meta);
@@ -69,7 +69,7 @@ class AddN_Op : public pir::Op<AddN_Op,
                     pir::OperationArgument &argument,  // NOLINT
                     pir::Value inputs_);
 
-  void Verify();
+  void VerifySig();
   pir::Value inputs() { return operand_source(0); }
   pir::OpResult out() { return result(0); }
 
@@ -89,7 +89,7 @@ class AddNWithKernelOp : public pir::Op<AddNWithKernelOp,
                     pir::OperationArgument &argument,  // NOLINT
                     pir::Value inputs_);
 
-  void Verify();
+  void VerifySig();
   pir::Value inputs() { return operand_source(0); }
   pir::OpResult out() { return result(0); }
 
@@ -113,7 +113,7 @@ class FusedGemmEpilogueOp
                     pir::Value y_,
                     pir::Value bias_,
                     pir::AttributeMap attributes);
-  void Verify();
+  void VerifySig();
   pir::Value x() { return operand_source(0); }
   pir::Value y() { return operand_source(1); }
   pir::Value bias() { return operand_source(2); }
@@ -141,7 +141,7 @@ class FusedGemmEpilogueGradOp
                     pir::Value reserve_space_,
                     pir::Value out_grad_,
                     pir::AttributeMap attributes);
-  void Verify();
+  void VerifySig();
   pir::Value x() { return operand_source(0); }
   pir::Value y() { return operand_source(1); }
   pir::Value reserve_space() { return operand_source(2); }
@@ -169,46 +169,13 @@ class SplitGradOp : public pir::Op<SplitGradOp, OpYamlInfoInterface> {
                     pir::Value out_grad_,
                     pir::Value axis_);
 
-  void Verify();
+  void VerifySig();
   pir::Value out_grad() { return operand_source(0); }
   pir::Value axis() { return operand_source(1); }
   pir::OpResult x_grad() { return result(0); }
   static void InferMeta(phi::InferMetaContext *infer_meta);
 };
 
-class IfOp : public pir::Op<IfOp> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd_op.if"; }
-  static constexpr const char **attributes_name = nullptr;
-  static constexpr uint32_t attributes_num = 0;
-  static void Build(pir::Builder &builder,             // NOLINT
-                    pir::OperationArgument &argument,  // NOLINT
-                    pir::Value cond,
-                    std::vector<pir::Type> &&output_types);
-  pir::Value cond() { return operand_source(0); }
-  pir::Block *true_block();
-  pir::Block *false_block();
-  void Print(pir::IrPrinter &printer);  // NOLINT
-  void Verify();
-};
-
-class WhileOp : public pir::Op<WhileOp> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd.while"; }
-  static constexpr uint32_t attributes_num = 0;
-  static constexpr const char **attributes_name = nullptr;
-
-  static void Build(pir::Builder &builder,             // NOLINT
-                    pir::OperationArgument &argument,  // NOLINT
-                    const std::vector<pir::Value> &inputs,
-                    const std::vector<pir::Type> &output_types);
-  void Verify() {}
-  pir::Block *cond_block();
-  pir::Block *body_block();
-};
-
 }  // namespace dialect
 }  // namespace paddle
 
@@ -218,5 +185,3 @@ IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueGradOp)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp)
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc
new file mode 100644
index 00000000000000..e6c84ca2934774
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/primitive/composite/composite.h"
+#include "paddle/fluid/primitive/type/lazy_tensor.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/int_array.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/op_base.h"
+
+// TODO(chenzhuo)
+// this file will be generated in pd_op_decomp.cc
+
+namespace paddle {
+namespace dialect {
+using IntArray = paddle::experimental::IntArray;
+
+std::vector<std::vector<pir::OpResult>> MeanOp::Decomp(pir::Operation* op) {
+  MeanOp op_obj = op->dyn_cast<MeanOp>();
+  (void)op_obj;
+
+  VLOG(4) << "Decomp Prepare inputs of mean";
+
+  Tensor x(std::make_shared<primitive::LazyTensor>(op_obj.x()));
+
+  VLOG(4) << "Decomp prepare attributes of mean";
+
+  IntArray axis = op->attribute("axis")
+                      .dyn_cast<paddle::dialect::IntArrayAttribute>()
+                      .data();
+
+  bool keepdim = op->attribute("keepdim").dyn_cast<pir::BoolAttribute>().data();
+  VLOG(4) << "Decomp mean keep_dim " << keepdim;
+
+  VLOG(4) << "Decomp prepare call mean's decomp interface";
+
+  Tensor op_res =
+      paddle::primitive::details::mean_decomp<primitive::LazyTensor>(
+          x, axis, keepdim);
+
+  auto org_res = op->results();
+  std::vector<std::vector<pir::OpResult>> res(org_res.size());
+  res[0].push_back(
+      std::static_pointer_cast<primitive::LazyTensor>(op_res.impl())
+          ->value()
+          .dyn_cast<pir::OpResult>());
+  return res;
+}
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
index 3b69d68eb65f3d..f10db043d1523d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
@@ -50,10 +50,10 @@ phi::Scalar ScalarAttribute::data() {
 
 IntArrayAttribute IntArrayAttribute::Parse(pir::IrParser &parser) {  // NOLINT
   Token buket_token = parser.ConsumeToken();
-  std::vector<int32_t> vec{};
+  std::vector<int> vec{};
   while (parser.PeekToken().val_ != "]") {
     Token val_token = parser.ConsumeToken();
-    vec.push_back(atoll(val_token.val_.c_str()));
+    vec.push_back(atoi(val_token.val_.c_str()));
     if (parser.PeekToken().val_ == "]") break;
     parser.ConsumeToken();
   }
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index ac62747026ed06..e484d7812d2daa 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
 // paddle/fluid/pir/dialect/CMakeLists.txt.
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/type_storage.h"
@@ -50,14 +51,16 @@ void OperatorDialect::initialize() {
 #define GET_OP_LIST
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"  // NOLINT
       >();
+  RegisterOps<
+#define GET_OP_LIST
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc"  // NOLINT
+      >();
   RegisterOps<paddle::dialect::AddNOp,
               paddle::dialect::AddN_Op,
               paddle::dialect::AddNWithKernelOp,
               paddle::dialect::FusedGemmEpilogueOp,
               paddle::dialect::FusedGemmEpilogueGradOp,
-              paddle::dialect::SplitGradOp,
-              paddle::dialect::IfOp,
-              paddle::dialect::WhileOp>();
+              paddle::dialect::SplitGradOp>();
 
   RegisterInterfaces<ParameterConvertInterface>();
 }
@@ -123,7 +126,7 @@ pir::Type OperatorDialect::ParseType(pir::IrParser &parser) {  // NOLINT
       break;
     }
     parser.ConsumeToken();
-    parser.lexer->Unget(peek_token_val.size() - 1);
+    parser.lexer->Unget(static_cast<int>(peek_token_val.size() - 1));
     if (parser.PeekToken().token_type_ != DIGIT) {
       break;
     }
@@ -163,6 +166,8 @@ void OperatorDialect::PrintOperation(pir::Operation *op,
                                      pir::IrPrinter &printer) const {
   if (auto if_op = op->dyn_cast<IfOp>()) {
     if_op.Print(printer);
+  } else if (auto while_op = op->dyn_cast<WhileOp>()) {
+    while_op.Print(printer);
   } else {
     printer.PrintGeneralOperation(op);
   }
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 29835f84908194..899863d58aba12 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -7,7 +7,6 @@
   kernel:
     func: add_n
     param: [inputs]
-  backward: add_n_grad
 
 - op : add_n_with_kernel
   args : (Tensor[] inputs)
@@ -18,7 +17,6 @@
   kernel:
     func: add_n
     param: [inputs]
-  backward: add_n_grad
 
 - op : assert
   args : (Tensor cond, Tensor[] data, int64_t summarize = -1)
@@ -175,16 +173,25 @@
 - op : write_to_array
   args : (Tensor i, Tensor x)
   output : Tensor[](out)
-  backward: write_to_array_grad
+
+- op: dpsgd
+  args: (Tensor param, Tensor grad, Tensor learning_rate, float clip = 10.0f, float batch_size = 16.0f, float sigma = 1.0f, int seed = 0)
+  output: Tensor(param_out)
+  infer_meta:
+     func: DpsgdInferMeta
+  kernel:
+     func: dpsgd
+     data_type: param
 
 - op: fused_attention
   args: (Tensor x, Tensor ln_scale, Tensor ln_bias, Tensor qkv_weight, Tensor qkv_bias, Tensor cache_kv, Tensor src_mask, Tensor out_linear_weight, Tensor out_linear_bias, Tensor ln_scale_2, Tensor ln_bias_2, int num_heads, bool transpose_qkv_wb, bool pre_layer_norm, float epsilon, float attn_dropout_rate, bool is_test, bool attn_dropout_fix_seed, int attn_dropout_seed, str attn_dropout_implementation, float dropout_rate, bool dropout_fix_seed, int dropout_seed, str dropout_implementation, float ln_epsilon, bool add_residual, int ring_id)
   output: Tensor(ln_mean), Tensor(ln_var), Tensor(ln_out), Tensor(qkv_out), Tensor(qkv_bias_out), Tensor(transpose_out_2), Tensor(qk_out), Tensor(qktv_out), Tensor(softmax_out), Tensor(attn_dropout_mask_out), Tensor(attn_dropout_out), Tensor(src_mask_out), Tensor(fmha_out), Tensor(out_linear_out), Tensor(dropout_mask_out), Tensor(ln_mean_2), Tensor(ln_var_2), Tensor(bias_dropout_residual_out), Tensor(cache_kv_out), Tensor(out)
   kernel:
     func: fused_attention
+    data_type : x
   infer_meta:
     func: FusedAttentionInferMeta
-  optional: cache_kv, ln_scale, ln_bias, qkv_bias, src_mask, out_linear_bias, ln_scale_2, ln_bias_2, ln_mean_2
+  optional: cache_kv, ln_scale, ln_bias, qkv_bias, src_mask, out_linear_bias, ln_scale_2, ln_bias_2, ln_mean_2, ln_var_2, bias_dropout_residual_out, cache_kv_out
   backward: fused_attention_grad
 
 - op: fused_feedforward
@@ -192,6 +199,7 @@
   output: Tensor(out), Tensor(dropout1_mask), Tensor(dropout2_mask), Tensor(ln1_mean), Tensor(ln1_variance), Tensor(ln2_mean), Tensor(ln2_variance), Tensor(linear1_out), Tensor(ln1_out), Tensor(dropout1_out), Tensor(dropout2_out)
   kernel:
     func: fused_feedforward
+    data_type : x
   infer_meta:
     func: FusedFeedForwardInferMeta
   optional: dropout1_seed, dropout2_seed, linear1_bias, linear2_bias, ln1_scale, ln1_bias, ln2_scale, ln2_bias, ln2_mean, ln2_variance, ln1_mean, ln1_variance, ln1_out
diff --git a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
index 48e69b689c4981..5452cd6f47f30e 100644
--- a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
@@ -222,5 +222,17 @@ const std::string& OpYamlInfoParser::GetOriginOpName() const {
   return std::get<4>(op_info_tuple_);
 }
 
+int OpYamlInfoParser::GetTensorParamIndexByArgsName(
+    const std::string& args_name) const {
+  const auto& iter = std::find(kernel_fn_tensor_params_.begin(),
+                               kernel_fn_tensor_params_.end(),
+                               args_name);
+  if (iter != kernel_fn_tensor_params_.end()) {
+    return std::distance(kernel_fn_tensor_params_.begin(), iter);
+  } else {
+    return -1;
+  }
+}
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h
index 6a4bec08c2b3dc..0a972ced0ef41d 100644
--- a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h
@@ -65,6 +65,8 @@ class OpYamlInfoParser {
 
   const std::string& GetOriginOpName() const;
 
+  int GetTensorParamIndexByArgsName(const std::string& args_name) const;
+
  private:
   void parse();
   inline const std::vector<OpInputInfo>& InputInfo() const {
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index e95ff4b44fcb34..0aa2eaf143f7e9 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include <unordered_set>
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 
 namespace paddle {
@@ -22,10 +23,11 @@ const std::unordered_set<std::string> LegacyOpList = {
     "pd_op.load_combine",
     "pd_op.c_concat",
     "pd_op.c_broadcast_",
-    "pd_op.fused_bn_add_activation_",
-    "pd_op.fused_bn_add_activation_grad",
     "pd_op.c_sync_calc_stream_",
     "pd_op.c_sync_comm_stream_",
+    "pd_op.fused_gemm_epilogue",
+    "pd_op.fused_gemm_epilogue_grad",
+    "pd_op.dpsgd",
     "pd_op.send_v2",
     "pd_op.recv_v2",
     "pd_op.c_allreduce_sum",
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.h b/paddle/fluid/pir/dialect/operator/utils/utils.h
index 6da122af99716c..1c228e7e850834 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.h
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.h
@@ -14,14 +14,13 @@
 
 #pragma once
 
-// #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/type_storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/attribute.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace dialect {
@@ -128,44 +127,6 @@ static inline pir::Attribute TransToIrAttribute(phi::Scalar scalar,
   }
 }
 
-inline DataType VarTypeToDataType(
-    ::paddle::framework::proto::VarType_Type var_type) {
-  switch (var_type) {
-    case paddle::framework::proto::VarType_Type::VarType_Type_BOOL:
-      return DataType::BOOL;
-    case paddle::framework::proto::VarType_Type::VarType_Type_INT16:
-      return DataType::INT16;
-    case paddle::framework::proto::VarType_Type::VarType_Type_INT32:
-      return DataType::INT32;
-    case paddle::framework::proto::VarType_Type::VarType_Type_INT64:
-      return DataType::INT64;
-    case paddle::framework::proto::VarType_Type::VarType_Type_FP16:
-      return DataType::FLOAT16;
-    case paddle::framework::proto::VarType_Type::VarType_Type_FP32:
-      return DataType::FLOAT32;
-    case paddle::framework::proto::VarType_Type::VarType_Type_FP64:
-      return DataType::FLOAT64;
-    case paddle::framework::proto::VarType_Type::VarType_Type_SIZE_T:
-      return DataType::UINT64;
-    case paddle::framework::proto::VarType_Type::VarType_Type_UINT8:
-      return DataType::UINT8;
-    case paddle::framework::proto::VarType_Type::VarType_Type_INT8:
-      return DataType::INT8;
-    case paddle::framework::proto::VarType_Type::VarType_Type_BF16:
-      return DataType::BFLOAT16;
-    case paddle::framework::proto::VarType_Type::VarType_Type_COMPLEX64:
-      return DataType::COMPLEX64;
-    case paddle::framework::proto::VarType_Type::VarType_Type_COMPLEX128:
-      return DataType::COMPLEX128;
-    case paddle::framework::proto::VarType_Type::VarType_Type_PSTRING:
-      return DataType::PSTRING;
-    default:
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "Unsupported proto::VarType_Type `%s` when casting it into DataType.",
-          var_type));
-  }
-}
-
 VariantType GetAttributeData(const pir::Attribute& attr);
 
 bool IsLegacyOp(const std::string& name);
diff --git a/paddle/fluid/pir/drr/CMakeLists.txt b/paddle/fluid/pir/drr/CMakeLists.txt
new file mode 100644
index 00000000000000..c1b524dda69a6a
--- /dev/null
+++ b/paddle/fluid/pir/drr/CMakeLists.txt
@@ -0,0 +1,65 @@
+file(GLOB DRR_SRCS "*.cc" "api/*.cc")
+
+set(op_creator_gen_file
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py
+)
+set(op_compat_yaml_file ${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/op_compat.yaml)
+set(op_forward_yaml_file1
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/ops.parsed.yaml
+)
+set(op_forward_yaml_file2
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/legacy_ops.parsed.yaml
+)
+set(op_backward_yaml_file1
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/backward_ops.parsed.yaml
+)
+set(op_backward_yaml_file2
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/legacy_backward_ops.parsed.yaml
+)
+set(fused_op_forward_yaml_file
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/fused_ops.parsed.yaml
+)
+set(fused_op_backward_yaml_file
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/fused_backward.parsed.yaml
+)
+
+set(parsed_op_dir
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/operator/ir/generated)
+
+set(op_yaml_file3 ${parsed_op_dir}/ops.parsed.yaml)
+set(op_yaml_file4 ${parsed_op_dir}/ops_backward.parsed.yaml)
+
+set(op_yaml_files
+    ${op_forward_yaml_file1},${op_forward_yaml_file2},${op_backward_yaml_file1},${op_backward_yaml_file2},${fused_op_forward_yaml_file},${fused_op_backward_yaml_file},${op_yaml_file3},${op_yaml_file4}
+)
+
+set(op_creator_file
+    ${PADDLE_BINARY_DIR}/paddle/fluid/pir/drr/ir_op_factory_generated.cc)
+set(op_creator_file_tmp ${op_creator_file}.tmp)
+
+set(dialect_name pd_op)
+
+add_custom_command(
+  OUTPUT ${op_creator_file}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${op_creator_gen_file} --op_yaml_files ${op_yaml_files}
+    --op_compat_yaml_file ${op_compat_yaml_file} --dialect_name ${dialect_name}
+    --op_creator_file ${op_creator_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${op_creator_file_tmp}
+          ${op_creator_file}
+  COMMENT "copy_if_different ${op_creator_file}"
+  DEPENDS ${op_creator_gen_file}
+          ${op_forward_yaml_file1}
+          ${op_forward_yaml_file2}
+          ${op_backward_yaml_file1}
+          ${op_backward_yaml_file2}
+          ${op_compat_yaml_file}
+          ${op_yaml_file3}
+          ${op_yaml_file4}
+          pd_op_dialect_op
+  VERBATIM)
+
+cc_library(
+  drr
+  SRCS ${DRR_SRCS} ${op_creator_file}
+  DEPS pd_op_dialect pir)
diff --git a/paddle/fluid/pir/drr/api/drr_pattern_base.h b/paddle/fluid/pir/drr/api/drr_pattern_base.h
new file mode 100644
index 00000000000000..d5f19ff3e6e9be
--- /dev/null
+++ b/paddle/fluid/pir/drr/api/drr_pattern_base.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/pir/drr/api/drr_pattern_context.h"
+#include "paddle/fluid/pir/drr/drr_rewrite_pattern.h"
+
+namespace pir {
+namespace drr {
+
+template <typename DrrPattern>
+class DrrPatternBase {
+ public:
+  virtual ~DrrPatternBase() = default;
+
+  // Define the Drr Pattern.
+  virtual void operator()(pir::drr::DrrPatternContext* ctx) const = 0;
+
+  std::unique_ptr<DrrRewritePattern<DrrPattern>> Build(
+      pir::IrContext* ir_context, pir::PatternBenefit benefit = 1) const {
+    DrrPatternContext drr_context;
+    this->operator()(&drr_context);
+    return std::make_unique<DrrRewritePattern<DrrPattern>>(
+        drr_context, ir_context, benefit);
+  }
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/api/drr_pattern_context.cc b/paddle/fluid/pir/drr/api/drr_pattern_context.cc
new file mode 100644
index 00000000000000..5f74b986f1a5e7
--- /dev/null
+++ b/paddle/fluid/pir/drr/api/drr_pattern_context.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/drr/api/drr_pattern_context.h"
+
+#include "paddle/fluid/pir/drr/pattern_graph.h"
+#include "paddle/pir/core/enforce.h"
+
+namespace pir {
+namespace drr {
+
+DrrPatternContext::DrrPatternContext() {
+  source_pattern_graph_ = std::make_shared<SourcePatternGraph>();
+  result_pattern_graph_ = std::make_shared<ResultPatternGraph>();
+}
+
+drr::SourcePattern DrrPatternContext::SourcePattern() {
+  return drr::SourcePattern(this);
+}
+const Op& DrrPatternContext::SourceOpPattern(
+    const std::string& op_type,
+    const std::unordered_map<std::string, Attribute>& attributes) {
+  owned_ops_.push_back(std::shared_ptr<drr::Op>(
+      new drr::Op(op_type, attributes, source_pattern_graph_.get())));
+  return *owned_ops_.back();
+}
+
+const drr::Tensor& DrrPatternContext::SourceTensorPattern(
+    const std::string& name) {
+  return source_pattern_graph_->AddTensor(std::shared_ptr<drr::Tensor>(
+      new drr::Tensor(name, source_pattern_graph_.get())));
+}
+
+const Op& DrrPatternContext::ResultOpPattern(
+    const std::string& op_type,
+    const std::unordered_map<std::string, Attribute>& attributes) {
+  owned_ops_.push_back(std::shared_ptr<drr::Op>(
+      new drr::Op(op_type, attributes, result_pattern_graph_.get())));
+  return *owned_ops_.back();
+}
+
+drr::Tensor& DrrPatternContext::ResultTensorPattern(const std::string& name) {
+  return result_pattern_graph_->AddTensor(std::shared_ptr<drr::Tensor>(
+      new drr::Tensor(name, result_pattern_graph_.get())));
+}
+
+std::vector<Constraint> DrrPatternContext::constraints() const {
+  return constraints_;
+}
+
+// void DrrPatternContext::RequireEqual(const Attribute& first, const Attribute&
+// second) {
+//   auto constrain_fn = [&](const MatchContext& match_context) {
+//     return match_context.Attr(first.id()) == match_context.Attr(second.id());
+//   };
+//   constraints_.emplace_back(constrain_fn);
+// }
+
+void DrrPatternContext::RequireEqual(const TensorShape& first,
+                                     const TensorShape& second) {
+  // Note: we capture the datas by value for constrain_fn
+  // because the datas are destructed before running constrain_fn.
+  auto constrain_fn = [=](const MatchContext& match_context) {
+    return match_context.Tensor(first.tensor_name()).Shape() ==
+           match_context.Tensor(second.tensor_name()).Shape();
+  };
+  constraints_.emplace_back(constrain_fn);
+}
+
+void DrrPatternContext::RequireEqual(const TensorDataType& first,
+                                     const TensorDataType& second) {
+  // Note: we capture the datas by value for constrain_fn
+  // because the datas are destructed before running constrain_fn.
+  auto constrain_fn = [=](const MatchContext& match_context) {
+    return match_context.Tensor(first.tensor_name()).Dtype() ==
+           match_context.Tensor(second.tensor_name()).Dtype();
+  };
+  constraints_.emplace_back(constrain_fn);
+}
+
+void DrrPatternContext::RequireNativeCall(
+    const std::function<bool(const MatchContext&)>& custom_fn) {
+  constraints_.emplace_back(custom_fn);
+}
+
+void Op::operator()(const Tensor& arg, const Tensor* out) const {
+  std::vector<const Tensor*> inputs{&arg};
+  std::vector<const Tensor*> outputs{out};
+  pattern_graph_->AddOpCall(std::make_shared<OpCall>(this, inputs, outputs));
+}
+
+void Op::operator()(const std::vector<const Tensor*>& args,
+                    const std::vector<const Tensor*>& outputs) const {
+  pattern_graph_->AddOpCall(std::make_shared<OpCall>(this, args, outputs));
+}
+
+Tensor& Op::operator()(const Tensor& arg) const {
+  std::vector<const Tensor*> inputs{&arg};
+  auto& out = pattern_graph_->AddTmpTensor(std::shared_ptr<Tensor>(new Tensor(
+      prefix + op_type_name_ + "_" + std::to_string(count++), pattern_graph_)));
+  std::vector<const Tensor*> outputs{&out};
+  pattern_graph_->AddOpCall(std::make_shared<OpCall>(this, inputs, outputs));
+  return out;
+}
+
+Tensor& Op::operator()(const Tensor& arg1, const Tensor& arg2) const {
+  std::vector<const Tensor*> inputs{&arg1, &arg2};
+  auto& out = pattern_graph_->AddTmpTensor(std::shared_ptr<Tensor>(new Tensor(
+      prefix + op_type_name_ + "_" + std::to_string(count++), pattern_graph_)));
+  std::vector<const Tensor*> outputs{&out};
+  pattern_graph_->AddOpCall(std::make_shared<OpCall>(this, inputs, outputs));
+  return out;
+}
+
+Tensor& Op::operator()() const {
+  std::vector<const Tensor*> inputs{};
+  auto& out = pattern_graph_->AddTmpTensor(std::shared_ptr<Tensor>(new Tensor(
+      prefix + op_type_name_ + "_" + std::to_string(count++), pattern_graph_)));
+  std::vector<const Tensor*> outputs{&out};
+  pattern_graph_->AddOpCall(std::make_shared<OpCall>(this, inputs, outputs));
+  return out;
+}
+
+thread_local int64_t Op::count = 0;
+const char* Op::prefix = "@drr_temp@_";
+
+const char Tensor::NONE_TENSOR_NAME[] = "__@none_tensor@__";
+
+void Tensor::Assign(const Tensor& other) {
+  dynamic_cast<ResultPatternGraph*>(pattern_graph_)->AssignTensor(*this, other);
+}
+
+void Tensor::operator=(const Tensor& other) const {  // NOLINT
+  // The two tensor must be in the same pattern graph.
+  IR_ENFORCE(this->pattern_graph_ == other.pattern_graph_);
+  if (other.name_.find(Op::prefix) == 0 &&
+      name_.find(Op::prefix) == std::string::npos) {
+    other.pattern_graph_->UpdateTmpTensor(other.name_, this->name_);
+  }
+}
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/api/drr_pattern_context.h b/paddle/fluid/pir/drr/api/drr_pattern_context.h
new file mode 100644
index 00000000000000..b4156bd54bf414
--- /dev/null
+++ b/paddle/fluid/pir/drr/api/drr_pattern_context.h
@@ -0,0 +1,334 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <any>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+
+#include "paddle/fluid/pir/drr/api/match_context.h"
+
+namespace pir {
+namespace drr {
+
+class Op;
+class Tensor;
+class OpCall;
+class SourcePattern;
+class ResultPattern;
+class PatternGraph;
+class SourcePatternGraph;
+class ResultPatternGraph;
+
+class NormalAttribute {
+ public:
+  explicit NormalAttribute(const std::string& name) : attr_name_(name) {}
+
+  const std::string& name() const { return attr_name_; }
+
+ private:
+  std::string attr_name_;
+};
+
+using AttrComputeFunc = std::function<std::any(const MatchContext&)>;
+
+class ComputeAttribute {
+ public:
+  explicit ComputeAttribute(const AttrComputeFunc& attr_compute_func)
+      : attr_compute_func_(attr_compute_func) {}
+
+  const AttrComputeFunc& attr_compute_func() const {
+    return attr_compute_func_;
+  }
+
+ private:
+  AttrComputeFunc attr_compute_func_;
+};
+
+using Attribute = std::variant<NormalAttribute, ComputeAttribute>;
+
+class TensorShape {
+ public:
+  explicit TensorShape(const std::string& tensor_name)
+      : tensor_name_(tensor_name) {}
+
+  const std::string& tensor_name() const { return tensor_name_; }
+
+ private:
+  std::string tensor_name_;
+};
+
+class TensorDataType {
+ public:
+  explicit TensorDataType(const std::string& tensor_name)
+      : tensor_name_(tensor_name) {}
+
+  const std::string& tensor_name() const { return tensor_name_; }
+
+ private:
+  std::string tensor_name_;
+};
+
+class Constraint {
+ public:
+  explicit Constraint(
+      const std::function<bool(const MatchContext&)>& constrain_fn)
+      : IsContextMatchConstraint_(constrain_fn) {}
+  bool operator()(const MatchContext& match_context) const {
+    return IsContextMatchConstraint_(match_context);
+  }
+
+ private:
+  std::function<bool(const MatchContext&)> IsContextMatchConstraint_;
+};
+
+class DrrPatternContext {
+ public:
+  DrrPatternContext();
+  ~DrrPatternContext() = default;
+
+  drr::SourcePattern SourcePattern();
+
+  std::shared_ptr<SourcePatternGraph> source_pattern_graph() const {
+    return source_pattern_graph_;
+  }
+
+  std::vector<Constraint> constraints() const;
+
+  std::shared_ptr<ResultPatternGraph> result_pattern_graph() const {
+    return result_pattern_graph_;
+  }
+
+ private:
+  friend class drr::SourcePattern;
+  friend class drr::ResultPattern;
+
+  const Op& SourceOpPattern(
+      const std::string& op_type,
+      const std::unordered_map<std::string, Attribute>& attributes = {});
+  const drr::Tensor& SourceTensorPattern(const std::string& name);
+
+  const Op& ResultOpPattern(
+      const std::string& op_type,
+      const std::unordered_map<std::string, Attribute>& attributes = {});
+  drr::Tensor& ResultTensorPattern(const std::string& name);
+
+  // void RequireEqual(const Attribute& first, const Attribute& second);
+  void RequireEqual(const TensorShape& first, const TensorShape& second);
+  void RequireEqual(const TensorDataType& first, const TensorDataType& second);
+  void RequireNativeCall(
+      const std::function<bool(const MatchContext&)>& custom_fn);
+
+  std::shared_ptr<SourcePatternGraph> source_pattern_graph_;
+  std::vector<Constraint> constraints_;
+  std::shared_ptr<ResultPatternGraph> result_pattern_graph_;
+
+  std::vector<std::shared_ptr<const drr::Op>> owned_ops_;
+};
+
+class Op {
+ public:
+  const std::string& name() const { return op_type_name_; }
+
+  void operator()(const Tensor& arg, const Tensor* out) const;
+
+  Tensor& operator()() const;
+
+  Tensor& operator()(const Tensor& arg) const;
+  Tensor& operator()(const Tensor& arg0, const Tensor& arg1) const;
+  void operator()(const std::vector<const Tensor*>& args,
+                  const std::vector<const Tensor*>& outputs) const;
+  // const Tensor& operator()(const Tensor& arg0, const Tensor& arg1, const
+  // Tensor& arg2) const; const Tensor& operator()(const Tensor& arg0, const
+  // Tensor& arg1, const Tensor& arg2, const Tensor& arg3) const; const Tensor&
+  // operator()(const Tensor& arg0, const Tensor& arg1, const Tensor& arg2,
+  // const Tensor& arg3, const Tensor& arg4) const;
+
+  static const char* prefix;
+
+ private:
+  friend class DrrPatternContext;
+  friend class OpCall;
+
+  Op(const std::string& op_type_name,
+     const std::unordered_map<std::string, Attribute>& attributes,
+     PatternGraph* pattern_graph)
+      : op_type_name_(op_type_name),
+        attributes_(attributes),
+        pattern_graph_(pattern_graph) {}
+
+  const std::unordered_map<std::string, Attribute>& attributes() const {
+    return attributes_;
+  }
+
+  thread_local static int64_t count;
+
+  std::string op_type_name_;
+  std::unordered_map<std::string, Attribute> attributes_;
+  PatternGraph* pattern_graph_{nullptr};
+};
+
+class Tensor {
+ public:
+  static const char NONE_TENSOR_NAME[];
+
+  const std::string& DebugName() const;
+
+  TensorShape shape() const { return TensorShape(name()); }
+
+  TensorDataType dtype() const { return TensorDataType(name()); }
+
+  bool is_none() const { return name_ == NONE_TENSOR_NAME; }
+
+  void Assign(const Tensor& other);
+
+  void operator=(const Tensor& other) const;  // NOLINT
+
+  const std::string& name() const { return name_; }
+
+  void set_name(const std::string& name) { name_ = name; }
+
+  OpCall* producer() const { return producer_; }
+
+  void set_producer(OpCall* producer) { producer_ = producer; }
+
+  const std::vector<const OpCall*>& consumers() const { return consumers_; }
+
+  void set_consumables(const std::vector<const OpCall*>& consumers) {
+    consumers_ = consumers;
+  }
+
+  void AddConsumer(const OpCall* consumer) { consumers_.push_back(consumer); }
+
+ private:
+  friend class DrrPatternContext;
+  friend class Op;
+
+  Tensor(const std::string& name, PatternGraph* pattern_graph)
+      : name_(name), pattern_graph_(pattern_graph) {}
+
+  std::string name_;
+  OpCall* producer_{nullptr};
+  std::vector<const OpCall*> consumers_;
+  PatternGraph* pattern_graph_{nullptr};
+};
+
+class OpCall {
+ public:
+  OpCall(const Op* op,
+         const std::vector<const Tensor*>& inputs,
+         const std::vector<const Tensor*>& outputs)
+      : op_name_(op->op_type_name_),
+        inputs_(inputs),
+        outputs_(outputs),
+        attributes_(op->attributes_) {}
+
+  const std::string& name() const { return op_name_; }
+
+  const std::vector<const Tensor*>& inputs() const { return inputs_; }
+
+  const std::vector<const Tensor*>& outputs() const { return outputs_; }
+
+  const std::unordered_map<std::string, Attribute>& attributes() const {
+    return attributes_;
+  }
+
+ private:
+  std::string op_name_;
+  std::vector<const Tensor*> inputs_;
+  std::vector<const Tensor*> outputs_;
+  std::unordered_map<std::string, Attribute> attributes_;
+};
+
+class ResultPattern {
+ public:
+  const drr::Op& Op(
+      const std::string& op_type,
+      const std::unordered_map<std::string, Attribute>& attributes = {}) {
+    return ctx_->ResultOpPattern(op_type, attributes);
+  }
+
+  drr::Tensor& Tensor(const std::string& name) {
+    return ctx_->ResultTensorPattern(name);
+  }
+
+  // Represent the input tensor which is none.
+  // Example:
+  // instance_norm has follow input tensor : (x, scale, bias), scale and
+  // bias are optional(means it may be none).
+  // When scale is onoe, we can write a instance_norm op in drr as follow:
+  // res.Op("instance_norm")(res.Tensor("x"), res.NoneTensor,
+  // res.Tensor("bias"));
+  drr::Tensor& NoneTensor() {
+    return ctx_->ResultTensorPattern(Tensor::NONE_TENSOR_NAME);
+  }
+
+  Attribute Attr(const std::string& attr_name) const {
+    return NormalAttribute(attr_name);
+  }
+  Attribute Attr(const AttrComputeFunc& attr_compute_func) const {
+    return ComputeAttribute(attr_compute_func);
+  }
+
+ private:
+  friend class SourcePattern;
+
+  explicit ResultPattern(DrrPatternContext* ctx) : ctx_(ctx) {}
+
+  DrrPatternContext* ctx_{nullptr};
+};
+
+class SourcePattern {
+ public:
+  drr::ResultPattern ResultPattern() const { return drr::ResultPattern(ctx_); }
+
+  const drr::Op& Op(
+      const std::string& op_type,
+      const std::unordered_map<std::string, Attribute>& attributes = {}) {
+    return ctx_->SourceOpPattern(op_type, attributes);
+  }
+
+  const drr::Tensor& Tensor(const std::string& name) {
+    return ctx_->SourceTensorPattern(name);
+  }
+
+  Attribute Attr(const std::string& attr_name) const {
+    return NormalAttribute(attr_name);
+  }
+
+  void RequireEqual(const TensorShape& first, const TensorShape& second) {
+    ctx_->RequireEqual(first, second);
+  }
+  void RequireEqual(const TensorDataType& first, const TensorDataType& second) {
+    ctx_->RequireEqual(first, second);
+  }
+
+  void RequireNativeCall(
+      const std::function<bool(const MatchContext&)>& custom_fn) {
+    ctx_->RequireNativeCall(custom_fn);
+  }
+
+ private:
+  friend class DrrPatternContext;
+  explicit SourcePattern(DrrPatternContext* ctx) : ctx_(ctx) {}
+  DrrPatternContext* ctx_{nullptr};
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/api/match_context.cc b/paddle/fluid/pir/drr/api/match_context.cc
new file mode 100644
index 00000000000000..35b28db13254ed
--- /dev/null
+++ b/paddle/fluid/pir/drr/api/match_context.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/drr/api/match_context.h"
+
+#include <cstdint>
+
+#include "paddle/fluid/pir/drr/ir_operation.h"
+#include "paddle/fluid/pir/drr/match_context_impl.h"
+
+namespace pir {
+namespace drr {
+
+MatchContext::MatchContext(std::shared_ptr<const MatchContextImpl> impl)
+    : impl_(impl) {}
+
+const TensorInterface& MatchContext::Tensor(
+    const std::string& tensor_name) const {
+  return impl_->Tensor(tensor_name);
+}
+
+template <typename T>
+T MatchContext::Attr(const std::string& attr_name) const {
+  return impl_->Attr<T>(attr_name);
+}
+
+template bool MatchContext::Attr<bool>(const std::string&) const;
+template int32_t MatchContext::Attr<int32_t>(const std::string&) const;
+template int64_t MatchContext::Attr<int64_t>(const std::string&) const;
+template float MatchContext::Attr<float>(const std::string&) const;
+template std::string MatchContext::Attr<std::string>(const std::string&) const;
+template std::vector<int32_t> MatchContext::Attr<std::vector<int32_t>>(
+    const std::string&) const;
+template std::vector<int64_t> MatchContext::Attr<std::vector<int64_t>>(
+    const std::string&) const;
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/api/match_context.h b/paddle/fluid/pir/drr/api/match_context.h
new file mode 100644
index 00000000000000..a1699ccb5bddf6
--- /dev/null
+++ b/paddle/fluid/pir/drr/api/match_context.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/pir/drr/api/tensor_interface.h"
+#include "paddle/fluid/pir/drr/ir_operation.h"
+
+namespace pir {
+namespace drr {
+
+class TensorInterface;
+class MatchContextImpl;
+
+class MatchContext final {
+ public:
+  MatchContext(std::shared_ptr<const MatchContextImpl> impl);
+
+  const TensorInterface& Tensor(const std::string& tensor_name) const;
+
+  template <typename T>
+  T Attr(const std::string& attr_name) const;
+
+ private:
+  std::shared_ptr<const MatchContextImpl> impl_;
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/api/tensor_interface.cc b/paddle/fluid/pir/drr/api/tensor_interface.cc
new file mode 100644
index 00000000000000..1b81b3a5672117
--- /dev/null
+++ b/paddle/fluid/pir/drr/api/tensor_interface.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/drr/api/tensor_interface.h"
+#include "paddle/fluid/pir/drr/ir_value.h"
+
+namespace pir {
+namespace drr {
+
+bool ShapeInterface::operator==(const ShapeInterface& other) const {
+  return *shape_ == *other.shape_;
+}
+
+int ShapeInterface::size() const { return shape_->size(); }
+
+int64_t ShapeInterface::at(int idx) const { return shape_->at(idx); }
+
+bool DtypeInterface::operator==(const DtypeInterface& other) const {
+  return *dtype_ == *other.dtype_;
+}
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/api/tensor_interface.h b/paddle/fluid/pir/drr/api/tensor_interface.h
new file mode 100644
index 00000000000000..7629857591bf33
--- /dev/null
+++ b/paddle/fluid/pir/drr/api/tensor_interface.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace pir {
+namespace drr {
+
+class IrValue;
+class IrShape;
+class IrDtype;
+
+class ShapeInterface final {
+ public:
+  bool operator==(const ShapeInterface& other) const;
+
+  int size() const;
+
+  int64_t at(int idx) const;
+
+ private:
+  explicit ShapeInterface(const IrShape* shape) : shape_(shape) {}
+
+  friend class IrValue;
+
+  const IrShape* shape_;
+};
+
+class DtypeInterface final {
+ public:
+  bool operator==(const DtypeInterface& other) const;
+
+ private:
+  explicit DtypeInterface(const IrDtype* dtype) : dtype_(dtype) {}
+
+  friend class IrValue;
+
+  const IrDtype* dtype_;
+};
+
+class TensorInterface {
+ public:
+  virtual ShapeInterface Shape() const = 0;
+  virtual DtypeInterface Dtype() const = 0;
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/attr_type_uilts.h b/paddle/fluid/pir/drr/attr_type_uilts.h
new file mode 100644
index 00000000000000..fb989fe063b771
--- /dev/null
+++ b/paddle/fluid/pir/drr/attr_type_uilts.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/pir/core/builtin_attribute.h"
+
+namespace pir {
+namespace drr {
+
+template <class T>
+struct CppTypeToIrAttribute;
+
+#define PD_SPECIALIZE_CppTypeToIrAttribute(cpp_type, ir_attr_type) \
+  template <>                                                      \
+  struct CppTypeToIrAttribute<                                     \
+      std::remove_const_t<std::remove_reference_t<cpp_type>>> {    \
+    using type = ir_attr_type;                                     \
+  };
+
+PD_SPECIALIZE_CppTypeToIrAttribute(bool, BoolAttribute);
+PD_SPECIALIZE_CppTypeToIrAttribute(int32_t, Int32Attribute);
+PD_SPECIALIZE_CppTypeToIrAttribute(int64_t, Int64Attribute);
+PD_SPECIALIZE_CppTypeToIrAttribute(float, FloatAttribute);
+PD_SPECIALIZE_CppTypeToIrAttribute(std::string, StrAttribute);
+PD_SPECIALIZE_CppTypeToIrAttribute(phi::DataType,
+                                   paddle::dialect::DataTypeAttribute);
+PD_SPECIALIZE_CppTypeToIrAttribute(phi::Place, paddle::dialect::PlaceAttribute);
+PD_SPECIALIZE_CppTypeToIrAttribute(std::vector<int32_t>, pir::ArrayAttribute);
+PD_SPECIALIZE_CppTypeToIrAttribute(std::vector<int64_t>,
+                                   paddle::dialect::IntArrayAttribute);
+
+template <typename T>
+struct IrAttrbuteCreator {
+  typename CppTypeToIrAttribute<T>::type operator()(T obj) const {
+    return CppTypeToIrAttribute<T>::type::template get(
+        pir::IrContext::Instance(), obj);
+  }
+};
+
+template <>
+struct IrAttrbuteCreator<std::vector<int32_t>> {
+  pir::ArrayAttribute operator()(std::vector<int32_t> obj) const {
+    std::vector<pir::Attribute> attr_vec;
+    attr_vec.reserve(obj.size());
+    for (int32_t x : obj) {
+      attr_vec.push_back(Int32Attribute::get(pir::IrContext::Instance(), x));
+    }
+    return pir::ArrayAttribute::get(pir::IrContext::Instance(), attr_vec);
+  }
+};
+
+template <typename T>
+struct IrAttrTypeCast {
+  static T To(const pir::Attribute& attr) {
+    return attr.dyn_cast<typename CppTypeToIrAttribute<T>::type>().data();
+  }
+};
+
+template <>
+struct IrAttrTypeCast<std::string> {
+  static std::string To(const pir::Attribute& attr) {
+    return attr.dyn_cast<typename CppTypeToIrAttribute<std::string>::type>()
+        .AsString();
+  }
+};
+
+template <>
+struct IrAttrTypeCast<std::vector<int32_t>> {
+  static std::vector<int32_t> To(const pir::Attribute& attr) {
+    std::vector<int32_t> result;
+    auto array_attr = attr.dyn_cast<pir::ArrayAttribute>();
+    for (size_t i = 0; i < array_attr.size(); i++) {
+      result.push_back(array_attr.at(i).dyn_cast<pir::Int32Attribute>().data());
+    }
+    return result;
+  }
+};
+
+template <>
+struct IrAttrTypeCast<std::vector<int64_t>> {
+  static std::vector<int64_t> To(const pir::Attribute& attr) {
+    std::vector<int64_t> result;
+    if (attr.dyn_cast<pir::ArrayAttribute>()) {
+      auto array_attr = attr.dyn_cast<pir::ArrayAttribute>();
+      for (size_t i = 0; i < array_attr.size(); i++) {
+        result.push_back(
+            array_attr.at(i).dyn_cast<pir::Int64Attribute>().data());
+      }
+    } else if (attr.dyn_cast<paddle::dialect::IntArrayAttribute>()) {
+      result =
+          attr.dyn_cast<paddle::dialect::IntArrayAttribute>().data().GetData();
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "Dynamic cast failed for IR attribute vector<int64_t>"));
+    }
+    return result;
+  }
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/drr_rewrite_pattern.h b/paddle/fluid/pir/drr/drr_rewrite_pattern.h
new file mode 100644
index 00000000000000..c17feb0eaad052
--- /dev/null
+++ b/paddle/fluid/pir/drr/drr_rewrite_pattern.h
@@ -0,0 +1,568 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/pir/drr/api/drr_pattern_context.h"
+#include "paddle/fluid/pir/drr/api/match_context.h"
+#include "paddle/fluid/pir/drr/ir_operation.h"
+#include "paddle/fluid/pir/drr/ir_operation_factory.h"
+#include "paddle/fluid/pir/drr/match_context_impl.h"
+#include "paddle/fluid/pir/drr/pattern_graph.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/type_name.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
+
+namespace pir {
+namespace drr {
+
+template <typename DrrPattern>
+class DrrRewritePattern : public pir::RewritePattern {
+ public:
+  explicit DrrRewritePattern(const DrrPatternContext& drr_context,
+                             pir::IrContext* context,
+                             pir::PatternBenefit benefit = 1)
+      : pir::RewritePattern(
+            drr_context.source_pattern_graph()->AnchorNode()->name(),
+            benefit,
+            context,
+            {}),
+        source_pattern_graph_(drr_context.source_pattern_graph()),
+        constraints_(drr_context.constraints()),
+        result_pattern_graph_(drr_context.result_pattern_graph()) {
+    IR_ENFORCE(!source_pattern_graph_->owned_op_call().empty(),
+               "source_pattern_graph is empty, please check the drr pattern "
+               "define code.");
+  }
+
+  bool MatchAndRewrite(pir::Operation* op,
+                       PatternRewriter& rewriter) const override {  // NOLINT
+    std::shared_ptr<MatchContextImpl> src_match_ctx =
+        std::make_shared<MatchContextImpl>();
+    if (PatternGraphMatch(op, src_match_ctx.get())) {
+      VLOG(4) << "DRR pattern (" << pir::get_type_name<DrrPattern>()
+              << ") is matched in program.";
+      PatternGraphRewrite(*src_match_ctx, rewriter);
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  bool PatternGraphMatch(pir::Operation* op,
+                         MatchContextImpl* source_pattern_match_ctx) const {
+    VLOG(6) << "PatternGraphMatch Start: op(" << op->name() << ")";
+    const OpCall* anchor = source_pattern_graph_->AnchorNode();
+    std::unordered_map<const OpCall*, std::unordered_set<pir::Operation*>>
+        bind_map =
+            FindCandidateIrOutputOp(op, anchor, *(source_pattern_graph_.get()));
+    if (bind_map.empty()) {
+      return false;
+    }
+    std::vector<const OpCall*> drr_output_sequence;
+    std::vector<Operation*> ir_output_sequence;
+    std::unordered_map<const OpCall*, Operation*> output_op_map;
+    for (auto pair : bind_map) {
+      drr_output_sequence.push_back(pair.first);
+    }
+    // using dfs to obtain the arrangement of all candidate ir ops
+    auto permute = [&](auto&& permute, size_t index) -> bool {
+      if (index == drr_output_sequence.size()) {
+        // avoiding duplicate binding of ir op
+        std::unordered_set<Operation*> ir_output_set;
+        for (Operation* op : ir_output_sequence) {
+          auto pr = ir_output_set.insert(op);
+          if (pr.second == false) {
+            return false;
+          }
+        }
+        // new match_ctx
+        std::shared_ptr<MatchContextImpl> match_ctx =
+            std::make_shared<MatchContextImpl>();
+        std::transform(drr_output_sequence.begin(),
+                       drr_output_sequence.end(),
+                       ir_output_sequence.begin(),
+                       std::inserter(output_op_map, output_op_map.end()),
+                       [](const OpCall* drr_op, Operation* ir_op) {
+                         return std::make_pair(drr_op, ir_op);
+                       });
+        if (MatchFromOutputToInput(
+                output_op_map, *(source_pattern_graph_.get()), match_ctx)) {
+          *source_pattern_match_ctx = *match_ctx;
+          return true;
+        }
+        return false;
+      }
+      for (auto* ir_op : bind_map[drr_output_sequence[index]]) {
+        ir_output_sequence.push_back(ir_op);
+        if (permute(permute, index + 1)) {
+          return true;
+        }
+        ir_output_sequence.pop_back();
+      }
+      return false;
+    };
+
+    return permute(permute, 0);
+  }
+
+  std::unordered_map<const OpCall*, std::unordered_set<pir::Operation*>>
+  FindCandidateIrOutputOp(
+      pir::Operation* op,
+      const OpCall* anchor,
+      const SourcePatternGraph& source_pattern_graph) const {
+    // get source pattern output op
+    std::unordered_set<const OpCall*> drr_output_op_set =
+        source_pattern_graph.OutputNodes();
+    std::unordered_map<const OpCall*, std::unordered_set<pir::Operation*>>
+        output_op_bind_map{{anchor, {op}}};
+    if (drr_output_op_set.size() == 1) {
+      return output_op_bind_map;
+    }
+    std::unordered_set<const OpCall*> drr_visited_ops{anchor};
+    DfsVisitor(
+        anchor, op, drr_output_op_set, &drr_visited_ops, &output_op_bind_map);
+    if (output_op_bind_map.size() != drr_output_op_set.size()) {
+      return {};
+    }
+    return output_op_bind_map;
+  }
+
+  void DfsVisitor(
+      const OpCall* drr_op,
+      pir::Operation* ir_op,
+      const std::unordered_set<const OpCall*>& drr_output_op_set,
+      std::unordered_set<const OpCall*>* drr_visited_ops,
+      std::unordered_map<const OpCall*, std::unordered_set<pir::Operation*>>*
+          output_op_bind_map) const {
+    VLOG(6) << "DfsVisitor Start: drr op(" << drr_op->name() << ")"
+            << "ir op(" << ir_op->name() << ")";
+    if (drr_op->name() != ir_op->name()) {
+      return;
+    }
+    // check op input's size
+    const auto& drr_op_input_tensors = drr_op->inputs();
+    auto ir_op_input_value_size = ir_op->num_operands();
+    if (drr_op_input_tensors.size() != ir_op_input_value_size) {
+      return;
+    }
+    // check op output's size
+    const auto& drr_op_output_tensors = drr_op->outputs();
+    auto ir_op_output_value_size = ir_op->num_results();
+    if (drr_op_output_tensors.size() != ir_op_output_value_size) {
+      return;
+    }
+    // check producer op
+    for (size_t i = 0; i < drr_op_input_tensors.size(); ++i) {
+      // case 1: drr_op_input_tensor is the input tensor of source pattern
+      if (drr_op_input_tensors[i]->producer() == nullptr) {
+        // dfs source pattern input tensor other child op
+        auto ir_input_tensor = ir_op->operand(i).source();
+        for (auto drr_bro_op : drr_op_input_tensors[i]->consumers()) {
+          if (drr_visited_ops->count(drr_bro_op)) {
+            continue;
+          }
+          for (auto it = ir_input_tensor.use_begin();
+               it != ir_input_tensor.use_end();
+               ++it) {
+            auto* ir_bro_op = it.owner();
+            if (drr_bro_op->name() == ir_bro_op->name()) {
+              drr_visited_ops->insert(drr_bro_op);
+              DfsVisitor(drr_bro_op,
+                         ir_bro_op,
+                         drr_output_op_set,
+                         drr_visited_ops,
+                         output_op_bind_map);
+              drr_visited_ops->erase(drr_bro_op);
+            }
+          }
+        }
+        continue;
+      }
+      // case 2: have producer op
+      const auto& drr_producer_op = drr_op_input_tensors[i]->producer();
+      if (drr_visited_ops->count(drr_producer_op)) {
+        continue;
+      }
+      auto ir_operand_value = ir_op->operand(i).source();
+      if (drr_op_input_tensors[i]->consumers().size() !=
+          ir_operand_value.use_count()) {
+        return;
+      }
+      auto* ir_producer_op = ir_operand_value.dyn_cast<pir::OpResult>().owner();
+      drr_visited_ops->insert(drr_producer_op);
+      DfsVisitor(drr_producer_op,
+                 ir_producer_op,
+                 drr_output_op_set,
+                 drr_visited_ops,
+                 output_op_bind_map);
+      drr_visited_ops->erase(drr_producer_op);
+    }
+    if (drr_output_op_set.count(drr_op)) {
+      (*output_op_bind_map)[drr_op].insert(ir_op);
+      return;
+    }
+    // check child ops
+    for (size_t i = 0; i < drr_op_output_tensors.size(); ++i) {
+      const auto& drr_child_ops = drr_op_output_tensors[i]->consumers();
+      auto ir_output_value = ir_op->result(i);
+      if (drr_child_ops.size() != ir_output_value.use_count()) {
+        return;
+      }
+      for (auto* drr_child_op : drr_child_ops) {
+        for (auto it = ir_output_value.use_begin();
+             it != ir_output_value.use_end();
+             ++it) {
+          auto* ir_child_op = it.owner();
+          if (drr_child_op->name() == ir_child_op->name()) {
+            if (drr_visited_ops->count(drr_child_op)) {
+              continue;
+            }
+            drr_visited_ops->insert(drr_child_op);
+            DfsVisitor(drr_child_op,
+                       ir_child_op,
+                       drr_output_op_set,
+                       drr_visited_ops,
+                       output_op_bind_map);
+            drr_visited_ops->erase(drr_child_op);
+          }
+        }
+      }
+    }  // check child ops
+    return;
+  }
+
+  bool MatchFromOutputToInput(
+      std::unordered_map<const OpCall*, Operation*> output_op_map,
+      const SourcePatternGraph& source_pattern_graph,
+      const std::shared_ptr<MatchContextImpl>& source_pattern_match_ctx) const {
+    VLOG(6) << "MatchFromOutputToInput Start";
+    std::unordered_set<const OpCall*> drr_visited;
+    std::unordered_set<Operation*> ir_visited;
+    std::queue<const OpCall*> drr_q;
+    std::queue<pir::Operation*> ir_q;
+    bool matched = true;
+    size_t step = 0;
+    for (auto it = output_op_map.begin(); it != output_op_map.end(); ++it) {
+      VLOG(6) << "match (" << it->first->name() << " @" << it->first << " : @"
+              << it->second << ") in source_pattern_graph ";
+      drr_q.push(it->first);
+      drr_visited.insert(it->first);
+      ir_q.push(it->second);
+      ir_visited.insert(it->second);
+    }
+    while (!drr_q.empty()) {
+      if (!matched) break;
+      auto* drr_node = drr_q.front();
+      auto* ir_node = ir_q.front();
+      drr_q.pop();
+      ir_q.pop();
+      if (drr_node->name() != ir_node->name()) {
+        matched = false;
+        break;
+      }
+      const auto& drr_input_tensors = drr_node->inputs();
+      auto ir_input_value_size = ir_node->num_operands();
+      if (drr_input_tensors.size() != ir_input_value_size) {
+        matched = false;
+        break;
+      }
+      if (drr_node->outputs().size() != ir_node->num_results()) {
+        matched = false;
+        break;
+      }
+      source_pattern_match_ctx->BindIrOperation(
+          drr_node, std::make_shared<IrOperation>(ir_node));
+      // binding input_tensor of current_op
+      for (size_t i = 0; i < drr_input_tensors.size(); ++i) {
+        source_pattern_match_ctx->BindIrValue(
+            drr_input_tensors[i]->name(),
+            std::make_shared<IrValue>(ir_node->operand(i).source()));
+        auto* drr_producer_op = drr_input_tensors[i]->producer();
+        if (drr_producer_op == nullptr) {
+          continue;
+        }
+        auto* ir_producer_op =
+            ir_node->operand(i).source().dyn_cast<pir::OpResult>().owner();
+        if (drr_input_tensors[i]->consumers().size() !=
+            ir_node->operand(i).source().use_count()) {
+          matched = false;
+          break;
+        }
+        // bfs producer_op of current_op
+        if (!drr_visited.count(drr_producer_op)) {
+          drr_q.push(drr_producer_op);
+          ir_q.push(ir_producer_op);
+          drr_visited.insert(drr_producer_op);
+          ir_visited.insert(ir_producer_op);
+        }
+      }
+      // binding output tensor of current_op
+      auto drr_op_output_tensor = drr_node->outputs();
+      for (size_t j = 0; j < drr_op_output_tensor.size(); j++) {
+        source_pattern_match_ctx->BindIrValue(
+            drr_op_output_tensor[j]->name(),
+            std::make_shared<IrValue>(ir_node->result(j)));
+      }
+      ++step;
+    }
+
+    if (matched) {
+      IR_ENFORCE(step == source_pattern_graph.CountOfOpCalls());
+    } else {
+      return matched;
+    }
+
+    MatchContext match_context{source_pattern_match_ctx};
+    for (const auto& constraint : constraints_) {
+      matched = constraint(match_context);
+      if (!matched) break;
+    }
+
+    return matched;
+  }
+
+  void PatternGraphRewrite(const MatchContextImpl& source_pattern_match_ctx,
+                           pir::PatternRewriter& rewriter) const {  // NOLINT
+    VLOG(6) << "Create Operations in result_pattern_graph";
+    MatchContextImpl res_match_ctx = CreateOperations(*source_pattern_graph_,
+                                                      *result_pattern_graph_,
+                                                      source_pattern_match_ctx,
+                                                      rewriter);
+    VLOG(6) << "Process Assign Tensor";
+    RebindIrTensorForAssignTensor(*result_pattern_graph_, &res_match_ctx);
+    VLOG(6) << "Replace Output Values in source_pattern_graph by Output Values "
+               "in result_pattern_graph";
+    ReplaceOutputTensor(source_pattern_match_ctx, res_match_ctx, rewriter);
+    VLOG(6) << "Delete Operations in source_pattern_graph";
+    DeleteSourcePatternOp(*source_pattern_graph_,
+                          *result_pattern_graph_,
+                          source_pattern_match_ctx,
+                          rewriter);
+  }
+
+ private:
+  MatchContextImpl CreateOperations(
+      const SourcePatternGraph& source_pattern_graph,
+      const ResultPatternGraph& result_pattern_graph,
+      const MatchContextImpl& src_match_ctx,
+      pir::PatternRewriter& rewriter) const {  // NOLINT
+    MatchContextImpl res_match_ctx;
+    // add input tensors info for res_match_ctx
+    for (const auto& in_tensor : result_pattern_graph.input_tensors()) {
+      IR_ENFORCE(result_pattern_graph.id2owend_tensor().count(in_tensor),
+                 "Drr input tensor [%s] must exists in result pattern graph.",
+                 in_tensor);
+      if (!result_pattern_graph.id2owend_tensor().at(in_tensor)->is_none()) {
+        res_match_ctx.BindIrValue(
+            in_tensor,
+            std::make_shared<IrValue>(src_match_ctx.GetIrValue(in_tensor)));
+      }
+    }
+
+    if (result_pattern_graph.CountOfOpCalls() == 1) {
+      CreateOperation(*result_pattern_graph.owned_op_call()[0],
+                      src_match_ctx,
+                      rewriter,
+                      &res_match_ctx);
+      return res_match_ctx;
+    }
+
+    std::vector<std::vector<Operation*>> temp_program;
+    std::unordered_map<Operation*, size_t> op_2_temp_program_index;
+    for (Operation* op : *rewriter.block()) {
+      op_2_temp_program_index[op] = temp_program.size();
+      temp_program.push_back({op});
+    }
+
+    // topo order visit result_pattern_graph
+    GraphTopo graph_topo_visit(&result_pattern_graph);
+    graph_topo_visit.WalkGraphNodesTopoOrder([&](const OpCall& op_call) {
+      // set insert point
+      size_t max_input_op_index = 0;
+      Operation* max_index_op = nullptr;
+      for (const Tensor* input : op_call.inputs()) {
+        if (input->is_none()) {
+          continue;
+        }
+        Value ir_val = res_match_ctx.GetIrValue(input->name()).get();
+        if (ir_val) {
+          Operation* ir_input_op = ir_val.dyn_cast<pir::OpResult>().owner();
+          if (max_input_op_index < op_2_temp_program_index[ir_input_op]) {
+            max_input_op_index = op_2_temp_program_index[ir_input_op];
+            max_index_op = ir_input_op;
+          } else if (max_input_op_index ==
+                     op_2_temp_program_index[ir_input_op]) {
+            const auto& ops_vec = temp_program[max_input_op_index];
+            for (auto it = ops_vec.rbegin(); it != ops_vec.rend(); it++) {
+              if (*it == max_index_op) {
+                break;
+              } else if (*it == ir_input_op) {
+                max_index_op = ir_input_op;
+                break;
+              } else {
+                // do nothing
+              }
+            }
+          } else {
+            // do nothing
+          }
+        }
+      }
+      if (max_input_op_index == 0UL) {
+        VLOG(6) << "Not found producer op for (" << op_call.name() << ")";
+        Operation* source_patter_first_op =
+            src_match_ctx
+                .Operation(source_pattern_graph.owned_op_call()[0].get())
+                .get();
+        max_input_op_index = op_2_temp_program_index[source_patter_first_op];
+        rewriter.SetInsertionPoint(source_patter_first_op);
+      } else {
+        rewriter.SetInsertionPointAfter(max_index_op);
+      }
+
+      Operation* new_op =
+          CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx);
+      op_2_temp_program_index[new_op] = max_input_op_index + 1;
+      temp_program[max_input_op_index + 1].push_back(new_op);
+    });
+
+    return res_match_ctx;
+  }
+
+  void RebindIrTensorForAssignTensor(
+      const ResultPatternGraph& result_pattern_graph,
+      MatchContextImpl* res_match_ctx) const {
+    const auto& tensor_assign_map = result_pattern_graph.tensor_assign_map();
+    for (const auto& kv : tensor_assign_map) {
+      const auto& src_tensor_name = kv.first;
+      const auto& dst_tensor_name = kv.second;
+      res_match_ctx->BindIrValue(
+          src_tensor_name,
+          std::make_shared<IrValue>(
+              res_match_ctx->GetIrValue(dst_tensor_name)));
+    }
+  }
+
+  void ReplaceOutputTensor(const MatchContextImpl& src_match_ctx,
+                           const MatchContextImpl& res_match_ctx,
+                           pir::PatternRewriter& rewriter) const {  // NOLINT
+    for (const auto& output_name : result_pattern_graph_->output_tensors()) {
+      if (source_pattern_graph_->id2owend_tensor().count(output_name)) {
+        const auto& src_ir_tensor = src_match_ctx.GetIrValue(output_name);
+        const auto& res_ir_tensor = res_match_ctx.GetIrValue(output_name);
+        rewriter.ReplaceAllUsesWith(src_ir_tensor.get(), res_ir_tensor.get());
+      } else {
+        LOG(WARNING) << "The output tensor (" << output_name
+                     << ") in the result_pattern_graph is not the tensor"
+                        " in source_pattern_graph.";
+      }
+    }
+  }
+
+  void DeleteSourcePatternOp(const SourcePatternGraph& source_pattern_graph,
+                             const ResultPatternGraph& result_pattern_graph,
+                             const MatchContextImpl& src_match_ctx,
+                             pir::PatternRewriter& rewriter) const {  // NOLINT
+    std::vector<const OpCall*> topo_order_ops;
+    GraphTopo graph_topo_visit(&source_pattern_graph);
+    graph_topo_visit.WalkGraphNodesTopoOrder(
+        [&topo_order_ops](const OpCall& op_call) {
+          topo_order_ops.push_back(&op_call);
+        });
+
+    // Filter the operations which are replaced by result pattern
+    // 1. Filter operations by forward walk
+    std::unordered_set<std::string> forward_visited_tensor_set(
+        result_pattern_graph.input_tensors());
+    std::unordered_set<const OpCall*> forward_deleted_ops;
+    std::for_each(topo_order_ops.begin(),
+                  topo_order_ops.end(),
+                  [&forward_deleted_ops,
+                   &forward_visited_tensor_set](const OpCall* op_call) {
+                    if (op_call->inputs().empty()) {
+                      forward_deleted_ops.insert(op_call);
+                      for (const auto* output : op_call->outputs()) {
+                        forward_visited_tensor_set.insert(output->name());
+                      }
+                    }
+                    for (const auto* input : op_call->inputs()) {
+                      if (forward_visited_tensor_set.count(input->name())) {
+                        forward_deleted_ops.insert(op_call);
+                        for (const auto* output : op_call->outputs()) {
+                          forward_visited_tensor_set.insert(output->name());
+                        }
+                        break;
+                      }
+                    }
+                  });
+    // 2. Filter operations by backward walk and merge the forward result
+    std::unordered_set<std::string> backward_visited_tensor_set(
+        result_pattern_graph.output_tensors());
+    std::vector<const OpCall*> deleted_ops;
+    std::unordered_set<const OpCall*> deleted_ops_set;
+    std::for_each(topo_order_ops.rbegin(),
+                  topo_order_ops.rend(),
+                  [&deleted_ops,
+                   &deleted_ops_set,
+                   &backward_visited_tensor_set,
+                   &forward_deleted_ops](const OpCall* op_call) {
+                    bool all_comsumer_deleted = true;
+                    bool from_backward_visited_tensor = false;
+                    for (const auto* output : op_call->outputs()) {
+                      if (backward_visited_tensor_set.count(output->name())) {
+                        from_backward_visited_tensor = true;
+                      } else if (output->consumers().empty()) {
+                        continue;
+                      } else {
+                        all_comsumer_deleted = false;
+                      }
+                    }
+                    if (all_comsumer_deleted && from_backward_visited_tensor &&
+                        forward_deleted_ops.count(op_call)) {
+                      deleted_ops_set.insert(op_call);
+                      deleted_ops.push_back(op_call);
+                      for (const auto* input : op_call->inputs()) {
+                        backward_visited_tensor_set.insert(input->name());
+                      }
+                    }
+                  });
+
+    // Delete Operation with topo order from output tensors.
+    for (const auto* op_call : deleted_ops) {
+      IR_ENFORCE(src_match_ctx.operation_map().count(op_call),
+                 "Drr OpCall [%s] must exists in match context.",
+                 op_call->name());
+      auto* op = src_match_ctx.operation_map().at(op_call)->get();
+      VLOG(6) << "Delete (" << op_call->name() << " @" << op_call << " :@" << op
+              << ") in source_pattern_graph ";
+      rewriter.EraseOp(op);
+    }
+  }
+
+ private:
+  const std::shared_ptr<SourcePatternGraph> source_pattern_graph_;
+  const std::vector<Constraint> constraints_;
+  const std::shared_ptr<ResultPatternGraph> result_pattern_graph_;
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/ir_operation.h b/paddle/fluid/pir/drr/ir_operation.h
new file mode 100644
index 00000000000000..2764bc92454170
--- /dev/null
+++ b/paddle/fluid/pir/drr/ir_operation.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/core/operation.h"
+
+namespace pir {
+namespace drr {
+
+class IrOperation {
+ public:
+  explicit IrOperation(pir::Operation* op) : op_(op) {}
+
+  pir::Operation* get() const { return op_; }
+
+ private:
+  pir::Operation* op_;
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/ir_operation_factory.cc b/paddle/fluid/pir/drr/ir_operation_factory.cc
new file mode 100644
index 00000000000000..5355a8977e8c53
--- /dev/null
+++ b/paddle/fluid/pir/drr/ir_operation_factory.cc
@@ -0,0 +1,166 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/drr/ir_operation_factory.h"
+
+#include <any>
+
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/attr_type_uilts.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
+
+namespace pir {
+namespace drr {
+
+void OperationFactory::RegisterManualOpCreator() {
+  RegisterOperationCreator(
+      "pd_op.fused_gemm_epilogue",
+      [](const std::vector<Value>& inputs,
+         const pir::AttributeMap& attrs,
+         pir::PatternRewriter& rewriter) {
+        return rewriter.Build<paddle::dialect::FusedGemmEpilogueOp>(
+            inputs[0].dyn_cast<pir::OpResult>(),
+            inputs[1].dyn_cast<pir::OpResult>(),
+            inputs[2].dyn_cast<pir::OpResult>(),
+            attrs);
+      });
+  RegisterOperationCreator(
+      "pd_op.fused_gemm_epilogue_grad",
+      [](const std::vector<Value>& inputs,
+         const pir::AttributeMap& attrs,
+         pir::PatternRewriter& rewriter) {
+        return rewriter.Build<paddle::dialect::FusedGemmEpilogueGradOp>(
+            inputs[0].dyn_cast<pir::OpResult>(),
+            inputs[1].dyn_cast<pir::OpResult>(),
+            inputs[2].dyn_cast<pir::OpResult>(),
+            inputs[3].dyn_cast<pir::OpResult>(),
+            attrs);
+      });
+  RegisterOperationCreator("builtin.combine",
+                           [](const std::vector<Value>& inputs,
+                              const pir::AttributeMap& attrs,
+                              pir::PatternRewriter& rewriter) {
+                             return rewriter.Build<pir::CombineOp>(inputs);
+                           });
+}
+
+static pir::Attribute CreateIrAttribute(const std::any& obj) {
+  if (obj.type() == typeid(bool)) {
+    return IrAttrbuteCreator<bool>()(std::any_cast<bool>(obj));
+  } else if (obj.type() == typeid(int32_t)) {
+    return IrAttrbuteCreator<int32_t>()(std::any_cast<int32_t>(obj));
+  } else if (obj.type() == typeid(int64_t)) {
+    return IrAttrbuteCreator<int64_t>()(std::any_cast<int64_t>(obj));
+  } else if (obj.type() == typeid(float)) {
+    return IrAttrbuteCreator<float>()(std::any_cast<float>(obj));
+  } else if (obj.type() == typeid(std::string)) {
+    return IrAttrbuteCreator<std::string>()(std::any_cast<std::string>(obj));
+  } else if (obj.type() == typeid(const char*)) {
+    return IrAttrbuteCreator<std::string>()(std::any_cast<const char*>(obj));
+  } else if (obj.type() == typeid(phi::DataType)) {
+    return IrAttrbuteCreator<phi::DataType>()(
+        std::any_cast<phi::DataType>(obj));
+  } else if (obj.type() == typeid(phi::Place)) {
+    return IrAttrbuteCreator<phi::Place>()(std::any_cast<phi::Place>(obj));
+  } else if (obj.type() == typeid(std::vector<int32_t>)) {
+    return IrAttrbuteCreator<std::vector<int32_t>>()(
+        std::any_cast<std::vector<int32_t>>(obj));
+  } else if (obj.type() == typeid(std::vector<int64_t>)) {
+    return IrAttrbuteCreator<std::vector<int64_t>>()(
+        std::any_cast<std::vector<int64_t>>(obj));
+  } else {
+    PADDLE_THROW(
+        phi::errors::Unimplemented("Type error. CreateIrAttribute for type(%s) "
+                                   "is unimplemented CreateInCurrently.",
+                                   obj.type().name()));
+  }
+}
+
+pir::AttributeMap CreateAttributeMap(const OpCall& op_call,
+                                     const MatchContextImpl& src_match_ctx) {
+  pir::AttributeMap attr_map;
+  for (const auto& kv : op_call.attributes()) {
+    std::visit(
+        [&](auto&& arg) {
+          if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
+                                       NormalAttribute>) {
+            attr_map[kv.first] = src_match_ctx.GetIrAttr(arg.name());
+          }
+          if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
+                                       ComputeAttribute>) {
+            MatchContext ctx(std::make_shared<MatchContextImpl>(src_match_ctx));
+            attr_map[kv.first] =
+                CreateIrAttribute(arg.attr_compute_func()(ctx));
+          }
+        },
+        kv.second);
+  }
+  return attr_map;
+}
+
+Value GetIrValueByDrrTensor(const Tensor& tensor,
+                            const MatchContextImpl& res_match_ctx) {
+  if (tensor.is_none()) {
+    return Value{};
+  }
+  return res_match_ctx.GetIrValue(tensor.name()).get();
+}
+
+std::vector<Value> GetIrValuesByDrrTensors(
+    const std::vector<const Tensor*>& tensors,
+    const MatchContextImpl& res_match_ctx) {
+  std::vector<Value> ir_values;
+  ir_values.reserve(tensors.size());
+  for (const auto* tensor : tensors) {
+    ir_values.push_back(GetIrValueByDrrTensor(*tensor, res_match_ctx));
+  }
+  return ir_values;
+}
+
+void BindIrOutputs(const OpCall& op_call,
+                   pir::Operation* op,
+                   MatchContextImpl* match_ctx) {
+  for (size_t i = 0; i < op_call.outputs().size(); ++i) {
+    std::shared_ptr<IrValue> ir_value = nullptr;
+    if (op->result(i)) {
+      ir_value = std::make_shared<IrValue>(op->result(i));
+    }
+    match_ctx->BindIrValue(op_call.outputs()[i]->name(), ir_value);
+  }
+}
+
+pir::Operation* CreateOperation(const OpCall& op_call,
+                                const MatchContextImpl& src_match_ctx,
+                                pir::PatternRewriter& rewriter,  // NOLINT
+                                MatchContextImpl* res_match_ctx) {
+  VLOG(6) << "Drr create [" << op_call.name() << "] op...";
+  const auto& inputs = op_call.inputs();
+  std::vector<Value> ir_values =
+      GetIrValuesByDrrTensors(inputs, *res_match_ctx);
+  pir::Operation* op = OperationFactory::Instance().CreateOperation(
+      op_call.name(),
+      ir_values,
+      CreateAttributeMap(op_call, src_match_ctx),
+      rewriter);
+  BindIrOutputs(op_call, op, res_match_ctx);
+  VLOG(6) << "Drr create [" << op_call.name() << "] op done.";
+  return op;
+}
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/ir_operation_factory.h b/paddle/fluid/pir/drr/ir_operation_factory.h
new file mode 100644
index 00000000000000..b38b5cd6a12b32
--- /dev/null
+++ b/paddle/fluid/pir/drr/ir_operation_factory.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "paddle/fluid/pir/drr/api/drr_pattern_context.h"
+#include "paddle/fluid/pir/drr/match_context_impl.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
+
+namespace pir {
+namespace drr {
+
+class OperationFactory {
+ public:
+  static OperationFactory& Instance() {
+    static OperationFactory operation_factory;
+    return operation_factory;
+  }
+
+  using operation_create_fn =
+      std::function<pir::Operation*(const std::vector<Value>&,
+                                    const pir::AttributeMap&,
+                                    pir::PatternRewriter&)>;
+
+  void RegisterOperationCreator(const std::string& op_name,
+                                const operation_create_fn& create_fn) {
+    op_creator_map.emplace(op_name, create_fn);
+  }
+
+  pir::Operation* CreateOperation(
+      const std::string& op_name,
+      const std::vector<Value>& inputs,
+      const pir::AttributeMap& attrs,
+      pir::PatternRewriter& rewriter) const {  // NOLINT
+    auto iter = op_creator_map.find(op_name);
+    IR_ENFORCE(iter != op_creator_map.end(),
+               "The create function for op: (%s) is not found.",
+               op_name);
+    return iter->second(inputs, attrs, rewriter);
+  }
+
+ private:
+  OperationFactory() {
+    RegisterGeneratedOpCreator();
+    RegisterManualOpCreator();
+  }
+
+  void RegisterManualOpCreator();
+  void RegisterGeneratedOpCreator();
+
+  std::unordered_map<std::string, operation_create_fn> op_creator_map;
+};
+
+pir::Operation* CreateOperation(const OpCall& op_call,
+                                const MatchContextImpl& src_match_ctx,
+                                pir::PatternRewriter& rewriter,  // NOLINT
+                                MatchContextImpl* res_match_ctx);
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/ir_value.h b/paddle/fluid/pir/drr/ir_value.h
new file mode 100644
index 00000000000000..907df9dfd24ebc
--- /dev/null
+++ b/paddle/fluid/pir/drr/ir_value.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/drr/api/tensor_interface.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/value.h"
+
+namespace pir {
+namespace drr {
+
+class IrShape {
+ public:
+  explicit IrShape(const phi::DDim& dims) : dims_(dims) {}
+
+  bool operator==(const IrShape& other) const { return dims_ == other.dims_; }
+
+  int size() const { return dims_.size(); }
+
+  int64_t at(int idx) const { return dims_.at(idx); }
+
+ private:
+  const phi::DDim dims_;
+};
+
+class IrDtype {
+ public:
+  explicit IrDtype(pir::Type dtype) : dtype_(dtype) {}
+
+  bool operator==(IrDtype other) const { return dtype_ == other.dtype_; }
+
+ private:
+  const pir::Type dtype_;
+};
+
+class IrValue : public TensorInterface {
+ public:
+  explicit IrValue(const pir::Value& value)
+      : value_(value),
+        shape_((value && value.type() &&
+                value.type().dyn_cast<paddle::dialect::DenseTensorType>())
+                   ? value.type()
+                         .dyn_cast<paddle::dialect::DenseTensorType>()
+                         .dims()
+                   : phi::DDim{}),
+        dtype_((value && value.type() &&
+                value.type().dyn_cast<paddle::dialect::DenseTensorType>())
+                   ? value.type()
+                         .dyn_cast<paddle::dialect::DenseTensorType>()
+                         .dtype()
+                   : pir::Type{}) {}
+
+  ShapeInterface Shape() const override { return ShapeInterface(&shape_); }
+  DtypeInterface Dtype() const override { return DtypeInterface(&dtype_); }
+
+  const Value& get() const { return value_; }
+
+ private:
+  const Value value_;
+  const IrShape shape_;
+  const IrDtype dtype_;
+};
+
+class IrAttr;
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/match_context_impl.h b/paddle/fluid/pir/drr/match_context_impl.h
new file mode 100644
index 00000000000000..a04efbbfaf444b
--- /dev/null
+++ b/paddle/fluid/pir/drr/match_context_impl.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/drr/api/drr_pattern_context.h"
+#include "paddle/fluid/pir/drr/api/tensor_interface.h"
+#include "paddle/fluid/pir/drr/attr_type_uilts.h"
+#include "paddle/fluid/pir/drr/ir_operation.h"
+#include "paddle/fluid/pir/drr/ir_value.h"
+#include "paddle/pir/core/builtin_attribute.h"
+
+namespace pir {
+namespace drr {
+
+class MatchContextImpl final {
+ public:
+  MatchContextImpl() = default;
+  ~MatchContextImpl() = default;
+
+  const TensorInterface& Tensor(const std::string& tensor_name) const {
+    IR_ENFORCE(tensor_map_.count(tensor_name),
+               "Drr tensor [%s] must exists in pattern graph.",
+               tensor_name);
+    return *tensor_map_.at(tensor_name);
+  }
+
+  const IrOperation& Operation(const OpCall* op_call) const {
+    IR_ENFORCE(operation_map_.count(op_call),
+               "Drr operation [%s] must exists in pattern graph.",
+               op_call->name());
+    return *operation_map_.at(op_call);
+  }
+
+  template <typename T>
+  T Attr(const std::string& attr_name) const {
+    return IrAttrTypeCast<T>::To(GetIrAttr(attr_name));
+  }
+
+  const IrValue& GetIrValue(const std::string& tensor_name) const {
+    auto iter = tensor_map_.find(tensor_name);
+    PADDLE_ENFORCE_NE(
+        iter,
+        tensor_map_.end(),
+        phi::errors::OutOfRange(
+            "the drr tensor(%s) is not found in the map to ir value.",
+            tensor_name));
+    return *iter->second;
+  }
+
+  pir::Attribute GetIrAttr(const std::string& attr_name) const {
+    auto iter = attr_map_.find(attr_name);
+    PADDLE_ENFORCE_NE(
+        iter,
+        attr_map_.end(),
+        phi::errors::OutOfRange(
+            "the drr attr(%s) is not found in the map to ir attribute.",
+            attr_name));
+    return iter->second;
+  }
+
+  const std::unordered_map<const OpCall*, std::shared_ptr<IrOperation>>&
+  operation_map() const {
+    return operation_map_;
+  }
+
+  const std::unordered_map<std::string, pir::Attribute>& attr_map() const {
+    return attr_map_;
+  }
+
+  const std::unordered_map<std::string, std::shared_ptr<IrValue>>& tensor_map()
+      const {
+    return tensor_map_;
+  }
+
+  void BindIrValue(const std::string& value_name,
+                   const std::shared_ptr<IrValue>& value) {
+    tensor_map_.emplace(value_name, value);
+  }
+
+  void BindIrOperation(const OpCall* op_call,
+                       const std::shared_ptr<IrOperation>& op) {
+    operation_map_.emplace(op_call, op);
+    const auto& attrs = op_call->attributes();
+    for (const auto& kv : attrs) {
+      std::visit(
+          [&](auto&& arg) {
+            if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
+                                         NormalAttribute>) {
+              BindIrAttr(arg.name(), op->get()->attribute(kv.first));
+            }
+          },
+          kv.second);
+    }
+  }
+
+ private:
+  void BindIrAttr(const std::string& attr_name, pir::Attribute attr) {
+    attr_map_.emplace(attr_name, attr);
+  }
+
+  std::unordered_map<std::string, std::shared_ptr<IrValue>> tensor_map_;
+  std::unordered_map<const OpCall*, std::shared_ptr<IrOperation>>
+      operation_map_;
+  std::unordered_map<std::string, pir::Attribute> attr_map_;
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/pattern_graph.cc b/paddle/fluid/pir/drr/pattern_graph.cc
new file mode 100644
index 00000000000000..0b63f398a790bd
--- /dev/null
+++ b/paddle/fluid/pir/drr/pattern_graph.cc
@@ -0,0 +1,223 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/drr/pattern_graph.h"
+
+#include <queue>
+
+#include "paddle/fluid/pir/drr/api/drr_pattern_context.h"
+#include "paddle/pir/core/enforce.h"
+
+namespace pir {
+namespace drr {
+
+const drr::OpCall &PatternGraph::AddOpCall(
+    const std::shared_ptr<drr::OpCall> &op_call) {
+  owned_op_call_.push_back(op_call);
+  for (const auto *input : op_call->inputs()) {
+    const auto &tensor_name = input->name();
+    IR_ENFORCE(id2owned_tensor_.count(tensor_name),
+               "intput tensor [%s] not exist.",
+               tensor_name);
+    id2owned_tensor_.at(tensor_name)->AddConsumer(op_call.get());
+
+    if (input->producer() == nullptr) {
+      input_tensors_.insert(tensor_name);
+    }
+    if (output_tensors_.find(tensor_name) != output_tensors_.end()) {
+      output_tensors_.erase(tensor_name);
+    }
+  }
+  for (auto &output : op_call->outputs()) {
+    const auto &out_tensor_name = output->name();
+    IR_ENFORCE(id2owned_tensor_.count(out_tensor_name));
+    id2owned_tensor_[output->name()]->set_producer(op_call.get());
+  }
+  return *owned_op_call_.back();
+}
+
+drr::Tensor &PatternGraph::AddTensor(
+    const std::shared_ptr<drr::Tensor> &tensor) {
+  if (id2owned_tensor_.find(tensor->name()) == id2owned_tensor_.end()) {
+    id2owned_tensor_[tensor->name()] = tensor;
+    output_tensors_.insert(tensor->name());
+  }
+  return *id2owned_tensor_[tensor->name()];
+}
+
+drr::Tensor &PatternGraph::AddTmpTensor(
+    const std::shared_ptr<drr::Tensor> &tensor) {
+  IR_ENFORCE(id2owned_tensor_.count(tensor->name()) == 0);
+  id2owned_tensor_[tensor->name()] = tensor;
+  output_tensors_.insert(tensor->name());
+  return *id2owned_tensor_[tensor->name()];
+}
+
+void PatternGraph::UpdateTmpTensor(const std::string &tmp_tensor_name,
+                                   const std::string &new_tensor_name) {
+  if (input_tensors_.count(tmp_tensor_name)) {
+    input_tensors_.erase(tmp_tensor_name);
+    input_tensors_.insert(new_tensor_name);
+  }
+
+  output_tensors_.erase(new_tensor_name);
+  if (output_tensors_.count(tmp_tensor_name)) {
+    output_tensors_.erase(tmp_tensor_name);
+    output_tensors_.insert(new_tensor_name);
+  }
+
+  auto tmp_tensor = id2owned_tensor_[tmp_tensor_name];
+  id2owned_tensor_.erase(tmp_tensor_name);
+  tmp_tensor->set_name(new_tensor_name);
+  id2owned_tensor_[new_tensor_name] = tmp_tensor;
+}
+
+size_t PatternGraph::CountOfOpCalls() const { return owned_op_call_.size(); }
+
+OpCall *SourcePatternGraph::AnchorNode() const {
+  for (const auto &output_tensor : output_tensors_) {
+    OpCall *output_op_candidate =
+        id2owned_tensor_.at(output_tensor)->producer();
+    if (std::all_of(output_op_candidate->outputs().begin(),
+                    output_op_candidate->outputs().end(),
+                    [this](const Tensor *output) -> bool {
+                      return this->output_tensors().count(output->name());
+                    }))
+      return output_op_candidate;
+  }
+  IR_THROW("Unable to find a valid anchor");
+}
+
+std::unordered_set<const OpCall *> SourcePatternGraph::OutputNodes() const {
+  std::unordered_set<const OpCall *> output_op_set;
+  for (const auto &output_tensor : output_tensors_) {
+    OpCall *output_op_candidate =
+        id2owned_tensor_.at(output_tensor)->producer();
+    if (std::all_of(output_op_candidate->outputs().begin(),
+                    output_op_candidate->outputs().end(),
+                    [this](const Tensor *output) -> bool {
+                      return this->output_tensors().count(output->name());
+                    }))
+      output_op_set.insert(output_op_candidate);
+  }
+  return output_op_set;
+}
+
+void ResultPatternGraph::AssignTensor(const Tensor &from, const Tensor &to) {
+  if (to.producer() == nullptr) {
+    input_tensors_.insert(to.name());
+  }
+  output_tensors_.erase(to.name());
+  IR_ENFORCE(output_tensors_.count(from.name()) == 1,
+             "The Tensor (%s) which be assigned must be the output of result "
+             "pattern graph.",
+             from.name());
+  tensor_assign_map_[from.name()] = to.name();
+}
+
+void GraphTopo::WalkGraphNodesTopoOrder(
+    const std::function<void(const OpCall &)> &VisitNode) const {
+  // graph data
+  const std::unordered_set<std::string> &inputs_tensor =
+      graph_->input_tensors();
+  const std::unordered_map<std::string, std::shared_ptr<Tensor>>
+      &id2owned_tensor = graph_->id2owend_tensor();
+  const std::vector<std::shared_ptr<OpCall>> &owend_opcall =
+      graph_->owned_op_call();
+
+  std::queue<const OpCall *> opcall_queue;
+  std::unordered_map<const OpCall *, std::unordered_set<std::string>>
+      opcall_dependent;
+
+  // init opcall_dependent
+  for (const std::shared_ptr<OpCall> &opcall_sptr : owend_opcall) {
+    if (opcall_sptr.get()->inputs().empty()) {  // opcall inputs is empty
+      opcall_queue.push(opcall_sptr.get());
+    } else {
+      for (const auto &pre_depd_tensor : opcall_sptr.get()->inputs()) {
+        opcall_dependent[opcall_sptr.get()].insert(pre_depd_tensor->name());
+      }
+    }
+  }
+
+  // init queue
+  for (const auto &tensor_name : inputs_tensor) {
+    IR_ENFORCE(id2owned_tensor.count(tensor_name),
+               "Drr input tensor [%s] must exists in pattern graph.",
+               tensor_name);
+    for (const auto &tensor_comsumer :
+         id2owned_tensor.at(tensor_name).get()->consumers()) {
+      opcall_dependent[tensor_comsumer].erase(tensor_name);
+      if (opcall_dependent[tensor_comsumer].empty()) {
+        opcall_queue.push(tensor_comsumer);
+      }
+    }
+  }
+
+  while (!opcall_queue.empty()) {
+    const OpCall *opcall = opcall_queue.front();
+    opcall_queue.pop();
+    VisitNode(*opcall);
+
+    // update opcall_dependent
+    for (const auto &output_tensor : opcall->outputs()) {
+      for (const auto &tensor_comsumer : output_tensor->consumers()) {
+        opcall_dependent[tensor_comsumer].erase(output_tensor->name());
+        if (opcall_dependent[tensor_comsumer].empty()) {
+          opcall_queue.push(tensor_comsumer);
+        }
+      }
+    }
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, const PatternGraph &pattern_graph) {
+  os << "\nAll Tensors:\n";
+  for (const auto &kv : pattern_graph.id2owend_tensor()) {
+    os << "  " << kv.first;
+  }
+  os << "\n\n";
+
+  os << "Input Tensors:\n";
+  for (const auto &tensor_name : pattern_graph.input_tensors()) {
+    os << "  " << tensor_name;
+  }
+  os << "\n\n";
+
+  os << "Output Tensors:\n";
+  for (const auto &tensor_name : pattern_graph.output_tensors()) {
+    os << "  " << tensor_name;
+  }
+  os << "\n\n";
+
+  for (const auto &op_call : pattern_graph.owned_op_call()) {
+    os << "  " << op_call->name() << " : ";
+    os << "inputs[ ";
+    for (const auto *input : op_call->inputs()) {
+      os << input->name() << " ";
+    }
+    os << "], ";
+
+    os << "outputs[ ";
+    for (const auto &output : op_call->outputs()) {
+      os << output->name() << " ";
+    }
+    os << "]\n";
+  }
+  os << "\n";
+  return os;
+}
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/drr/pattern_graph.h b/paddle/fluid/pir/drr/pattern_graph.h
new file mode 100644
index 00000000000000..63bd60eadf17f3
--- /dev/null
+++ b/paddle/fluid/pir/drr/pattern_graph.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace pir {
+namespace drr {
+
+class Constraint;
+class MatchContext;
+class OpCall;
+class Tensor;
+
+class PatternGraph {
+ public:
+  virtual ~PatternGraph() {}
+
+  const drr::OpCall& AddOpCall(const std::shared_ptr<drr::OpCall>& op_call);
+
+  drr::Tensor& AddTensor(const std::shared_ptr<drr::Tensor>& tensor);
+
+  drr::Tensor& AddTmpTensor(const std::shared_ptr<drr::Tensor>& tensor);
+
+  void UpdateTmpTensor(const std::string& tmp_tensor_name,
+                       const std::string& new_tensor_name);
+
+  const std::unordered_set<std::string>& input_tensors() const {
+    return input_tensors_;
+  }
+
+  const std::unordered_set<std::string>& output_tensors() const {
+    return output_tensors_;
+  }
+
+  size_t CountOfOpCalls() const;
+
+  const std::vector<std::shared_ptr<OpCall>>& owned_op_call() const {
+    return owned_op_call_;
+  }
+
+  const std::unordered_map<std::string, std::shared_ptr<Tensor>>&
+  id2owend_tensor() const {
+    return id2owned_tensor_;
+  }
+
+ protected:
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> id2owned_tensor_;
+  std::vector<std::shared_ptr<OpCall>> owned_op_call_;
+  std::unordered_set<std::string> input_tensors_;
+  std::unordered_set<std::string> output_tensors_;
+};
+
+std::ostream& operator<<(std::ostream& os, const PatternGraph& pattern_graph);
+
+class SourcePatternGraph : public PatternGraph {
+ public:
+  OpCall* AnchorNode() const;
+
+  std::unordered_set<const OpCall*> OutputNodes() const;
+
+ private:
+  friend class DrrPatternContext;
+};
+
+class ResultPatternGraph : public PatternGraph {
+ public:
+  void AssignTensor(const Tensor& from, const Tensor& to);
+
+  const std::unordered_map<std::string, std::string>& tensor_assign_map()
+      const {
+    return tensor_assign_map_;
+  }
+
+ private:
+  std::unordered_map<std::string, std::string> tensor_assign_map_;
+};
+
+class GraphTopo {
+ public:
+  explicit GraphTopo(const PatternGraph* graph) : graph_(graph) {}
+
+  void WalkGraphNodesTopoOrder(
+      const std::function<void(const OpCall&)>& VisitNode) const;
+
+ private:
+  const PatternGraph* graph_;
+};
+
+}  // namespace drr
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/CMakeLists.txt b/paddle/fluid/pir/transforms/CMakeLists.txt
index e1903c903de349..082af7b827ead0 100644
--- a/paddle/fluid/pir/transforms/CMakeLists.txt
+++ b/paddle/fluid/pir/transforms/CMakeLists.txt
@@ -1,3 +1,10 @@
+file(GLOB FUSION_PASS_SRCS "fusion/*.cc")
+
+cc_library(
+  fusion_passes
+  SRCS ${FUSION_PASS_SRCS}
+  DEPS drr)
+
 cc_library(
   transform_general_functions
   SRCS transform_general_functions.cc
@@ -9,7 +16,7 @@ cc_library(
   DEPS pd_kernel_dialect pd_op_dialect pd_op_dialect_utils)
 
 cc_library(
-  _constant_folding_pass
+  pd_constant_folding_pass
   SRCS constant_folding_pass.cc
   DEPS standalone_executor pd_op_to_kernel_pass transform_general_functions)
 
diff --git a/paddle/fluid/pir/transforms/build_cinn_pass.cc b/paddle/fluid/pir/transforms/build_cinn_pass.cc
index 4ad820fe03b6a7..f15183fd1af036 100644
--- a/paddle/fluid/pir/transforms/build_cinn_pass.cc
+++ b/paddle/fluid/pir/transforms/build_cinn_pass.cc
@@ -32,7 +32,7 @@
 #include "paddle/pir/pass/pass_registry.h"
 
 #include "paddle/cinn/frontend/op_mapper_registry.h"
-#include "paddle/cinn/hlir/framework/new_ir/utils.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/utils/flags.h"
 
 PD_DECLARE_string(allow_cinn_ops);
@@ -43,7 +43,7 @@ using GroupOpsVec = std::vector<pir::Operation*>;
 // The delim(`;`) that is used to split the FLAGS_allow_cinn_ops
 // & FLAGS_deny_cinn_ops.
 constexpr char kDelim[] = ";";
-using CompatibleInfo = cinn::hlir::framework::newir::CompatibleInfo;
+using CompatibleInfo = cinn::hlir::framework::pir::CompatibleInfo;
 
 // OpTransInfo contains informations used to detect subgraphs
 // supported by the CINN compiler.
@@ -551,7 +551,6 @@ void ReplaceWithGroupOp(pir::Block* block,
   // step 1: Ensure the insert point and create GroupOp here.
   auto* laste_input_op = group_ops.back();
   builder.SetInsertionPointAfter(laste_input_op);
-  // TODO(Aurelius84): Need confirm how many YieldOps we need.
   std::vector<pir::Type> output_types;
   std::vector<pir::Value> outputs = AnalysisOutputs(group_ops);
   for (auto& value : outputs) {
@@ -559,7 +558,7 @@ void ReplaceWithGroupOp(pir::Block* block,
   }
   // step 2: Replace the old op with GroupOp.
   auto new_group_op = builder.Build<cinn::dialect::GroupOp>(output_types);
-  pir::Block* group_block = new_group_op.Block();
+  pir::Block* group_block = new_group_op.block();
   for (auto* op : group_ops) {
     op->MoveTo(group_block, group_block->begin());
   }
@@ -575,11 +574,11 @@ void ReplaceWithGroupOp(pir::Block* block,
 
 class BuildCinnPass : public pir::Pass {
  public:
-  BuildCinnPass() : pir::Pass("BuildCinnPass", /*opt_level=*/1) {}
+  BuildCinnPass() : pir::Pass("build_cinn_pass", /*opt_level=*/1) {}
 
   void Run(pir::Operation* op) override {
     auto module_op = op->dyn_cast<pir::ModuleOp>();
-    IR_ENFORCE(module_op, "InplacePass should run on module op.");
+    IR_ENFORCE(module_op, "build_cinn_pass should run on module op.");
     auto* block = module_op.block();
 
     std::vector<GroupOpsVec> groups =
diff --git a/paddle/fluid/pir/transforms/constant_folding_pass.cc b/paddle/fluid/pir/transforms/constant_folding_pass.cc
index 3b40960373a2f1..dfa26c950212f3 100644
--- a/paddle/fluid/pir/transforms/constant_folding_pass.cc
+++ b/paddle/fluid/pir/transforms/constant_folding_pass.cc
@@ -192,8 +192,7 @@ class ConstantFoldingPattern : public pir::RewritePattern {
 
 class ConstantFoldingPass : public pir::Pass {
  public:
-  // TODO(liuyuanle): Naming convention for pass.
-  ConstantFoldingPass() : pir::Pass("ConstantFoldingPass", 1) {}
+  ConstantFoldingPass() : pir::Pass("constant_folding_pass", 1) {}
 
   bool Initialize(pir::IrContext* context) override {
     pir::RewritePatternSet ps(context);
diff --git a/paddle/fluid/pir/transforms/fusion/attention_fuse_pass.cc b/paddle/fluid/pir/transforms/fusion/attention_fuse_pass.cc
new file mode 100644
index 00000000000000..0bd8c5e29e7efc
--- /dev/null
+++ b/paddle/fluid/pir/transforms/fusion/attention_fuse_pass.cc
@@ -0,0 +1,253 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/fusion/attention_fuse_pass.h"
+
+#include "paddle/fluid/pir/drr/api/drr_pattern_base.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_registry.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
+
+namespace {
+
+class MultiHeadMatmulFusePattern
+    : public pir::drr::DrrPatternBase<MultiHeadMatmulFusePattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    //
+    // Source Pattern.
+    //
+    pir::drr::SourcePattern src = ctx->SourcePattern();
+    // The first path to matmul with scale (q).
+    const auto &matmul_1 =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_1_transpose_x")},
+                {"transpose_y", src.Attr("matmul_1_transpose_y")}});
+    src.Tensor("matmul_1_out") =
+        matmul_1(src.Tensor("matmul_1_in_1"), src.Tensor("matmul_1_in_2"));
+    const auto &add_1 = src.Op("pd_op.add");
+    src.Tensor("add_1_out") =
+        add_1(src.Tensor("matmul_1_out"), src.Tensor("add_1_in_2"));
+    const auto &full_int_array_1 =
+        src.Op("pd_op.full_int_array",
+               {{"value", src.Attr("full_int_array_1_value")}});
+    const auto &reshape_1 = src.Op("pd_op.reshape");
+    reshape_1({&src.Tensor("add_1_out"), &full_int_array_1()},
+              {&src.Tensor("reshape_1_out"), &src.Tensor("reshape_1_xshape")});
+    const auto &transpose_1 = src.Op("pd_op.transpose");
+    src.Tensor("transpose_1_out") = transpose_1(src.Tensor("reshape_1_out"));
+    const auto &full_1 =
+        src.Op("pd_op.full", {{"value", src.Attr("full_1_value")}});
+    const auto &scale = src.Op("pd_op.scale");
+    src.Tensor("scale_out") = scale(src.Tensor("transpose_1_out"), full_1());
+
+    // The second path to matmul (k).
+    const auto &matmul_2 =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_2_transpose_x")},
+                {"transpose_y", src.Attr("matmul_2_transpose_y")}});
+    src.Tensor("matmul_2_out") =
+        matmul_2(src.Tensor("matmul_1_in_1"), src.Tensor("matmul_2_in_2"));
+    const auto &add_2 = src.Op("pd_op.add");
+    src.Tensor("add_2_out") =
+        add_2(src.Tensor("matmul_2_out"), src.Tensor("add_2_in_2"));
+    const auto &full_int_array_2 = src.Op("pd_op.full_int_array");
+    const auto &reshape_2 = src.Op("pd_op.reshape");
+    reshape_2({&src.Tensor("add_2_out"), &full_int_array_2()},
+              {&src.Tensor("reshape_2_out"), &src.Tensor("reshape_2_xshape")});
+    const auto &transpose_2 = src.Op("pd_op.transpose");
+    src.Tensor("transpose_2_out") = transpose_2(src.Tensor("reshape_2_out"));
+
+    // The third path to matmul (v).
+    const auto &matmul_3 =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_3_transpose_x")},
+                {"transpose_y", src.Attr("matmul_3_transpose_y")}});
+    src.Tensor("matmul_3_out") =
+        matmul_3(src.Tensor("matmul_1_in_1"), src.Tensor("matmul_3_in_2"));
+    const auto &add_3 = src.Op("pd_op.add");
+    src.Tensor("add_3_out") =
+        add_3(src.Tensor("matmul_3_out"), src.Tensor("add_3_in_2"));
+    const auto &full_int_array_3 = src.Op("pd_op.full_int_array");
+    const auto &reshape_3 = src.Op("pd_op.reshape");
+    reshape_3({&src.Tensor("add_3_out"), &full_int_array_3()},
+              {&src.Tensor("reshape_3_out"), &src.Tensor("reshape_3_xshape")});
+    const auto &transpose_3 = src.Op("pd_op.transpose");
+    src.Tensor("transpose_3_out") = transpose_3(src.Tensor("reshape_3_out"));
+
+    // softmax(qk)v
+    const auto &matmul_4 =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_4_transpose_x")},
+                {"transpose_y", src.Attr("matmul_4_transpose_y")}});
+    src.Tensor("matmul_4_out") =
+        matmul_4(src.Tensor("scale_out"), src.Tensor("transpose_2_out"));
+    const auto &add_4 = src.Op("pd_op.add");
+    src.Tensor("add_4_out") =
+        add_4(src.Tensor("matmul_4_out"), src.Tensor("add_4_in_2"));
+    const auto &softmax =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+    src.Tensor("softmax_out") = softmax(src.Tensor("add_4_out"));
+    const auto &matmul_5 =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_5_transpose_x")},
+                {"transpose_y", src.Attr("matmul_5_transpose_y")}});
+    src.Tensor("matmul_5_out") =
+        matmul_5(src.Tensor("softmax_out"), src.Tensor("transpose_3_out"));
+    const auto &transpose_4 = src.Op("pd_op.transpose");
+    src.Tensor("transpose_4_out") = transpose_4(src.Tensor("matmul_5_out"));
+    const auto &full_int_array_4 = src.Op("pd_op.full_int_array");
+    const auto &reshape_4 = src.Op("pd_op.reshape");
+    reshape_4({&src.Tensor("transpose_4_out"), &full_int_array_4()},
+              {&src.Tensor("reshape_4_out"), &src.Tensor("reshape_4_xshape")});
+
+    //
+    // Constraints.
+    //
+    src.RequireNativeCall([](const pir::drr::MatchContext &match_ctx) -> bool {
+      const auto &softmax_axis = match_ctx.Attr<int>("softmax_axis");
+      if (softmax_axis != -1 && softmax_axis != 3) return false;
+
+      bool matmul_1_transpose_x = match_ctx.Attr<bool>("matmul_1_transpose_x");
+      bool matmul_1_transpose_y = match_ctx.Attr<bool>("matmul_1_transpose_y");
+      if (matmul_1_transpose_x || matmul_1_transpose_y) return false;
+
+      bool matmul_2_transpose_x = match_ctx.Attr<bool>("matmul_2_transpose_x");
+      bool matmul_2_transpose_y = match_ctx.Attr<bool>("matmul_2_transpose_y");
+      if (matmul_2_transpose_x || matmul_2_transpose_y) return false;
+
+      bool matmul_3_transpose_x = match_ctx.Attr<bool>("matmul_3_transpose_x");
+      bool matmul_3_transpose_y = match_ctx.Attr<bool>("matmul_3_transpose_y");
+      if (matmul_3_transpose_x || matmul_3_transpose_y) return false;
+
+      bool matmul_4_transpose_x = match_ctx.Attr<bool>("matmul_4_transpose_x");
+      bool matmul_4_transpose_y = match_ctx.Attr<bool>("matmul_4_transpose_y");
+      if (matmul_4_transpose_x || !matmul_4_transpose_y) return false;
+
+      bool matmul_5_transpose_x = match_ctx.Attr<bool>("matmul_5_transpose_x");
+      bool matmul_5_transpose_y = match_ctx.Attr<bool>("matmul_5_transpose_y");
+      if (matmul_5_transpose_x || matmul_5_transpose_y) return false;
+
+      return true;
+    });
+
+    //
+    // Result Pattern.
+    //
+    pir::drr::ResultPattern res = src.ResultPattern();
+    // W combine.
+    const auto &combine_1 = res.Op("builtin.combine");
+    combine_1({&res.Tensor("matmul_1_in_2"),
+               &res.Tensor("matmul_2_in_2"),
+               &res.Tensor("matmul_3_in_2")},
+              {&res.Tensor("combine_1_out")});
+    const auto &concat_axis = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> int { return 0; });
+    const auto &concat_1 = res.Op("pd_op.concat", {{"axis", concat_axis}});
+    res.Tensor("concat_1_out") = concat_1(res.Tensor("combine_1_out"));
+    const auto &reshape_5_shape = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> std::vector<int64_t> {
+          auto matmul_1_in_2 = match_ctx.Tensor("matmul_1_in_2").Shape();
+          return {-1, 3, matmul_1_in_2.at(1)};
+        });
+    const auto &reshape_5 =
+        res.Op("pd_op.reshape", {{"shape", reshape_5_shape}});
+    reshape_5({&res.Tensor("concat_1_out")},
+              {&res.Tensor("reshape_5_out"), &res.NoneTensor()});
+
+    // Bias combine.
+    const auto &combine_2 = res.Op("builtin.combine");
+    combine_2({&res.Tensor("add_1_in_2"),
+               &res.Tensor("add_2_in_2"),
+               &res.Tensor("add_3_in_2")},
+              {&res.Tensor("combine_2_out")});
+    const auto &concat_2 = res.Op("pd_op.concat", {{"axis", concat_axis}});
+    res.Tensor("concat_2_out") = concat_2(res.Tensor("combine_2_out"));
+    const auto &reshape_6_shape = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> std::vector<int64_t> {
+          return {3, -1};
+        });
+    const auto &reshape_6 =
+        res.Op("pd_op.reshape", {{"shape", reshape_6_shape}});
+    reshape_6({&res.Tensor("concat_2_out")},
+              {&res.Tensor("reshape_6_out"), &res.NoneTensor()});
+
+    const auto &head_number =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> int {
+          const auto &full_int_array_1_value =
+              match_ctx.Attr<std::vector<int64_t>>("full_int_array_1_value");
+          return full_int_array_1_value.at(2);
+        });
+    const auto &alpha =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> float {
+          return match_ctx.Attr<float>("full_1_value");
+        });
+    const auto &multihead_matmul = res.Op(
+        "pd_op.multihead_matmul",
+        {{"transpose_q", res.Attr([](const pir::drr::MatchContext &match_ctx) {
+            return false;
+          })},
+         {"transpose_k", res.Attr([](const pir::drr::MatchContext &match_ctx) {
+            return true;
+          })},
+         {"transpose_v", res.Attr([](const pir::drr::MatchContext &match_ctx) {
+            return false;
+          })},
+         {"head_number", head_number},
+         {"alpha", alpha}});
+    multihead_matmul({&res.Tensor("matmul_1_in_1"),
+                      &res.Tensor("reshape_5_out"),
+                      &res.Tensor("reshape_6_out"),
+                      &res.Tensor("add_4_in_2")},
+                     {&res.Tensor("reshape_4_out")});
+  }
+};
+
+class AttentionFusePass : public pir::Pass {
+ public:
+  AttentionFusePass() : pir::Pass("attention_fuse_pass", 2) {}
+
+  bool Initialize(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(MultiHeadMatmulFusePattern().Build(context));
+    // Add other attention variant fuse pattern.
+
+    patterns_ = pir::FrozenRewritePatternSet(std::move(ps));
+    return true;
+  }
+
+  void Run(pir::Operation *op) override {
+    pir::GreedyRewriteConfig cfg;
+    cfg.use_top_down_traversal = true;
+    cfg.max_iterations = 10;
+    pir::ApplyPatternsGreedily(op->region(0), patterns_, cfg);
+  }
+
+  bool CanApplyOn(pir::Operation *op) const override {
+    return op->isa<::pir::ModuleOp>() && op->num_regions() > 0;
+  }
+
+ private:
+  pir::FrozenRewritePatternSet patterns_;
+};
+
+}  // namespace
+
+namespace pir {
+std::unique_ptr<Pass> CreateAttentionFusePass() {
+  return std::make_unique<AttentionFusePass>();
+}
+}  // namespace pir
+
+REGISTER_IR_PASS(attention_fuse_pass, AttentionFusePass);
diff --git a/paddle/cinn/hlir/framework/convert_to_dialect.h b/paddle/fluid/pir/transforms/fusion/attention_fuse_pass.h
similarity index 73%
rename from paddle/cinn/hlir/framework/convert_to_dialect.h
rename to paddle/fluid/pir/transforms/fusion/attention_fuse_pass.h
index 7ea0a2ace40c7a..0c0d2e84952ca4 100644
--- a/paddle/cinn/hlir/framework/convert_to_dialect.h
+++ b/paddle/fluid/pir/transforms/fusion/attention_fuse_pass.h
@@ -15,19 +15,12 @@
 #pragma once
 
 #include <memory>
+#include "paddle/pir/core/dll_decl.h"
 
 namespace pir {
-class Program;
-}  // namespace pir
 
-namespace cinn {
-namespace hlir {
-namespace framework {
-class Program;
+class Pass;
 
-std::unique_ptr<::pir::Program> ConvertToRuntimeDialect(
-    const hlir::framework::Program& program);
+IR_API std::unique_ptr<Pass> CreateAttentionFusePass();
 
-}  // namespace framework
-}  // namespace hlir
-}  // namespace cinn
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/fusion/fused_gemm_epilogue_pass.cc b/paddle/fluid/pir/transforms/fusion/fused_gemm_epilogue_pass.cc
new file mode 100644
index 00000000000000..0823867b444888
--- /dev/null
+++ b/paddle/fluid/pir/transforms/fusion/fused_gemm_epilogue_pass.cc
@@ -0,0 +1,295 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/fusion/fused_gemm_epilogue_pass.h"
+
+#include "paddle/fluid/pir/drr/api/drr_pattern_base.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_registry.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
+
+namespace {
+
+class FusedLinearPattern : public pir::drr::DrrPatternBase<FusedLinearPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &matmul = pat.Op("pd_op.matmul",
+                                {{"transpose_x", pat.Attr("trans_x")},
+                                 {"transpose_y", pat.Attr("trans_y")}});
+    const auto &add = pat.Op("pd_op.add");
+
+    pat.Tensor("tmp") = matmul(pat.Tensor("x"), pat.Tensor("w"));
+    pat.Tensor("out") = add(pat.Tensor("tmp"), pat.Tensor("bias"));
+
+    pat.RequireNativeCall([&](const pir::drr::MatchContext &match_ctx) {
+      return (match_ctx.Tensor("w").Shape().size() == 2 &&
+              match_ctx.Tensor("x").Shape().size() >= 2);
+    });
+
+    pir::drr::ResultPattern res = pat.ResultPattern();
+    const auto &act_attr =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> std::any {
+          return "none";
+        });
+    const auto &fused_gemm_epilogue = res.Op("pd_op.fused_gemm_epilogue",
+                                             {{{"trans_x", pat.Attr("trans_x")},
+                                               {"trans_y", pat.Attr("trans_y")},
+                                               {"activation", act_attr}}});
+    fused_gemm_epilogue(
+        {&res.Tensor("x"), &res.Tensor("w"), &res.Tensor("bias")},
+        {&res.Tensor("out")});
+  }
+};
+
+class FusedLinearGradPattern
+    : public pir::drr::DrrPatternBase<FusedLinearGradPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &matmul = pat.Op("pd_op.matmul",
+                                {{"transpose_x", pat.Attr("trans_x")},
+                                 {"transpose_y", pat.Attr("trans_y")}});
+    const auto &matmul_grad = pat.Op("pd_op.matmul_grad",
+                                     {{"transpose_x", pat.Attr("trans_x")},
+                                      {"transpose_y", pat.Attr("trans_y")}});
+    const auto &add = pat.Op("pd_op.add");
+    const auto &add_grad = pat.Op("pd_op.add_grad");
+
+    pat.Tensor("tmp") = matmul(pat.Tensor("x"), pat.Tensor("w"));
+    pat.Tensor("out") = add(pat.Tensor("tmp"), pat.Tensor("bias"));
+    add_grad({&pat.Tensor("tmp"), &pat.Tensor("bias"), &pat.Tensor("out_grad")},
+             {&pat.Tensor("tmp_grad"), &pat.Tensor("bias_grad")});
+    matmul_grad({&pat.Tensor("x"), &pat.Tensor("w"), &pat.Tensor("tmp_grad")},
+                {&pat.Tensor("x_grad"), &pat.Tensor("w_grad")});
+
+    pat.RequireNativeCall([&](const pir::drr::MatchContext &match_ctx) {
+      return (match_ctx.Tensor("w").Shape().size() == 2 &&
+              match_ctx.Tensor("x").Shape().size() >= 2);
+    });
+
+    pir::drr::ResultPattern res = pat.ResultPattern();
+    const auto &act_attr =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> std::any {
+          return "none";
+        });
+    const auto &fused_gemm_epilogue = res.Op("pd_op.fused_gemm_epilogue",
+                                             {{{"trans_x", pat.Attr("trans_x")},
+                                               {"trans_y", pat.Attr("trans_y")},
+                                               {"activation", act_attr}}});
+    const auto &fused_gemm_epilogue_grad =
+        res.Op("pd_op.fused_gemm_epilogue_grad",
+               {{{"trans_x", pat.Attr("trans_x")},
+                 {"trans_y", pat.Attr("trans_y")},
+                 {"activation_grad", act_attr}}});
+    fused_gemm_epilogue(
+        {&res.Tensor("x"), &res.Tensor("w"), &res.Tensor("bias")},
+        {&res.Tensor("out")});
+    fused_gemm_epilogue_grad({&res.Tensor("x"),
+                              &res.Tensor("w"),
+                              &res.NoneTensor(),
+                              &res.Tensor("out_grad")},
+                             {&res.Tensor("x_grad"),
+                              &res.Tensor("w_grad"),
+                              &res.Tensor("bias_grad")});
+  }
+};
+
+class FusedLinearGeluGradPattern
+    : public pir::drr::DrrPatternBase<FusedLinearGeluGradPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &fused_gemm_epilogue =
+        pat.Op("pd_op.fused_gemm_epilogue",
+               {{{"trans_x", pat.Attr("trans_x1")},
+                 {"trans_y", pat.Attr("trans_y1")},
+                 {"activation", pat.Attr("act1")}}});
+    const auto &fused_gemm_epilogue_grad1 =
+        pat.Op("pd_op.fused_gemm_epilogue_grad",
+               {{{"trans_x", pat.Attr("trans_x2")},
+                 {"trans_y", pat.Attr("trans_y2")},
+                 {"activation_grad", pat.Attr("act2")}}});
+    fused_gemm_epilogue(
+        {&pat.Tensor("x"), &pat.Tensor("w"), &pat.Tensor("bias")},
+        {&pat.Tensor("fuse_out"), &pat.Tensor("reserve_space")});
+    pat.Tensor("out") = pat.Op("pd_op.gelu")(pat.Tensor("fuse_out"));
+
+    fused_gemm_epilogue_grad1({&pat.Tensor("x1"),
+                               &pat.Tensor("w1"),
+                               &pat.Tensor("reserve_space1"),
+                               &pat.Tensor("out_grad")},
+                              {&pat.Tensor("x1_grad"),
+                               &pat.Tensor("w1_grad"),
+                               &pat.Tensor("bias1_grad")});
+    pat.Tensor("gelu_dx") = pat.Op("pd_op.gelu_grad")(pat.Tensor("fuse_out"),
+                                                      pat.Tensor("x1_grad"));
+
+    pat.RequireNativeCall([&](const pir::drr::MatchContext &match_ctx) {
+      return match_ctx.Attr<std::string>("act1") == "none" &&
+             match_ctx.Attr<std::string>("act2") == "none";
+    });
+
+    pir::drr::ResultPattern res = pat.ResultPattern();
+    const auto &act_attr =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> std::any {
+          return "gelu";
+        });
+    const auto &fused_gemm_epilogue_new =
+        res.Op("pd_op.fused_gemm_epilogue",
+               {{{"trans_x", pat.Attr("trans_x1")},
+                 {"trans_y", pat.Attr("trans_y1")},
+                 {"activation", act_attr}}});
+    const auto &act_grad_attr =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> std::any {
+          return "gelu_grad";
+        });
+    const auto &fused_gemm_epilogue_grad_new =
+        res.Op("pd_op.fused_gemm_epilogue_grad",
+               {{{"trans_x", pat.Attr("trans_x2")},
+                 {"trans_y", pat.Attr("trans_y2")},
+                 {"activation_grad", act_grad_attr}}});
+    fused_gemm_epilogue_new(
+        {&res.Tensor("x"), &res.Tensor("w"), &res.Tensor("bias")},
+        {&res.Tensor("out"), &res.Tensor("reserve_space2")});
+    fused_gemm_epilogue_grad_new({&res.Tensor("x1"),
+                                  &res.Tensor("w1"),
+                                  &res.Tensor("reserve_space2"),
+                                  &res.Tensor("out_grad")},
+                                 {&res.Tensor("gelu_dx"),
+                                  &res.Tensor("w1_grad"),
+                                  &res.Tensor("bias1_grad")});
+  }
+};
+
+class FusedLinearReluGradPattern
+    : public pir::drr::DrrPatternBase<FusedLinearReluGradPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &fused_gemm_epilogue =
+        pat.Op("pd_op.fused_gemm_epilogue",
+               {{{"trans_x", pat.Attr("trans_x1")},
+                 {"trans_y", pat.Attr("trans_y1")},
+                 {"activation", pat.Attr("act1")}}});
+    const auto &fused_gemm_epilogue_grad =
+        pat.Op("pd_op.fused_gemm_epilogue_grad",
+               {{{"trans_x", pat.Attr("trans_x2")},
+                 {"trans_y", pat.Attr("trans_y2")},
+                 {"activation_grad", pat.Attr("act2")}}});
+    const auto &fused_gemm_epilogue_grad1 =
+        pat.Op("pd_op.fused_gemm_epilogue_grad",
+               {{{"trans_x", pat.Attr("trans_x3")},
+                 {"trans_y", pat.Attr("trans_y3")},
+                 {"activation_grad", pat.Attr("act3")}}});
+    fused_gemm_epilogue(
+        {&pat.Tensor("x"), &pat.Tensor("w"), &pat.Tensor("bias")},
+        {&pat.Tensor("fuse_out"), &pat.Tensor("reserve_space")});
+    pat.Tensor("out") = pat.Op("pd_op.relu")(pat.Tensor("fuse_out"));
+
+    fused_gemm_epilogue_grad1({&pat.Tensor("x1"),
+                               &pat.Tensor("w1"),
+                               &pat.Tensor("reserve_space2"),
+                               &pat.Tensor("out_grad")},
+                              {&pat.Tensor("x1_grad"),
+                               &pat.Tensor("w1_grad"),
+                               &pat.Tensor("bias1_grad")});
+    pat.Tensor("relu_dx") =
+        pat.Op("pd_op.relu_grad")(pat.Tensor("x1"), pat.Tensor("x1_grad"));
+    fused_gemm_epilogue_grad({&pat.Tensor("x"),
+                              &pat.Tensor("w"),
+                              &pat.Tensor("reserve_space1"),
+                              &pat.Tensor("relu_dx")},
+                             {&pat.Tensor("x_grad"),
+                              &pat.Tensor("w_grad"),
+                              &pat.Tensor("bias_grad")});
+
+    pat.RequireNativeCall([&](const pir::drr::MatchContext &match_ctx) {
+      return match_ctx.Attr<std::string>("act1") == "none" &&
+             match_ctx.Attr<std::string>("act3") == "none";
+    });
+
+    pir::drr::ResultPattern res = pat.ResultPattern();
+    const auto &act_attr =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> std::any {
+          return "relu";
+        });
+    const auto &fused_gemm_epilogue_new =
+        res.Op("pd_op.fused_gemm_epilogue",
+               {{{"trans_x", pat.Attr("trans_x1")},
+                 {"trans_y", pat.Attr("trans_y1")},
+                 {"activation", act_attr}}});
+    const auto &act_grad_attr =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> std::any {
+          return "relu_grad";
+        });
+    const auto &fused_gemm_epilogue_grad1_new =
+        res.Op("pd_op.fused_gemm_epilogue_grad",
+               {{{"trans_x", pat.Attr("trans_x2")},
+                 {"trans_y", pat.Attr("trans_y2")},
+                 {"activation_grad", act_grad_attr}}});
+    fused_gemm_epilogue_new(
+        {&res.Tensor("x"), &res.Tensor("w"), &res.Tensor("bias")},
+        {&res.Tensor("out"), &res.Tensor("reserve_space3")});
+    fused_gemm_epilogue_grad1_new({&res.Tensor("x1"),
+                                   &res.Tensor("w1"),
+                                   &res.Tensor("reserve_space3"),
+                                   &res.Tensor("out_grad")},
+                                  {&res.Tensor("relu_dx"),
+                                   &res.Tensor("w1_grad"),
+                                   &res.Tensor("bias1_grad")});
+  }
+};
+
+class FusedGemmEpiloguePass : public pir::Pass {
+ public:
+  FusedGemmEpiloguePass() : pir::Pass("fused_gemm_epilogue_pass", 2) {}
+
+  bool Initialize(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(FusedLinearGradPattern().Build(context));
+    ps.Add(FusedLinearPattern().Build(context));
+    ps.Add(FusedLinearGeluGradPattern().Build(context));
+    ps.Add(FusedLinearReluGradPattern().Build(context));
+
+    patterns_ = pir::FrozenRewritePatternSet(std::move(ps));
+    return true;
+  }
+
+  void Run(pir::Operation *op) override {
+    pir::GreedyRewriteConfig cfg;
+    cfg.use_top_down_traversal = true;
+    cfg.max_iterations = 10;
+    pir::ApplyPatternsGreedily(op->region(0), patterns_, cfg);
+  }
+
+  bool CanApplyOn(pir::Operation *op) const override {
+    return op->name() == "builtin.module" && op->num_regions() > 0;
+  }
+
+ private:
+  pir::FrozenRewritePatternSet patterns_;
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateFusedGemmEpiloguePass() {
+  return std::make_unique<FusedGemmEpiloguePass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(fused_gemm_epilogue_pass, FusedGemmEpiloguePass);
diff --git a/paddle/fluid/operators/tree_conv_op.cu b/paddle/fluid/pir/transforms/fusion/fused_gemm_epilogue_pass.h
similarity index 61%
rename from paddle/fluid/operators/tree_conv_op.cu
rename to paddle/fluid/pir/transforms/fusion/fused_gemm_epilogue_pass.h
index 1bfcb94013c2bd..61f503a530f729 100644
--- a/paddle/fluid/operators/tree_conv_op.cu
+++ b/paddle/fluid/pir/transforms/fusion/fused_gemm_epilogue_pass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,11 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/tree_conv_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
+#include <memory>
+#include "paddle/pir/core/dll_decl.h"
 
-PD_REGISTER_STRUCT_KERNEL(
-    tree_conv, GPU, ALL_LAYOUT, ops::TreeConvKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    tree_conv_grad, GPU, ALL_LAYOUT, ops::TreeConvGradKernel, float, double) {}
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateFusedGemmEpiloguePass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/inplace_pass.cc b/paddle/fluid/pir/transforms/inplace_pass.cc
index 6010af208fae6c..760a78c1952ab1 100644
--- a/paddle/fluid/pir/transforms/inplace_pass.cc
+++ b/paddle/fluid/pir/transforms/inplace_pass.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/trait/inplace.h"
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
@@ -110,7 +111,7 @@ static std::unordered_set<pir::Value> GetSkipDeletionValues(pir::Block* block) {
       continue;
     }
     if (upper_op_name == "pd_op.fetch" ||
-        upper_op_name == "pd_op.shadow_output") {
+        upper_op_name == "builtin.shadow_output") {
       skip_dels.insert(op->operand_source(0));
       continue;
     }
@@ -121,11 +122,10 @@ static std::unordered_set<pir::Value> GetSkipDeletionValues(pir::Block* block) {
 // NOTE(zhangbo): For inplace Pass, currently only the kernel_dialect operator
 // is supported. Therefore, this function only returns the values in the
 // kernel_dialect operator that can be eager deleted.
-static std::unordered_map<pir::Operation*, std::unordered_set<pir::Value>>
-GetEagerDeletionValues(pir::Block* block) {
-  std::unordered_set<pir::Value> skip_dels = GetSkipDeletionValues(block);
-
-  std::unordered_map<pir::Value, pir::Operation*> del_value_2_op;
+static void GetEagerDelValueOfOp(
+    pir::Block* block,
+    const std::unordered_set<pir::Value>& skip_dels,
+    std::unordered_map<pir::Value, pir::Operation*>* del_value_2_op) {
   for (auto& op : *block) {
     std::string upper_op_name = op->name();
     if (op->dialect()->name().compare(paddle::dialect::KernelDialect::name()) ==
@@ -150,16 +150,32 @@ GetEagerDeletionValues(pir::Block* block) {
         VLOG(8) << " -- is no_need_buffer: " << IsNoNeedBuffer(op, input);
         continue;
       }
-      del_value_2_op[input] = op;
+      (*del_value_2_op)[input] = op;
     }
 
     for (size_t i = 0; i < op->num_results(); ++i) {
       pir::Value output = op->result(i);
       if (output && CanBeDeleted(output)) {
-        del_value_2_op[output] = op;
+        (*del_value_2_op)[output] = op;
       }
     }
+
+    if (op->isa<paddle::dialect::IfOp>()) {
+      auto if_op = op->dyn_cast<paddle::dialect::IfOp>();
+      GetEagerDelValueOfOp(if_op.true_block(), skip_dels, del_value_2_op);
+      VLOG(8) << "GetEagerDelValueOfOp for IfOp true block";
+      GetEagerDelValueOfOp(if_op.false_block(), skip_dels, del_value_2_op);
+      VLOG(8) << "GetEagerDelValueOfOp for IfOp false block";
+    }
   }
+}
+
+static std::unordered_map<pir::Operation*, std::unordered_set<pir::Value>>
+GetEagerDeletionValues(pir::Block* block) {
+  std::unordered_set<pir::Value> skip_dels = GetSkipDeletionValues(block);
+
+  std::unordered_map<pir::Value, pir::Operation*> del_value_2_op;
+  GetEagerDelValueOfOp(block, skip_dels, &del_value_2_op);
 
   std::unordered_map<pir::Operation*, std::unordered_set<pir::Value>>
       eager_dels;
@@ -304,11 +320,11 @@ static std::unordered_map<pir::Operation*, std::string> GetInplaceOps(
 
 class InplacePass : public pir::Pass {
  public:
-  InplacePass() : pir::Pass("InplacePass", 3) {}
+  InplacePass() : pir::Pass("inplace_pass", 3) {}
 
   void Run(pir::Operation* op) override {
     auto module_op = op->dyn_cast<pir::ModuleOp>();
-    IR_ENFORCE(module_op, "InplacePass should run on module op.");
+    IR_ENFORCE(module_op, "inplace_pass should run on module op.");
     auto* block = module_op.block();
 
     auto inplace_ops = details::GetInplaceOps(block);
@@ -333,7 +349,7 @@ class InplacePass : public pir::Pass {
           pir::BoolAttribute::get(pir::IrContext::Instance(), true));
     }
     LOG_FIRST_N(INFO, 1)
-        << "Apply inplace pass on lowering ::ir::Program to Kernel Dialect.";
+        << "Apply inplace pass on lowering ::pir::Program to Kernel Dialect.";
   }
 
   bool CanApplyOn(pir::Operation* op) const override {
@@ -349,4 +365,4 @@ std::unique_ptr<pir::Pass> CreateInplacePass() {
 
 }  // namespace pir
 
-REGISTER_IR_PASS(inplace, InplacePass);
+REGISTER_IR_PASS(inplace_pass, InplacePass);
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 00597318091084..3ac3db56cfd41d 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
+
 #include <iostream>
 
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
@@ -19,6 +21,7 @@
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
@@ -27,7 +30,6 @@
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
@@ -35,7 +37,10 @@
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
+#include "paddle/utils/flags.h"
 
+PHI_DECLARE_bool(print_ir);
 namespace paddle {
 namespace dialect {
 
@@ -60,13 +65,17 @@ const std::unordered_set<std::string> UnchangeOutputOps = {
     "pd_op.fetch",
     "builtin.set_parameter",
     "builtin.get_parameter",
-    "pd_op.shadow_output"};
-
-const std::unordered_set<std::string> SpecialLowerOps = {"builtin.combine",
-                                                         "builtin.slice",
-                                                         "builtin.split",
-                                                         "pd_op.if",
-                                                         "cf.yield"};
+    "builtin.shadow_output",
+    "cinn_runtime.jit_kernel"};
+const std::unordered_set<std::string> SpecialLowerOps = {
+    "builtin.combine",
+    "builtin.slice",
+    "builtin.split",
+    "pd_op.if",
+    "pd_op.while",
+    "cf.yield",
+    "cf.cond_yield",
+    "cinn_runtime.jit_kernel"};
 
 bool NeedFallBackCpu(const pir::Operation* op,
                      const std::string& kernel_fn_name,
@@ -105,7 +114,8 @@ phi::Backend GetDstBackend(const std::string& op_name,
                            const OpYamlInfoParser* op_yaml_info_parser,
                            phi::Backend kernel_def_backend,
                            size_t input_index) {
-  if (op_name == "builtin.set_parameter" &&
+  if ((op_name == "builtin.set_parameter" ||
+       op_name == "builtin.shadow_output") &&
       place.GetType() == phi::AllocationType::GPU) {
     // NOTE: align old executor, all the paramter are initilizered
     // on backend of executor place defined
@@ -222,16 +232,16 @@ std::vector<std::shared_ptr<phi::TensorBase>> GetFakeTensorList(
   return vec_res;
 }
 
-pir::OpResult AddPlaceTransferOp(pir::OpResult in,
+pir::OpResult AddPlaceTransferOp(pir::Value in,
                                  pir::Type out_type,
                                  const phi::Place& src_place,
                                  const phi::Place& dst_place,
                                  const phi::KernelKey& kernel_key,
                                  pir::Block* block) {
   pir::IrContext* ctx = pir::IrContext::Instance();
-  std::string op_name = paddle::dialect::PhiKernelOp::name();
 
-  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
+  pir::OpInfo kernel_op_info =
+      ctx->GetRegisteredOpInfo(paddle::dialect::PhiKernelOp::name());
 
   if ((src_place.GetType() == phi::AllocationType::CPU) &&
       (dst_place.GetType() == phi::AllocationType::GPU)) {
@@ -244,11 +254,11 @@ pir::OpResult AddPlaceTransferOp(pir::OpResult in,
         {"dst_place_type", pir::Int32Attribute::get(ctx, 1)}};
 
     pir::Operation* op =
-        pir::Operation::Create({in}, op_attribute, {out_type}, op_info);
+        pir::Operation::Create({in}, op_attribute, {out_type}, kernel_op_info);
 
-    if (in.owner()->HasAttribute(kAttrIsPersisable)) {
-      op->set_attribute(kAttrIsPersisable,
-                        in.owner()->attribute(kAttrIsPersisable));
+    auto in_op = in.dyn_cast<pir::OpResult>().owner();
+    if (in_op && in_op->HasAttribute(kAttrIsPersisable)) {
+      op->set_attribute(kAttrIsPersisable, in_op->attribute(kAttrIsPersisable));
     }
     block->push_back(op);
 
@@ -266,7 +276,7 @@ pir::OpResult AddPlaceTransferOp(pir::OpResult in,
         {"dst_place_type", pir::Int32Attribute::get(ctx, 0)}};
 
     pir::Operation* op =
-        pir::Operation::Create({in}, op_attribute, {out_type}, op_info);
+        pir::Operation::Create({in}, op_attribute, {out_type}, kernel_op_info);
 
     block->push_back(op);
 
@@ -324,7 +334,7 @@ pir::Type BuildOutputType(pir::Type type,
 
 phi::DataType GetKernelDataTypeByYamlInfo(
     const pir::Operation* op,
-    const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
+    const std::unordered_map<pir::Value, pir::Value>& map_value_pair,
     const dialect::OpYamlInfoParser* op_info_parser) {
   auto& attr_map = op->attributes();
   auto& data_type_info = op_info_parser->OpRuntimeInfo().kernel_key_dtype;
@@ -404,7 +414,7 @@ phi::DataType GetKernelDataTypeByYamlInfo(
 
 phi::Backend GetKernelBackendByYamlInfo(
     const pir::Operation* op,
-    const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
+    const std::unordered_map<pir::Value, pir::Value>& map_value_pair,
     const dialect::OpYamlInfoParser* op_info_parser,
     const phi::Place& place) {
   auto& attr_map = op->attributes();
@@ -481,11 +491,12 @@ phi::KernelKey GetKernelKey(
     pir::Operation* op,
     const phi::Place& place,
     const std::string& kernel_fn_str,
-    const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
+    const std::unordered_map<pir::Value, pir::Value>& map_value_pair,
     dialect::OpYamlInfoParser* op_info_parser = nullptr) {
   if (op->isa<paddle::dialect::FeedOp>()) {
     // NOTE, for now feed op don't need a kernel, so the data type from Op
     // Result the next op use base program datatype
+    VLOG(6) << "FeedOp doesn't need a kernel. Backend: CPU, DataLayout: ANY";
     return {phi::Backend::CPU,
             phi::DataLayout::ANY,
             TransToPhiDataType(
@@ -495,6 +506,7 @@ phi::KernelKey GetKernelKey(
   if (op->isa<paddle::dialect::DataOp>()) {
     // NOTE, for now feed op don't need a kernel, so the data type from Op
     // Result the next op use base program datatype
+    VLOG(6) << "DataOp doesn't need a kernel";
     auto data_place =
         op->attributes().at("place").dyn_cast<dialect::PlaceAttribute>().data();
 
@@ -506,7 +518,8 @@ phi::KernelKey GetKernelKey(
                 op->result(0).type().dyn_cast<DenseTensorType>().dtype())};
   }
 
-  if (op->name() == "pd_op.seed") {
+  if (op->isa<paddle::dialect::SeedOp>()) {
+    VLOG(6) << "SeedOp doesn't need a kernel";
     auto backend = paddle::experimental::ParseBackend(place);
     return {backend,
             phi::DataLayout::ANY,
@@ -514,6 +527,17 @@ phi::KernelKey GetKernelKey(
                 op->result(0).type().dyn_cast<DenseTensorType>().dtype())};
   }
 
+  if (op->isa<paddle::dialect::FullWithTensorOp>()) {
+    VLOG(6) << "FullWithTensorOp doesn't need a kernel";
+    auto backend = paddle::experimental::ParseBackend(place);
+    auto dtype = op->attributes()
+                     .at("dtype")
+                     .dyn_cast<dialect::DataTypeAttribute>()
+                     .data();
+
+    return {backend, phi::DataLayout::ANY, dtype};
+  }
+
   phi::Backend kernel_backend = phi::Backend::UNDEFINED;
   phi::DataLayout kernel_layout = phi::DataLayout::UNDEFINED;
   phi::DataType kernel_data_type = phi::DataType::UNDEFINED;
@@ -522,31 +546,24 @@ phi::KernelKey GetKernelKey(
     // only suppurt non vector input for now
     int tensor_input_number =
         static_cast<int>(op_info_parser->InputTensorNumber());
-
+    VLOG(8) << "Begin to infer kernel key from op_info_parser(defined by yaml "
+               "info)";
     // get datatype info
     kernel_data_type =
         GetKernelDataTypeByYamlInfo(op, map_value_pair, op_info_parser);
+    VLOG(8) << "Infer kernel data_type: [" << kernel_data_type
+            << "] from yaml info";
     kernel_backend =
         GetKernelBackendByYamlInfo(op, map_value_pair, op_info_parser, place);
-
+    VLOG(8) << "Infer kernel backend: [" << kernel_backend
+            << "] from yaml info";
     // parse all the input tensor
     if (tensor_input_number == 0 || op->isa<paddle::dialect::Full_Op>()) {
       // all the information have to get from attribute and context
-
-      if (op->isa<paddle::dialect::UniformOp>()) {
-        // try to process uniform, use shape to determin backend
-        // TODO(phlrain): shuold support other initilize op
-        auto define_op =
-            op->operand_source(0).dyn_cast<pir::OpResult>().owner();
-        if (define_op->isa<paddle::dialect::FullIntArrayOp>()) {
-          auto shape = define_op->attribute<dialect::IntArrayAttribute>("value")
-                           .data()
-                           .GetData();
-        }
-      }
-
       if (kernel_backend == phi::Backend::UNDEFINED) {
         kernel_backend = paddle::experimental::ParseBackend(place);
+        VLOG(8) << "Infer kernel backend: [" << kernel_backend
+                << "] when tensor_input_number == 0  or is Full_Op";
       }
     }
   }
@@ -555,15 +572,17 @@ phi::KernelKey GetKernelKey(
        kernel_data_type == phi::DataType::UNDEFINED) &&
       op->num_operands() > 0) {
     paddle::experimental::detail::KernelKeyParser kernel_key_parser;
-
+    VLOG(8) << "Begin to infer kernel key from op operands";
     for (size_t i = 0; i < op->num_operands(); ++i) {
       // NOTE, only op with OpYamlInfo can have TensorArr
       if (op_info_parser != nullptr && op_info_parser->IsTensorAttribute(i)) {
+        VLOG(8) << "input (" << i << ") doesn't have TensorArr";
         continue;
       }
       auto input_tmp = op->operand_source(i);
       // NOTE: if not input_tmp, it's an optional input
       if (!input_tmp) {
+        VLOG(8) << "input (" << i << ") is NULL (optional input)";
         continue;
       }
       auto new_input_tmp = map_value_pair.at(input_tmp);
@@ -578,10 +597,12 @@ phi::KernelKey GetKernelKey(
       // don't know how to select the kernel in the next of op that
       // uses data op outout as inputs. So, we need set kernel backend
       // manually.
-      if (op->operand_source(i)
-              .dyn_cast<pir::OpResult>()
-              .owner()
-              ->isa<paddle::dialect::DataOp>()) {
+      auto op_res = op->operand_source(i).dyn_cast<pir::OpResult>();
+
+      if (!op_res) {
+        continue;
+      }
+      if (op_res.owner()->isa<paddle::dialect::DataOp>()) {
         auto data_op = op->operand_source(i).dyn_cast<pir::OpResult>().owner();
         auto data_place =
             data_op->attribute<dialect::PlaceAttribute>("place").data();
@@ -593,6 +614,8 @@ phi::KernelKey GetKernelKey(
         kernel_key_parser.key_set.backend_set =
             kernel_key_parser.key_set.backend_set |
             paddle::experimental::BackendSet(data_op_backend);
+        VLOG(8) << "Update kernel backend set from owner op (DataOp): "
+                << data_op_backend;
       } else if (op->operand_source(i)
                      .dyn_cast<pir::OpResult>()
                      .owner()
@@ -617,6 +640,8 @@ phi::KernelKey GetKernelKey(
             kernel_key_parser.key_set.backend_set =
                 kernel_key_parser.key_set.backend_set |
                 paddle::experimental::BackendSet(data_op_backend);
+            VLOG(8) << "Update kernel backend set from owner op (CombineOp): "
+                    << data_op_backend;
             break;
           }
         }
@@ -629,16 +654,26 @@ phi::KernelKey GetKernelKey(
 
     if (kernel_backend == phi::Backend::UNDEFINED) {
       kernel_backend = kernel_key.backend();
+      if (kernel_backend != phi::Backend::UNDEFINED) {
+        VLOG(8) << "Infer kernel backend from op operands";
+      }
     }
     if (kernel_layout == phi::DataLayout::UNDEFINED) {
       kernel_layout = kernel_key.layout();
+      if (kernel_layout != phi::DataLayout::UNDEFINED) {
+        VLOG(8) << "Infer kernel layout from op operands";
+      }
     }
     if (kernel_data_type == phi::DataType::UNDEFINED) {
       kernel_data_type = kernel_key.dtype();
+      if (kernel_data_type != phi::DataType::UNDEFINED) {
+        VLOG(8) << "Infer kernel data_type from op operands";
+      }
     }
   }
 
   if (kernel_backend == phi::Backend::UNDEFINED) {
+    VLOG(8) << "Kernel backend cannot be infered from op operands";
     kernel_backend = paddle::experimental::ParseBackend(place);
   }
 
@@ -646,13 +681,17 @@ phi::KernelKey GetKernelKey(
 
   if (op->isa<paddle::dialect::LoadCombineOp>()) {
     res.set_dtype(phi::DataType::FLOAT32);
+    VLOG(8) << "LoadCombineOp's kernel data type must be FLOAT32";
   }
   if (NeedFallBackCpu((op), kernel_fn_str, res)) {
     res.set_backend(phi::Backend::CPU);
+    VLOG(8) << "kernel backend must be on CPU when need fallback";
   }
 
   if (NeedFallBackFromGPUDNN2GPU(op, res)) {
     res.set_backend(phi::Backend::GPU);
+    VLOG(8) << "kernel backend must be on GPU when need fallback from GPUDNN "
+               "to GPU";
   }
 
   return res;
@@ -664,53 +703,125 @@ void HandleForIfOp(
     pir::Block* block,
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
-    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
-  auto cur_in = op_item->operand_source(0);
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair) {
+  auto old_cond = op_item->operand_source(0);
 
   PADDLE_ENFORCE_EQ(
-      map_value_pair->count(cur_in),
+      map_value_pair->count(old_cond),
       true,
       phi::errors::PreconditionNotMet(
           "[%d]'s input of [%s] op MUST in map pair", 0, op_item->name()));
-  auto new_in = map_value_pair->at(cur_in);
+  auto new_cond = map_value_pair->at(old_cond);
+
+  // NOTE(zhangbo): IfOp's input cond should be a cpu type.
+  AllocatedDenseTensorType new_cond_type =
+      new_cond.type().dyn_cast<dialect::AllocatedDenseTensorType>();
+  if (new_cond_type) {
+    if (new_cond_type.place().GetType() == phi::AllocationType::GPU) {
+      auto out_type = dialect::AllocatedDenseTensorType::get(
+          ctx,
+          phi::CPUPlace(),
+          old_cond.type().dyn_cast<dialect::DenseTensorType>());
+      phi::KernelKey kernel_key(
+          phi::Backend::GPU, phi::DataLayout::ALL_LAYOUT, phi::DataType::BOOL);
+      new_cond = AddPlaceTransferOp(new_cond,
+                                    out_type,
+                                    new_cond_type.place(),
+                                    phi::CPUPlace(),
+                                    kernel_key,
+                                    block);
+    }
+  } else {
+    PADDLE_THROW(
+        phi::errors::Unimplemented("IfOp onlu support DenseTensorType"));
+  }
 
+  // Create IfOp and insert to kernel dialect program
   pir::Builder builder(ctx, block);
-
-  auto base_if_op = op_item->dyn_cast<paddle::dialect::IfOp>();
-  std::vector<pir::Type> op_output_types;
-  for (size_t i = 0; i < base_if_op.num_results(); ++i) {
-    op_output_types.push_back(paddle::dialect::AllocatedDenseTensorType::get(
+  auto old_ifop = op_item->dyn_cast<paddle::dialect::IfOp>();
+  std::vector<pir::Type> new_ifop_outputs;
+  for (size_t i = 0; i < old_ifop.num_results(); ++i) {
+    new_ifop_outputs.push_back(paddle::dialect::AllocatedDenseTensorType::get(
         ctx,
         place,
-        base_if_op.result(i).type().dyn_cast<dialect::DenseTensorType>()));
+        old_ifop.result(i).type().dyn_cast<dialect::DenseTensorType>()));
   }
-  auto new_if_op =
-      builder.Build<paddle::dialect::IfOp>(new_in, std::move(op_output_types));
+  auto new_ifop = builder.Build<paddle::dialect::IfOp>(
+      new_cond, std::move(new_ifop_outputs));
 
   // process true block
-  pir::Block* true_block = new_if_op.true_block();
+  pir::Block* true_block = new_ifop.true_block();
   ProcessBlock(place,
-               base_if_op.true_block(),
+               old_ifop.true_block(),
                true_block,
                ctx,
                map_op_pair,
                map_value_pair);
 
   // process false block
-  pir::Block* false_block = new_if_op.false_block();
+  pir::Block* false_block = new_ifop.false_block();
   ProcessBlock(place,
-               base_if_op.false_block(),
+               old_ifop.false_block(),
                false_block,
                ctx,
                map_op_pair,
                map_value_pair);
+
+  // update map
+  (*map_op_pair)[op_item] = new_ifop;
+  for (size_t i = 0; i < op_item->num_results(); ++i) {
+    (*map_value_pair)[op_item->result(i)] = new_ifop->result(i);
+  }
 }
 
-pir::OpResult GetNewInput(
+void HandleForWhileOp(
+    const phi::Place& place,
+    pir::Operation* op_item,
+    pir::Block* block,
+    pir::IrContext* ctx,
+    std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair) {
+  std::vector<pir::Value> vec_in;
+  pir::Value cond_val;
+  for (size_t i = 0; i < op_item->num_operands(); ++i) {
+    auto cur_in = op_item->operand_source(i);
+
+    PADDLE_ENFORCE_EQ(
+        map_value_pair->count(cur_in),
+        true,
+        phi::errors::PreconditionNotMet(
+            "[%d]'s input of [%s] op MUST in map pair", 0, op_item->name()));
+    auto new_in = map_value_pair->at(cur_in);
+    if (i == 0)
+      cond_val = new_in;
+    else
+      vec_in.push_back(new_in);
+  }
+
+  pir::Builder builder(ctx, block);
+
+  auto base_while_op = op_item->dyn_cast<paddle::dialect::WhileOp>();
+  auto new_while_op = builder.Build<paddle::dialect::WhileOp>(cond_val, vec_in);
+  pir::Block* body_block = new_while_op.body_block();
+  for (size_t i = 0; i < vec_in.size(); ++i) {
+    auto block_arg = body_block->AddArgument(vec_in[i].type());
+    (*map_value_pair)[base_while_op.body_block()->argument(i)] = block_arg;
+  }
+
+  // process body block
+  ProcessBlock(place,
+               base_while_op.body_block(),
+               body_block,
+               ctx,
+               map_op_pair,
+               map_value_pair);
+}
+
+pir::Value GetNewInput(
     const pir::Value cur_in,
-    const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
+    const std::unordered_map<pir::Value, pir::Value>& map_value_pair,
     const int index,
-    const std::string op_name) {
+    const std::string& op_name) {
   PADDLE_ENFORCE_EQ(
       map_value_pair.count(cur_in),
       true,
@@ -726,11 +837,16 @@ void HandleForSpecialOp(
     pir::Block* block,
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
-    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair) {
   if (op_item->isa<paddle::dialect::IfOp>()) {
     HandleForIfOp(place, op_item, block, ctx, map_op_pair, map_value_pair);
     return;
   }
+
+  if (op_item->isa<paddle::dialect::WhileOp>()) {
+    HandleForWhileOp(place, op_item, block, ctx, map_op_pair, map_value_pair);
+    return;
+  }
   std::vector<pir::Value> vec_inputs;
   std::vector<pir::Type> op_output_types;
   if (op_item->isa<::pir::CombineOp>()) {
@@ -743,7 +859,8 @@ void HandleForSpecialOp(
           vec_inputs.emplace_back();
           continue;
         }
-        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
+        auto new_in = GetNewInput(
+            cur_in, *map_value_pair, static_cast<int>(i), op_item->name());
         vec_inputs.push_back(new_in);
         vec_inner_types.push_back(new_in.type());
       }
@@ -762,7 +879,8 @@ void HandleForSpecialOp(
           vec_inputs.emplace_back();
           continue;
         }
-        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
+        auto new_in = GetNewInput(
+            cur_in, *map_value_pair, static_cast<int>(i), op_item->name());
         vec_inputs.push_back(new_in);
 
         if (new_in.type().isa<pir::VectorType>()) {
@@ -787,7 +905,8 @@ void HandleForSpecialOp(
           vec_inputs.emplace_back();
           continue;
         }
-        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
+        auto new_in = GetNewInput(
+            cur_in, *map_value_pair, static_cast<int>(i), op_item->name());
         vec_inputs.push_back(new_in);
 
         if (new_in.type().isa<pir::VectorType>()) {
@@ -803,7 +922,7 @@ void HandleForSpecialOp(
     }
   }
 
-  if (op_item->name() == "cf.yield") {
+  if (op_item->isa<::pir::YieldOp>()) {
     if (op_item->num_operands() > 0) {
       for (size_t i = 0; i < op_item->num_operands(); ++i) {
         auto cur_in = op_item->operand_source(i);
@@ -811,12 +930,35 @@ void HandleForSpecialOp(
           vec_inputs.emplace_back();
           continue;
         }
-        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
+        auto new_in = GetNewInput(
+            cur_in, *map_value_pair, static_cast<int>(i), op_item->name());
         vec_inputs.push_back(new_in);
       }
     }
   }
 
+  if (op_item->name() == "cinn_runtime.jit_kernel") {
+    if (op_item->num_operands() > 0) {
+      for (size_t i = 0; i < op_item->num_operands(); ++i) {
+        auto cur_in = op_item->operand_source(i);
+        if (!cur_in) {
+          vec_inputs.emplace_back();
+          continue;
+        }
+        auto new_in = GetNewInput(
+            cur_in, *map_value_pair, static_cast<int>(i), op_item->name());
+        vec_inputs.push_back(new_in);
+      }
+    }
+
+    for (size_t i = 0; i < op_item->num_results(); ++i) {
+      op_output_types.push_back(paddle::dialect::AllocatedDenseTensorType::get(
+          ctx,
+          place,
+          op_item->result(i).type().dyn_cast<dialect::DenseTensorType>()));
+    }
+  }
+
   pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name());
   // Generate new op
   pir::Operation* op = pir::Operation::Create(
@@ -921,7 +1063,7 @@ std::vector<pir::Value> BuildOpInputList(
     const OpYamlInfoParser* op_info_parser,
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
-    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair,
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair,
     pir::Block* block) {
   if (op_item->num_operands() == 0) {
     return {};
@@ -940,6 +1082,7 @@ std::vector<pir::Value> BuildOpInputList(
         true,
         phi::errors::PreconditionNotMet(
             "[%d]'s input of [%s] op MUST in map pair", i, op_item->name()));
+
     auto new_in = map_value_pair->at(cur_in);
 
     auto new_in_type = new_in.type();
@@ -947,6 +1090,17 @@ std::vector<pir::Value> BuildOpInputList(
     auto& kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN(
         kernel_fn_str, kernel_key);
 
+    int tensor_param_index = i;
+    if (kernel.IsValid()) {
+      tensor_param_index = op_info_parser->GetTensorParamIndexByArgsName(
+          op_info_parser->InputNames()[i]);
+      // the input of op args is not the kernel parameter
+      if (tensor_param_index == -1) {
+        vec_inputs.emplace_back(new_in);
+        continue;
+      }
+    }
+
     bool check_place_transfer =
         (op_item->isa<::pir::SetParameterOp>()) ||
         (kernel.IsValid() && (!UnchangeOutputOps.count(op_item->name())));
@@ -961,11 +1115,13 @@ std::vector<pir::Value> BuildOpInputList(
         auto args_def = kernel.args_def();
         auto input_defs = args_def.input_defs();
 
-        auto dst_backend = GetDstBackend(op_item->name(),
-                                         place,
-                                         op_info_parser,
-                                         kernel.InputAt(i).backend,
-                                         i);
+        auto dst_backend =
+            GetDstBackend(op_item->name(),
+                          place,
+                          op_info_parser,
+                          kernel.InputAt(tensor_param_index).backend,
+                          i);
+        VLOG(6) << "Infer kernel backend from input " << i << " of op ";
 
         bool need_trans =
             (in_place.GetType() != phi::AllocationType::UNDEFINED) &&
@@ -1024,12 +1180,13 @@ std::vector<pir::Value> BuildOpInputList(
                 (op_info_parser != nullptr &&
                  !op_info_parser->IsTensorAttribute(i)) &&
                 (paddle::experimental::NeedTransformPlace(
-                    place, kernel.InputAt(i).backend, {}));
+                    place, kernel.InputAt(tensor_param_index).backend, {}));
             if (need_trans) {
               VLOG(6) << "need trans from " << place << " to "
                       << kernel_key.backend();
               // build memcopy op
-              auto out_place = phi::TransToPhiPlace(kernel.InputAt(i).backend);
+              auto out_place = phi::TransToPhiPlace(
+                  kernel.InputAt(tensor_param_index).backend);
               pir::Type out_type;
               if (in_i_type.isa<dialect::AllocatedDenseTensorType>()) {
                 out_type = dialect::AllocatedDenseTensorType::get(
@@ -1082,12 +1239,13 @@ std::vector<pir::Value> BuildOpInputList(
         auto args_def = kernel.args_def();
         auto input_defs = args_def.input_defs();
 
-        auto dst_backend = GetDstBackend(op_item->name(),
-                                         place,
-                                         op_info_parser,
-                                         kernel.InputAt(i).backend,
-                                         i);
-
+        auto dst_backend =
+            GetDstBackend(op_item->name(),
+                          place,
+                          op_info_parser,
+                          kernel.InputAt(tensor_param_index).backend,
+                          i);
+        VLOG(6) << "Infer kernel backend from input " << i << " of op ";
         bool need_trans =
             (in_place.GetType() != phi::AllocationType::UNDEFINED) &&
             (paddle::experimental::NeedTransformPlace(
@@ -1111,8 +1269,9 @@ std::vector<pir::Value> BuildOpInputList(
               new_in, out_type, in_place, out_place, kernel_key, block);
         }
       } else {
-        PADDLE_THROW(phi::errors::Unimplemented(
-            "only support allocated dense tensor type for now"));
+        PADDLE_THROW(
+            phi::errors::Unimplemented("only support allocated dense tensor "
+                                       "type and selected rows for now"));
       }
     }
     vec_inputs.push_back(new_in);
@@ -1128,7 +1287,7 @@ void AddShadowFeed(
     pir::Block* block,
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
-    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair) {
   bool feed_op_add_shadow_feed = (op_item->isa<paddle::dialect::FeedOp>()) &&
                                  platform::is_gpu_place(place);
   bool data_op_add_shadow_feed =
@@ -1210,7 +1369,7 @@ pir::Operation* BuildPhiKernelOp(
     pir::Block* block,
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
-    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair) {
   std::unordered_map<std::string, pir::Attribute> op_attribute{
       {"op_name", pir::StrAttribute::get(ctx, op_item->name())},
       {"kernel_name", pir::StrAttribute::get(ctx, kernel_fn_str)},
@@ -1230,7 +1389,7 @@ pir::Operation* BuildPhiKernelOp(
 
   pir::OpInfo legacy_kernel_op_info =
       ctx->GetRegisteredOpInfo(paddle::dialect::LegacyKernelOp::name());
-  pir::Operation* op;
+  pir::Operation* op = nullptr;
   if (dialect::IsLegacyOp(op_item->name())) {
     op = pir::Operation::Create(
         vec_inputs, op_attribute, op_output_types, legacy_kernel_op_info);
@@ -1258,18 +1417,21 @@ void ProcessBlock(
     pir::Block* new_block,
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
-    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair) {
   auto skip_feed_names = GetSkipFeedNames(block);
 
   for (auto op_item : *block) {
     VLOG(6) << "op name " << op_item->name();
     if ((op_item->isa<paddle::dialect::FeedOp>()) &&
         SkipFeedOp(op_item, skip_feed_names)) {
+      VLOG(6) << "Skip FeedOp while lowering to kernel pass";
       continue;
     }
 
     // HandleSpecialOp
     if (SpecialLowerOps.count(op_item->name())) {
+      VLOG(6) << "Handle Special Op: [" << op_item->name()
+              << "] while lowering to kernel pass";
       HandleForSpecialOp(
           place, op_item, new_block, ctx, map_op_pair, map_value_pair);
       continue;
@@ -1318,6 +1480,10 @@ void ProcessBlock(
 
 std::unique_ptr<pir::Program> PdOpLowerToKernelPass(pir::Program* prog,
                                                     phi::Place place) {
+  if (FLAGS_print_ir) {
+    std::cout << "IR before lowering = " << *prog << std::endl;
+  }
+
   auto program = std::make_unique<pir::Program>(pir::IrContext::Instance());
 
   auto block = prog->block();
@@ -1327,16 +1493,15 @@ std::unique_ptr<pir::Program> PdOpLowerToKernelPass(pir::Program* prog,
   ctx->GetOrRegisterDialect<paddle::dialect::KernelDialect>();
 
   std::unordered_map<pir::Operation*, pir::Operation*> map_op_pair;
-  std::unordered_map<pir::Value, pir::OpResult> map_value_pair;
+  std::unordered_map<pir::Value, pir::Value> map_value_pair;
 
   ProcessBlock(
       place, block, program->block(), ctx, &map_op_pair, &map_value_pair);
 
-  if (VLOG_IS_ON(2)) {
-    std::stringstream ss1;
-    program->Print(ss1);
-    VLOG(2) << "Program after lowering to kernel pass : " << ss1.str();
+  if (FLAGS_print_ir) {
+    std::cout << "IR after lowering = " << *program << std::endl;
   }
+
   return program;
 }
 }  // namespace dialect
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h
index 35b5484508a6f2..c1f0fe0cb85d94 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h
@@ -28,6 +28,6 @@ void ProcessBlock(
     pir::Block* new_block,
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
-    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair);
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair);
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
index 48962456d4ca74..f645a5862c7161 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
@@ -717,4 +717,3 @@ REGISTER_HANDLER(bce_loss, binary_cross_entropy_handler);
 REGISTER_HANDLER(huber_loss, huber_loss_handler);
 REGISTER_HANDLER(warpctc, warpctc_handler);
 REGISTER_HANDLER(rank_loss, rank_loss_handler);
-REGISTER_HANDLER(margin_rank_loss, margin_rank_loss_handler);
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index abe5bcd8c6c852..d585f2f4c64f9f 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -101,9 +101,15 @@ std::vector<vartype::Type> get_xpu_kp_op_support_type(
 
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, phi::backends::xpu::XPUVersion version) {
-  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1
-                  ? phi::backends::xpu::get_kl1_ops()
-                  : phi::backends::xpu::get_kl2_ops();
+  phi::backends::xpu::XPUOpMap ops;
+  if (version == phi::backends::xpu::XPUVersion::XPU1) {
+    ops = phi::backends::xpu::get_kl1_ops();
+  } else if (version == phi::backends::xpu::XPUVersion::XPU2) {
+    ops = phi::backends::xpu::get_kl2_ops();
+  } else {
+    ops = phi::backends::xpu::get_kl3_ops();
+  }
+
   std::vector<vartype::Type> res;
   if (ops.find(op_name) != ops.end()) {
     auto& dtypes = ops[op_name];
@@ -115,9 +121,15 @@ std::vector<vartype::Type> get_xpu_op_support_type(
 }
 
 XPUOpListMap get_xpu_op_list(phi::backends::xpu::XPUVersion version) {
-  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1
-                  ? phi::backends::xpu::get_kl1_ops()
-                  : phi::backends::xpu::get_kl2_ops();
+  phi::backends::xpu::XPUOpMap ops;
+  if (version == phi::backends::xpu::XPUVersion::XPU1) {
+    ops = phi::backends::xpu::get_kl1_ops();
+  } else if (version == phi::backends::xpu::XPUVersion::XPU2) {
+    ops = phi::backends::xpu::get_kl2_ops();
+  } else {
+    ops = phi::backends::xpu::get_kl3_ops();
+  }
+
   XPUOpListMap res;
   for (auto& op : ops) {
     std::vector<vartype::Type> op_types;
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index 3eb7f2d9f22721..a77e396adee5f4 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -419,7 +419,7 @@ void SendBroadCastCommID(std::vector<std::string> servers,
 
   // connect with server
   std::vector<int> connects;
-  for (auto server : servers) {
+  for (auto const& server : servers) {
     VLOG(3) << "connecting endpoint: " << server;
     int conn = ConnectAddr(server, head);
     connects.push_back(conn);
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 67512474567d30..44c17c32fa8d56 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -139,8 +139,8 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
   }
 
   if (type == TracerMemEventType::Allocate) {
-    uint64_t current_allocated;
-    uint64_t peak_allocated;
+    uint64_t current_allocated = 0;
+    uint64_t peak_allocated = 0;
     uint64_t current_reserved = 0;  // 0 means keep the same as before
     uint64_t peak_reserved = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
@@ -223,8 +223,8 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
                                                         peak_allocated,
                                                         peak_reserved);
   } else if (type == TracerMemEventType::ReservedAllocate) {
-    uint64_t current_reserved;
-    uint64_t peak_reserved;
+    uint64_t current_reserved = 0;
+    uint64_t peak_reserved = 0;
     uint64_t current_allocated = 0;  // 0 means keep the same as before
     uint64_t peak_allocated = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
@@ -306,8 +306,8 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
                                                         peak_allocated,
                                                         peak_reserved);
   } else if (type == TracerMemEventType::Free) {
-    uint64_t current_allocated;
-    uint64_t peak_allocated;
+    uint64_t current_allocated = 0;
+    uint64_t peak_allocated = 0;
     uint64_t current_reserved = 0;  // 0 means keep the same as before
     uint64_t peak_reserved = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
@@ -389,8 +389,8 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
                                                        peak_allocated,
                                                        peak_reserved);
   } else if (type == TracerMemEventType::ReservedFree) {
-    uint64_t current_reserved;
-    uint64_t peak_reserved;
+    uint64_t current_reserved = 0;
+    uint64_t peak_reserved = 0;
     uint64_t current_allocated = 0;  // 0 means keep the same as before
     uint64_t peak_allocated = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
diff --git a/paddle/fluid/platform/profiler/custom_device/custom_tracer.cc b/paddle/fluid/platform/profiler/custom_device/custom_tracer.cc
index 7ea473dfdc1505..795aab1e128fd4 100644
--- a/paddle/fluid/platform/profiler/custom_device/custom_tracer.cc
+++ b/paddle/fluid/platform/profiler/custom_device/custom_tracer.cc
@@ -28,6 +28,10 @@ namespace platform {
 
 CustomTracer::CustomTracer(const std::string& dev_type) : dev_type_(dev_type) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
+  auto selected_devices = phi::DeviceManager::GetSelectedDeviceList(dev_type_);
+  if (selected_devices.size()) {
+    phi::DeviceManager::SetDevice(dev_type_, selected_devices[0]);
+  }
   phi::DeviceManager::ProfilerInitialize(dev_type_, &collector_, &context_);
 #endif
 }
@@ -105,7 +109,7 @@ void CustomTracer::CollectTraceData(TraceEventCollector* collector) {
   for (auto de : collector_.DeviceEvents()) {
     collector->AddDeviceEvent(std::move(de));
   }
-  for (auto tn : collector_.ThreadNames()) {
+  for (auto const& tn : collector_.ThreadNames()) {
     collector->AddThreadName(tn.first, tn.second);
   }
   collector_.ClearAll();
diff --git a/paddle/fluid/prim/api/api.yaml b/paddle/fluid/prim/api/api.yaml
index ec3bd5741371eb..5a1a6e335abeb5 100644
--- a/paddle/fluid/prim/api/api.yaml
+++ b/paddle/fluid/prim/api/api.yaml
@@ -48,3 +48,4 @@
 - reshape
 - erf
 - tanh
+- sign
diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 53369e956d7b82..64c431b3d237fe 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -585,9 +585,8 @@ void sigmoid_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
 template <typename T>
 void abs_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
-    auto abs_tmp = abs<T>(x);
-    auto divide_tmp = divide<T>(x, abs_tmp);
-    set_output<T>(out_grad * divide_tmp, x_grad);
+    auto sign_tmp = sign<T>(x);
+    set_output<T>(out_grad * sign_tmp, x_grad);
   }
 }
 
diff --git a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
index d76a8ad5523bb9..f89a898ca1a58e 100644
--- a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
+++ b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
@@ -27,12 +27,12 @@ namespace paddle {
 namespace prim {
 using Tensor = paddle::Tensor;
 template <>
-Tensor empty<DescTensor>(const paddle::experimental::IntArray& shape,
-                         phi::DataType dtype,
-                         const paddle::Place& place) {
+TEST_API Tensor empty<DescTensor>(const paddle::experimental::IntArray& shape,
+                                  phi::DataType dtype,
+                                  const paddle::Place& place) {
   framework::VarDesc* new_var =
       StaticCompositeContext::Instance().GetBlock()->Var(
-          std::move(StaticCompositeContext::Instance().GenerateUniqueName()));
+          StaticCompositeContext::Instance().GenerateUniqueName());
   new_var->SetShape(shape.GetData());
   new_var->SetDataType(framework::TransToProtoVarType(dtype));
   // Place is not supported in static mode
diff --git a/paddle/fluid/prim/utils/static/static_global_utils.h b/paddle/fluid/prim/utils/static/static_global_utils.h
index c08405bb18dbed..b88292d488ab69 100644
--- a/paddle/fluid/prim/utils/static/static_global_utils.h
+++ b/paddle/fluid/prim/utils/static/static_global_utils.h
@@ -25,7 +25,6 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
-
 namespace paddle {
 namespace prim {
 
@@ -109,7 +108,7 @@ class StaticCompositeContext {
   static thread_local bool enable_bwd_prim_;
   static thread_local bool enable_fwd_prim_;
   static thread_local bool enable_eager_prim_;
-  static StaticCompositeContext* static_composite_context_;
+  TEST_API static StaticCompositeContext* static_composite_context_;
   DISABLE_COPY_AND_ASSIGN(StaticCompositeContext);
 };
 
diff --git a/paddle/fluid/primitive/backend/manual/manual_backend.h b/paddle/fluid/primitive/backend/manual/manual_backend.h
index 3c9340164ac012..4faabab79f6852 100644
--- a/paddle/fluid/primitive/backend/manual/manual_backend.h
+++ b/paddle/fluid/primitive/backend/manual/manual_backend.h
@@ -24,7 +24,7 @@ namespace primitive {
 namespace backend {
 
 using Tensor = paddle::Tensor;
-using Scalar = paddle::experimental::Scalar;
+using Scalar = phi::Scalar;
 using IntArray = paddle::experimental::IntArray;
 using DataType = phi::DataType;
 
@@ -32,6 +32,13 @@ template <typename T>
 std::vector<Tensor> add_n_grad(const std::vector<Tensor>& x,
                                const Tensor& out_grad);
 
+template <typename T>
+Tensor embedding_grad(const Tensor& x,
+                      const Tensor& weight,
+                      const Tensor& out_grad,
+                      int64_t padding_idx = -1,
+                      bool sparse = false);
+
 }  // namespace backend
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/backend/manual/manual_static_backend.cc b/paddle/fluid/primitive/backend/manual/manual_static_backend.cc
index 7b33200336d000..b115e6a0210974 100644
--- a/paddle/fluid/primitive/backend/manual/manual_static_backend.cc
+++ b/paddle/fluid/primitive/backend/manual/manual_static_backend.cc
@@ -45,6 +45,23 @@ std::vector<Tensor> add_n_grad<LazyTensor>(const std::vector<Tensor>& x,
   return x_grad;
 }
 
+template <>
+Tensor embedding_grad<LazyTensor>(const Tensor& x,
+                                  const Tensor& weight,
+                                  const Tensor& out_grad,
+                                  int64_t padding_idx,
+                                  bool sparse) {
+  pir::Value x_res = std::static_pointer_cast<LazyTensor>(x.impl())->value();
+  pir::Value weight_res =
+      std::static_pointer_cast<LazyTensor>(weight.impl())->value();
+  pir::Value out_grad_res =
+      std::static_pointer_cast<LazyTensor>(out_grad.impl())->value();
+  auto op_res = paddle::dialect::embedding_grad(
+      x_res, weight_res, out_grad_res, padding_idx, sparse);
+  Tensor out(std::make_shared<LazyTensor>(op_res));
+  return out;
+}
+
 }  // namespace backend
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/codegen/CMakeLists.txt b/paddle/fluid/primitive/codegen/CMakeLists.txt
index d01d21829ca1e2..e081da5b5dfe02 100644
--- a/paddle/fluid/primitive/codegen/CMakeLists.txt
+++ b/paddle/fluid/primitive/codegen/CMakeLists.txt
@@ -4,6 +4,12 @@ set(fwd_path ${parsed_yaml_path}/ops.parsed.yaml)
 set(fwd_legacy_path ${parsed_yaml_path}/legacy_ops.parsed.yaml)
 set(rev_path ${parsed_yaml_path}/backward_ops.parsed.yaml)
 set(rev_legacy_path ${parsed_yaml_path}/legacy_backward_ops.parsed.yaml)
+set(fwd_pd_op_path
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/operator/ir/generated/ops.parsed.yaml
+)
+set(rev_pd_op_path
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/operator/ir/generated/ops_backward.parsed.yaml
+)
 set(prim_path "${PADDLE_SOURCE_DIR}/paddle/fluid/primitive/primitive.yaml")
 set(templates_dir
     "${PADDLE_SOURCE_DIR}/paddle/fluid/primitive/codegen/templates/")
@@ -17,7 +23,8 @@ execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${scripts} --fwd_path ${fwd_path} --fwd_legacy_path
     ${fwd_legacy_path} --rev_path ${rev_path} --rev_legacy_path
-    ${rev_legacy_path} --prim_path ${prim_path} --templates_dir ${templates_dir}
+    ${rev_legacy_path} --fwd_pd_op_path ${fwd_pd_op_path} --rev_pd_op_path
+    ${rev_pd_op_path} --prim_path ${prim_path} --templates_dir ${templates_dir}
     --compat_path ${compat_path} --destination_dir ${destination_dir}
   RESULT_VARIABLE _result)
 if(${_result})
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index da9e12fa817c59..5ec3edd9838ec5 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -37,102 +37,19 @@
 # fmt: on
 
 
-VJPS = [
-    'where_grad',
-    'tril_grad',
-    'triu_grad',
-    'tile_grad',
-    'tanh_grad',
-    'mean_grad',
-    'add_grad',
-    'divide_grad',
-    'sum_grad',
-    'concat_grad',
-    'split_grad',
-    'split_with_num_grad',
-    'gelu_grad',
-    'softmax_grad',
-    'silu_grad',
-    'multiply_grad',
-    'subtract_grad',
-    'erf_grad',
-    'expand_grad',
-    'exp_grad',
-    'expm1_grad',
-    'elementwise_pow_grad',
-    'fused_softmax_mask_upper_triangle_grad',
-    'matmul_grad',
-    'pow_grad',
-    'rsqrt_grad',
-    'slice_grad',
-    'transpose_grad',
-    'square_grad',
-    'dropout_grad',
-    'cast_grad',
-    'slice_double_grad',
-    'layer_norm_grad',
-    'embedding_grad',
-    'scale_grad',
-    'gather_nd_grad',
-    'stack_grad',
-    'squeeze_grad',
-    'unsqueeze_grad',
-    'poisson_grad',
-    'gumbel_softmax_grad',
-    'conv2d_grad',
-    'depthwise_conv2d_grad',
-    'sqrt_grad',
-    'flatten_grad',
-    'relu_grad',
-    'abs_grad',
-    'log_grad',
-    'clip_grad',
-    'ceil_grad',
-    'frobenius_norm_grad',
-    'p_norm_grad',
-    'maximum_grad',
-    'argsort_grad',
-    'min_grad',
-    'batch_norm_grad',
-    'max_pool2d_with_index_grad',
-    'pool2d_grad',
-    'minimum_grad',
-    'prod_grad',
-    'round_grad',
-    'sin_grad',
-    'cos_grad',
-    'dot_grad',
-    'floor_grad',
-    'topk_grad',
-    'square_grad',
-    'gather_grad',
-    'label_smooth_grad',
-    'cross_entropy_with_softmax_grad',
-    'mean_all_grad',
-    'cumsum_grad',
-    'linear_interp_grad',
-    'bilinear_interp_grad',
-    'trilinear_interp_grad',
-    'nearest_interp_grad',
-    'bicubic_interp_grad',
-    'assign_grad',
-    'assign_out__grad',
-    'real_grad',
-    'flip_grad',
-    'softmax_grad',
-    'expand_grad',
-    'conv2d_transpose_grad',
-    'depthwise_conv2d_transpose_grad',
-    'sigmoid_grad',
-    'pad_grad',
-    'pad3d_grad',
-    'einsum_grad',
-    'leaky_relu_grad',
-    'log10_grad',
-    'conv3d_grad',
-    'solve_grad',
-    'diag_grad',
-    'trace_grad',
+VJPS_BLACK_LIST = [
+    'reshape_grad',
+    'add_n_grad',
+]
+
+BACKENDS_BLACK_LIST = [
+    'copy_to',
+    'add_n_grad',
+    "allclose",
+    "isclose",
+    "send_v2",
+    "assert",
+    "embedding_grad_sparse",
 ]
 
 
@@ -148,153 +65,25 @@
     'tanh_grad',
     'transpose_grad',
     'concat_grad',
+    'erf_grad',
+    'exp_grad',
+    'expand_grad',
+    'log_grad',
+    'gather_nd_grad',
+    'pad_grad',
+    'max_grad',
+    'slice_grad',
+    'tile_grad',
 ]  # vjp list of primitive op
 CUSTOM_VJP = [
     'gelu_grad',
     'layer_norm_grad',
     'dropout_grad',
-]  # custom vjp list of composite op
-VJP_COMPS = PRIM_VJP + CUSTOM_VJP
-
-BACKENDS = [
-    'where_grad',
-    'tril_grad',
-    'triu_grad',
-    'tile_grad',
-    'add_n',
-    'mean',
-    'sum',
-    'divide',
-    'full',
-    'tanh',
-    'tanh_grad',
-    'mean_grad',
-    'concat',
-    'add',
-    'multiply',
-    'elementwise_pow',
-    'scale',
-    'reshape',
-    'expand',
-    'tile',
-    'add_grad',
-    'divide_grad',
-    'sum_grad',
-    'concat_grad',
-    'split_grad',
-    'split_with_num_grad',
-    'gelu_grad',
-    'softmax_grad',
     'silu_grad',
-    'multiply_grad',
-    'subtract_grad',
-    'erf_grad',
-    'expand_grad',
-    'exp_grad',
-    'expm1_grad',
-    'multiply',
-    'exp',
-    'erf',
-    'cast',
-    'elementwise_pow_grad',
-    'fused_softmax_mask_upper_triangle_grad',
-    'matmul_grad',
-    'pow_grad',
-    'reshape_grad',
-    'rsqrt_grad',
-    'slice_grad',
-    'transpose_grad',
-    'subtract',
-    'assign',
-    'equal',
-    'greater_equal',
-    'greater_than',
-    'less_equal',
-    'less_than',
-    'matmul',
-    'max',
-    'maximum',
-    'minimum',
-    'not_equal',
-    'abs',
-    'bitwise_and',
-    'bitwise_not',
-    'bitwise_or',
-    'bitwise_xor',
-    'floor',
-    'gather_nd',
-    'log',
-    'roll',
-    'scatter',
-    'scatter_nd_add',
-    'square_grad',
-    'dropout_grad',
-    'slice',
-    'layer_norm_grad',
-    'embedding_grad',
-    'sqrt',
-    'uniform',
-    'poisson_grad',
-    'gumbel_softmax_grad',
-    'split',
-    'transpose',
-    'gather_nd_grad',
-    'stack_grad',
-    'squeeze_grad',
-    'unsqueeze_grad',
-    'conv2d_grad',
-    'depthwise_conv2d_grad',
-    'sqrt_grad',
-    'flatten_grad',
-    'relu_grad',
-    'abs_grad',
-    'log_grad',
-    'clip_grad',
-    'ceil_grad',
-    'frobenius_norm_grad',
-    'p_norm_grad',
-    'maximum_grad',
-    'argsort_grad',
-    'min_grad',
-    'batch_norm_grad',
-    'max_pool2d_with_index_grad',
-    'pool2d_grad',
-    'minimum_grad',
-    'prod_grad',
-    'round_grad',
-    'sin_grad',
-    'cos_grad',
-    'dot_grad',
-    'floor_grad',
-    'topk_grad',
-    'square_grad',
-    'gather_grad',
-    'label_smooth_grad',
-    'cross_entropy_with_softmax_grad',
-    'mean_all_grad',
-    'cumsum_grad',
-    'linear_interp_grad',
-    'bilinear_interp_grad',
-    'trilinear_interp_grad',
-    'nearest_interp_grad',
-    'bicubic_interp_grad',
-    'assign_out__grad',
-    'real_grad',
     'softmax_grad',
-    'conv2d_transpose_grad',
-    'depthwise_conv2d_transpose_grad',
-    'sigmoid_grad',
-    'pad_grad',
-    'pad3d_grad',
-    'einsum_grad',
-    'leaky_relu_grad',
-    'log10_grad',
-    'conv3d_grad',
-    'solve_grad',
-    'diag_grad',
-    'trace_grad',
-    'flip',
-]
+    'sqrt_grad',
+]  # custom vjp list of composite op
+VJP_COMPS = PRIM_VJP + CUSTOM_VJP
 
 
 def load(path: pathlib.Path):
@@ -346,6 +135,7 @@ def render(src_dir: pathlib.Path, dst_dir: pathlib.Path, *args, **kwargs):
             'datatype': op_gen_tests.is_datatype,
             'exist_mutable_attribute': op_gen_tests.exist_mutable_attribute,
             'mutable_attribute': op_gen_tests.is_mutable_attribute,
+            'only_composite_op': op_gen_tests.is_only_composite_op,
         }
     )
     for tpl in env.list_templates(
@@ -496,6 +286,23 @@ def process_backward_invoke_info(apis):
             api['invoke']['args'] = ', '.join(args)
 
 
+def process_optional_output_info(apis):
+    for api in apis:
+        inputs_dict = to_named_dict(api['inputs'])
+        for output in api['outputs']:
+            if not api['is_fwd']:
+                output['optional'] = False
+            else:
+                if (
+                    api.get("inplace", None)
+                    and output['name'] in api['inplace']
+                    and inputs_dict[api['inplace'][output['name']]]['optional']
+                ):
+                    output['optional'] = True
+                else:
+                    output['optional'] = False
+
+
 def gen(
     prim_path: pathlib.Path,
     fwd_path: pathlib.Path,
@@ -503,6 +310,8 @@ def gen(
     rev_path: pathlib.Path,
     rev_legacy_path: pathlib.Path,
     compat_path: pathlib.Path,
+    fwd_pd_op_path: pathlib.Path,
+    rev_pd_op_path: pathlib.Path,
     templates_dir: pathlib.Path,
     destination_dir: pathlib.Path,
 ):
@@ -518,23 +327,38 @@ def gen(
         rev_legacy_path (pathlib.Path): The YAML file path of the legacy
             backward API.
         compat_path: (pathlib.Path): The YAML file path of the ops compat.
+        fwd_pd_op_path (pathlib.Path): The YAML file path of the ir forward API.
+        rev_pd_op_path (pathlib.Path): The YAML file path of the ir backward API.
         templates_dir (pathlib.Path): The directory of the templates.
         destination_dir (pathlib.Path): The Directory of the generated file.
 
     Returns:
         None
     """
-    prims, fwds, legacy_fwds, revs, legacy_revs, compats = (
+    (
+        prims,
+        fwds,
+        legacy_fwds,
+        revs,
+        legacy_revs,
+        compats,
+        ir_fwds,
+        ir_revs,
+    ) = (
         load(prim_path),
         load(fwd_path),
         load(fwd_legacy_path),
         load(rev_path),
         load(rev_legacy_path),
         load(compat_path),
+        load(fwd_pd_op_path),
+        load(rev_pd_op_path),
     )
     filter_compat_info(compats)
-    apis = [{**api, **{'is_fwd': True}} for api in fwds + legacy_fwds]
-    apis = apis + [{**api, **{'is_fwd': False}} for api in revs + legacy_revs]
+    apis = [{**api, **{'is_fwd': True}} for api in fwds + legacy_fwds + ir_fwds]
+    apis = apis + [
+        {**api, **{'is_fwd': False}} for api in revs + legacy_revs + ir_revs
+    ]
     apis = [
         {**api, **{'is_prim': True}}
         if api['name'] in prims
@@ -544,12 +368,13 @@ def gen(
     apis = extend_compat_info(apis, compats)
     apis = apis + get_inplace_api(apis)
     process_backward_invoke_info(apis)
+    process_optional_output_info(apis)
     render(
         templates_dir,
         destination_dir,
         apis=apis,
-        backend_white_list=BACKENDS,
-        vjp_white_list=VJPS,
+        backend_black_list=BACKENDS_BLACK_LIST,
+        vjp_black_list=VJPS_BLACK_LIST,
         vjp_comp_white_list=VJP_COMPS,
     )
 
@@ -584,6 +409,16 @@ def gen(
         type=str,
         help='The parsed ops compat yaml file.',
     )
+    parser.add_argument(
+        '--fwd_pd_op_path',
+        type=str,
+        help='The ir forward ops parsed  yaml file.',
+    )
+    parser.add_argument(
+        '--rev_pd_op_path',
+        type=str,
+        help='The ir backward ops parsed  yaml file.',
+    )
     parser.add_argument(
         '--templates_dir',
         type=str,
@@ -603,6 +438,8 @@ def gen(
         pathlib.Path(args.rev_path),
         pathlib.Path(args.rev_legacy_path),
         pathlib.Path(args.compat_path),
+        pathlib.Path(args.fwd_pd_op_path),
+        pathlib.Path(args.rev_pd_op_path),
         pathlib.Path(args.templates_dir),
         pathlib.Path(args.destination_dir),
     )
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
index 25443f52fe8af7..e422bd61a9618a 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
@@ -15,24 +15,22 @@ namespace primitive {
 namespace backend {
 
 using Tensor = paddle::Tensor;
-using Scalar = paddle::experimental::Scalar;
+using Scalar = phi::Scalar;
 using IntArray = paddle::experimental::IntArray;
 using DataType = phi::DataType;
 
 {% for api in apis %}
-  {%- if api.name in backend_white_list -%}
-  {% set inplace_map = {} %}
-  {% if 'inplace' in api and api.inplace != None %}
-    {% for source, target in api.inplace.items() %}
-      {% do inplace_map.update({source: target}) %}
-    {% endfor %}
-  {% endif %}
-  {% if api.attrs is exist_mutable_attribute %}
-{{common.sig(api.name, api.inputs, api.outputs|trip_intermediate , api.attrs, inplace_map, True, True)}};
+  {%- if api is only_composite_op or "infer_meta" not in api and "composite" not in api and "invoke" not in api -%}{#- render nothing -#}
+  {%- elif api.name not in backend_black_list -%}
+    {%- if 'invoke' not in api or 'invoke' in api and api.is_fwd -%}
+      {% if api.attrs is exist_mutable_attribute %}
+{{common.sig(api.name, api.inputs, api.outputs|trip_intermediate , api.attrs, True, True)}};
 
-  {% endif %}
-{{common.sig(api.name, api.inputs, api.outputs|trip_intermediate , api.attrs, inplace_map, False, True)}};
+      {% endif %}
+{{common.sig(api.name, api.inputs, api.outputs|trip_intermediate , api.attrs, False, True)}};
 
+    {% endif %}
+  {% else %}{#- render nothing -#}
   {% endif %}
 {% endfor %}
 }  // namespace backend
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
index 34e427f0c2e03b..7f9f4b5b8676ff 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
@@ -12,13 +12,14 @@ namespace backend {
 
 {%- macro args(inputs, attrs) -%}  {#- Arguments are variable pass into method -#}
   {{common.sequence('', '', ', ', inputs)}}
-  {%- if attrs|length > 0 -%} {{", "}} {%- endif -%} {#- append comma between inputs and attrs -#}
+  {%- if attrs|length > 0 -%} {{", "}} {%- endif -%} {#- append comma between 
+  nputs and attrs -#}
   {{common.sequence('', '', ', ', attrs)}}
 {%- endmacro -%}
 
-{%- macro sig(name, inputs, attrs, outputs, inplace_map) -%}
+{%- macro sig(name, inputs, attrs, outputs) -%}
 template <>
-{{common.ret(outputs, inplace_map)}} {{name}}<Tensor>({{common.params(inputs, attrs, False)}})
+{{common.ret(outputs)}} {{name}}<Tensor>({{common.params(inputs, attrs, False)}})
 {%- endmacro -%}
 
 {% macro body(name, inputs, attrs, outputs) %}
@@ -27,21 +28,15 @@ template <>
   {%- set attr_names = [] -%}
   {%- for i in attrs -%} {%- do attr_names.append(i.name) -%} {%-endfor-%}
   {% filter indent(2, True) %}
-VLOG(4) << "Eager Prim API {name}_ad_func call";
+VLOG(4) << "Eager Prim API {{name}}_ad_func call";
 return ::{{name}}_ad_func({{common.args(input_names, attr_names)}});
   {% endfilter %}
 {% endmacro %}
 
 
 {% for api in apis %}
-  {%- if api.is_prim and api.name in backend_white_list -%}
-  {% set inplace_map = {} %}
-  {% if 'inplace' in api and api.inplace != None %}
-    {% for source, target in api.inplace.items() %}
-      {% do inplace_map.update({source: target}) %}
-    {% endfor %}
-  {% endif %}
-{{sig(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate, inplace_map)}} {
+  {%- if api.is_prim and api.name not in backend_black_list and api.name[-1] !=  '_' -%}
+{{sig(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} {
 {{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} 
 }
 
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
index 152cd241ad8333..36adc8ac964c41 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
@@ -12,9 +12,9 @@ namespace backend {
 
 using LazyTensor = paddle::primitive::LazyTensor;
 
-{%- macro sig(name, inputs, outputs, attrs, inplace_map, mutable_attribute_as_inputs=False) -%}
+{%- macro sig(name, inputs, outputs, attrs, mutable_attribute_as_inputs=False) -%}
 template <>
-{{common.ret(outputs, inplace_map)}} {{name}}<LazyTensor>({{common.params(inputs, attrs, mutable_attribute_as_inputs, False)}})
+{{common.ret(outputs)}} {{name}}<LazyTensor>({{common.params(inputs, attrs, mutable_attribute_as_inputs, False)}})
 {%- endmacro -%}
 
 {%- macro prepare_ir_api_inputs(inputs)-%}
@@ -48,13 +48,13 @@ if({{input.name}}) {
 
 {%- macro get_static_backend_outputs(outputs)-%}
   {%- if outputs|length == 1 -%}
-    {%- if outputs[0].typename == 'Tensor' and not outputs[0].optional-%}
+    {%- if outputs[0].typename == 'Tensor' and not outputs[0].optional -%}
 Tensor {{outputs[0].name}}(std::make_shared<LazyTensor>(op_res));
 return {{outputs[0].name}};
     {%- elif outputs[0].typename == 'Tensor' and outputs[0].optional -%}
 paddle::optional<Tensor> {{outputs[0].name}};
 if(op_res){
-  {{outputs[0].name}} = paddle::make_optional<Tensor>(Tensor(std::make_shared<LazyTensor>(op_res.get()));
+  {{outputs[0].name}} = paddle::make_optional<Tensor>(Tensor(std::make_shared<LazyTensor>(op_res.get())));
 }
 return {{outputs[0].name}};
     {%- elif outputs[0].typename == 'Tensor[]' and not outputs[0].optional -%}
@@ -80,7 +80,7 @@ return {{outputs[0].name}};
 auto op_res_{{i}} = std::get<{{i}}>(op_res);
       {% if outputs[i].typename == 'Tensor' and not outputs[i].optional %}
 Tensor {{outputs[i].name}}(std::make_shared<LazyTensor>(op_res_{{i}}));
-      {% elif outputs[i].typename == 'Tensor' and  outputs[i].optional %}
+      {% elif outputs[i].typename == 'Tensor' and outputs[i].optional %}
 paddle::optional<Tensor> {{outputs[i].name}};
 if(op_res_{{i}}){
   {{outputs[i].name}} = paddle::make_optional<Tensor>(Tensor(std::make_shared<LazyTensor>(op_res_{{i}}.get())));
@@ -139,28 +139,26 @@ auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}}
 
 
 {% for api in apis %}
-{% if api.name in backend_white_list %}
+{%- if api is only_composite_op or "infer_meta" not in api and "composite" not in api and "invoke" not in api -%}{#- render nothing -#}
+{% elif api.name not in backend_black_list %}
+  {%- if 'invoke' not in api or 'invoke' in api and api.is_fwd-%}
   {% set api_outputs = api.outputs | trip_intermediate %}
-  {% set inplace_map = {} %}
-  {% if 'inplace' in api and api.inplace != None %}
-    {% for source, target in api.inplace.items() %}
-      {% do inplace_map.update({source: target}) %}
-    {% endfor %}
-  {% endif %}
-{{sig(api.name, api.inputs, api_outputs, api.attrs, inplace_map)}} {
+{{sig(api.name, api.inputs, api_outputs, api.attrs)}} {
   {% filter indent(2, True) %}
 {{body(api.name, api.inputs, api_outputs, api.attrs)}} 
   {% endfilter %}
 }
 
-  {% if api.attrs is exist_mutable_attribute %}
-{{sig(api.name, api.inputs, api_outputs, api.attrs, inplace_map, True)}} {
+    {% if api.attrs is exist_mutable_attribute %}
+{{sig(api.name, api.inputs, api_outputs, api.attrs, True)}} {
   {% filter indent(2, True) %}
 {{body(api.name, api.inputs, api_outputs, api.attrs, True)}} 
   {% endfilter %}
 }
 
+    {% endif %}
   {% endif %}
+{% else %}{#- render nothing -#}
 {% endif %}
 {% endfor %}
 
diff --git a/paddle/fluid/primitive/codegen/templates/common.j2 b/paddle/fluid/primitive/codegen/templates/common.j2
index 6ac639e8ceeaef..5f7148017ab23b 100644
--- a/paddle/fluid/primitive/codegen/templates/common.j2
+++ b/paddle/fluid/primitive/codegen/templates/common.j2
@@ -1,6 +1,6 @@
-{%- macro sig(name, inputs, outputs, attrs, inplace_map, mutable_attribute_as_inputs=False, default=False) -%}
+{%- macro sig(name, inputs, outputs, attrs, mutable_attribute_as_inputs=False, default=False) -%}
 template <typename T>
-{{ret(outputs, inplace_map)}} {{name}}({{params(inputs, attrs, mutable_attribute_as_inputs, default)}})
+{{ret(outputs)}} {{name}}({{params(inputs, attrs, mutable_attribute_as_inputs, default)}})
 {%- endmacro %}
 
 
@@ -40,9 +40,9 @@ template <typename T>
 {%- endmacro -%}
 
 
-{%- macro ret(outputs, inplace_map) -%}
+{%- macro ret(outputs) -%}
   {%- set names = [] -%}
-  {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type(i.name in inplace_map and i.optional)) -%} {%- endfor -%}
+  {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type(i.optional)) -%} {%- endfor -%}
   {%- if names|length > 1 -%} 
 std::tuple<{{sequence('', '', ', ', names)}}>
   {%- else -%}
@@ -73,5 +73,9 @@ std::tuple<{{sequence('', '', ', ', names)}}>
 
 
 {%- macro scalar2ir(name, data_type) -%}
+  {%- if data_type == 'std::vector<Scalar>' -%}
+{{name}}
+  {%- else -%}
 {{name}}.to<{{data_type}}>()
+  {%- endif -%}
 {%- endmacro -%}
diff --git a/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2 b/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
index 5cf6807470f2bf..90c8d4ce5d89fa 100644
--- a/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
+++ b/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
@@ -13,18 +13,12 @@ using Tensor = paddle::Tensor;
 using IntArray = paddle::experimental::IntArray;
 
 {% for api in apis %}
-{%- if api.is_prim and api.name in backend_white_list and api.name[-1] !=  '_' -%}
+{%- if api.is_prim and api.name not in backend_black_list and api.name[-1] !=  '_' -%}
   {%- set input_names = [] -%}
   {%- for i in api.inputs -%} {%- do input_names.append(i.name) -%} {%- endfor -%}
   {%- set attr_names = [] -%}
   {%- for i in api.attrs -%} {%- do attr_names.append(i.name) -%} {% endfor %}
-  {% set inplace_map = {} %}
-  {% if 'inplace' in api and api.inplace != None %}
-    {% for source, target in api.inplace.items() %}
-      {% do inplace_map.update({source: target}) %}
-    {% endfor %}
-  {% endif %}
-{{common.sig(api.name, api.inputs, api.outputs | trip_intermediate, api.attrs, inplace_map, False, True)}} {
+{{common.sig(api.name, api.inputs, api.outputs | trip_intermediate, api.attrs, False, True)}} {
     return backend::{{api.name}}<T>({{common.args(input_names, attr_names)}});
 }
 
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
index 50a0c5d86fc318..02e6c58f97af63 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
@@ -120,8 +120,10 @@ details::{{api.composite.func_name}}<LazyTensor>({{api.composite.func_args}});
 {%- set api_map = {} -%}
 {%- for api in apis -%} {%- do api_map.update({api.name: api}) -%} {%- endfor -%}
 {%- for api in apis %}
-  {%- if api.backward and api.backward in api_map and api.backward in vjp_white_list -%}
+  {%- if api.backward and api.backward in api_map and api.backward not in vjp_black_list -%}
       {%- set backward_api = api_map[api.backward] %}
+  {%- if backward_api is only_composite_op -%}{#- render nothing -#}
+  {%- else -%}
 {{sig(api.name, backward_api.name, backward_api.inputs, backward_api.attrs, backward_api.outputs)}} {
     {% filter indent(2, True) %}
 {{body(backward_api)}}
@@ -129,6 +131,7 @@ details::{{api.composite.func_name}}<LazyTensor>({{api.composite.func_args}});
 }
 
   {% endif %}
+  {% endif %}
 {% endfor %}
 
 
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.h.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.h.j2
index 7f403661fea05e..a4209fb5e81748 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.h.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.h.j2
@@ -20,11 +20,14 @@ std::vector<std::vector<paddle::Tensor>> {{fwd_name}}_vjp({{common.params(inputs
 {%- set api_map = {} -%}
 {%- for api in apis -%} {%- do api_map.update({api.name: api}) -%} {%- endfor -%}
 {% for api in apis %}
-  {%- if api.backward and api.backward in api_map  and api.backward in vjp_white_list -%}
+  {%- if api.backward and api.backward in api_map  and api.backward not in vjp_black_list -%}
     {%- set backward_api = api_map[api.backward] -%}
+  {%- if backward_api is only_composite_op -%}{#- render nothing -#}
+  {%- else -%}
 {{sig(api.name, backward_api.name, backward_api.inputs, backward_api.attrs, backward_api.outputs)}}
 
   {% endif %}
+  {% endif %}
 {% endfor %}
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index 7ac642573ca798..e0da626ef4c938 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -14,11 +14,55 @@
 
 #pragma once
 
-namespace paddle {
+#include "paddle/fluid/primitive/primitive/primitive.h"
+#include "paddle/fluid/primitive/type/lazy_tensor.h"
+#include "paddle/fluid/primitive/utils/utils.h"
 
+namespace paddle {
 namespace primitive {
+namespace details {
+
+template <typename T>
+Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
+  auto org_dtype = x.dtype();
+  auto x_tmp = x;
+  bool need_cast = org_dtype == phi::DataType::FLOAT16 ||
+                   org_dtype == phi::DataType::BFLOAT16;
+  if (need_cast) {
+    x_tmp = cast<T>(x, phi::DataType::FLOAT32);
+  }
+  std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x_tmp.dims());
+  int64_t axis_size = axis.size();
+  int64_t x_dim_size = x_dim.size();
+  auto axis_ = std::vector<int64_t>();
+  if (axis_size == 0) {
+    for (int64_t i = 0; i < x_dim_size; i++) {
+      axis_.push_back(i);
+    }
+  } else {
+    axis_ = axis.GetData();
+    for (int64_t i = 0; i < axis_size; i++) {
+      if (axis[i] < 0) {
+        axis_[i] = axis[i] + x_dim_size;
+      }
+    }
+  }
+
+  int64_t value = 1;
+  for (size_t i = 0; i < axis_.size(); i++) {
+    value *= x_dim[axis_[i]];
+  }
+  auto sum_x = sum<T>(x_tmp, IntArray(axis_), x_tmp.dtype(), keepdim);
+  auto res = divide<T>(
+      sum_x, full<T>(phi::vectorize(sum_x.dims()), value, sum_x.dtype()));
+  if (need_cast) {
+    return cast<T>(res, org_dtype);
+  } else {
+    return res;
+  }
+}
 
-namespace experimental {}
+}  // namespace details
 
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/primitive.yaml b/paddle/fluid/primitive/primitive.yaml
index ccf9673bafba07..85ffc28a20d20e 100644
--- a/paddle/fluid/primitive/primitive.yaml
+++ b/paddle/fluid/primitive/primitive.yaml
@@ -50,3 +50,5 @@
 - tanh
 - full
 - cast
+- sign
+- slice
diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index 4e2d7d4732b89a..5e8863027a78d1 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -536,6 +536,304 @@ void dropout_grad(const Tensor& mask,
   }
 }
 
+template <typename T>
+void erf_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
+  if (x_grad) {
+    auto m_2_sqrt_pi = full<T>(phi::vectorize(x.dims()), M_2_SQRTPI, x.dtype());
+    auto neg_one = full<T>(phi::vectorize(x.dims()), -1.0, x.dtype());
+    auto neg_tmp = neg_one * x * x;
+    auto mul_tmp = m_2_sqrt_pi * exp<T>(neg_tmp);
+    set_output<T>(out_grad * mul_tmp, x_grad);
+  }
+}
+
+template <typename T>
+void expand_grad(const Tensor& x,
+                 const Tensor& out_grad,
+                 const IntArray& shape,
+                 Tensor* x_grad) {
+  if (x_grad) {
+    auto out_dims = phi::make_ddim(shape.GetData());
+    if (out_dims != x.dims()) {
+      auto axes = get_reduce_dims(x.dims(), out_dims);
+      if (!axes.size()) {
+        by_pass<T>(out_grad, x_grad);
+      } else {
+        auto reduced = out_grad.sum(phi::vectorize(axes), x.dtype(), false);
+        if (reduced.dims().size() != x.dims().size()) {
+          reduced = reshape<T>(reduced, x.shape());
+        }
+        set_output<T>(reduced, x_grad);
+      }
+    } else {
+      by_pass<T>(out_grad, x_grad);
+    }
+  }
+}
+
+template <typename T>
+void log_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
+  if (x_grad) {
+    // dx = dout / x
+    set_output<T>(out_grad / x, x_grad);
+  }
+}
+
+template <typename T>
+void exp_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
+  if (x_grad) {
+    if (out.dtype() == phi::DataType::FLOAT16 ||
+        out.dtype() == phi::DataType::BFLOAT16) {
+      Tensor out_promote = cast<T>(out, phi::DataType::FLOAT32);
+      Tensor out_grad_promote = cast<T>(out_grad, phi::DataType::FLOAT32);
+      set_output<T>(cast<T>(out_promote * out_grad_promote, out.dtype()),
+                    x_grad);
+    } else {
+      set_output<T>(out_grad * out, x_grad);
+    }
+  }
+}
+
+template <typename T>
+void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
+  if (x_grad) {
+    // This calculation is important for resnet.
+    auto x_grad_tmp = (0.5 / out) * out_grad;
+    set_output<T>(x_grad_tmp, x_grad);
+  }
+}
+
+template <typename T>
+void silu_grad(const Tensor& x,
+               const Tensor& out,
+               const Tensor& out_grad,
+               Tensor* x_grad) {
+  if (x_grad) {
+    auto org_dtype = x.dtype();
+    bool need_cast = org_dtype == phi::DataType::FLOAT16 ||
+                     org_dtype == phi::DataType::BFLOAT16;
+    if (need_cast) {
+      auto x_cast = cast<T>(x, phi::DataType::FLOAT32);
+      auto out_cast = cast<T>(out, phi::DataType::FLOAT32);
+      auto out_grad_cast = cast<T>(out_grad, phi::DataType::FLOAT32);
+      auto sigmoid = 1.0 / (1.0 + exp<T>(-x_cast));
+      auto res = out_grad_cast * sigmoid * (1.0 + x_cast - out_cast);
+      set_output<T>(cast<T>(res, org_dtype), x_grad);
+    } else {
+      auto sigmoid = 1.0 / (1.0 + exp<T>(-x));
+      auto res = out_grad * sigmoid * (1.0 + x - out);
+      set_output<T>(res, x_grad);
+    }
+  }
+}
+
+template <typename T>
+void softmax_grad(const Tensor& out,
+                  const Tensor& out_grad,
+                  int axis,
+                  Tensor* x_grad) {
+  if (x_grad) {
+    if (out_grad.dims().size() > 0) {
+      if (axis >= 0) {
+        auto new_out_grad = out_grad * out;
+        auto tmp_x_grad = new_out_grad -
+                          out * sum<T>(new_out_grad, {axis}, out.dtype(), true);
+        set_output<T>(tmp_x_grad, x_grad);
+      } else {
+        auto new_out_grad = out_grad * out;
+        auto tmp_x_grad =
+            new_out_grad - out * sum<T>(new_out_grad,
+                                        {out.dims().size() + axis},
+                                        out.dtype(),
+                                        true);
+        set_output<T>(tmp_x_grad, x_grad);
+      }
+    } else {
+      set_output<T>(out_grad * 0.0, x_grad);
+    }
+  }
+}
+
+template <typename T>
+void gather_nd_grad(const Tensor& x,
+                    const Tensor& index,
+                    const Tensor& out_grad,
+                    Tensor* x_grad) {
+  if (x_grad) {
+    auto zero_tensor = full<T>(phi::vectorize(x.dims()), 0.0, x.dtype());
+    auto x_grad_tmp = scatter_nd_add<T>(zero_tensor, index, out_grad);
+    set_output<T>(x_grad_tmp, x_grad);
+  }
+}
+
+template <typename T>
+void pad_grad(const Tensor& input,
+              const Tensor& out_grad,
+              const std::vector<int>& paddings,
+              const Scalar& pad_value,
+              Tensor* input_grad) {
+  if (input_grad) {
+    size_t rank = input.dims().size();
+    auto out_dims = out_grad.dims();
+
+    std::vector<int64_t> starts(rank, 0);
+    std::vector<int64_t> ends(rank, 0);
+    std::vector<int64_t> axes(rank, 0);
+    std::vector<int64_t> infer_flags(rank, 1);
+    std::vector<int64_t> decrease_axis({});
+    for (size_t i = 0; i < rank; ++i) {
+      starts[i] = static_cast<int64_t>(paddings[2 * i]);
+      ends[i] = static_cast<int64_t>(out_dims[i] - paddings[2 * i + 1]);
+      axes[i] = i;
+    }
+    auto out_tmp =
+        slice<T>(out_grad, axes, starts, ends, infer_flags, decrease_axis);
+    set_output<T>(out_tmp, input_grad);
+  }
+}
+
+template <typename T>
+void max_grad(const Tensor& x,
+              const Tensor& out,
+              const Tensor& out_grad,
+              const IntArray& axis,
+              bool keepdim,
+              bool reduce_all,
+              Tensor* x_grad) {
+  if (!x_grad) {
+    return;
+  }
+  auto zero_tensor = full<T>(phi::vectorize(x.dims()), 0.0, x.dtype());
+  std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x.dims());
+  int64_t axis_size = axis.size();
+  int64_t x_dim_size = x_dim.size();
+  reduce_all = false;
+  if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
+    reduce_all = true;
+  } else {
+    reduce_all = false;
+  }
+  auto x_grad_tmp = Tensor();
+  if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
+    auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
+    auto out_tmp = out.expand(IntArray(x_dim));
+    auto mask = equal<T>(x, out_tmp);
+    x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+  } else {
+    auto axis_ = std::vector<int64_t>();
+    if (reduce_all) {
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        axis_.push_back(i);
+      }
+    } else {
+      axis_ = axis.GetData();
+      for (int64_t i = 0; i < axis_size; i++) {
+        if (axis[i] < 0) {
+          axis_[i] = axis[i] + x_dim_size;
+        }
+      }
+    }
+    auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
+    auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
+    auto out_ = reshape<T>(out, out_grad_shape);
+    auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
+    auto out_tmp = out_.expand(IntArray(x_dim));
+    auto mask = equal<T>(x, out_tmp);
+    x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+  }
+  set_output<T>(x_grad_tmp, x_grad);
+}
+
+template <typename T>
+void slice_grad(const Tensor& input,
+                const Tensor& out_grad,
+                const std::vector<int64_t>& axes,
+                const IntArray& starts,
+                const IntArray& ends,
+                const std::vector<int64_t>& infer_flags,
+                const std::vector<int64_t>& decrease_axis,
+                Tensor* input_grad) {
+  if (input_grad) {
+    size_t rank = input.dims().size();
+    auto out_dims = out_grad.dims();
+    std::vector<int64_t> origin_out_shape;
+    auto in_dims = input.dims();
+
+    auto decrease_size = decrease_axis.size();
+    if (decrease_size > 0) {
+      if (decrease_size == static_cast<size_t>(in_dims.size())) {
+        // all dims decrease
+        out_dims = phi::make_ddim(std::vector<int>(decrease_size, 1));
+      } else {
+        origin_out_shape.resize(out_dims.size() + decrease_size, -1);
+        for (size_t i = 0; i < decrease_size; ++i) {
+          origin_out_shape[decrease_axis[i]] = 1;
+        }
+
+        int index = 0;
+        for (size_t i = 0; i < origin_out_shape.size(); ++i) {
+          if (origin_out_shape[i] == -1) {
+            origin_out_shape[i] = out_dims[index];
+            ++index;
+          }
+        }
+        out_dims = phi::make_ddim(origin_out_shape);
+      }
+    }
+
+    std::vector<int> offsets(rank, 0);
+    std::vector<int> extents(rank, 0);
+    for (size_t i = 0; i < rank; ++i) {
+      offsets[i] = 0;
+      extents[i] = out_dims[i];
+    }
+    for (size_t i = 0; i < axes.size(); ++i) {
+      int axis = axes[i];
+      int64_t start = starts[i] < 0 ? (starts[i] + in_dims[axis]) : starts[i];
+      start = std::max(start, static_cast<int64_t>(0));
+      offsets[axis] = start;
+    }
+
+    std::vector<int> paddings;
+    for (size_t i = 0; i < rank; ++i) {
+      paddings.push_back(offsets[i]);
+      paddings.push_back((in_dims[i] - out_dims[i]) - offsets[i]);
+    }
+    if (decrease_size > 0 &&
+        (decrease_size != static_cast<size_t>(in_dims.size()))) {
+      auto out_tmp =
+          pad<T>(reshape<T>(out_grad, origin_out_shape), paddings, 0.0);
+      set_output<T>(out_tmp, input_grad);
+    } else {
+      auto out_tmp = pad<T>(out_grad, paddings, 0.0);
+      set_output<T>(out_tmp, input_grad);
+    }
+  }
+}
+
+template <typename T>
+void tile_grad(const Tensor& x,
+               const Tensor& out_grad,
+               const IntArray& repeat_times,
+               Tensor* x_grad) {
+  if (x_grad) {
+    auto repeat_times_data = repeat_times.GetData();
+    auto out_grad_shape = phi::vectorize<int>(out_grad.dims());
+    auto result = out_grad;
+    for (int i = 0; i < static_cast<int>(repeat_times_data.size()); i++) {
+      int size = out_grad_shape[i] / repeat_times_data[i];
+      std::vector<int> sections(repeat_times_data[i], size);
+      auto split_arr = split<T>(result, IntArray(sections), i);
+      result = full<T>(phi::vectorize(split_arr[0].dims()), 0.0, x.dtype());
+      for (int j = 0; j < static_cast<int>(split_arr.size()); j++) {
+        result = split_arr[j] + result;
+      }
+    }
+    result = reshape<T>(result, x.shape());
+    set_output<T>(result, x_grad);
+  }
+}
+
 }  // namespace details
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
index 838b83d5d533b5..6b3b1050448ef7 100644
--- a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
+++ b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
@@ -55,6 +55,7 @@ std::vector<std::vector<paddle::Tensor>> reshape_vjp(
   if (paddle::prim::StaticCompositeContext::Instance().IsBwdPrimEnabled() &&
       !need_skip) {
     FLAGS_tensor_operants_mode = "static";
+    VLOG(4) << "Call PIR Decomposed backward op reshape_grad";
     paddle::Tensor* x_grad = !stop_gradients[0][0] ? &vjp_res[0][0] : nullptr;
 
     details::reshape_grad<LazyTensor>(xshape, out_grad, x_grad);
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 46bfb0ee005a45..2dfeb89bef5c42 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -277,7 +277,9 @@ if(WITH_PYTHON)
                  eager_legacy_op_function_generator.cc)
   set(GENERATOR_DEPS ${PYBIND_DEPS})
   list(REMOVE_DUPLICATES GENERATOR_DEPS)
-  list(REMOVE_ITEM GENERATOR_DEPS python)
+  if(NOT WITH_ARM)
+    list(REMOVE_ITEM GENERATOR_DEPS python)
+  endif()
   target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS})
   if(NOT WIN32)
     add_executable(kernel_signature_generator kernel_signature_generator.cc)
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index 62b595a13f9602..09d76e33d69c1e 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -645,16 +645,20 @@ static void parse_attrs(PyObject *obj,
                         phi::distributed::InferSpmdContext *ctx,
                         const size_t arg_pos) {
   if (PyBool_Check(first_item)) {
-    auto attrs = CastPyArg2Booleans(obj, infer_spmd_string, arg_pos);
+    auto attrs = CastPyArg2Booleans(
+        obj, infer_spmd_string, static_cast<ssize_t>(arg_pos));
     ctx->EmplaceBackAttr(attrs);
   } else if (PyCheckInteger(first_item)) {
-    auto attrs = CastPyArg2Ints(obj, infer_spmd_string, arg_pos);
+    auto attrs =
+        CastPyArg2Ints(obj, infer_spmd_string, static_cast<ssize_t>(arg_pos));
     ctx->EmplaceBackAttr(attrs);
   } else if (PyLong_Check(first_item)) {
-    auto attrs = CastPyArg2Longs(obj, infer_spmd_string, arg_pos);
+    auto attrs =
+        CastPyArg2Longs(obj, infer_spmd_string, static_cast<ssize_t>(arg_pos));
     ctx->EmplaceBackAttr(attrs);
   } else if (PyFloat_Check(first_item)) {
-    auto attrs = CastPyArg2Floats(obj, infer_spmd_string, arg_pos);
+    auto attrs =
+        CastPyArg2Floats(obj, infer_spmd_string, static_cast<ssize_t>(arg_pos));
     ctx->EmplaceBackAttr(attrs);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -671,16 +675,20 @@ static void parse_attr(PyObject *obj,
                        phi::distributed::InferSpmdContext *ctx,
                        const size_t arg_pos) {
   if (PyBool_Check(obj)) {
-    auto attr = CastPyArg2Boolean(obj, infer_spmd_string, arg_pos);
+    auto attr = CastPyArg2Boolean(
+        obj, infer_spmd_string, static_cast<ssize_t>(arg_pos));
     ctx->EmplaceBackAttr(attr);
   } else if (PyCheckInteger(obj)) {
-    auto attr = CastPyArg2Int(obj, infer_spmd_string, arg_pos);
+    auto attr =
+        CastPyArg2Int(obj, infer_spmd_string, static_cast<ssize_t>(arg_pos));
     ctx->EmplaceBackAttr(attr);
   } else if (PyLong_Check(obj)) {
-    auto attr = CastPyArg2Long(obj, infer_spmd_string, arg_pos);
+    auto attr =
+        CastPyArg2Long(obj, infer_spmd_string, static_cast<ssize_t>(arg_pos));
     ctx->EmplaceBackAttr(attr);
   } else if (PyFloat_Check(obj)) {
-    auto attr = CastPyArg2Float(obj, infer_spmd_string, arg_pos);
+    auto attr =
+        CastPyArg2Float(obj, infer_spmd_string, static_cast<ssize_t>(arg_pos));
     ctx->EmplaceBackAttr(attr);
   } else {  // TODO(ljz) support other types
     PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 69b2e4b3c9786d..36beb74c7c05bd 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -155,7 +155,7 @@ py::dtype DistModelTypeToNumpyDType(DistModelDataType dtype) {
 
 py::array DistModelTensorGetData(DistModelTensor& tensor) {  // NOLINT
   py::dtype dt = DistModelTypeToNumpyDType(tensor.dtype);
-  return py::array(std::move(dt), {tensor.shape}, tensor.data.data());
+  return py::array(dt, {tensor.shape}, tensor.data.data());
 }
 
 void BindFleetExecutor(py::module* m) {
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index 9291338d70b656..bc18f368234c54 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -146,7 +146,7 @@ class IterableDatasetWrapper {
           if (tensors_[i][j]->place() == places_[read_num]) {
             result[read_num].emplace(slots_[j], std::move(*tensors_[i][j]));
           } else {
-            framework::TensorCopy(std::move(*tensors_[i][j]),
+            framework::TensorCopy(*tensors_[i][j],
                                   places_[read_num],
                                   &result[read_num][slots_[j]]);
           }
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 716d207d3b1960..a30f01084a060f 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -189,8 +189,8 @@ void CreateDistTensorWithNumpyValue(TensorObject* self,
         "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/CustomPlace"));
   }
 
-  auto dist_tensor =
-      std::make_shared<phi::distributed::DistTensor>(dense_tensor, dist_attr);
+  auto dist_tensor = std::make_shared<phi::distributed::DistTensor>(
+      std::make_shared<phi::DenseTensor>(dense_tensor), dist_attr);
   self->tensor.set_impl(dist_tensor);
 
   if (!autograd_meta->GetMutableGradNode()) {
@@ -280,13 +280,13 @@ void InitDistTensorWithTensor(TensorObject* self,
   if (place == src.place()) {
     std::shared_ptr<phi::DenseTensor> tensor =
         std::static_pointer_cast<phi::DenseTensor>(src.impl());
-    self->tensor.set_impl(std::make_shared<DistTensor>(*tensor, dist_attr));
+    self->tensor.set_impl(std::make_shared<DistTensor>(tensor, dist_attr));
     VLOG(4) << "Same place, do ShareDataWith for DistTensor.";
   } else {
     std::shared_ptr<phi::DenseTensor> tensor =
         std::static_pointer_cast<phi::DenseTensor>(
             src.copy_to(place, true).impl());
-    self->tensor.set_impl(std::make_shared<DistTensor>(*tensor, dist_attr));
+    self->tensor.set_impl(std::make_shared<DistTensor>(tensor, dist_attr));
     VLOG(4) << "Different place, do TensorCopy for DistTensor.";
   }
   if (src.get_autograd_meta()) {
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 1e1f40bf8e3d41..df3e62b3bae476 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -660,7 +660,7 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
           VLOG(7) << "Custom operator add output " << output
                   << " to CustomOpKernelContext. Add vector<tensor> size = "
                   << empty_tensors.size();
-          ctx.EmplaceBackOutputs(std::move(empty_tensors));
+          ctx.EmplaceBackOutputs(empty_tensors);
           continue;
         }
       }
diff --git a/paddle/fluid/pybind/eager_generator.h b/paddle/fluid/pybind/eager_generator.h
index 03b8690569c226..0d495966793836 100644
--- a/paddle/fluid/pybind/eager_generator.h
+++ b/paddle/fluid/pybind/eager_generator.h
@@ -118,7 +118,6 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"X", "InScale", "InAccum", "InState"}},
     {"nll_loss", {"X", "Label", "Weight"}},
-    {"smooth_l1_loss", {"X", "Y", "InsideWeight", "OutsideWeight"}},
     {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
     {"gather", {"X", "Index", "Axis"}},
     {"repeat_interleave", {"X", "RepeatsTensor"}},
@@ -249,8 +248,6 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"crop", {"X", "Y", "Offsets"}},
     {"batch_norm",
      {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}},
-    {"inplace_abn",
-     {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}},
     {"linear_interp", {"X", "OutSize"}},
     {"bilinear_interp", {"X", "OutSize"}},
     {"trilinear_interp", {"X", "OutSize"}},
@@ -297,13 +294,6 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
       "SavedVariance",
       "ReserveSpace"}},
     {"lstsq", {"Solution", "Residuals", "Rank", "SingularValues"}},
-    {"inplace_abn",
-     {"Y",
-      "MeanOut",
-      "VarianceOut",
-      "SavedMean",
-      "SavedVariance",
-      "ReserveSpace"}},
     {"fused_attention", {"LnMean",         "LnVariance",
                          "LnOut",          "QKVOut",
                          "QKVBiasOut",     "TransposeOut2",
@@ -485,7 +475,6 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"batch_norm", {"MeanOut", "VarianceOut"}},
-    {"inplace_abn", {"MeanOut", "VarianceOut"}},
     {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
     {"fill_constant", {"Out"}},
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index ecae39fb43a49f..21578110323abc 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -579,6 +579,11 @@ static PyObject* tensor__mul__method(TensorObject* self,
     }
   }
 
+  const phi::distributed::ProcessMesh* mesh = nullptr;
+  if (InputsContainDistTensor(&mesh, self_tensor, other_tensor)) {
+    ConvertAllInputsToDistTensor(mesh, self_tensor, other_tensor);
+  }
+
   // 4. calculation
   VLOG(6) << "Calling multiply_ad_func in tensor__mul__method";
   {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index e72f5dc77f99cb..199d05d2c98007 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -135,15 +135,15 @@ Returns a numpy array shows the value of current Tensor.
     same as current Tensor.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        data = paddle.uniform([30, 10, 32], dtype="float32", min=-1, max=1)
-        linear = paddle.nn.Linear(32, 64)
-        data = paddle.to_tensor(data)
-        x = linear(data)
-        print(x.numpy())
+        >>> data = paddle.uniform([30, 10, 32], dtype="float32", min=-1, max=1)
+        >>> linear = paddle.nn.Linear(32, 64)
+        >>> data = paddle.to_tensor(data)
+        >>> x = linear(data)
 )DOC");
 
 static PyObject* tensor_method_numpy(TensorObject* self,
@@ -629,16 +629,17 @@ Reconstruct the self with other Tensor. It is a deep copy of 'self = other'.
     None.
 
 Examples:
-    .. code-block:: python
 
-      import paddle
+    .. code-block:: python
 
-      t1 = paddle.to_tensor([1.0], stop_gradient=False)
-      t2 = paddle.to_tensor([2.0], stop_gradient=True)
+        >>> import paddle
 
-      t1.reconstruct_from_(t2)
+        >>> t1 = paddle.to_tensor([1.0], stop_gradient=False)
+        >>> t2 = paddle.to_tensor([2.0], stop_gradient=True)
 
-      print(t1)
+        >>> t1.reconstruct_from_(t2)
+        >>> print(t1)
+        Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, [2.])
 )DOC");
 
 static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
@@ -706,28 +707,38 @@ Tn addition, the cloned Tensor provides gradient propagation.
     Tensor, The cloned Tensor.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
-
-        x = paddle.to_tensor(1.0, stop_gradient=False)
-        clone_x = x.clone()
-        y = clone_x**2
-        y.backward()
-        print(clone_x.stop_gradient) # False
-        print(clone_x.grad)          # [2.0], support gradient propagation
-        print(x.stop_gradient)       # False
-        print(x.grad)                # [2.0], clone_x support gradient propagation for x
-
-        x = paddle.to_tensor(1.0)
-        clone_x = x.clone()
-        clone_x.stop_gradient = False
-        z = clone_x**3
-        z.backward()
-        print(clone_x.stop_gradient) # False
-        print(clone_x.grad)          # [3.0], support gradient propagation
-        print(x.stop_gradient) # True
-        print(x.grad)          # None
+        >>> import paddle
+
+        >>> x = paddle.to_tensor(1.0, stop_gradient=False)
+        >>> clone_x = x.clone()
+        >>> clone_x.retain_grads()
+        >>> y = clone_x**2
+        >>> y.backward()
+        >>> print(clone_x.stop_gradient)
+        False
+        >>> print(clone_x.grad)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False, 2.)
+        >>> print(x.stop_gradient)
+        False
+        >>> print(x.grad)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False, 2.)
+
+        >>> x = paddle.to_tensor(1.0)
+        >>> clone_x = x.clone()
+        >>> clone_x.stop_gradient = False
+        >>> z = clone_x**3
+        >>> z.backward()
+        >>> print(clone_x.stop_gradient)
+        False
+        >>> print(clone_x.grad)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False, 3.)
+        >>> print(x.stop_gradient)
+        True
+        >>> print(x.grad)
+        None
 )DOC");
 
 static PyObject* tensor_method_clone(TensorObject* self,
@@ -760,27 +771,32 @@ Enables this Tensor to have their grad populated during backward(). It is a no-o
     None.
 
 Examples:
-    .. code-block:: python
 
-      import paddle
-
-      x = paddle.to_tensor([1.0, 2.0, 3.0])
-      x.stop_gradient = False
-      y = x + x
-      y.retain_grads()
-      loss = y.sum()
-      loss.backward()
-
-      print(y.grad) # [1., 1., 1.]
-
-      x = paddle.to_tensor([1.0, 2.0, 3.0])
-      x.stop_gradient = False
-      y = x + x
-      # y.retain_grads()
-      loss = y.sum()
-      loss.backward()
+    .. code-block:: python
 
-      print(y.grad) # None
+        >>> import paddle
+
+        >>> x = paddle.to_tensor([1.0, 2.0, 3.0])
+        >>> x.stop_gradient = False
+        >>> y = x + x
+        >>> y.retain_grads()
+        >>> loss = y.sum()
+        >>> loss.backward()
+
+        >>> print(y.grad)
+        Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=False,
+        [1., 1., 1.])
+
+        >>> x = paddle.to_tensor([1.0, 2.0, 3.0])
+        >>> x.stop_gradient = False
+        >>> y = x + x
+        >>> y.retain_grads()
+        >>> loss = y.sum()
+        >>> loss.backward()
+
+        >>> print(y.grad)
+        Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=False,
+        [1., 1., 1.])
 )DOC");
 
 static PyObject* tensor_retain_grads(TensorObject* self,
@@ -820,16 +836,26 @@ The Gradient of current Tensor will be set to ``0`` elementwise or ``None``.
     None.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
-        input = paddle.uniform([10, 2])
-        linear = paddle.nn.Linear(2, 3)
-        out = linear(input)
-        out.backward()
-        print("Before clear_gradient, linear.weight.grad: {}".format(linear.weight.grad))
-        linear.weight.clear_gradient()
-        print("After clear_gradient, linear.weight.grad: {}".format(linear.weight.grad))
+        >>> import paddle
+        >>> input = paddle.uniform([10, 2])
+        >>> linear = paddle.nn.Linear(2, 3)
+        >>> out = linear(input)
+        >>> out.backward()
+        >>> print("Before clear_gradient, linear.weight.grad: {}".format(linear.weight.grad))
+        >>> # doctest: +SKIP("Random output")
+        Before clear_gradient, linear.weight.grad: Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+        [[-0.03178465, -0.03178465, -0.03178465],
+         [-0.98546225, -0.98546225, -0.98546225]])
+        >>> # doctest: -SKIP
+        >>> linear.weight.clear_gradient()
+        >>> print("After clear_gradient, linear.weight.grad: {}".format(linear.weight.grad))
+        After clear_gradient, linear.weight.grad: Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+        [[0., 0., 0.],
+         [0., 0., 0.]])
+
 )DOC");
 
 static PyObject* tensor_clear_gradient(TensorObject* self,
@@ -844,7 +870,7 @@ static PyObject* tensor_clear_gradient(TensorObject* self,
     set_to_zero = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
   }
 
-  paddle::Tensor* grad;
+  paddle::Tensor* grad = nullptr;
   bool is_leaf = egr::EagerUtils::IsLeafTensor(self->tensor);
   if (is_leaf) {
     grad = egr::EagerUtils::mutable_grad(self->tensor);
@@ -1037,33 +1063,41 @@ In addition, the detached Tensor doesn't provide gradient propagation.
     Tensor, The detached Tensor.
 
 Examples:
+
     .. code-block:: python
 
-      import paddle
-
-      x = paddle.to_tensor([1.0], stop_gradient=False)
-      detach_x = x.detach()
-      detach_x[0] = 10.0
-      print(x)  # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=False,
-                  #        [10.])
-      y = x**2
-      y.backward()
-      print(x.grad)         # [20.0]
-      print(detach_x.grad)  # None, 'stop_gradient=True' by default
-
-      detach_x.stop_gradient = False # Set stop_gradient to be False, supported auto-grad
-      z = detach_x**3
-      z.backward()
-
-      print(x.grad)         # [20.0], detach_x is detached from x's graph, not affect each other
-      print(detach_x.grad)  # [300.0], detach_x has its own graph
-
-      # Due to sharing of data with origin Tensor, There are some unsafe operations:
-      # y = 2 * x
-      # detach_x[:] = 5.0
-      # y.backward()
-      # It will raise Error:
-      #   one of the variables needed for gradient computation has been modified by an inplace operation.
+        >>> import paddle
+
+        >>> x = paddle.to_tensor([1.0], stop_gradient=False)
+        >>> detach_x = x.detach()
+        >>> detach_x[0] = 10.0
+        >>> print(x)
+        Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=False, [10.])
+
+        >>> y = x**2
+        >>> y.backward()
+        >>> print(x.grad)
+        Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False, [20.])
+
+        >>> print(detach_x.grad) # None, 'stop_gradient=True' by default
+        None
+
+        >>> detach_x.stop_gradient = False # Set stop_gradient to be False, supported auto-grad
+        >>> z = detach_x**3
+        >>> z.backward()
+
+        >>> print(x.grad)
+        Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False, [20.])
+
+        >>> print(detach_x.grad)
+        Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False, [300.])
+
+        >>> # Due to sharing of data with origin Tensor, There are some unsafe operations:
+        >>> # y = 2 * x
+        >>> # detach_x[:] = 5.0
+        >>> # y.backward()
+        >>> # It will raise Error:
+        >>> #   one of the variables needed for gradient computation has been modified by an inplace operation.
 )DOC");
 
 static PyObject* tensor_method_detach(TensorObject* self,
@@ -1132,13 +1166,19 @@ Returns the underline tensor in the origin Tensor.
     Underline tensor.
 
 Examples:
+
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
 
-      x = paddle.to_tensor([1.0], stop_gradient=False)
-      underline_x = x.get_tensor()
-      print(underline_x) # a Dense Tensor info
+        >>> x = paddle.to_tensor([1.0], stop_gradient=False)
+        >>> underline_x = x.get_tensor()
+        >>> print(underline_x)
+          - place: Place(cpu)
+          - shape: [1]
+          - layout: NCHW
+          - dtype: float32
+          - data: [1]
 )DOC");
 
 static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
@@ -1729,7 +1769,7 @@ static PyObject* tensor_register_grad_hook(TensorObject* self,
                                            PyObject* args,
                                            PyObject* kwargs) {
   EAGER_TRY
-  int64_t hook_id;
+  int64_t hook_id = 0;
   if (egr::EagerUtils::IsLeafTensor(self->tensor)) {
     VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name();
 
@@ -2022,16 +2062,17 @@ Returns the total number of non zero elements in input SparseCooTensor/SparseCsr
     int
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        indices = [[0, 1, 2], [1, 2, 0]]
-        values = [1.0, 2.0, 3.0]
-        dense_shape = [3, 3]
-        coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
-        coo.nnz()
-        # 3
+        >>> indices = [[0, 1, 2], [1, 2, 0]]
+        >>> values = [1.0, 2.0, 3.0]
+        >>> dense_shape = [3, 3]
+        >>> coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+        >>> coo.nnz()
+        3
 
 )DOC");
 
@@ -2069,18 +2110,19 @@ Returns the indices of non zero elements in input SparseCooTensor.
     DenseTesnor
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        indices = [[0, 1, 2], [1, 2, 0]]
-        values = [1.0, 2.0, 3.0]
-        dense_shape = [3, 3]
-        coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
-        coo.indices()
-        # Tensor(shape=[2, 3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-        #        [[0, 1, 2],
-        #         [1, 2, 0]])
+        >>> indices = [[0, 1, 2], [1, 2, 0]]
+        >>> values = [1.0, 2.0, 3.0]
+        >>> dense_shape = [3, 3]
+        >>> coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+        >>> coo.indices()
+        Tensor(shape=[2, 3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+        [[0, 1, 2],
+         [1, 2, 0]])
 
 )DOC");
 
@@ -2112,17 +2154,18 @@ Returns the values of non zero elements in input SparseCooTensor.
     DenseTesnor
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        indices = [[0, 1, 2], [1, 2, 0]]
-        values = [1.0, 2.0, 3.0]
-        dense_shape = [3, 3]
-        coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
-        coo.values()
-        # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-        #        [1., 2., 3.])
+        >>> indices = [[0, 1, 2], [1, 2, 0]]
+        >>> values = [1.0, 2.0, 3.0]
+        >>> dense_shape = [3, 3]
+        >>> coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+        >>> coo.values()
+        Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+        [1., 2., 3.])
 
 )DOC");
 
@@ -2164,18 +2207,19 @@ Returns the compressed row index of non zero elements in input SparseCsrTensor.
     DenseTesnor
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        crows = [0, 2, 3, 5]
-        cols = [1, 3, 2, 0, 1]
-        values = [1, 2, 3, 4, 5]
-        dense_shape = [3, 4]
-        csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
-        csr.crows()
-        # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-        #        [0, 2, 3, 5])
+        >>> crows = [0, 2, 3, 5]
+        >>> cols = [1, 3, 2, 0, 1]
+        >>> values = [1, 2, 3, 4, 5]
+        >>> dense_shape = [3, 4]
+        >>> csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+        >>> csr.crows()
+        Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+        [0, 2, 3, 5])
 
 )DOC");
 
@@ -2207,18 +2251,19 @@ Returns the column index of non zero elements in input SparseCsrTensor.
     DenseTesnor
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        crows = [0, 2, 3, 5]
-        cols = [1, 3, 2, 0, 1]
-        values = [1, 2, 3, 4, 5]
-        dense_shape = [3, 4]
-        csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
-        csr.cols()
-        # Tensor(shape=[5], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-        #        [1, 3, 2, 0, 1])
+        >>> crows = [0, 2, 3, 5]
+        >>> cols = [1, 3, 2, 0, 1]
+        >>> values = [1, 2, 3, 4, 5]
+        >>> dense_shape = [3, 4]
+        >>> csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+        >>> csr.cols()
+        Tensor(shape=[5], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+        [1, 3, 2, 0, 1])
 
 )DOC");
 
@@ -2246,12 +2291,14 @@ Whether the Tensor is a Dense Tensor.
     Whether the Tensor is a Dense Tensor.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor([1.0], stop_gradient=False)
-        print(x.is_dense())
+        >>> x = paddle.to_tensor([1.0], stop_gradient=False)
+        >>> print(x.is_dense())
+        True
 )DOC");
 
 static PyObject* tensor_method_is_dense(TensorObject* self,
@@ -2274,12 +2321,14 @@ Whether the Tensor is a Distributed Tensor.
     Whether the Tensor is a Distributed Tensor.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor([1.0], stop_gradient=False)
-        print(x.is_dist()) # False
+        >>> x = paddle.to_tensor([1.0], stop_gradient=False)
+        >>> print(x.is_dist())
+        False
 )DOC");
 
 static PyObject* tensor_method_is_dist(TensorObject* self,
@@ -2305,16 +2354,17 @@ When input is SparseCooTensor/SparseCsrTensor, will return True. When input is D
     bool
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        indices = [[0, 1, 2], [1, 2, 0]]
-        values = [1.0, 2.0, 3.0]
-        dense_shape = [3, 3]
-        coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
-        coo.is_sparse()
-        # True
+        >>> indices = [[0, 1, 2], [1, 2, 0]]
+        >>> values = [1.0, 2.0, 3.0]
+        >>> dense_shape = [3, 3]
+        >>> coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+        >>> coo.is_sparse()
+        True
 
 )DOC");
 static PyObject* tensor_method_is_sparse(TensorObject* self,
@@ -2341,16 +2391,17 @@ When input is SparseCooTensor, will return True. When input is DenseTensor/Spars
     bool
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        indices = [[0, 1, 2], [1, 2, 0]]
-        values = [1.0, 2.0, 3.0]
-        dense_shape = [3, 3]
-        coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
-        coo.is_sparse_coo()
-        # True
+        >>> indices = [[0, 1, 2], [1, 2, 0]]
+        >>> values = [1.0, 2.0, 3.0]
+        >>> dense_shape = [3, 3]
+        >>> coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+        >>> coo.is_sparse_coo()
+        True
 
 )DOC");
 
@@ -2377,17 +2428,18 @@ When input is SparseCsrTensor, will return True. When input is DenseTensor/Spars
     bool
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        crows = [0, 2, 3, 5]
-        cols = [1, 3, 2, 0, 1]
-        values = [1, 2, 3, 4, 5]
-        dense_shape = [3, 4]
-        csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
-        csr.is_sparse_csr()
-        # True
+        >>> crows = [0, 2, 3, 5]
+        >>> cols = [1, 3, 2, 0, 1]
+        >>> values = [1, 2, 3, 4, 5]
+        >>> dense_shape = [3, 4]
+        >>> csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+        >>> csr.is_sparse_csr()
+        True
 
 )DOC");
 
@@ -2417,19 +2469,20 @@ When input is SparseCooTensor, will convert `COO` to `CSR` . When input is Dense
     SparseCsrTensor
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        indices = [[0, 1, 2], [1, 2, 0]]
-        values = [1.0, 2.0, 3.0]
-        dense_shape = [3, 3]
-        coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
-        coo.to_sparse_csr()
-        # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-        #        crows=[0, 1, 2, 3],
-        #        cols=[1, 2, 0],
-        #        values=[1., 2., 3.])
+        >>> indices = [[0, 1, 2], [1, 2, 0]]
+        >>> values = [1.0, 2.0, 3.0]
+        >>> dense_shape = [3, 3]
+        >>> coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+        >>> coo.to_sparse_csr()
+        Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+        crows=[0, 1, 2, 3],
+        cols=[1, 2, 0],
+        values=[1., 2., 3.])
 
 )DOC");
 
@@ -2466,17 +2519,17 @@ Any two type Tensor among DenseTensor/SparseCooTensor/SparseCsrTensor are suppor
 
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.rand([2, 3, 8])
-        y = paddle.rand([2, 3, 8])
-        y = y.to_sparse_csr()
-        z = paddle.rand([2, 5])
+        >>> x = paddle.rand([2, 3, 8])
+        >>> y = paddle.rand([2, 3, 8])
+        >>> y = y.to_sparse_csr()
+        >>> z = paddle.rand([2, 5])
 
-        x.is_same_shape(y)
-        # True
-        x.is_same_shape(z)
-        # False
+        >>> x.is_same_shape(y)
+        True
+        >>> x.is_same_shape(z)
+        False
 
 )DOC");
 
@@ -2509,24 +2562,30 @@ Returns the size in bytes of an element in the Tensor.
     int, The size in bytes of an element in the Tensor.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor(1, dtype='bool')
-        x.element_size() # 1
+        >>> x = paddle.to_tensor(1, dtype='bool')
+        >>> x.element_size()
+        1
 
-        x = paddle.to_tensor(1, dtype='float16')
-        x.element_size() # 2
+        >>> x = paddle.to_tensor(1, dtype='float16')
+        >>> x.element_size()
+        2
 
-        x = paddle.to_tensor(1, dtype='float32')
-        x.element_size() # 4
+        >>> x = paddle.to_tensor(1, dtype='float32')
+        >>> x.element_size()
+        4
 
-        x = paddle.to_tensor(1, dtype='float64')
-        x.element_size() # 8
+        >>> x = paddle.to_tensor(1, dtype='float64')
+        >>> x.element_size()
+        8
 
-        x = paddle.to_tensor(1, dtype='complex128')
-        x.element_size() # 16
+        >>> x = paddle.to_tensor(1, dtype='complex128')
+        >>> x.element_size()
+        16
 )DOC");
 
 static PyObject* tensor_method_element_size(TensorObject* self,
@@ -2753,12 +2812,16 @@ Returns the address of the first element of current Tensor.
     int, The address of the first element of current Tensor.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor([1, 2, 3])
-        print(x.data_ptr())
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> print(x.data_ptr())
+        >>> # doctest: +SKIP('return the address')
+        93220864
+        >>> # doctest: -SKIP
 )DOC");
 
 static PyObject* tensor_data_ptr(TensorObject* self,
@@ -2800,13 +2863,15 @@ Returns the strides of current Tensor.
     List, the strides of current Tensor.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor([1, 2, 3])
-        y = x[1]
-        print(y.get_strides())
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> y = x[1]
+        >>> print(y.get_strides())
+        []
 )DOC");
 
 static PyObject* tensor_method_strides(TensorObject* self,
@@ -2838,14 +2903,16 @@ If self tensor is already contiguous, this function returns the current Tensor.
     Tensor, The contiguous Tensor.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor([1, 2, 3])
-        y = x[1]
-        y = y.contiguous()
-        print(y)
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> y = x[1]
+        >>> y = y.contiguous()
+        >>> print(y)
+        Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True, 2)
 )DOC");
 
 static PyObject* tensor_contiguous(TensorObject* self,
@@ -2883,13 +2950,14 @@ Whether the Tensor is contiguous.
     Bool, Whether the Tensor is contiguous.
 
 Examples:
+
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor([1, 2, 3])
-        y = x[1]
-        print(y.is_contiguous())
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> y = x[1]
+        >>> print(y.is_contiguous())
 )DOC");
 static PyObject* tensor_is_contiguous(TensorObject* self,
                                       PyObject* args,
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 7d70ed174a4c81..b6a0ed95377559 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -237,8 +237,8 @@ float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos) {
 
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos) {
   if (PyObject_CheckStr(obj)) {
-    Py_ssize_t size;
-    const char* data;
+    Py_ssize_t size = 0;
+    const char* data = nullptr;
     data = PyUnicode_AsUTF8AndSize(obj, &size);
     return std::string(data, static_cast<size_t>(size));
   } else {
@@ -1583,8 +1583,6 @@ std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
                 ->tp_name));  // NOLINT
       }
     }
-  } else if (PyObject_TypeCheck(obj, g_ir_opresult_pytype)) {
-    return {::pybind11::handle(obj).cast<pir::OpResult>()};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
@@ -1842,7 +1840,7 @@ paddle::Tensor PyTensorHook::operator()(const paddle::Tensor& var) {
     res = PyObject_CallFunctionObjArgs(py_func_, p_tmp_var, nullptr);
     Py_DECREF(p_tmp_var);
   } catch (platform::EnforceNotMet& e) {
-    throw std::move(e);
+    throw e;
   } catch (std::exception& e) {
     PADDLE_THROW(platform::errors::Unavailable(
         "Hook function of Tensor raises an exception: %s.", e.what()));
@@ -1869,7 +1867,7 @@ void PyVoidHook::operator()() {
   try {
     PyObject_CallFunctionObjArgs(py_func_, nullptr);
   } catch (platform::EnforceNotMet& e) {
-    throw std::move(e);
+    throw e;
   } catch (std::exception& e) {
     PADDLE_THROW(platform::errors::Unavailable(
         "Hook function of Tensor raises an exception: %s.", e.what()));
@@ -2079,9 +2077,9 @@ void DistTensorConverter::convert(Tensor* x) {
     phi::distributed::TensorDistAttr dist_attr(
         phi::vectorize(x->impl()->dims()));
     dist_attr.set_process_mesh(*mesh);
-    auto dense_t = static_cast<phi::DenseTensor*>(x->impl().get());
+    auto dense_t = std::static_pointer_cast<phi::DenseTensor>(x->impl());
     x->set_impl(
-        std::make_shared<phi::distributed::DistTensor>(*dense_t, dist_attr));
+        std::make_shared<phi::distributed::DistTensor>(dense_t, dist_attr));
   }
 }
 
diff --git a/paddle/fluid/pybind/eval_frame.c b/paddle/fluid/pybind/eval_frame.c
index a07f2033e4b4e8..5b4f216be24dc7 100644
--- a/paddle/fluid/pybind/eval_frame.c
+++ b/paddle/fluid/pybind/eval_frame.c
@@ -184,8 +184,13 @@ int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
   if (lasti < 0 && _Py_OPCODE(_PyCode_CODE(co)[0]) == COPY_FREE_VARS) {
     /* Free vars have not been initialized -- Do that */
     PyCodeObject *co = frame->f_code;
+#if PY_VERSION_HEX >= 0x030c0000
+    PyObject *closure = ((PyFunctionObject *)frame->f_funcobj)->func_closure;
+    int offset = co->co_nlocals + co->co_ncellvars;
+#else
     PyObject *closure = frame->f_func->func_closure;
     int offset = co->co_nlocals + co->co_nplaincellvars;
+#endif
     for (int i = 0; i < co->co_nfreevars; ++i) {
       PyObject *o = PyTuple_GET_ITEM(closure, i);
       Py_INCREF(o);
@@ -269,6 +274,8 @@ PyFrameObject *Internal_PyFrame_New_NoTrack(PyCodeObject *code) {
   return f;
 }
 
+#if PY_VERSION_HEX < 0x030c0000
+
 PyFrameObject *Internal_PyFrame_MakeAndSetFrameObject(
     _PyInterpreterFrame *frame) {
   assert(frame->frame_obj == NULL);
@@ -387,6 +394,8 @@ void Internal_PyFrame_Clear(_PyInterpreterFrame *frame) {
   Py_DECREF(frame->f_code);
 }
 
+#endif
+
 #else
 typedef PyFrameObject FrameObject;
 #endif
@@ -449,9 +458,11 @@ inline static PyObject *eval_custom_code_py311_plus(PyThreadState *tstate,
   // Create a new function object from code object. Refer to MAKE_FUNCTION.
   PyFunctionObject *func =
       (PyFunctionObject *)PyFunction_New((PyObject *)code, frame->f_globals);
+#if PY_VERSION_HEX < 0x030c0000
   Py_XINCREF(frame->f_func->func_closure);
   func->func_closure = frame->f_func->func_closure;
   _PyFrame_InitializeSpecials(shadow, func, NULL, code->co_nlocalsplus);
+#endif
 
   PyObject **fastlocals_old = frame->localsplus;
   PyObject **fastlocals_new = shadow->localsplus;
@@ -483,7 +494,9 @@ inline static PyObject *eval_custom_code_py311_plus(PyThreadState *tstate,
   }
 
   PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
+#if PY_VERSION_HEX < 0x030c0000
   Internal_PyFrame_Clear(shadow);
+#endif
   free(shadow);
   Py_DECREF(func);
   Py_DECREF(namemap);
@@ -558,7 +571,11 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
   // original frame. So we pass a PyInterpreterFrame to
   // _PyFrame_FastToLocalsWithError directly. But this is an internal API, so we
   // copy many code from CPython project into our project.
+#if PY_VERSION_HEX >= 0x030c0000
+  if (true) {
+#else
   if (Internal_PyFrame_FastToLocalsWithError(frame) < 0) {
+#endif
 #else
   if (PyFrame_FastToLocalsWithError(frame) < 0) {
 #endif
@@ -605,7 +622,7 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
     PyCodeObject *code = (PyCodeObject *)PyObject_GetAttrString(result, "code");
     PyObject *disable_eval_frame =
         PyObject_GetAttrString(result, "disable_eval_frame");
-    PyObject *out;
+    PyObject *out = NULL;
     // VLOG(7) << "Start eval new frame and code.";
     if (disable_eval_frame != Py_True) {
       // Re-enable custom behavior
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 80e6de07919611..55efda46c86b07 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -108,7 +108,7 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook {
       res = PyObject_CallFunctionObjArgs(
           py_func_, py::cast(tmp_varbase).ptr(), nullptr);
     } catch (platform::EnforceNotMet &e) {
-      throw std::move(e);
+      throw e;
     } catch (std::exception &e) {
       PADDLE_THROW(platform::errors::Unavailable(
           "Hook function of Tensor raises an exception: %s.", e.what()));
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index bd569f328b1158..019b5098feb75f 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -97,6 +97,7 @@ using paddle::PaddlePredictor;
 using paddle::PaddleTensor;
 using paddle::PassStrategy;
 using paddle::ZeroCopyTensor;
+using paddle_infer::experimental::InternalUtils;
 
 namespace {
 void BindPaddleDType(py::module *m);
@@ -116,6 +117,7 @@ void BindPaddlePassBuilder(py::module *m);
 void BindPaddleInferPredictor(py::module *m);
 void BindPaddleInferTensor(py::module *m);
 void BindPredictorPool(py::module *m);
+void BindInternalUtils(py::module *m);
 
 #ifdef PADDLE_WITH_DNNL
 void BindMkldnnQuantizerConfig(py::module *m);
@@ -204,7 +206,7 @@ py::dtype PaddleDTypeToNumpyDType(PaddleDType dtype) {
 
 py::array PaddleTensorGetData(PaddleTensor &tensor) {  // NOLINT
   py::dtype dt = PaddleDTypeToNumpyDType(tensor.dtype);
-  return py::array(std::move(dt), {tensor.shape}, tensor.data.data());
+  return py::array(dt, {tensor.shape}, tensor.data.data());
 }
 
 template <typename T>
@@ -212,7 +214,7 @@ void ZeroCopyTensorCreate(ZeroCopyTensor &tensor,  // NOLINT
                           py::array_t<T, py::array::c_style> data) {
   std::vector<int> shape;
   std::copy_n(data.shape(), data.ndim(), std::back_inserter(shape));
-  tensor.Reshape(std::move(shape));
+  tensor.Reshape(shape);
   tensor.copy_from_cpu(static_cast<const T *>(data.data()));
 }
 
@@ -233,7 +235,7 @@ void PaddleInferTensorCreate(paddle_infer::Tensor &tensor,  // NOLINT
                              py::array_t<T, py::array::c_style> data) {
   std::vector<int> shape;
   std::copy_n(data.shape(), data.ndim(), std::back_inserter(shape));
-  tensor.Reshape(std::move(shape));
+  tensor.Reshape(shape);
   tensor.CopyFromCpu(static_cast<const T *>(data.data()));
 }
 
@@ -509,6 +511,7 @@ void BindInferenceApi(py::module *m) {
   BindPaddleInferTensor(m);
   BindPaddlePassBuilder(m);
   BindPredictorPool(m);
+  BindInternalUtils(m);
 #ifdef PADDLE_WITH_DNNL
   BindMkldnnQuantizerConfig(m);
 #endif
@@ -1262,8 +1265,8 @@ void BindPaddlePassBuilder(py::module *m) {
       .def("set_passes",
            [](PaddlePassBuilder &self, const std::vector<std::string> &passes) {
              self.ClearPasses();
-             for (auto pass : passes) {
-               self.AppendPass(std::move(pass));
+             for (auto const &pass : passes) {
+               self.AppendPass(pass);
              }
            })
       .def("append_pass", &PaddlePassBuilder::AppendPass)
@@ -1304,6 +1307,24 @@ void BindPaddlePassBuilder(py::module *m) {
       .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer)
       .def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16);
 }
+
+void BindInternalUtils(py::module *m) {
+  py::class_<InternalUtils> internal_utils(*m, "InternalUtils");
+  internal_utils
+      .def_static("set_transformer_posid",
+                  [](paddle_infer::Config &config, std::string tensor_name) {
+                    InternalUtils::SetTransformerPosid(&config, tensor_name);
+                  })
+      .def_static("set_transformer_maskid",
+                  [](paddle_infer::Config &config, std::string tensor_name) {
+                    InternalUtils::SetTransformerMaskid(&config, tensor_name);
+                  })
+      .def_static("disable_tensorrt_half_ops",
+                  [](paddle_infer::Config &config,
+                     const std::unordered_set<std::string> &ops) {
+                    InternalUtils::DisableTensorRtHalfOps(&config, ops);
+                  });
+}
 }  // namespace
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index d28dc9bec40088..489b25f35867c8 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -289,8 +289,8 @@ std::string CastPyArg2String(PyObject* obj,
                              const std::string& op_type,
                              ssize_t arg_pos) {
   if (PyObject_CheckString(obj)) {
-    Py_ssize_t size;
-    const char* data;
+    Py_ssize_t size = 0;
+    const char* data = nullptr;
     data = PyUnicode_AsUTF8AndSize(obj, &size);
     return std::string(data, (size_t)size);  // NOLINT
   } else {
@@ -696,8 +696,8 @@ std::vector<std::string> CastPyArg2Strings(PyObject* obj,
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyList_GetItem(obj, i);
       if (PyObject_CheckString(item)) {
-        Py_ssize_t size;
-        const char* data;
+        Py_ssize_t size = 0;
+        const char* data = nullptr;
         data = PyUnicode_AsUTF8AndSize(item, &size);
         value.emplace_back(std::string(data, (size_t)size));  // NOLINT
       } else {
@@ -716,8 +716,8 @@ std::vector<std::string> CastPyArg2Strings(PyObject* obj,
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyTuple_GetItem(obj, i);
       if (PyObject_CheckString(item)) {
-        Py_ssize_t size;
-        const char* data;
+        Py_ssize_t size = 0;
+        const char* data = nullptr;
         data = PyUnicode_AsUTF8AndSize(item, &size);
         value.emplace_back(std::string(data, (size_t)size));  // NOLINT
       } else {
@@ -896,8 +896,8 @@ void ConstructAttrMapFromPyArgs(
   PyObject* obj = nullptr;
   for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
     VLOG(1) << "Start Process " << arg_pos;
-    Py_ssize_t key_len;
-    const char* key_ptr;
+    Py_ssize_t key_len = 0;
+    const char* key_ptr = nullptr;
     obj = PyTuple_GET_ITEM(args, arg_pos);
     if (PyObject_CheckString(obj)) {
       key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len);
@@ -988,8 +988,8 @@ void ConstructAttrMapForRunProgram(
   PyObject* obj = nullptr;
   for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
     VLOG(1) << "Start Process " << arg_pos;
-    Py_ssize_t key_len;
-    const char* key_ptr;
+    Py_ssize_t key_len = 0;
+    const char* key_ptr = nullptr;
     obj = PyTuple_GET_ITEM(args, arg_pos);
     if (PyObject_CheckString(obj)) {
       key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len);
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 9ba115381a2c00..5b8d169d91f746 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -264,21 +264,21 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.static as static
+            >>> import paddle
+            >>> import paddle.static as static
 
-            paddle.enable_static()
+            >>> paddle.enable_static()
 
-            data = static.data(name="x", shape=[None, 1], dtype="float32")
-            hidden = static.nn.fc(data, size=10)
-            loss = paddle.mean(hidden)
-            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+            >>> data = static.data(name="x", shape=[None, 1], dtype="float32")
+            >>> hidden = static.nn.fc(data, size=10)
+            >>> loss = paddle.mean(hidden)
+            >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
-            build_strategy = static.BuildStrategy()
-            build_strategy.enable_inplace = True
-            build_strategy.memory_optimize = True
-            build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-            program = static.CompiledProgram(static.default_main_program(), build_strategy=build_strategy)
+            >>> build_strategy = static.BuildStrategy()
+            >>> build_strategy.enable_inplace = True
+            >>> build_strategy.memory_optimize = True
+            >>> build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
+            >>> program = static.CompiledProgram(static.default_main_program(), build_strategy=build_strategy)
 )DOC");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
@@ -316,14 +316,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-                  )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
+          )DOC")
       .def_property(
           "gradient_scale_strategy",
           [](const BuildStrategy &self) { return self.gradient_scale_; },
@@ -345,38 +345,38 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import numpy
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        use_cuda = True
-                        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-                        exe = static.Executor(place)
-
-                        data = static.data(name='X', shape=[None, 1], dtype='float32')
-                        hidden = static.nn.fc(data, size=10)
-                        loss = paddle.mean(hidden)
-                        paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-                        exe.run(static.default_startup_program())
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.gradient_scale_strategy = \
-                                    static.BuildStrategy.GradientScaleStrategy.Customized
-                        compiled_prog = static.CompiledProgram(
-                                    static.default_main_program(),
-                                    build_strategy=build_strategy,
-                        )
-
-                        x = numpy.random.random(size=(10, 1)).astype('float32')
-                        loss_grad = numpy.ones((1)).astype("float32") * 0.01
-                        loss_grad_name = loss.name+"@GRAD"
-                        loss_data = exe.run(compiled_prog,
-                                                feed={"X": x, loss_grad_name : loss_grad},
-                                                fetch_list=[loss.name, loss_grad_name])
-                   )DOC")
+                        >>> import numpy
+                        >>> import paddle
+                        >>> import paddle.static as static
+
+                        >>> paddle.enable_static()
+
+                        >>> use_cuda = paddle.device.is_compiled_with_cuda
+                        >>> place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+                        >>> exe = static.Executor(place)
+
+                        >>> data = static.data(name='X', shape=[None, 1], dtype='float32')
+                        >>> hidden = static.nn.fc(data, size=10)
+                        >>> loss = paddle.mean(hidden)
+                        >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+                        >>> exe.run(static.default_startup_program())
+
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.gradient_scale_strategy = \
+                        ...             static.BuildStrategy.GradientScaleStrategy.Customized
+                        >>> compiled_prog = static.CompiledProgram(
+                        ...             static.default_main_program(),
+                        ...             build_strategy=build_strategy,
+                        >>> )
+
+                        >>> x = numpy.random.random(size=(10, 1)).astype('float32')
+                        >>> loss_grad = numpy.ones((1)).astype("float32") * 0.01
+                        >>> loss_grad_name = loss.name+"@GRAD"
+                        >>> loss_data = exe.run(compiled_prog,
+                        ...                         feed={"X": x, loss_grad_name : loss_grad},
+                        ...                         fetch_list=[loss.name, loss_grad_name])
+          )DOC")
       .def_property(
           "debug_graphviz_path",
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
@@ -395,14 +395,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.debug_graphviz_path = "./graph"
-                    )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.debug_graphviz_path = "./graph"
+          )DOC")
       .def_property(
           "enable_sequential_execution",
           [](const BuildStrategy &self) {
@@ -422,13 +422,13 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.enable_sequential_execution = True
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.enable_sequential_execution = True
           )DOC")
       .def_property(
           "remove_unnecessary_lock",
@@ -449,13 +449,13 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.remove_unnecessary_lock = True
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.remove_unnecessary_lock = True
           )DOC")
       .def_property(
           "num_trainers",
@@ -525,16 +525,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                       Default False.
 
                       Examples:
-                          .. code-block:: python
-
-                              import paddle
-                              import paddle.static as static
-
-                              paddle.enable_static()
+                            .. code-block:: python
 
-                              build_strategy = static.BuildStrategy()
-                              build_strategy.build_cinn_pass = True
-                    )DOC")
+                                >>> import paddle
+                                >>> import paddle.static as static
+                                >>> paddle.enable_static()
+                                >>> build_strategy = static.BuildStrategy()
+                                >>> build_strategy.build_cinn_pass = True
+          )DOC")
       .def_property(
           "fuse_elewise_add_act_ops",
           [](const BuildStrategy &self) {
@@ -555,14 +553,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_elewise_add_act_ops = True
-                     )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.fuse_elewise_add_act_ops = True
+          )DOC")
       .def_property(
           "fuse_gemm_epilogue",
           [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
@@ -581,14 +579,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_gemm_epilogue = True
-                     )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.fuse_gemm_epilogue = True
+          )DOC")
       .def_property(
           "fuse_adamw",
           [](const BuildStrategy &self) { return self.fuse_adamw_; },
@@ -605,12 +603,13 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 it may make the execution faster. Default is False.
                 Examples:
                     .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_adamw = True
-                     )DOC")
+
+                        >>> import paddle
+                        >>> import paddle.static as static
+                        >>> paddle.enable_static()
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.fuse_adamw = True
+          )DOC")
       .def_property(
           "fused_attention",
           [](const BuildStrategy &self) { return self.fused_attention_; },
@@ -629,14 +628,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fused_attention = True
-                     )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.fused_attention = True
+          )DOC")
       .def_property(
           "fused_feedforward",
           [](const BuildStrategy &self) { return self.fused_feedforward_; },
@@ -655,14 +654,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fused_feedforward = True
-                     )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.fused_feedforward = True
+          )DOC")
       .def_property(
           "sequential_run",
           [](const BuildStrategy &self) { return self.sequential_run_; },
@@ -680,14 +679,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.sequential_run = True
-                     )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.sequential_run = True
+          )DOC")
       .def_property(
           "fuse_bn_act_ops",
           [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
@@ -706,14 +705,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_bn_act_ops = True
-                     )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.fuse_bn_act_ops = True
+          )DOC")
       .def_property(
           "fuse_bn_add_act_ops",
           [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
@@ -732,14 +731,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_bn_add_act_ops = True
-                     )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.fuse_bn_add_act_ops = True
+          )DOC")
       .def_property(
           "enable_auto_fusion",
           [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
@@ -759,14 +758,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.enable_auto_fusion = True
-                    )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.enable_auto_fusion = True
+          )DOC")
       .def_property(
           "fuse_relu_depthwise_conv",
           [](const BuildStrategy &self) {
@@ -789,13 +788,13 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_relu_depthwise_conv = True
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.fuse_relu_depthwise_conv = True
           )DOC")
       .def_property(
           "fuse_broadcast_ops",
@@ -819,16 +818,15 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                       for NCCLReduce operations for a period of time. Default False.
 
                       Examples:
-                          .. code-block:: python
-
-                              import paddle
-                              import paddle.static as static
+                            .. code-block:: python
 
-                              paddle.enable_static()
+                                >>> import paddle
+                                >>> import paddle.static as static
+                                >>> paddle.enable_static()
 
-                              build_strategy = static.BuildStrategy()
-                              build_strategy.fuse_broadcast_ops = True
-                    )DOC")
+                                >>> build_strategy = static.BuildStrategy()
+                                >>> build_strategy.fuse_broadcast_ops = True
+          )DOC")
       .def_property(
           "fuse_all_optimizer_ops",
           [](const BuildStrategy &self) {
@@ -864,14 +862,14 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.sync_batch_norm = True
-                )DOC")
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.sync_batch_norm = True
+          )DOC")
       .def_property(
           "memory_optimize",
           [](const BuildStrategy &self) -> py::object {
@@ -904,15 +902,15 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Examples:
                     .. code-block:: python
 
-                        import paddle
-                        import paddle.static as static
+                        >>> import paddle
+                        >>> import paddle.static as static
 
-                        paddle.enable_static()
+                        >>> paddle.enable_static()
 
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.memory_optimize = True
+                        >>> build_strategy = static.BuildStrategy()
+                        >>> build_strategy.memory_optimize = True
 
-                )DOC")
+          )DOC")
       .def_property(
           "is_distribution",
           [](const BuildStrategy &self) { return self.is_distribution_; },
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index f32fe6f592218d..25f6936ab1c386 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -25,6 +25,7 @@
 #include "paddle/pir/core/builtin_op.h"
 
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/ir_adaptor/translator/program_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
@@ -46,6 +47,7 @@
 #include "paddle/pir/pass/pass_manager.h"
 #include "paddle/pir/pass/pass_registry.h"
 #include "paddle/pir/transforms/dead_code_elimination_pass.h"
+#include "paddle/utils/flags.h"
 #include "pybind11/stl.h"
 
 namespace py = pybind11;
@@ -63,8 +65,10 @@ using pir::Type;
 using pir::Value;
 using pybind11::return_value_policy;
 
-USE_PASS(dead_code_elimination);
-USE_PASS(inplace);
+USE_PASS(dead_code_elimination_pass);
+USE_PASS(inplace_pass);
+
+PHI_DECLARE_bool(print_ir);
 
 namespace paddle {
 namespace pybind {
@@ -230,6 +234,24 @@ void BindBlock(py::module *m) {
             None
 
       )DOC")
+      .def(
+          "move_op",
+          [](Block &self, Operation *op, uint32_t offset) {
+            Block::Iterator position = self.begin();
+            std::advance(position, offset);
+            op->MoveTo(&self, position);
+          },
+          R"DOC(
+          Move an op to a specific position (block.begin() + offset).
+
+          Args:
+              op (pir.Operation): the operator to be moved.
+              offset (uint32_t) : offset relative to the begin of the block
+
+          Returns:
+              None
+
+        )DOC")
       .def("all_parameters", [](Block &self) -> py::list {
         py::list param_list;
         for (auto iter = self.begin(); iter != self.end(); iter++) {
@@ -340,6 +362,38 @@ void BindOperation(py::module *m) {
            });
 }
 
+py::str Value2String(const Value &self) {
+  std::ostringstream print_stream;
+  print_stream << "Value(";
+  print_stream << GetValueInfo(self);
+  print_stream << ")";
+  return print_stream.str();
+}
+
+phi::DataType GetValueDtype(const Value &value) {
+  if (value.type().isa<DenseTensorType>()) {
+    return paddle::dialect::TransToPhiDataType(
+        value.type().dyn_cast<DenseTensorType>().dtype());
+  } else if (value.type().isa<SelectedRowsType>()) {
+    return paddle::dialect::TransToPhiDataType(
+        value.type().dyn_cast<SelectedRowsType>().dtype());
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Currently, we can only get phi::DataType from DenseTensorType and "
+        "SelectedRowsType."));
+  }
+}
+
+phi::DDim GetValueDims(const Value &value) {
+  if (value.type().isa<DenseTensorType>()) {
+    return value.type().dyn_cast<DenseTensorType>().dims();
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Currently, we can only get shape for dense "
+        "tensor."));
+  }
+}
+
 void BindValue(py::module *m) {
   py::class_<Value> value(*m, "Value", R"DOC(
     Value class represents the SSA value in the IR system. It is a directed edge
@@ -363,6 +417,10 @@ void BindValue(py::module *m) {
       .def("first_use", &Value::first_use, return_value_policy::reference)
       .def("has_one_use", &Value::HasOneUse)
       .def("use_empty", &Value::use_empty)
+      .def("replace_all_uses_with",
+           [](Value &self, Value &op_value) {
+             self.ReplaceAllUsesWith(op_value);
+           })
       .def("__eq__", &Value::operator==)
       .def("__eq__",
            [](Value &self, OpResult &other) {
@@ -370,13 +428,22 @@ void BindValue(py::module *m) {
            })
       .def("__hash__",
            [](const Value &self) { return std::hash<pir::Value>{}(self); })
-      .def("__str__", [](const Value &self) -> py::str {
-        std::ostringstream print_stream;
-        print_stream << "Value(";
-        print_stream << GetValueInfo(self);
-        print_stream << ")";
-        return print_stream.str();
-      });
+      .def("__str__", &Value2String)
+      .def("__repr__", &Value2String)
+      .def_property(
+          "shape",
+          [](Value &self) { return phi::vectorize(GetValueDims(self)); },
+          [](Value &self, const std::vector<int> &shape) {
+            PADDLE_THROW(phi::errors::InvalidArgument(
+                "can't set shape when building static graph"));
+          })
+      .def_property(
+          "dtype",
+          [](Value &self) { return GetValueDtype(self); },
+          [](Value &self, phi::DataType dtype) {
+            PADDLE_THROW(phi::errors::InvalidArgument(
+                "can't set dtype when building static graph"));
+          });
 }
 
 void BindOpOperand(py::module *m) {
@@ -457,6 +524,16 @@ phi::DataType GetOpResultDtype(const OpResult &result) {
   }
 }
 
+const phi::DDim &GetOpResultDims(const OpResult &result) {
+  if (result.type().isa<DenseTensorType>()) {
+    return result.type().dyn_cast<DenseTensorType>().dims();
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Currently, we can only get shape for dense "
+        "tensor."));
+  }
+}
+
 #define OVERRIDE_OPERATOR(operator, api, other_type)              \
   op_result.def(#operator, [](OpResult &self, other_type other) { \
     return paddle::dialect::api(self, other);                     \
@@ -610,6 +687,12 @@ void BindOpResult(py::module *m) {
                return false;
              }
            })
+      .def("numel",
+           [](OpResult &self) { return phi::product(GetOpResultDims(self)); })
+      .def("replace_all_uses_with",
+           [](OpResult &self, OpResult &op_result) {
+             self.ReplaceAllUsesWith(op_result);
+           })
       .def_property(
           "stop_gradient",
           [](OpResult &self) {
@@ -638,16 +721,7 @@ void BindOpResult(py::module *m) {
           })
       .def_property(
           "shape",
-          [](OpResult &self) {
-            if (self.type().isa<DenseTensorType>()) {
-              return phi::vectorize(
-                  self.type().dyn_cast<DenseTensorType>().dims());
-            } else {
-              PADDLE_THROW(phi::errors::InvalidArgument(
-                  "Currently, we can only get shape for dense "
-                  "tensor."));
-            }
-          },
+          [](OpResult &self) { return phi::vectorize(GetOpResultDims(self)); },
           [](OpResult &self, const std::vector<int> &shape) {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "can't set shape when building static graph"));
@@ -1038,12 +1112,15 @@ SplitedResult ForwardBackwardSplit(
 
   VLOG(4) << "forward_value_map.size() is " << forward_value_map.size();
   VLOG(4) << "backward_value_map.size() is " << backward_value_map.size();
-  std::ostringstream print_stream;
-  print_stream << "ForwardProgram is :\n";
-  forward_program->Print(print_stream);
-  print_stream << "BackwardProgram is:\n";
-  backward_program->Print(print_stream);
-  VLOG(4) << "Splited Program (fwd | bwd): \n" << print_stream.str();
+  if (FLAGS_print_ir) {
+    std::ostringstream print_stream;
+    print_stream << "ForwardProgram is :\n";
+    forward_program->Print(print_stream);
+    print_stream << "BackwardProgram is:\n";
+    backward_program->Print(print_stream);
+    std::cout << "Splited Program (fwd | bwd): \n"
+              << print_stream.str() << std::endl;
+  }
 
   // construct all attributes we needed.
 
@@ -1138,7 +1215,7 @@ void BindUtils(pybind11::module *m) {
                     y_s = paddle.matmul(x_s, x_s)
                     z_s = paddle.add(y_s, y_s)
                     k_s = paddle.tanh(z_s)
-                newir_program = ir.translate_to_new_ir(main_program.desc)
+                newir_program = pir.translate_to_new_ir(main_program.desc)
 
                 print(newir_program)
 
@@ -1158,6 +1235,53 @@ void BindUtils(pybind11::module *m) {
       Returns:
         list[str] : List of unregistered operators in paddle dialect, the name is expressed by origin op name.
     )DOC");
+  m->def(
+      "translate_to_new_ir_with_param_map",
+      [](const framework::ProgramDesc &legacy_program) {
+        auto ir_ctx = pir::IrContext::Instance();
+        auto program = std::make_shared<pir::Program>(ir_ctx);
+        translator::ProgramTranslator program_translator(&legacy_program,
+                                                         program.get());
+        program_translator.Translate();
+        return std::make_pair(program, program_translator.VarDesc2Value());
+      },
+      R"DOC(
+        Convert Fluid Program to New IR Program and get the mappings of VarDesc -> pir::Value.
+
+        Args:
+
+            legacy_program (ProgramDesc): The Fluid Program that will be converted.
+
+        Returns:
+            Program: The New IR Program
+            dict[str, pir::Value]: Mapping between VarDesc(by name) and pir::Value.
+
+        Raises:
+            PreconditionNotMet: If legacy_program has multi block will raise error.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle import pir
+                paddle.enable_static()
+
+                x = paddle.randn([4, 4])
+                main_program, start_program = (
+                    paddle.static.Program(),
+                    paddle.static.Program(),
+                )
+                with paddle.static.program_guard(main_program, start_program):
+                    x_s = paddle.static.data('x', [4, 4], x.dtype)
+                    x_s.stop_gradient = False
+                    y_s = paddle.matmul(x_s, x_s)
+                    z_s = paddle.add(y_s, y_s)
+                    k_s = paddle.tanh(z_s)
+                newir_program, mappings = pir.translate_to_new_ir_with_param_map(main_program.desc)
+
+                print(newir_program)
+                print(mappings)
+    )DOC");
 }
 
 void BindIrPass(pybind11::module *m) {
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 1b0101a85537ad..1c4315e8ee1851 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -459,6 +459,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   py::enum_<phi::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
       .value("XPU1", phi::backends::xpu::XPUVersion::XPU1)
       .value("XPU2", phi::backends::xpu::XPUVersion::XPU2)
+      .value("XPU3", phi::backends::xpu::XPUVersion::XPU3)
       .export_values();
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 19e813cc25e7a7..dcae0104f35598 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -195,6 +195,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/nan_inf_utils.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/pir/dialect/operator/interface/decomp.h"
 #include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
 #include "paddle/fluid/pir/dialect/operator/trait/custom_vjp.h"
 #include "paddle/fluid/prim/utils/eager/eager_tensor_operants.h"
@@ -766,6 +767,42 @@ void BindVjp(pybind11::module *m) {
                out (bool): True means that the op has custom vjp rules, False means it does not.
            )DOC");
 }
+
+void BindDecomp(pybind11::module *m) {
+  m->def("call_decomp", [](pir::Operation &fwd_op) {
+    py::list res;
+    paddle::dialect::DecompInterface decomp_interface =
+        fwd_op.dyn_cast<paddle::dialect::DecompInterface>();
+    PADDLE_ENFORCE(
+        decomp_interface,
+        phi::errors::InvalidArgument(
+            "The decomp function is not registered in %s op ", fwd_op.name()));
+    std::vector<std::vector<pir::OpResult>> decomp_res =
+        decomp_interface.Decomp(&fwd_op);
+    for (size_t i = 0; i < decomp_res.size(); ++i) {
+      py::list sub_res;
+      for (size_t j = 0; j < decomp_res[i].size(); ++j) {
+        if (!decomp_res[i][j]) {
+          sub_res.append(nullptr);
+        } else {
+          sub_res.append(decomp_res[i][j]);
+        }
+      }
+      res.append(sub_res);
+    }
+    return res;
+  });
+
+  m->def("has_decomp", [](pir::Operation &fwd_op) {
+    pir::IrContext *ctx = pir::IrContext::Instance();
+    pir::OpInfo fwd_op_info = ctx->GetRegisteredOpInfo(fwd_op.name());
+    auto decomp_interface_impl =
+        fwd_op_info.GetInterfaceImpl<paddle::dialect::DecompInterface>();
+    if (decomp_interface_impl == nullptr) return false;
+    return true;
+  });
+}
+
 PYBIND11_MODULE(libpaddle, m) {
   BindImperative(&m);
   BindEager(&m);
@@ -852,7 +889,7 @@ PYBIND11_MODULE(libpaddle, m) {
   m.def("clear_gradients",
         [](std::vector<std::shared_ptr<imperative::VarBase>> param_list,
            bool set_to_zero) {
-          for (auto param : param_list) {
+          for (auto const &param : param_list) {
             param->ClearGradient(set_to_zero);
           }
         });
@@ -2940,6 +2977,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   BindPIR(&m);
   BindVjp(&m);
+  BindDecomp(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 5b6efa9e1dba9b..7205333bb688c7 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -1038,7 +1038,13 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           [](DistTensor &self) { return self.value(); },
           py::return_value_policy::reference)
       .def("numel",
-           [](DistTensor &self) -> int64_t { return self.value().numel(); });
+           [](DistTensor &self) -> int64_t { return self.value().numel(); })
+      .def("_share_data_with", [](DistTensor &self, const DistTensor &src) {
+        self.unsafe_set_dims(src.dims());
+        self.unsafe_set_dist_attr(src.dist_attr());
+        self.unsafe_mutable_value()->ShareDataWith(src.value());
+        return self;
+      });
 #endif
 
   py::class_<phi::SelectedRows>(m, "SelectedRows")
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index 3098145b801c72..71257dc588dac1 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -566,11 +566,7 @@ std::vector<phi::distributed::DistTensor*> SetKernelDistOutput(
     if (tmp) {
       // TODO(GhostScreaming): now all dist case are nullptr
       if (tmp->impl() == nullptr) {
-        phi::DenseTensor dense_t;
-        // TODO(GhostScreaming): polish code, dist_attr is null now
-        phi::distributed::TensorDistAttr dist_attr;
-        auto dist_t =
-            std::make_shared<phi::distributed::DistTensor>(dense_t, dist_attr);
+        auto dist_t = std::make_shared<phi::distributed::DistTensor>();
         tmp->set_impl(dist_t);
       }
       result.emplace_back(
@@ -587,11 +583,7 @@ std::vector<phi::distributed::DistTensor*> SetKernelDistOutput(
   out->reserve(out_size);
   std::vector<phi::distributed::DistTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
-    phi::DenseTensor dense_t;
-    // TODO(GhostScreaming): polish code, dist_attr is null now
-    phi::distributed::TensorDistAttr dist_attr;
-    auto dist_t =
-        std::make_shared<phi::distributed::DistTensor>(dense_t, dist_attr);
+    auto dist_t = std::make_shared<phi::distributed::DistTensor>();
     results[i] = dist_t.get();
     out->emplace_back();
     out->back().set_impl(dist_t);
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 8c9a57f264db42..8ba76b64f5f7af 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -211,7 +211,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
   // But the embarrassment is that this solution this solution makes training
   // slower.
   phi::DenseTensor out;
-  phi::DeviceContext* dev_ctx;
+  phi::DeviceContext* dev_ctx = nullptr;
   if (dst_place.GetType() != AllocationType::CPU) {
     dev_ctx = pool.Get(dst_place);
   } else {
@@ -652,18 +652,9 @@ ReshardApiInputToReplicatedKernelInput(
     if (ReshardIsNeeded(dist_tensor->dist_attr(), dist_attr)) {
       VLOG(6) << "ApiIn to Replicated KernelIn - "
               << ReshardDebugInfo(*dist_tensor, dist_attr);
-      if (dist_tensor->initialized()) {
-        auto* func = phi::distributed::ChooseProperReshardFunction(*dist_tensor,
-                                                                   dist_attr);
-        return func->Eval(dev_ctx, *dist_tensor, dist_attr);
-      } else {
-        // when no tensor data need to be reshard, we still need to set correct
-        // replicated dist attr and local dims for output
-        dist_tensor->unsafe_set_dist_attr(dist_attr);
-        auto dense_tensor_meta = dist_tensor->value().meta();
-        dense_tensor_meta.dims = dist_tensor->dims();
-        dist_tensor->unsafe_mutable_value()->set_meta(dense_tensor_meta);
-      }
+      auto* func = phi::distributed::ChooseProperReshardFunction(*dist_tensor,
+                                                                 dist_attr);
+      return func->Eval(dev_ctx, *dist_tensor, dist_attr);
     }
     return std::static_pointer_cast<phi::distributed::DistTensor>(tensor_in);
   }
@@ -794,7 +785,8 @@ PrepareDataForDistTensor(const std::vector<Tensor>& input,
         // change(NCHW->NHWC), so the new DistTensor's meta maybe not unified.
         VLOG(6) << "PrepareDataForDistTensor return transformed dist tensor";
         out.push_back(std::make_shared<phi::distributed::DistTensor>(
-            trans_in_tensor, dist_tensor->dist_attr()));
+            std::make_shared<phi::DenseTensor>(trans_in_tensor),
+            dist_tensor->dist_attr()));
       }
     } else {
       out.push_back(nullptr);
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index a69afdfdfe2d8c..4984974b338ef1 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -11,9 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-
+#include <glog/logging.h>
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
@@ -68,6 +67,7 @@ BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
 #endif
     phi::Backend backend_key = phi::TransToPhiBackend(t.place());
     BackendSet backend_set(backend_key);
+    VLOG(10) << "update BackendSet by tensor: add [" << backend_key << "]";
     if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) &&
         static_cast<const phi::DenseTensor&>(t).meta().use_gpudnn) {
       backend_set = backend_set | BackendSet(Backend::GPUDNN);
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 847c2a7d14756e..7bd3524ed795c3 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include <glog/logging.h>
 #include <limits>
 #include <string>
 #include <utility>
-
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/backend_set.h"
 #include "paddle/phi/api/lib/data_type_set.h"
@@ -99,11 +99,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
   inline void AssignKernelKeySet(const phi::TensorBase& tensor) {
     // assign Backend
     BackendSet tensor_backend_set = detail::GetTensorBackendSet(tensor);
+    VLOG(8) << "Get BackendSet from tensor";
     key_set.backend_set = key_set.backend_set | tensor_backend_set;
     // tensor's attribute use_gpudnn=False, explicitly disable gpudnn kernel
     if (tensor_backend_set == BackendSet(Backend::GPU) || disable_gpudnn) {
       disable_gpudnn = true;
       key_set.backend_set = key_set.backend_set - BackendSet(Backend::GPUDNN);
+      VLOG(8) << "Disable kernel backend: GPUDNN";
     }
     // assign DataLayout
     phi::DataLayout tensor_layout = tensor.layout();
@@ -115,6 +117,7 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     auto promote_result = PromoteTypes(dtype_set);
     if (promote_result != DataType::UNDEFINED) {
       key_set.dtype = promote_result;
+      VLOG(8) << "promote kernel DataType:" << promote_result;
     }
   }
 
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 5d0e2f139c2137..da8b9125a71ddd 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -121,7 +121,7 @@ void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) {
 
 void CustomOpKernelContext::EmplaceBackAttrs(
     const std::vector<paddle::any>& attrs) {
-  attrs_ = std::move(attrs);
+  attrs_ = attrs;
 }
 
 const Tensor& CustomOpKernelContext::InputAt(size_t idx) const {
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 1a57a578c78972..f50347fd6678aa 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -95,7 +95,7 @@ Tensor::Tensor(const Place &place, const std::vector<int64_t> &shape) {
 
 Tensor::Tensor(std::shared_ptr<phi::TensorBase> tensor_impl,
                const std::string &name)
-    : impl_(std::move(tensor_impl)), name_(std::move(name)) {}
+    : impl_(std::move(tensor_impl)), name_(name) {}
 
 /* Part 2: Dimension, DataType and DataLayout methods */
 
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 7be497318443a7..5e39b764fa96d7 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1817,6 +1817,7 @@
   infer_meta :
     func : UnchangedInferMeta
     param : [out]
+    spmd_rule : ElementwiseUnaryGradInferSpmd
   kernel :
     func : relu_grad
   backward: relu_double_grad
@@ -2234,6 +2235,7 @@
   infer_meta :
     func : UnchangedInferMeta
     param : [x]
+    spmd_rule : ElementwiseUnaryGradInferSpmd
   kernel :
     func : square_grad
   backward : square_double_grad
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index 9f19dcf31d728c..b54307861b3674 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -145,6 +145,25 @@
   backward : fused_dropout_add_grad
   support_dygraph_mode : true
 
+- op : fused_embedding_eltwise_layernorm
+  args : (Tensor[] ids, Tensor[] embs, Tensor bias, Tensor scale, float epsilon = 0.00001f)
+  output : Tensor(out)
+  infer_meta :
+    func : FusedEmbeddingEltWiseLayerNormInferMeta
+  kernel :
+    func : fused_embedding_eltwise_layernorm
+    data_type : embs
+
+- op : fused_fc_elementwise_layernorm
+  args : (Tensor x, Tensor w, Tensor y, Tensor bias0, Tensor scale, Tensor bias1, int x_num_col_dims = 1, str activation_type = "", float epsilon = 0.00001f, int begin_norm_axis = 1)
+  output : Tensor(out), Tensor(mean), Tensor(variance)
+  infer_meta :
+    func : FusedFCElementwiseLayerNormInferMeta
+  kernel :
+    func : fused_fc_elementwise_layernorm
+    data_type : x
+  optional : bias0, scale, bias1, mean, variance
+
 - op : fused_linear_param_grad_add
   args : (Tensor x, Tensor dout, Tensor dweight, Tensor dbias, bool multi_precision = true, bool has_bias = true)
   output : Tensor(dweight_out), Tensor(dbias_out)
@@ -188,6 +207,15 @@
     func : fused_scale_bias_relu_conv_bnstats
     data_type : x
 
+- op : fusion_transpose_flatten_concat
+  args : (Tensor[] x, int[] trans_axis, int flatten_axis, int concat_axis)
+  output : Tensor(out)
+  infer_meta :
+    func : FusionTransposeFlattenConcatInferMeta
+  kernel :
+    func : fusion_transpose_flatten_concat
+    data_type : x
+
 - op : generate_sequence_xpu
   args : (Tensor x, DataType dtype)
   output : Tensor
diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py
index fcfcd179227593..7a71555c1156fd 100644
--- a/paddle/phi/api/yaml/generator/api_gen.py
+++ b/paddle/phi/api/yaml/generator/api_gen.py
@@ -379,6 +379,7 @@ def source_include(header_file_path):
 
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/phi/infermeta/spmd_rules/rules.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
 #endif
 
 PD_DECLARE_bool(conv2d_disable_cudnn);
diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
index 00189d880e67fb..2bf886ab7fa5ef 100644
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import argparse
+import re
 
 import yaml
 from api_base import PREFIX_TENSOR_NAME
@@ -43,23 +44,38 @@
 # TODO(chenweihang): add view support later
 MAIN_DIST_BRANCH_TEMPLATE = """
   // Auto Parallel condition
-  if ({}) {{
+  if (use_dist_branch) {{
     // 1. InferSpmd (Infer DistAttr of Inputs&Outputs){}
     // 2. Create API Output & Prepare Dist and Dense Output{}
     // 3. Infer DistTensor's Global Shape{}\n
-    // 4. Select Kernel{}
-    // 5. Reshard Input{}\n
-    // 6. PrepareData (DataTransform & Prepare Dense Input){}
-    // 7. Infer Local DenseTensor Meta{}
-    // 8. DenseTensor Kernel Call{}
-    // 9. Reshard Partial Output to Replicated (Temporary){}\n
+    if (!computation_clip_for_pp){{
+      // 4. Select Kernel{}
+      // 5. Reshard Input{}\n
+      // 6. PrepareData (DataTransform & Prepare Dense Input){}
+      // 7. Infer Local DenseTensor Meta{}
+      // 8. DenseTensor Kernel Call{}
+      // 9. Reshard Partial Output to Replicated (Temporary){}\n
+    }}
     // 10. Return
     {}
   }}
 """
 
+# TODO(GhostScreaming): Support no-input operators.
+# 1. Non computation rank clip
+GET_MESH_TEMPLATE = """
+    auto mesh = std::static_pointer_cast<phi::distributed::DistTensor>({}impl())->dist_attr().process_mesh();
+    computation_clip_for_pp = !phi::distributed::IsCurRankInMesh(mesh);"""
+
 # Auto Parallel condition
-AUTO_PARALLEL_COND_TEMPLATE = """AllInputsAreDistTensor({})"""
+AUTO_PARALLEL_COND_TEMPLATE = """
+  bool use_dist_branch = AllInputsAreDistTensor({input_args});
+  bool computation_clip_for_pp = false;
+  if (use_dist_branch) {{{mesh}
+  }}
+  if (!computation_clip_for_pp) {{{kernel_code}
+  }}
+"""
 
 # 1. InferSPMD
 SINGLE_DIST_META_IN_TEMPLATE = """
@@ -84,24 +100,49 @@
 SINGLE_OUT_CREATION_TEMPLATE_NO_SPMD = """
     auto dist_out = SetKernelDistOutput(&api_output);
     auto dense_out = dist_out->unsafe_mutable_value();
+    if (computation_clip_for_pp) {{
+      *dense_out = phi::DenseTensor(
+            std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+            phi::DenseTensorMeta());
+    }}
 """
 MULTI_SINGLE_OUT_CREATION_TEMPLATE_NO_SPMD = """
     auto dist_out_{idx} = SetKernelDistOutput({out});
     auto dense_out_{idx} = dist_out_{idx}->unsafe_mutable_value();
+    if (computation_clip_for_pp) {{
+      *dense_out_{idx} = phi::DenseTensor(
+            std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+            phi::DenseTensorMeta());
+    }}
 """
 SINGLE_OUT_CREATION_TEMPLATE = """
     auto dist_out = SetKernelDistOutput(&api_output, spmd_info.second[0]);
     auto dense_out = dist_out->unsafe_mutable_value();
+    if (computation_clip_for_pp) {{
+      *dense_out = phi::DenseTensor(
+            std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+            phi::DenseTensorMeta());
+    }}
 """
 MULTI_SINGLE_OUT_CREATION_TEMPLATE = """
     auto dist_out_{idx} = SetKernelDistOutput({out}, spmd_info.second[{idx}]);
     auto dense_out_{idx} = dist_out_{idx}->unsafe_mutable_value();
+    if (computation_clip_for_pp) {{
+      *dense_out_{idx} = phi::DenseTensor(
+            std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+            phi::DenseTensorMeta());
+    }}
 """
 VECTOR_OUT_CREATION_TEMPLATE = """
     auto dist_out = SetKernelDistOutput({}, &api_output);
     std::vector<phi::DenseTensor*> dense_out(dist_out.size());
     for (size_t i = 0; i < dist_out.size(); ++i) {{
-        dense_out[i] = const_cast<phi::DenseTensor*>(&dist_out[i]->value());
+      dense_out[i] = const_cast<phi::DenseTensor*>(&dist_out[i]->value());
+      if (computation_clip_for_pp) {{
+        *dense_out[i] = phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+      }}
     }}
 """
 MULTI_VECTOR_OUT_CREATION_TEMPLATE = """
@@ -109,6 +150,11 @@
     std::vector<phi::DenseTensor*> dense_out_{out_name}(dist_out_{out_name}.size());
     for (size_t i = 0; i < dist_out_{out_name}.size(); ++i) {{
         dense_out_{out_name}[i] = const_cast<phi::DenseTensor*>(&dist_out_{out_name}[i]->value());
+        if (computation_clip_for_pp) {{
+          *dense_out_{out_name}[i] = phi::DenseTensor(
+                  std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                  phi::DenseTensorMeta());
+        }}
     }}
 """
 MULTI_VECTOR_INPLACE_AND_OPTIONAL_OUT_CREATION_TEMPLATE = """
@@ -171,57 +217,63 @@
 # Dist Branch will not generated in the API that doesn't have input tensor.
 SET_SINGLE_OUT_REPLICATED_DIST_ATTR = """
     SetReplicatedDistAttrForOutput({}, spmd_info.first[0].process_mesh());"""
+SET_VECTOR_OUT_REPLICATED_DIST_ATTR = """
+    auto current_process_mesh = spmd_info.first[0].process_mesh();
+    for (size_t i = 0; i < dist_out.size(); ++i) {{
+        SetReplicatedDistAttrForOutput(dist_out[i], current_process_mesh);
+    }}
+"""
 
 # 4. Select Kernel
 KERNEL_SELECTION_TEMPLATE = """
-    VLOG(6) << "{} API dist branch: kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
-    auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-        "{}", {{kernel_backend, kernel_layout, kernel_data_type}});
-    const auto& kernel = kernel_result.kernel;
-    VLOG(6) << "{} kernel: " << kernel;
-    auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);
+      VLOG(6) << "{} API dist branch: kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+      auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+          "{}", {{kernel_backend, kernel_layout, kernel_data_type}});
+      const auto& kernel = kernel_result.kernel;
+      VLOG(6) << "{} kernel: " << kernel;
+      auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);
 """
 
 # 5. Reshard Input
 SINGLE_INPUT_RESHARD_TEMPLATE = """
-    auto dist_input_{arg} = ReshardApiInputToKernelInput(dev_ctx, {arg}, spmd_info.first[{idx}]);"""
+      auto dist_input_{arg} = ReshardApiInputToKernelInput(dev_ctx, {arg}, spmd_info.first[{idx}]);"""
 SINGLE_GENERAL_INPUT_RESHARD_TEMPLATE = """
-    auto dist_input_{arg} = ReshardApiInputToReplicatedKernelInput(dev_ctx, {arg}, spmd_info.first[{idx}]);"""
+      auto dist_input_{arg} = ReshardApiInputToReplicatedKernelInput(dev_ctx, {arg}, spmd_info.first[{idx}]);"""
 UNSUPPORTED_RESHARD_INPUT_COMMENT_TEMPLATE = """
-    // API `{}` does not need to support ReshardInput at this time
+      // API `{}` does not need to support ReshardInput at this time
 """
 
 # 6. PrepareData
 SINGLE_PREPARE_DATA_TEMPLATE = """
-    dist_input_{arg} = PrepareDataForDistTensor(dist_input_{arg}, GetKernelInputArgDef(kernel.InputAt({idx}), kernel_backend), {flag}, kernel_result.is_stride_kernel);
-    auto input_{arg} = &dist_input_{arg}->value();
+      dist_input_{arg} = PrepareDataForDistTensor(dist_input_{arg}, GetKernelInputArgDef(kernel.InputAt({idx}), kernel_backend), {flag}, kernel_result.is_stride_kernel);
+      auto input_{arg} = &dist_input_{arg}->value();
 """
 SINGLE_PREPARE_DATA_TEMPLATE_NO_RESHARD = """
-    auto dist_input_{arg} = PrepareDataForDistTensor({arg}, GetKernelInputArgDef(kernel.InputAt({idx}), kernel_backend), {flag}, kernel_result.is_stride_kernel);
-    auto input_{arg} = &dist_input_{arg}->value();
+      auto dist_input_{arg} = PrepareDataForDistTensor({arg}, GetKernelInputArgDef(kernel.InputAt({idx}), kernel_backend), {flag}, kernel_result.is_stride_kernel);
+      auto input_{arg} = &dist_input_{arg}->value();
 """
 VECTOR_PREPARE_DATA_TEMPLATE = """
-    auto dist_input_{name}_vec = PrepareDataForDistTensor({name}, GetKernelInputArgDef(kernel.InputAt({index}), kernel_backend), {trans_flag}, kernel_result.is_stride_kernel);
-    std::vector<const phi::DenseTensor*> dense_input_{name}_vec;
-    for (auto tmp : dist_input_{name}_vec) {{
-      dense_input_{name}_vec.emplace_back(&tmp->value());
-    }}
-    std::vector<phi::MetaTensor> dense_input_{name}_meta_vec = MakeMetaTensor(dense_input_{name}_vec);
-    std::vector<const phi::MetaTensor*> dense_input_{name}_meta_ptr_vec(dense_input_{name}_meta_vec.size());
-    for (size_t i = 0; i < dense_input_{name}_meta_ptr_vec.size(); ++i) {{
-      dense_input_{name}_meta_ptr_vec[i] = &dense_input_{name}_meta_vec[i];
-    }}
+      auto dist_input_{name}_vec = PrepareDataForDistTensor({name}, GetKernelInputArgDef(kernel.InputAt({index}), kernel_backend), {trans_flag}, kernel_result.is_stride_kernel);
+      std::vector<const phi::DenseTensor*> dense_input_{name}_vec;
+      for (auto tmp : dist_input_{name}_vec) {{
+        dense_input_{name}_vec.emplace_back(&tmp->value());
+      }}
+      std::vector<phi::MetaTensor> dense_input_{name}_meta_vec = MakeMetaTensor(dense_input_{name}_vec);
+      std::vector<const phi::MetaTensor*> dense_input_{name}_meta_ptr_vec(dense_input_{name}_meta_vec.size());
+      for (size_t i = 0; i < dense_input_{name}_meta_ptr_vec.size(); ++i) {{
+        dense_input_{name}_meta_ptr_vec[i] = &dense_input_{name}_meta_vec[i];
+      }}
 """
 OPTIONAL_SINGLE_PREPARE_DATA_TEMPLATE = """
-    auto dist_input_{name} = PrepareDataForDistTensor({name}, GetKernelInputArgDef(kernel.InputAt({index}), kernel_backend), {trans_flag}, kernel_result.is_stride_kernel);
-    paddle::optional<phi::DenseTensor> input_{name} = dist_input_{name} ? paddle::make_optional<phi::DenseTensor>(dist_input_{name}->value()) : paddle::none;
+      auto dist_input_{name} = PrepareDataForDistTensor({name}, GetKernelInputArgDef(kernel.InputAt({index}), kernel_backend), {trans_flag}, kernel_result.is_stride_kernel);
+      paddle::optional<phi::DenseTensor> input_{name} = dist_input_{name} ? paddle::make_optional<phi::DenseTensor>(dist_input_{name}->value()) : paddle::none;
 """
 OPTIONAL_VECTOR_PREPARE_DATA_TEMPLATE = """
-    auto dist_input_{name}_vec = PrepareDataForDistTensor({name}, GetKernelInputArgDef(kernel.InputAt({index}), kernel_backend), {trans_flag}, kernel_result.is_stride_kernel);
-    std::vector<const phi::DenseTensor*> dense_input_{name}_vec;
-    if ({name}) {{
-      for (auto tmp : *dist_input_{name}_vec) {{
-        dense_input_{name}_vec.emplace_back(&tmp->value());
+      auto dist_input_{name}_vec = PrepareDataForDistTensor({name}, GetKernelInputArgDef(kernel.InputAt({index}), kernel_backend), {trans_flag}, kernel_result.is_stride_kernel);
+      std::vector<const phi::DenseTensor*> dense_input_{name}_vec;
+      if ({name}) {{
+        for (auto tmp : *dist_input_{name}_vec) {{
+          dense_input_{name}_vec.emplace_back(&tmp->value());
       }}
     }}
     paddle::optional<std::vector<const phi::DenseTensor*>> input_{name}(dense_input_{name}_vec);
@@ -251,16 +303,16 @@
 OPTIONAL_SINGLE_META_IN_TEMPLATE = """MakeMetaTensor(input_{}), """
 OPTIONAL_VECTOR_META_IN_TEMPLATE = """dense_input_{}_meta_ptr_vec, """
 SINGLE_META_OUT_DECL_TEMPLATE = """
-    phi::MetaTensor meta_{}({});"""
+      phi::MetaTensor meta_{}({});"""
 VECTOR_META_OUT_DECL_TEMPLATE = """
-    std::vector<phi::MetaTensor> {name}_meta_vec = MakeMetaTensor({name});
-    std::vector<phi::MetaTensor*> {name}_meta_ptr_vec({name}_meta_vec.size());
-    for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{
-      {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
-    }}
+      std::vector<phi::MetaTensor> {name}_meta_vec = MakeMetaTensor({name});
+      std::vector<phi::MetaTensor*> {name}_meta_ptr_vec({name}_meta_vec.size());
+      for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{
+        {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
+      }}
 """
 INFER_META_TEMPLATE = """
-    phi::{}({}{});
+      phi::{}({}{});
 """
 
 # 8. DenseTensor Kernel Call
@@ -272,10 +324,11 @@
 TUPLE_OUTPUT_NAME_TEMPLATE = """
 """
 KERNEL_CALL_TEMPLATE = """
-    using kernel_signature = {};
-    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-    (*kernel_fn)({}, {});
+      using kernel_signature = {};
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)({}, {});
 """
+
 # TODO(GhostScreaming): Some operators generate shape info in runtime,
 # bincount. As a result, dist_output's global shape is set uncorrectly,
 # because it's generated in InferMeta function. A temporally solution is
@@ -297,11 +350,11 @@
 
 # 9. Reshard Partial Output to Replicated
 RESHARD_P2R_SINGLE_OUTPUT_TEMPLATE = """
-    ReshardOutputPartialAxisToReplicated(dev_ctx, dist_out);"""
+      ReshardOutputPartialAxisToReplicated(dev_ctx, dist_out);"""
 RESHARD_P2R_MULTI_SINGLE_OUTPUT_TEMPLATE = """
-    ReshardOutputPartialAxisToReplicated(dev_ctx, dist_out_{});"""
+      ReshardOutputPartialAxisToReplicated(dev_ctx, dist_out_{});"""
 UNSUPPORTED_RESHARD_OUTPUT_COMMENT_TEMPLATE = """
-    // API `{}` does not need to support ReshardOutput now
+      // API `{}` does not need to support ReshardOutput now
 """
 
 # BaseAPI members:
@@ -393,13 +446,224 @@ def vector_output_size_assertion_check(self):
             self.outputs['out_size_expr'] is not None
         ), f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
 
-    def generate_if_condition_code(self) -> str:
+    def generate_non_computation_rank_clip_code(self) -> str:
+        if len(self.inputs['names']) > 0:
+            mesh = ""
+            # All inputs have same mesh
+            if (
+                self.inputs['input_info'][self.inputs['names'][0]]
+                == "const Tensor&"
+            ):
+                mesh = GET_MESH_TEMPLATE.format(
+                    "{}.".format(self.inputs['names'][0])
+                )
+            elif (
+                self.inputs['input_info'][self.inputs['names'][0]]
+                == "const paddle::optional<Tensor>&"
+            ):
+                mesh = GET_MESH_TEMPLATE.format(
+                    "{}->".format(self.inputs['names'][0])
+                )
+            elif (
+                self.inputs['input_info'][self.inputs['names'][0]]
+                == "const std::vector<Tensor>&"
+            ):
+                mesh = GET_MESH_TEMPLATE.format(
+                    "{}[0].".format(self.inputs['names'][0])
+                )
+            elif (
+                self.inputs['input_info'][self.inputs['names'][0]]
+                == "const paddle::optional<std::vector<Tensor>>&"
+            ):
+                mesh = GET_MESH_TEMPLATE.format(
+                    "{}->at(0).".format(self.inputs['names'][0])
+                )
+            return mesh
+        else:
+            return ""
+
+    # Backward API Override this method
+    def gene_kernel_backend_select(self):
+        backend_select_code = ""
+        if self.kernel['backend'] is not None:
+            if '>' in self.kernel['backend']:
+                vars_list = self.kernel['backend'].split('>')
+                assert (
+                    len(vars_list) == 2
+                ), f"{self.api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
+                assert (vars_list[0].strip() in self.attrs['names']) and (
+                    self.attrs['attr_info'][vars_list[0].strip()][0]
+                    == 'const Place&'
+                ), f"{self.api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type."
+                backend_select_code = f"""
+    kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                backend_args = [
+                    ele.strip() for ele in self.kernel['backend'].split(',')
+                ]
+                backend_select_code = f"""
+    kernel_backend = ParseBackend({", ".join(backend_args)});
+"""
+
+        return backend_select_code
+
+    # Overload api_base.py gene_kernel_select function.
+    def gene_kernel_select(self) -> str:
+        api = self.api
+        input_names = self.inputs['names']
+        attrs = self.attrs
+        kernel = self.kernel
+
+        kernel_key_item_init = """
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+"""
+
+        # Check the tensor options
+        attr_backend_count = 0
+        attr_layout_count = 0
+        attr_data_type_count = 0
+        for attr_name in attrs['names']:
+            if attrs['attr_info'][attr_name][0] == 'const Place&':
+                assert (
+                    kernel['backend'] is not None
+                ), f"{api} api: When there is a parameter with 'Place' type in attributes, you must set backend of kernel manually."
+                attr_backend_count = attr_backend_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataLayout':
+                assert (
+                    kernel['layout'] is not None
+                ), f"{api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
+                attr_layout_count = attr_layout_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataType':
+                assert (
+                    kernel['data_type'] is not None
+                ), f"{api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
+                attr_data_type_count = attr_data_type_count + 1
+
+        # preprocess kernel configures
+        kernel_select_code = self.gene_kernel_backend_select()
+
+        if kernel['layout'] is not None:
+            if '>' in kernel['layout']:
+                vars_list = kernel['layout'].split('>')
+                assert (
+                    len(vars_list) == 2
+                ), f"{api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
+                assert (
+                    vars_list[0].strip() in attrs['names']
+                    and attrs['attr_info'][vars_list[0].strip()][0]
+                    == 'DataLayout'
+                ), f"{api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
+                kernel_select_code = (
+                    kernel_select_code
+                    + f"""
+    kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+                )
+
+            else:
+                vars_list = kernel['layout'].split(',')
+                assert (
+                    len(vars_list) == 1
+                ), f"{api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
+                kernel_select_code = (
+                    kernel_select_code
+                    + f"""
+    kernel_layout = ParseLayout({vars_list[0].strip()});
+"""
+                )
+
+        if kernel['data_type'] is not None:
+
+            def process_data_type_args(args_item):
+                args_item = args_item.strip()
+                complex_match_result = re.match(
+                    r"complex\((?P<param_name>\w+)\)", args_item
+                )
+                if complex_match_result:
+                    return f"phi::dtype::ToComplex(ParseDataType({complex_match_result.group('param_name')}))"
+                else:
+                    return f"ParseDataType({args_item})"
+
+            if '>' in kernel['data_type']:
+                vars_list = kernel['data_type'].split('>')
+                assert (
+                    len(vars_list) == 2
+                ), f"{api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
+                assert (
+                    vars_list[0].strip() in attrs['names']
+                    and attrs['attr_info'][vars_list[0].strip()][0]
+                    == 'DataType'
+                ), f"{api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
+                kernel_select_code = (
+                    kernel_select_code
+                    + f"""
+    kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+                )
+
+            else:
+                vars_list = kernel['data_type'].split(',')
+                assert (
+                    len(vars_list) == 1
+                ), f"{api} api: The number of params to set data_type only allows 1, but received {len(vars_list)}."
+                kernel_select_code = (
+                    kernel_select_code
+                    + f"""
+    kernel_data_type = {process_data_type_args(vars_list[0])};
+"""
+                )
+
+        if len(input_names) == 0:
+            assert (
+                attr_backend_count > 0 and attr_data_type_count > 0
+            ), f"{api} api: When there is no input tensor, the args must have 'Place' and 'DataType'."
+
+        kernel_select_args = ""
+        for input_name in input_names:
+            kernel_select_args = kernel_select_args + input_name + ", "
+
+        if len(kernel_select_args) > 2:
+            kernel_select_args = kernel_select_args[:-2]
+
+        # kernel_select_code = kernel_key_item_init + kernel_select_code
+
+        if len(input_names) > 0:
+            kernel_select_code = (
+                kernel_select_code
+                + f"""
+    if (kernel_backend == Backend::UNDEFINED
+          || kernel_layout == DataLayout::UNDEFINED
+          || kernel_data_type == DataType::UNDEFINED ) {{
+      auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
+      auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+      if (kernel_backend == Backend::UNDEFINED) {{
+        kernel_backend = kernel_key.backend();
+      }}
+      if (kernel_layout == DataLayout::UNDEFINED) {{
+        kernel_layout = kernel_key.layout();
+      }}
+      if (kernel_data_type == DataType::UNDEFINED) {{
+        kernel_data_type = kernel_key.dtype();
+      }}
+    }}"""
+            )
+
         input_args = ""
         for input_name in self.inputs['names']:
             input_args = input_args + input_name + ", "
         if len(input_args) > 2:
             input_args = input_args[:-2]
-        return AUTO_PARALLEL_COND_TEMPLATE.format(input_args)
+        mesh = self.generate_non_computation_rank_clip_code()
+
+        if_condition_code = AUTO_PARALLEL_COND_TEMPLATE.format(
+            input_args=input_args, mesh=mesh, kernel_code=kernel_select_code
+        )
+
+        return kernel_key_item_init + if_condition_code
 
     def generate_specialized_infer_spmd_code(self) -> str:
         input_names = self.inputs['names']
@@ -680,6 +944,10 @@ def generate_infer_global_shape_code(self) -> str:
                     name=out_name
                 )
                 output_args_code += f"{out_name}_meta_ptr_vec, "
+                if self.generate_general_infer_spmd is True:
+                    set_out_dist_attr_code += (
+                        SET_VECTOR_OUT_REPLICATED_DIST_ATTR
+                    )
             else:
                 output_decl_code += SINGLE_GLOBAL_META_OUT_DECL_TEMPLATE.format(
                     out_name, out_name
@@ -1088,7 +1356,6 @@ def generate_auto_paralel_branch(self) -> str:
         if len(self.inputs['names']) == 0:
             return ""
         return MAIN_DIST_BRANCH_TEMPLATE.format(
-            self.generate_if_condition_code(),
             self.generate_infer_spmd_code(),
             self.generate_output_creation_code(),
             self.generate_infer_global_shape_code(),
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index b29e186f06d381..9368d6908b33cc 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -24,16 +24,18 @@
 
 MAIN_DIST_BRANCH_TEMPLATE = """
   // Auto Parallel condition
-  if ({}) {{
+  if (use_dist_branch) {{
     // 1. InferSpmd (Infer DistAttr of Inputs&Outputs){}
     // 2. Create Temporary Output & Prepare Dist and Dense Output{}
     // 3. Infer DistTensor's Global Shape{}\n
-    // 4. Select Kernel{}
-    // 5. Reshard Input{}\n
-    // 6. PrepareData (DataTransform & Prepare Dense Input){}
-    // 7. Infer Local DenseTensor Meta{}
-    // 8. DenseTensor Kernel Call{}
-    // 9. Reshard Output{}\n
+    if (!computation_clip_for_pp){{
+      // 4. Select Kernel{}
+      // 5. Reshard Input{}\n
+      // 6. PrepareData (DataTransform & Prepare Dense Input){}
+      // 7. Infer Local DenseTensor Meta{}
+      // 8. DenseTensor Kernel Call{}
+      // 9. Reshard Partial Output to Replicated (Temporary){}\n
+    }}
     // 10. Return
     {}
   }}
@@ -99,9 +101,9 @@
 
 # 9. Reshard Output
 RESHARD_SINGLE_OUTPUT_TEMPLATE = """
-    ReshardKernelOutputToApiOutput(dev_ctx, shared_dist_out, {});"""
+      ReshardKernelOutputToApiOutput(dev_ctx, shared_dist_out, {});"""
 RESHARD_MULTI_SINGLE_OUTPUT_TEMPLATE = """
-    ReshardKernelOutputToApiOutput(dev_ctx, shared_dist_out_{}, {});"""
+      ReshardKernelOutputToApiOutput(dev_ctx, shared_dist_out_{}, {});"""
 
 
 class DistBackwardAPI(DistForwardAPI, BackwardAPI):
@@ -260,7 +262,6 @@ def generate_auto_paralel_branch(self) -> str:
         if len(self.inputs['names']) == 0:
             return ""
         return MAIN_DIST_BRANCH_TEMPLATE.format(
-            self.generate_if_condition_code(),
             self.generate_infer_spmd_code(),
             self.generate_output_creation_code(),
             self.generate_infer_global_shape_code(),
@@ -308,6 +309,7 @@ def source_include(header_file_path, fw_header_file_path):
 
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/phi/infermeta/spmd_rules/rules.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
 #endif
 
 PD_DECLARE_bool(conv2d_disable_cudnn);
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 17d53342ba2776..47eda81f5d0ca0 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -19,6 +19,7 @@
   infer_meta :
     func : GeneralBinaryGradInferMeta
     param : [x, y]
+    spmd_rule : ElementwiseBinaryGradInferSpmd
   kernel :
     func : add_grad
   no_need_buffer : x, y
@@ -406,8 +407,8 @@
   composite : minimum_grad(x, y, out_grad, axis, x_grad, y_grad)
 
 - backward_op : mish_grad
-  forward : mish (Tensor x, float threshold) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, float threshold)
+  forward : mish (Tensor x, float lambda) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float lambda)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
@@ -680,6 +681,7 @@
   infer_meta :
     func : GeneralBinaryGradInferMeta
     param : [x, y]
+    spmd_rule : ElementwiseBinaryGradInferSpmd
   kernel :
     func : subtract_grad
   no_need_buffer : x, y
@@ -745,7 +747,7 @@
   kernel :
     func : tile_grad
   no_need_buffer : x
-  composite : tile_grad(x, outgrad, repeat_times, x_grad)
+  composite : tile_grad(x, out_grad, repeat_times, x_grad)
   backward : tile_double_grad
 
 - backward_op : trans_layout_grad
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 14daf99fd7f13a..01acb338c987bd 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -17,6 +17,7 @@
   output : Tensor(out)
   infer_meta :
     func : ElementwiseInferMeta
+    spmd_rule : ElementwiseBinaryInferSpmd
   kernel :
     func : add
   inplace : (x -> out)
@@ -1003,6 +1004,7 @@
   output : Tensor(out)
   infer_meta :
     func : ElementwiseInferMeta
+    spmd_rule : ElementwiseBinaryInferSpmd
   kernel :
     func : subtract
   inplace : (x -> out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index f74df02af26d2f..f7d3878e44847f 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1255,6 +1255,35 @@
     attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f,
              float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}']
 
+- op : fused_embedding_eltwise_layernorm
+  inputs :
+    ids : Ids
+    embs : Embs
+    bias : Bias
+    scale : Scale
+  outputs :
+    out : Out
+  attrs :
+    epsilon : epsilon
+
+- op : fused_fc_elementwise_layernorm
+  inputs :
+    x : X
+    w : W
+    y : Y
+    bias0 : Bias0
+    scale : Scale
+    bias1 : Bias1
+  outputs :
+    out : Out
+    mean : Mean
+    variance : Variance
+  attrs :
+    x_num_col_dims : x_num_col_dims
+    activation_type : activation_type
+    epsilon : epsilon
+    begin_norm_axis : begin_norm_axis
+
 - op : fused_feedforward
   backward: fused_feedforward_grad
   inputs:
@@ -1287,10 +1316,32 @@
     dropout1_out: Dropout1Out
     dropout2_out: Dropout2Out
 
+- op : fused_gemm_epilogue
+  inputs:
+    {x : X, y : Y, bias : Bias}
+  outputs :
+    {out : Out, reserve_space: ReserveSpace}
+
+- op : fused_gemm_epilogue_grad
+  inputs:
+    {x : X, y : Y, reserve_space: ReserveSpace, out_grad : DOut}
+  outputs :
+    {x_grad : DX, y_grad : DY, bias_grad : DBias}
+
 - op : fused_transpose
   extra :
     attrs : [str data_format = "AnyLayout"]
 
+- op : fusion_transpose_flatten_concat
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs :
+    trans_axis : trans_axis
+    flatten_axis : flatten_axis
+    concat_axis : concat_axis
+
 - op : gather
   backward : gather_grad
   inputs :
@@ -1490,11 +1541,6 @@
   attrs :
     axis : dim
 
-- op : inplace_abn
-  backward : inplace_abn_grad
-  extra :
-    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
-
 - op : instance_norm
   inputs :
     x : X
@@ -2139,6 +2185,17 @@
   outputs :
     out : Out
 
+- op : pad
+  backward : pad_grad, pad_double_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar:
+    pad_value:
+      data_type : float
+      support_tensor : true
+
 - op : pad2d
   backward : pad2d_grad
   extra :
@@ -3181,6 +3238,12 @@
   outputs:
     {out: Out}
 
+- op: dpsgd
+  inputs:
+    {param: Param,grad: Grad,learning_rate: LearningRate}
+  outputs:
+    param_out : ParamOut
+
 - op: fetch (fetch_v2)
   inputs: {x: X}
   outputs: {out: Out}
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index fff70e820e575a..aaf6c4e1445ef4 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -134,7 +134,7 @@
   backward : angle_grad
 
 - op : argmax
-  args : (Tensor x, Scalar(int64_t) axis, bool keepdims = false, bool flatten = false, int dtype = 3)
+  args : (Tensor x, Scalar(int64_t) axis, bool keepdims = false, bool flatten = false, DataType dtype = DataType::INT64)
   output : Tensor(out)
   infer_meta :
     func : ArgMinMaxInferMeta
@@ -143,7 +143,7 @@
     data_type : x
 
 - op : argmin
-  args : (Tensor x, Scalar(int64_t) axis, bool keepdims = false, bool flatten = false, int dtype = 3)
+  args : (Tensor x, Scalar(int64_t) axis, bool keepdims = false, bool flatten = false, DataType dtype = DataType::INT64)
   output : Tensor(out)
   infer_meta :
     func : ArgMinMaxInferMeta
@@ -2073,6 +2073,7 @@
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
+    spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : relu
   inplace : (x -> out)
@@ -2458,6 +2459,7 @@
   output : Tensor
   infer_meta :
     func : UnchangedInferMeta
+    spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : square {dense -> dense},
            square_sr {selected_rows -> selected_rows}
@@ -2786,6 +2788,15 @@
   intermediate: warprnntgrad
   backward : warprnnt_grad
 
+- op : weight_dequantize
+  args : (Tensor x, Tensor scale, str algo="weight_only_int8", DataType out_dtype=DataType::FLOAT16)
+  output : Tensor(out)
+  infer_meta :
+    func : WeightDequantizeInferMeta
+  kernel :
+    func : weight_dequantize
+    data_type : out_dtype
+
 - op : weight_only_linear
   args : (Tensor x, Tensor weight, Tensor bias, Tensor weight_scale, str weight_dtype)
   output : Tensor(out)
@@ -2798,7 +2809,7 @@
   backward: weight_only_linear_grad
 
 - op : weight_quantize
-  args : (Tensor x, str algo = "weight_only_int8")
+  args : (Tensor x, str algo="weight_only_int8")
   output : Tensor(out), Tensor(scale)
   infer_meta :
     func : WeightQuantizeInferMeta
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 1c916682cf7b1c..55e629de34e7e2 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -20,8 +20,14 @@ endif()
 
 if(WITH_XPU)
   list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
-  list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
-       xpu/xpu2_op_list.cc xpu/xpu_l3_strategy.cc)
+  list(
+    APPEND
+    BACKENDS_SRCS
+    xpu/xpu_op_list.cc
+    xpu/xpu1_op_list.cc
+    xpu/xpu2_op_list.cc
+    xpu/xpu3_op_list.cc
+    xpu/xpu_l3_strategy.cc)
   list(APPEND BACKENDS_DEPS phi_dynload_xpti)
 endif()
 
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index 619db6f83fc240..7824fc3b160b10 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -61,7 +61,7 @@ thread_local const std::map<Place,
 TEST_API phi::DeviceContext* DeviceContextPool::Get(const phi::Place& place) {
   VLOG(6) << "DeviceContextPool Get: " << place;
   const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-      ptr;
+      ptr = nullptr;
   if (external_device_contexts_ && external_device_contexts_->count(place)) {
     ptr = external_device_contexts_;
   } else {
diff --git a/paddle/phi/backends/device_guard.h b/paddle/phi/backends/device_guard.h
index a2e44c30d62d56..d6ab326faacade 100644
--- a/paddle/phi/backends/device_guard.h
+++ b/paddle/phi/backends/device_guard.h
@@ -20,20 +20,15 @@ namespace phi {
 class DeviceGuard {
  public:
   explicit inline DeviceGuard(const Place& place)
-      : dev_type_(place.GetDeviceType()) {
-    prev_id = DeviceManager::GetDevice(dev_type_);
-    cur_id = place.GetDeviceId();
-
-    if (cur_id != prev_id) {
-      DeviceManager::SetDevice(dev_type_, cur_id);
-    }
+      : cur_id(place.GetDeviceId()), dev_type_(place.GetDeviceType()) {
+    DeviceManager::SetDevice(dev_type_, cur_id);
   }
 
   DeviceGuard(const DeviceGuard& o) = delete;
   DeviceGuard& operator=(const DeviceGuard& o) = delete;
 
  private:
-  size_t prev_id, cur_id;
+  size_t cur_id;
   std::string dev_type_;
 };
 
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index 24ad5087769de5..748c80c0859c5e 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -494,7 +494,7 @@ std::vector<size_t> DeviceManager::GetSelectedDeviceList(
     auto FLAGS_selected_devices = getenv(FLAGS.c_str());
     if (FLAGS_selected_devices) {
       auto devices_str = paddle::string::Split(FLAGS_selected_devices, ',');
-      for (auto id : devices_str) {
+      for (auto const& id : devices_str) {
         device_list.push_back(atoi(id.c_str()));
       }
     } else {
@@ -697,8 +697,8 @@ DeviceManager& DeviceManager::Instance() {
 }
 
 void DeviceManager::Release() {
-  stream::Stream::ReleaseAll();
   event::Event::ReleaseAll();
+  stream::Stream::ReleaseAll();
   Instance().device_map_.clear();
   Instance().device_impl_map_.clear();
 }
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 6989f32b18e9e0..bdb9e120d2884b 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -185,9 +185,9 @@ static inline std::string join(const std::string& part1,
 static inline std::vector<std::string> split(
     const std::string& str, const std::string separator = " ") {
   std::vector<std::string> str_list;
-  std::string::size_type firstPos;
+  std::string::size_type firstPos = 0;
   firstPos = str.find_first_not_of(separator, 0);
-  std::string::size_type lastPos;
+  std::string::size_type lastPos = 0;
   lastPos = str.find_first_of(separator, firstPos);
   while (std::string::npos != firstPos && std::string::npos != lastPos) {
     str_list.push_back(str.substr(firstPos, lastPos - firstPos));
@@ -263,7 +263,7 @@ static inline void* GetDsoHandleFromSearchPath(
 #endif  // !_WIN32
   std::vector<std::string> dso_names = split(dso_name, ";");
   void* dso_handle = nullptr;
-  for (auto dso : dso_names) {
+  for (auto const& dso : dso_names) {
     // 1. search in user config path by FLAGS
     dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
     // 2. search in system default path
@@ -272,7 +272,7 @@ static inline void* GetDsoHandleFromSearchPath(
     }
     // 3. search in extra paths
     if (nullptr == dso_handle) {
-      for (auto path : extra_paths) {
+      for (auto const& path : extra_paths) {
         VLOG(3) << "extra_paths: " << path;
         dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
       }
diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc
index 1c620afbad558d..c08b4b269b2d2e 100644
--- a/paddle/phi/backends/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -46,6 +46,7 @@ Event::Event(const Place& place, event_t event)
       own_data_(false) {}
 
 Event::~Event() {
+  Synchronize();
   Destroy();
   std::unique_lock lock(g_events_mutex);
   g_events.remove(this);
@@ -77,14 +78,35 @@ void Event::Destroy() {
     own_data_ = false;
     event_ = nullptr;
     device_ = nullptr;
+    is_recorded_ = false;
   }
 }
 
-void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); }
+void Event::Record(const stream::Stream* stream) {
+  if (device_) {
+    is_recorded_ = true;  // synchronize the event during detroy
+    stream->RecordEvent(this);
+  }
+}
 
-bool Event::Query() const { return device_->QueryEvent(this); }
+bool Event::Query() const {
+  if (device_ && is_recorded_) {
+    bool ret = device_->QueryEvent(this);
+    if (ret) {
+      is_recorded_ =
+          false;  // event completed, do not need to synchronize the event.
+    }
+    return ret;
+  } else {
+    return true;
+  }
+}
 
-void Event::Synchronize() const { device_->SynchronizeEvent(this); }
+void Event::Synchronize() const {
+  if (device_ && is_recorded_) {
+    device_->SynchronizeEvent(this);
+  }
+}
 
 const Place& Event::GetPlace() const { return place_; }
 
diff --git a/paddle/phi/backends/event.h b/paddle/phi/backends/event.h
index 1dac619c2abf96..21dc9f47d7b89e 100644
--- a/paddle/phi/backends/event.h
+++ b/paddle/phi/backends/event.h
@@ -57,6 +57,7 @@ class Event {
   Device* device_;
   event_t event_;
   bool own_data_ = true;
+  mutable bool is_recorded_ = false;
 };
 }  // namespace event
 
diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
index 651a4247a12df0..74db3fc75bcd10 100644
--- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
@@ -33,8 +33,12 @@ namespace phi {
 namespace backends {
 namespace gpu {
 
+#define CUDNN_VERSION_COMPUTE(major, minor, patch)     \
+  ((major) <= 8 ? (major)*1000 + (minor)*100 + (patch) \
+                : (major)*10000 + (minor)*100 + (patch))
+
 #define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
+  (CUDNN_VERSION >= CUDNN_VERSION_COMPUTE(major, minor, patch))
 
 enum class DataLayout {  // Not use
   kNHWC,
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index d6ce3e750f65ff..7905320728bda5 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -686,7 +686,7 @@ struct GPUContext::Impl {
   void AddStreamCallback(const std::function<void()>& callback) const {
     // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may
     // launch too many threads and result in thread oversubscription.
-    auto* callback_func = new std::function<void()>(std::move(callback));
+    auto* callback_func = new std::function<void()>(callback);
     auto* func = new std::function<void()>([this, callback_func] {
       std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
       VLOG(4) << "Stream callback";
diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc
index f6ca9d4168b2c8..1849faa4520774 100644
--- a/paddle/phi/backends/gpu/gpu_info.cc
+++ b/paddle/phi/backends/gpu/gpu_info.cc
@@ -47,7 +47,7 @@ std::vector<int> GetSelectedDevices() {
   std::vector<int> devices;
   if (!FLAGS_selected_gpus.empty()) {
     auto devices_str = Split(FLAGS_selected_gpus, ',');
-    for (auto id : devices_str) {
+    for (auto const& id : devices_str) {
       devices.push_back(atoi(id.c_str()));
     }
   } else {
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index a447df94cb4dc1..a29b5e110922a4 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -146,19 +146,40 @@ void InitGpuProperties(Place place,
   }
 #else
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  auto get_cudnn_major = [](auto version) {
+    if (version < 9000) {
+      return version / 1000;
+    }
+    // CUDNN changes the CUDNN_VERSION rules after 9.0
+    return version / 10000;
+  };
+  auto get_cudnn_minor = [](auto version) {
+    if (version < 9000) {
+      return (version % 1000) / 100;
+    }
+    // CUDNN changes the CUDNN_VERSION rules after 9.0
+    return (version % 10000) / 100;
+  };
+
   LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
-                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-                          << (cudnn_dso_ver % 1000) / 100 << ".";
+                          << ", cuDNN Version: "
+                          << get_cudnn_major(cudnn_dso_ver) << "."
+                          << get_cudnn_minor(cudnn_dso_ver) << ".";
 
   // Check CUDA/CUDNN version compatiblity
   auto local_cuda_version =
       (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
   auto compile_cuda_version =
       (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+
+  // Compute cuDNN major
+  auto local_cudnn_major = get_cudnn_major(cudnn_dso_ver);
+  size_t compile_cudnn_major = CUDNN_MAJOR;
+
 #if defined(__linux__)
   PADDLE_ENFORCE_EQ(
       (local_cuda_version / 10 < compile_cuda_version / 10) &&
-          (cudnn_dso_ver / 1000 < CUDNN_VERSION / 1000),
+          (local_cudnn_major < compile_cudnn_major),
       false,
       phi::errors::InvalidArgument(
           "The installed Paddle is compiled with CUDA%d/cuDNN%d,"
@@ -167,9 +188,9 @@ void InitGpuProperties(Place place,
           "Please recompile or reinstall Paddle with compatible CUDA/cuDNN "
           "version.",
           compile_cuda_version / 10,
-          CUDNN_VERSION / 1000,
+          compile_cudnn_major,
           local_cuda_version / 10,
-          cudnn_dso_ver / 1000));
+          local_cudnn_major));
 #endif
   if (local_cuda_version < compile_cuda_version) {
     LOG_FIRST_N(WARNING, 1)
@@ -269,15 +290,17 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
 #else
-    auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
-    auto compile_cudnn_version = CUDNN_VERSION / 100;
-    if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
+    auto version = phi::dynload::cudnnGetVersion();
+    auto local_cudnn_major =
+        (version < 9000) ? version / 1000 : version / 10000;
+    auto local_cudnn_minor =
+        (version < 9000) ? (version % 1000) / 100 : (version % 10000) / 100;
+    if (version < static_cast<size_t>(CUDNN_VERSION)) {
       LOG_FIRST_N(WARNING, 1)
           << "WARNING: device: " << place.device
-          << ". The installed Paddle is compiled with CUDNN "
-          << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
-          << ", but CUDNN version in your machine is "
-          << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+          << ". The installed Paddle is compiled with CUDNN " << CUDNN_MAJOR
+          << "." << CUDNN_MINOR << ", but CUDNN version in your machine is "
+          << local_cudnn_major << "." << local_cudnn_minor
           << ", which may cause serious incompatible bug. "
           << "Please recompile or reinstall Paddle with compatible CUDNN "
              "version.";
diff --git a/paddle/phi/backends/gpu/rocm/miopen_helper.h b/paddle/phi/backends/gpu/rocm/miopen_helper.h
index b8ce6e22e939be..f7815e2ed851e0 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_helper.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_helper.h
@@ -61,8 +61,12 @@ inline const char* miopenGetErrorString(miopenStatus_t status) {
 }
 
 // no use, but will have compiling error if not defined
+#define CUDNN_VERSION_COMPUTE(major, minor, patch)     \
+  ((major) <= 8 ? (major)*1000 + (minor)*100 + (patch) \
+                : (major)*10000 + (minor)*100 + (patch))
+
 #define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
+  (CUDNN_VERSION >= CUDNN_VERSION_COMPUTE(major, minor, patch))
 
 enum class DataLayout {  // Not use
   kNHWC,
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 74a8cf0bc1150e..cdc6d895b84be0 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -797,6 +797,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
                      phi::DataType::FLOAT32})},
       {"squeeze2",
        XPUKernelSet({phi::DataType::FLOAT64,
@@ -806,6 +808,7 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
                      phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
                      phi::DataType::FLOAT32})},
       {"squeeze",
        XPUKernelSet({phi::DataType::FLOAT64,
@@ -814,6 +817,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
                      phi::DataType::FLOAT32})},
       {"squeeze_grad",
        XPUKernelSet({phi::DataType::FLOAT64,
@@ -822,6 +827,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
                      phi::DataType::FLOAT32})},
       {"stack",
        XPUKernelSet({phi::DataType::FLOAT32,
@@ -935,7 +942,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
                      phi::DataType::FLOAT32,
-                     phi::DataType::FLOAT16})},
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"unsqueeze2",
        XPUKernelSet({phi::DataType::FLOAT64,
                      phi::DataType::INT64,
@@ -944,7 +952,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
                      phi::DataType::FLOAT32,
-                     phi::DataType::FLOAT16})},
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"unsqueeze_grad",
        XPUKernelSet({phi::DataType::FLOAT64,
                      phi::DataType::INT64,
@@ -952,7 +961,9 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
-                     phi::DataType::FLOAT32})},
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"unsqueeze",
        XPUKernelSet({phi::DataType::FLOAT64,
                      phi::DataType::INT64,
@@ -960,8 +971,9 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
+                     phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
-                     phi::DataType::FLOAT32})},
+                     phi::DataType::BFLOAT16})},
       {"unstack",
        XPUKernelSet({phi::DataType::INT64,
                      phi::DataType::INT32,
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
new file mode 100644
index 00000000000000..6174f13cd30b26
--- /dev/null
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -0,0 +1,1040 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/phi/backends/xpu/xpu_op_list.h"
+
+namespace phi {
+namespace backends {
+namespace xpu {
+
+XPUOpMap& get_kl3_ops() {
+  // KL3支持的op，通过op_name, data_type, place来索引
+  static XPUOpMap s_xpu3_kernels{
+      {"add_act_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"add_layernorm_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"abs", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"abs_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"accuracy", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"adadelta", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"adamw", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"adam", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"adam_dense_param_sparse_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"addcmul_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"arg_max",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"argsort_grad",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"argsort",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"assign",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL})},
+      {"assign_value",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL})},
+      {"atan", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"atan_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"batch_norm_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"batch_norm",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"bn_act_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"bmm", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"bmm_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"bce_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"beam_search",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"beam_search_decode",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"bilinear_interp_v2",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"bilinear_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bitwise_not", XPUKernelSet({phi::DataType::BOOL})},
+      {"bitwise_and", XPUKernelSet({phi::DataType::BOOL})},
+      {"broadcast", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_allgather",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"c_allreduce_max",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::INT32})},
+      {"c_allreduce_sum",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::INT32})},
+      {"c_broadcast",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"c_concat",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"c_embedding", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_embedding_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_identity",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"c_softmax_with_cross_entropy", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_softmax_with_cross_entropy_grad",
+       XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_reduce_sum",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"c_split",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::INT32})},
+      {"c_sync_calc_stream",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"c_sync_comm_stream", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"cast",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"check_finite_and_unscale",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"clip",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"clip_by_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"clip_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"coalesce_tensor",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"concat_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"concat",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"conv2d_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv2d",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv1d_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv2d_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv3d_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv3d",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv2d_transpose_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"conv2d_transpose",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv2d_transpose_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"cumsum",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"cumsum_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"cumprod",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"deformable_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"deformable_conv_v1_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"deformable_conv_v1", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"depthwise_conv2d_transpose_grad",
+       XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d_transpose",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"diag_v2",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"diagonal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"dropout_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"dropout",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"einsum", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"einsum_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_add_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_add",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"elementwise_div_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_div",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"elementwise_floordiv",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_max_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_max",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_min_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_min",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_mul_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_mul",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"elementwise_pow",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_sub_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_sub",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"elementwise_mod",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"embedding_with_eltwise_add_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"empty",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64})},
+      {"embedding_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"embedding_sparse_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"equal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::BOOL})},
+      {"exp_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"exp", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"expand_as_v2",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"expand_v2",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"fast_where_xpu",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"layer_norm_act_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fast_layernorm_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fc_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fill",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT32})},
+      {"fill_any",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT32})},
+      {"fill_any_like",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"fill_diagonal_tensor",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"fill_constant",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"flatten2_grad",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"flatten2",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"flatten_contiguous_range_grad",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"flatten_contiguous_range",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"flatten_grad",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"flatten",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"flip", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"full_batch_size_like",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"fill_constant_batch_size_like",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"fused_multi_transformer_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"unfold",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"unfold_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"floor", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"gather_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"gather_nd_grad",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"gather_nd",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"gather",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL})},
+      {"gaussian_random",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"gelu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"gelu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"generate_proposals_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"generate_sequence_xpu",
+       XPUKernelSet({
+           phi::DataType::FLOAT32,
+           phi::DataType::INT32,
+           phi::DataType::INT64,
+       })},
+      {"grad_add",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"greater_equal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"greater_than",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"grid_sampler_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"hard_sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"hard_sigmoid",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"hard_swish_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"hard_swish", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"huber_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"huber_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"kldiv_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"kldiv_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"increment",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"index_put",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"index_sample_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"index_sample",
+       XPUKernelSet({phi::DataType::INT8,
+                     phi::DataType::INT16,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::BOOL})},
+      {"index_select_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"index_select",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"instance_norm",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"instance_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"inverse",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT64})},
+      {"label_smooth", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"lars_momentum",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"layer_norm_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"layer_norm",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"leaky_relu_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"leaky_relu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"less_equal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"less_than",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"load", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"load_combine",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT8,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"log", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"log_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"log_softmax", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"log_softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"logical_and", XPUKernelSet({phi::DataType::BOOL})},
+      {"logical_not", XPUKernelSet({phi::DataType::BOOL})},
+      {"logical_or", XPUKernelSet({phi::DataType::BOOL})},
+      {"logical_xor", XPUKernelSet({phi::DataType::BOOL})},
+      {"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"lookup_table_v2",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"masked_select",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"masked_select_grad",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"max_pool2d_with_index",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"max_pool2d_with_index_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"matmul_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"matmul_v2_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"matmul_v2",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"matmul",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mean_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mean", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"merged_momentum",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mish_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"mish", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"momentum",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mul", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mul_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"multiply",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"multi_encoder_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"multiclass_nms3", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nearest_interp_v2",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64})},
+      {"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nll_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nll_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"not_equal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"one_hot", XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"one_hot_v2",
+       XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"p_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"p_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pad3d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pad3d", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"pad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::FLOAT16})},
+      {"pad_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::FLOAT16})},
+      {"pixel_shuffle", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pixel_shuffle_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pool2d_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"pool2d",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"pool3d_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"pool3d",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"pow", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pow_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pow2_decay_with_linear_warmup", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"prior_box", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"prelu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"prod_raw", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"range",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"randperm",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64})},
+      {"reciprocal", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reciprocal_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"reduce_any", XPUKernelSet({phi::DataType::BOOL})},
+      {"reduce_max_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_max",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"reduce_mean_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_mean",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"reduce_min_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_min", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_prod", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_sum_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_sum",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"relu6", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"relu6_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"relu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"relu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"reshape2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"reshape2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"reshape",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"resnet_unit",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"resnet_unit_grad",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"rnn_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roi_align", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roi_align_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roll", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"scale",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"scatter",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"scatter_grad",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"scatter_nd_add",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"sampling_id",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT64})},
+      {"set_value",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BOOL})},
+      {"set_value_with_tensor",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL})},
+      {"set_value_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16})},
+      {"sgd_dense_param_sparse_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"silu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"silu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"size",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"sigmoid_cross_entropy_with_logits_grad",
+       XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sigmoid_cross_entropy_with_logits",
+       XPUKernelSet({phi::DataType::FLOAT32})},
+      {"shape",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16})},
+      {"sigmoid",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sign", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"slice_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32})},
+      {"slice",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"softmax",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"softmax_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_softmax_mask_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"softmax_with_cross_entropy_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"softmax_with_cross_entropy",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"softplus", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"softplus_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sparse_coo_tensor",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::UINT8,
+                     phi::DataType::INT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"split",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"split_with_num",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"sqrt", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"sqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"square_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"square",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"squeeze2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"squeeze2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"squeeze",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"squeeze_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"stack",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16})},
+      {"stack_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
+      {"strided_slice",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"strided_slice_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT16,
+                     phi::DataType::INT32})},
+      {"sum", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"swish", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"swish_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"take_along_axis",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"tanh_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"tanh", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"temporal_shift", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"temporal_shift_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"transfer_dtype",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::UINT8,
+                     phi::DataType::INT8,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"tril_triu",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16})},
+      {"tril",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16})},
+      {"triu",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16})},
+      {"tril_triu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16})},
+      {"tril_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16})},
+      {"triu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16})},
+      {"tile",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT32})},
+      {"tile_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"transpose2_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL})},
+      {"transpose2",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL})},
+      {"transpose_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL})},
+      {"transpose",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL})},
+      {"truncated_gaussian_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"top_k", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"top_k_v2",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"update_loss_scaling",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"unbind", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"unique",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"unsqueeze2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
+      {"unsqueeze2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
+      {"unsqueeze_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
+      {"unsqueeze",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
+      {"unstack",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"unstack_grad",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"warpctc_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"warpctc", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"where_index",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"where_grad",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"where",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"sin", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sin_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"cos", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"cos_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"linspace",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"randint", XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"group_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"group_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"meshgrid",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"expand_v2_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
+      {"isnan_v2",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"yolo_box_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+
+      // AddMore
+      {"sequence_conv", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sequence_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sequence_unpad", XPUKernelSet({phi::DataType::FLOAT32})},
+      // Fused op
+      {"resnet_basic_block_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"resnet_basic_block", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"fused_gemm_epilogue",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_gemm_epilogue_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_attention",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_attention_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_feedforward",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_feedforward_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"lod_reset",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+  };
+
+  return s_xpu3_kernels;
+}
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace phi
+#endif
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index ee51b19f482bff..96ff4cc2c81abb 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -195,9 +195,12 @@ XPUVersion get_xpu_version(int dev_id) {
   if (v == K100 || v == K200) {
     VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
     return XPU1;
-  } else {
+  } else if (v < KL3_BEGIN) {
     VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
     return XPU2;
+  } else {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU3\n";
+    return XPU3;
   }
 }
 
@@ -211,6 +214,9 @@ int get_xpu_max_ptr_size(int dev_id) {
     case XPUVersion::XPU2:
       max_ptr_size = 6;
       break;
+    case XPUVersion::XPU3:
+      max_ptr_size = 12;
+      break;
     default:
       PADDLE_THROW(phi::errors::InvalidArgument(
           "Only support get max ptr size of XPU1 or XPU2."));
diff --git a/paddle/phi/backends/xpu/xpu_info.h b/paddle/phi/backends/xpu/xpu_info.h
index b4fbdec7a93613..ad5a0b9745832d 100644
--- a/paddle/phi/backends/xpu/xpu_info.h
+++ b/paddle/phi/backends/xpu/xpu_info.h
@@ -92,7 +92,7 @@ class XPUDeviceGuard {
   int prev_id_{-1};
 };
 
-enum XPUVersion { XPU1, XPU2 };
+enum XPUVersion { XPU1, XPU2, XPU3 };
 XPUVersion get_xpu_version(int dev_id);
 
 int get_xpu_max_ptr_size(int dev_id);
diff --git a/paddle/phi/backends/xpu/xpu_op_list.h b/paddle/phi/backends/xpu/xpu_op_list.h
index 975a5d02b16b2b..1635ed2e6e8660 100644
--- a/paddle/phi/backends/xpu/xpu_op_list.h
+++ b/paddle/phi/backends/xpu/xpu_op_list.h
@@ -25,6 +25,7 @@ using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 
 XPUOpMap& get_kl1_ops();
 XPUOpMap& get_kl2_ops();
+XPUOpMap& get_kl3_ops();
 
 #ifdef PADDLE_WITH_XPU_KP
 bool is_xpu_kp_support_op(const std::string& fluid_op_name,
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
index 57ad4d09ef463d..be11b4c9596cd9 100644
--- a/paddle/phi/core/ddim.h
+++ b/paddle/phi/core/ddim.h
@@ -227,7 +227,7 @@ std::vector<T> vectorize(const DDim& ddim) {
   return result;
 }
 
-int64_t product(const DDim& ddim);
+TEST_API int64_t product(const DDim& ddim);
 
 bool contain_unknown_dim(const DDim& ddim);
 
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
index 46e58cc9b373e2..3c95f2c3ff66f1 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
@@ -349,7 +349,8 @@ std::string TensorDistAttr::partial_status_string() const {
 }
 
 bool TensorDistAttr::empty() const {
-  return process_mesh_.empty() || dims_mapping_.empty();
+  // dims_mapping is empty when the tensor is 0-dim, but it is also be valid.
+  return process_mesh_.empty();
 }
 
 std::vector<std::shared_ptr<PlacementStatus>> TensorDistAttr::to_placement()
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
index dc5d6c20e62b33..1e3164de818659 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
@@ -22,11 +22,11 @@ namespace distributed {
 phi::DDim DistMetaTensor::dims() const {
   // member values in tensor_ have higher priority than those in DistMetaTensor
   if (tensor_ != nullptr) {
-    PADDLE_ENFORCE_EQ(this->is_dist(),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "The current MetaTensor doesn't contains "
-                          "DistTensor when call `dist_attr` method."));
+    PADDLE_ENFORCE_EQ(
+        this->is_dist(),
+        true,
+        phi::errors::InvalidArgument("The current MetaTensor doesn't contains "
+                                     "DistTensor when call `dims` method."));
     return MetaTensor::dims();
   } else {
     return dims_;
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
index 94d611e8043aa0..8e3e6405f4d29a 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -33,35 +33,45 @@ inline void check_defined(const DistTensor& dist_tensor,
           method_hint));
 }
 
-DistTensor::DistTensor(const phi::DenseTensor& global_value,
+DistTensor::DistTensor() : value_(std::make_shared<DenseTensor>()) {}
+
+DistTensor::DistTensor(const std::shared_ptr<phi::DenseTensor>& global_value,
                        const TensorDistAttr& dist_attr)
-    : dims_(global_value.dims()), dist_attr_(dist_attr), value_(global_value) {
-  // TODO(liyurui): This is a temporary solution. We need to support only infer
-  // meta when the input dense_tensor is empty.
-  // Support the value in DistTensor only has DenseTensor meta
-  // but without actual data. So we can visit its meta attr even if it is
-  // undefined.
+    : dims_(global_value->dims()),
+      dist_attr_(dist_attr),
+      value_(std::make_shared<DenseTensor>()) {
+  // If the current rank doesn't in process_mesh, we should create an
+  // uninitialized tensor only with tensor_meta.
   if (IsCurRankInMesh(dist_attr.process_mesh())) {
-    if (value_.initialized() && !dist_attr.is_replicated()) {
+    if (!dist_attr.is_replicated()) {
       // 1. create replicated global tensor
-      int64_t dims_size = global_value.dims().size();
-      std::vector<int64_t> dims_mapping(dims_size, -1);
-      dist_attr_.set_dims_mapping(dims_mapping);
-      if (dist_attr_.is_partial()) {
-        dist_attr_.clean_partial_status();
-      }
-      dist_attr_.set_dims_mapping(dims_mapping);
+      TensorDistAttr replicated_dist_attr(vectorize(global_value->dims()));
+      replicated_dist_attr.set_process_mesh(dist_attr.process_mesh());
+      DistTensor replicated_tensor(global_value, replicated_dist_attr);
 
       // 2. reshard from replicated to other state
-      auto* func = ChooseProperReshardFunction(*this, dist_attr);
-      auto* dev_ctx = DeviceContextPool::Instance().Get(global_value.place());
-      func->Eval(dev_ctx, *this, dist_attr, this);
+      auto* func = ChooseProperReshardFunction(replicated_tensor, dist_attr);
+      auto* dev_ctx = DeviceContextPool::Instance().Get(global_value->place());
+      func->Eval(dev_ctx, replicated_tensor, dist_attr, this);
+    } else {
+      value_ = global_value;
+    }
+  } else {
+    // TODO(liyurui): The following logic is illegal, and should be removed
+    // later. It exist temporary because the basic execution procedure is not
+    // ready, even sometimes we try to construct a DistTensor with empty
+    // DistAttr. Here we warning when the DistAttr is empty for debug use.
+    if (dist_attr.empty()) {
+      LOG(WARNING) << "Try to construct a dist tensor with empty dist attr.";
     }
+    value_ = global_value;
   }
 }
 
 DistTensor::DistTensor(const DDim& dims, const TensorDistAttr& dist_attr)
-    : dims_(dims), dist_attr_(dist_attr) {}
+    : dims_(dims),
+      dist_attr_(dist_attr),
+      value_(std::make_shared<DenseTensor>()) {}
 
 void DistTensor::unsafe_set_dims(const DDim& dims) {
   if (this->initialized()) {
@@ -80,39 +90,42 @@ void DistTensor::unsafe_set_dist_attr(const TensorDistAttr& dist_attr) {
 }
 
 int64_t DistTensor::numel() const {
-  check_defined(*this, "numel");
-  return value_.numel();
+  // DistTensor with uninitialized local tensor can
+  // also have numel.
+  return product(dims_);
 }
 
 const DDim& DistTensor::local_dims() const {
   check_defined(*this, "local_dims");
-  return value_.dims();
+  return value_->dims();
 }
 
 bool DistTensor::valid() const {
   check_defined(*this, "valid");
-  return value_.valid();
+  return value_->valid();
 }
 
-bool DistTensor::defined() const { return value_.holder_ != nullptr; }
+bool DistTensor::defined() const { return value_->holder_ != nullptr; }
 
 bool DistTensor::initialized() const {
-  return value_.holder_ != nullptr && value_.holder_->ptr();
+  return value_->holder_ != nullptr && value_->holder_->ptr();
 }
 
 DataType DistTensor::dtype() const {
-  check_defined(*this, "dtype");
-  return value_.dtype();
+  // DistTensor with uninitialized local tensor can
+  // also have dtype.
+  return value_->dtype();
 }
 
 DataLayout DistTensor::layout() const {
-  check_defined(*this, "layout");
-  return value_.layout();
+  // DistTensor with uninitialized local tensor can
+  // also have layout.
+  return value_->layout();
 }
 
 const Place& DistTensor::place() const {
   check_defined(*this, "place");
-  return value_.holder_->place();
+  return value_->holder_->place();
 }
 
 void* DistTensor::AllocateFrom(Allocator* allocator,
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
index c965733a7e0e8e..9e93ccf70c70b8 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
@@ -30,12 +30,12 @@ class DistTensor final
   /// \brief Careful to create dist tensor using default constructor.
   /// this should only used in reshard for now, and the dist properties
   /// will be set by reshard later.
-  DistTensor() = default;
+  DistTensor();
 
   /// \brief Construct a dist tensor based dense tensor.
   /// \param global_value The global dense tensor of the current tensor.
   /// \param dist_attr The distributed attributes of the current tensor.
-  DistTensor(const phi::DenseTensor& global_value,
+  DistTensor(const std::shared_ptr<phi::DenseTensor>& global_value,
              const TensorDistAttr& dist_attr);
 
   /// \brief Construct a empty dist tensor (for infer spmd)
@@ -68,7 +68,7 @@ class DistTensor final
 
   /// \brief Returns the dense tensor value's const reference in dist tensor.
   /// \return The DenseTensor value's const reference
-  const DenseTensor& value() const { return value_; }
+  const DenseTensor& value() const { return *value_; }
 
   /// \brief Returns the mutable dense tensor value in dist tensor.
   /// \note If DenseTensor value is modified externally, the corresponding
@@ -77,7 +77,7 @@ class DistTensor final
   /// so you need to make sure to consider it thoroughly when using
   /// this method.
   /// \return The mutable pointer of DenseTensor value
-  DenseTensor* unsafe_mutable_value() { return &value_; }
+  DenseTensor* unsafe_mutable_value() { return value_.get(); }
 
   /// \brief Returns the global dims of the dist tensor.
   /// \return The global dims of the dist tensor.
@@ -126,7 +126,7 @@ class DistTensor final
   // The distributed attributes
   TensorDistAttr dist_attr_;
   // The local DenseTensor value
-  DenseTensor value_;
+  std::shared_ptr<DenseTensor> value_;
 };
 
 }  // namespace distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/nd_mesh_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/nd_mesh_reshard_function.cc
index fbeddae16dc408..9d5d8f43f76708 100644
--- a/paddle/phi/core/distributed/auto_parallel/nd_mesh_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/nd_mesh_reshard_function.cc
@@ -38,10 +38,10 @@ ProcessMesh GetSubProcessMesh(const ProcessMesh& mesh, int64_t axis) {
   for (int64_t i = 0; i < shape_of_axis; ++i) {
     coord[axis] = i;
     int64_t rank = coord.back();
-    for (int64_t j = coord.size() - 2; j >= 0; --j) {
+    for (int64_t j = static_cast<int64_t>(coord.size() - 2); j >= 0; --j) {
       rank += coord[j] * mesh.dim_size(j + 1);
     }
-    process_ids.emplace_back(rank);
+    process_ids.emplace_back(mesh.process_ids()[rank]);
   }
 
   ProcessMesh out_mesh(shape, process_ids, dim_names);
@@ -58,7 +58,8 @@ int64_t FindFirstDiffShardAxis(const TensorDistAttr& in_dist_attr,
   const auto& out_dims_mapping = out_dist_attr.dims_mapping();
   int64_t axis = -1;
 
-  for (int64_t i = in_dims_mapping.size() - 1; i >= 0; --i) {
+  for (int64_t i = static_cast<int64_t>(in_dims_mapping.size() - 1); i >= 0;
+       --i) {
     if (in_dims_mapping[i] != out_dims_mapping[i]) {
       axis = i;
       break;
@@ -87,18 +88,24 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
                                      const DistTensor& in,
                                      const TensorDistAttr& out_dist_attr,
                                      DistTensor* out) {
+  VLOG(3) << "Call SameNdMeshReshardFunction Eval";
   const auto& in_dist_attr = in.dist_attr();
   const auto& process_mesh = out_dist_attr.process_mesh();
 
   int64_t first_diff_axis = FindFirstDiffShardAxis(in_dist_attr, out_dist_attr);
 
+  // Backup out_dist_attr to to avoid overwriting the out's dist attr
+  auto out_dist_attr_orig = out_dist_attr;
+
   SetValue(out, in.value());
   SetDistProps(out, in.dims(), in_dist_attr);
 
   // 1. change all the partial status to replicated status if needed
   if (in_dist_attr.is_partial()) {
-    const auto& in_partial_status = in_dist_attr.partial_status();
-    const auto& out_partial_status = out_dist_attr.partial_status();
+    // Copy in_dist_attr.partial_status to avoid overwriting the value of
+    // input when the output and input are the same value
+    const auto in_partial_status = in_dist_attr.partial_status();
+    const auto& out_partial_status = out_dist_attr_orig.partial_status();
     for (const auto& kv : in_partial_status) {
       if (out_partial_status.count(kv.first) != 0) {
         continue;
@@ -173,9 +180,9 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   }
 
   // 3. Change replicated to partial
-  if (out_dist_attr.is_partial()) {
+  if (out_dist_attr_orig.is_partial()) {
     const auto& in_partial_status = out->dist_attr().partial_status();
-    const auto& out_partial_status = out_dist_attr.partial_status();
+    const auto& out_partial_status = out_dist_attr_orig.partial_status();
     for (const auto& kv : out_partial_status) {
       if (in_partial_status.count(kv.first) != 0) {
         continue;
@@ -211,7 +218,7 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
 
   // 4. Change replicated to shard
   for (int64_t i = first_diff_axis; i >= 0; --i) {
-    int64_t out_mesh_axis = out_dist_attr.dims_mapping()[i];
+    int64_t out_mesh_axis = out_dist_attr_orig.dims_mapping()[i];
     if (out_mesh_axis != -1) {
       VLOG(3) << "Step4: out_mesh axis " << out_mesh_axis;
       // 4.1 Calculate the dist_attr after this transform
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard_function.cc
index b8e355e689caea..01824dd93bca19 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_function.cc
@@ -30,7 +30,7 @@ std::shared_ptr<DistTensor> ReshardFunction::Eval(
 }
 
 void ReshardFunction::SetValue(DistTensor* tensor, const DenseTensor& value) {
-  tensor->value_ = value;
+  tensor->value_ = std::make_shared<DenseTensor>(value);
 }
 
 void ReshardFunction::SetDistProps(DistTensor* tensor,
@@ -56,7 +56,7 @@ void ReshardFunction::SetDistProps(DistTensor* tensor,
 }
 
 DenseTensor* ReshardFunction::GetMutableTensor(DistTensor* tensor) {
-  return &tensor->value_;
+  return tensor->value_.get();
 }
 
 ReshardFunction* ChooseProperReshardFunction(
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
index 60c9cbdda3b676..ce52f0a203680f 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
@@ -95,11 +95,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
     } else if (phi::CustomContext::classof(&dev_ctx)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
       CommContextManager::CreateXCCLCommContext(
-          store,
-          unique_comm_key,
-          dev_ctx.GetPlace().GetDeviceType(),
-          rank,
-          world_size);
+          store, unique_comm_key, dev_ctx.GetPlace(), rank, world_size);
 #endif
     } else {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -146,5 +142,14 @@ bool IsCurRankInMesh(const ProcessMesh& process_mesh) {
           process_ids.end());
 }
 
+Place GetDefaultPlace() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (phi::backends::gpu::GetGPUDeviceCount() >= 0) {
+    return paddle::DefaultGPUPlace();
+  }
+#endif
+  return paddle::CPUPlace();
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
index 652840976194f8..a40b62c182f318 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
@@ -32,6 +32,8 @@ class ProcessMesh;
 
 bool IsCurRankInMesh(const ProcessMesh& process_mesh);
 
+Place GetDefaultPlace();
+
 int64_t GetLocalRankInParticipate(const std::vector<int64_t>& process_ids,
                                   int64_t global_rank = -1);
 
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
index db8a26088ae457..361c83d64a007f 100644
--- a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
@@ -30,7 +30,8 @@ bool SToRReshardFunction::IsSuitable(const DistTensor& in,
                                      const TensorDistAttr& out_dist_attr) {
   bool flag = true;
   const auto& in_dist_attr = in.dist_attr();
-  const auto& in_dims_mapping = in_dist_attr.dims_mapping();
+  // TODO(GhostScreaming): Fix problems of using uninitialized DistTensor's
+  // local_dims const auto& in_dims_mapping = in_dist_attr.dims_mapping();
 
   flag &= in_dist_attr.is_shard();
   flag &= out_dist_attr.is_replicated();
@@ -42,12 +43,13 @@ bool SToRReshardFunction::IsSuitable(const DistTensor& in,
   flag &= (out_process_mesh.ndim() == 1);
   flag &= (in_process_mesh == out_process_mesh);
 
-  // Ensure the tensor is balanced split, or we need send/recv rather than
-  // all_gather
-  int split_axis = GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first;
-  int64_t num_of_process = in_process_mesh.size();
-  flag &= (in.local_dims()[static_cast<int>(split_axis)] * num_of_process ==
-           in.dims()[static_cast<int>(split_axis)]);
+  // TODO(GhostScreaming): Fix problems of using uninitialized DistTensor's
+  // local_dims Ensure the tensor is balanced split, or we need send/recv rather
+  // than all_gather int split_axis =
+  // GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first; int64_t
+  // num_of_process = in_process_mesh.size(); flag &=
+  // (in.local_dims()[static_cast<int>(split_axis)] * num_of_process ==
+  //          in.dims()[static_cast<int>(split_axis)]);
 
   return flag;
 }
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
index 29aa1256e01937..3aafe1dc7fbeea 100644
--- a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
@@ -53,7 +53,7 @@ void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   const auto& in_process_ids = in_process_mesh.process_ids();
   auto dtype = in.dtype();
   const auto& logical_ddim = in.dims();
-  int64_t nranks = in_process_ids.size();
+  int64_t nranks = static_cast<int64_t>(in_process_ids.size());
   int in_split_axis =
       GetSplitAxisWithDimsMapping(in.dist_attr().dims_mapping()).begin()->first;
   int out_split_axis =
diff --git a/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.cc
index a6f49268c5612f..e7aed9ae788b04 100644
--- a/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.cc
@@ -64,6 +64,7 @@ void SameStatusReshardFunction::Eval(phi::DeviceContext* dev_ctx,
                                      const DistTensor& in,
                                      const TensorDistAttr& out_dist_attr,
                                      DistTensor* out) {
+  VLOG(3) << "Call SameStatusReshardFunction Eval";
   const auto& in_dist_attr = in.dist_attr();
   const auto& in_process_mesh = in_dist_attr.process_mesh();
   const auto& in_process_ids = in_process_mesh.process_ids();
@@ -79,6 +80,19 @@ void SameStatusReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   // kernel execution.
   bool dynamic_shape = true;
 
+  // TODO(GhostScreaming): After cross-mesh reshard, current device may
+  // needs to execute next layer. When it construct next layer's backward
+  // graph, out->place() will be called such as in SetGradOutMeta method. As
+  // a result, out can't be undefined. Try to allocate a zero-memory value
+  // for out. Following send/recv will cover this empty DenseTensor
+  // construction.
+  VLOG(3) << "Same_status_reshard_function create an empty DenseTensor for "
+             "cross-mesh DistTensor.";
+  *(out->unsafe_mutable_value()) =
+      phi::DenseTensor(std::make_shared<phi::Allocation>(
+                           nullptr, 0, phi::distributed::GetDefaultPlace()),
+                       in.value().meta());
+
   std::vector<std::pair<int64_t, int64_t>> p2p_pair;
   for (size_t i = 0; i < out_process_ids.size(); ++i) {
     p2p_pair.emplace_back(
@@ -89,8 +103,8 @@ void SameStatusReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   for (const auto& iter : p2p_pair) {
     int64_t src = iter.first;
     int64_t dst = iter.second;
-    VLOG(3) << "Send/Recv from src " << src << " to dst " << dst;
     if (src == cur_global_rank) {
+      VLOG(3) << "Send from src " << src << " to dst " << dst;
       int64_t dst_local_rank = GetLocalRankInParticipate(all_process_ids, dst);
       // Sice send kernel only has input, so we don't need to infermeta
       // actually. According to this reason, just use the kernel directly.
@@ -102,6 +116,7 @@ void SameStatusReshardFunction::Eval(phi::DeviceContext* dev_ctx,
                                 dst_local_rank,
                                 dynamic_shape);
     } else if (dst == cur_global_rank) {
+      VLOG(3) << "Recv from src " << src << " to dst " << dst;
       int64_t src_local_rank = GetLocalRankInParticipate(all_process_ids, src);
       RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
                                 PRecv,
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index 342a86313bf3fb..338ee4b4bad177 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -130,15 +130,19 @@ void CommContextManager::CreateGlooCommContext(
 void CommContextManager::CreateXCCLCommContext(
     const std::shared_ptr<Store>& store,
     const std::string& unique_comm_key,
-    const std::string& device_type,
+    const phi::Place& place,
     int rank,
-    int size) {
+    int size,
+    const std::string& hash_key) {
   phi::ccl::CCLRootId xccl_root_id;
   if (rank == 0) {
-    phi::DeviceManager::CCLGetUniqueId(device_type, &xccl_root_id);
+    phi::DeviceManager::CCLGetUniqueId(place.GetDeviceType(), &xccl_root_id);
   }
 
   std::string unique_key = "XCCLCommContext/" + unique_comm_key;
+  if (!hash_key.empty()) {
+    unique_key += "/" + hash_key;
+  }
   if (rank == 0) {
     store->set(unique_key, xccl_root_id);
   } else {
@@ -148,7 +152,7 @@ void CommContextManager::CreateXCCLCommContext(
           << ", unique_comm_key: " << unique_comm_key << ", xccl uniqueid: "
           << phi::ccl::SerializeXCCLUniqueId(xccl_root_id);
   auto xccl_comm_context =
-      std::make_unique<XCCLCommContext>(device_type, rank, size, xccl_root_id);
+      std::make_unique<XCCLCommContext>(place, rank, size, xccl_root_id);
   auto& comm_context_manager = CommContextManager::GetInstance();
   comm_context_manager.SetStore(store);
   comm_context_manager.Emplace(unique_comm_key, std::move(xccl_comm_context));
diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h
index dcbfaab55af903..69e58a96e18e1a 100644
--- a/paddle/phi/core/distributed/comm_context_manager.h
+++ b/paddle/phi/core/distributed/comm_context_manager.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <unordered_map>
 
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/distributed/comm_context.h"
 #include "paddle/phi/core/macros.h"
 
@@ -74,9 +75,10 @@ class CommContextManager {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   static void CreateXCCLCommContext(const std::shared_ptr<Store>& store,
                                     const std::string& unique_comm_key,
-                                    const std::string& device_type,
+                                    const phi::Place& place,
                                     int rank,
-                                    int size);
+                                    int size,
+                                    const std::string& hash_key = "");
 #endif
 
  private:
diff --git a/paddle/phi/core/distributed/store/tcp_store.cc b/paddle/phi/core/distributed/store/tcp_store.cc
index 9650d051f98fbe..6fbe2aa6761e2c 100644
--- a/paddle/phi/core/distributed/store/tcp_store.cc
+++ b/paddle/phi/core/distributed/store/tcp_store.cc
@@ -421,7 +421,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
 }
 
 void TCPStore::wait(const std::string& key) {
-  ReplyType reply;
+  ReplyType reply;  // NOLINT
   VLOG(7) << "TCPStore wait.";
   _client->send_command_for_key(Command::WAIT, _key_prefix + key);
   reply = _client->receive_value<ReplyType>();
diff --git a/paddle/phi/core/distributed/store/tcp_utils.cc b/paddle/phi/core/distributed/store/tcp_utils.cc
index aaf00cb8000853..64c5424928b9ff 100644
--- a/paddle/phi/core/distributed/store/tcp_utils.cc
+++ b/paddle/phi/core/distributed/store/tcp_utils.cc
@@ -44,7 +44,7 @@ ::addrinfo* get_addr_info(const std::string host,
                           const std::string port,
                           int ai_flags,
                           int family) {
-  ::addrinfo hints{}, *res;
+  ::addrinfo hints{}, *res = nullptr;
   hints.ai_flags = ai_flags;
   hints.ai_family = family;
   hints.ai_socktype = SOCK_STREAM;
@@ -52,7 +52,7 @@ ::addrinfo* get_addr_info(const std::string host,
   const char* node = host.empty() ? nullptr : host.c_str();
   const char* port_cstr = port.empty() ? nullptr : port.c_str();
 
-  int n;
+  int n = 0;
   n = ::getaddrinfo(node, port_cstr, &hints, &res);
   const char* gai_err = ::gai_strerror(n);
   const char* proto = (family == AF_INET    ? "IPv4"
@@ -216,7 +216,7 @@ void send_string(SocketType socket, const std::string& s) {
 }
 
 std::string receive_string(SocketType socket) {
-  std::string::size_type size;
+  std::string::size_type size = 0;
   receive_bytes<std::string::size_type>(socket, &size, 1);
   std::vector<char> v(size);
   receive_bytes<char>(socket, v.data(), size);
diff --git a/paddle/phi/core/distributed/xccl_comm_context.cc b/paddle/phi/core/distributed/xccl_comm_context.cc
index 6342ff27a48722..5c82e7baf0e82f 100644
--- a/paddle/phi/core/distributed/xccl_comm_context.cc
+++ b/paddle/phi/core/distributed/xccl_comm_context.cc
@@ -25,17 +25,19 @@
 namespace phi {
 namespace distributed {
 
-XCCLCommContext::XCCLCommContext(const std::string& device_type,
+XCCLCommContext::XCCLCommContext(const phi::Place& place,
                                  int rank,
                                  int size,
                                  const ccl::CCLRootId& xccl_id)
     : CommContext(rank, size) {
-  device_type_ = device_type;
-  phi::DeviceManager::CCLCommInitRank(device_type,
+  place_ = place;
+  phi::DeviceManager::CCLCommInitRank(place_.GetDeviceType(),
                                       size_,
                                       const_cast<ccl::CCLRootId*>(&xccl_id),
                                       rank,
                                       &xccl_comm_);
+  stream_ = std::make_shared<phi::stream::Stream>();
+  stream_->Init(place_);
 }
 
 void XCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
@@ -49,7 +51,7 @@ void XCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
                              size_,
                              phi::AllocationType::CUSTOM);
   if (rank_ == root) {
-    phi::DeviceManager::CCLBroadcast(device_type_,
+    phi::DeviceManager::CCLBroadcast(place_.GetDeviceType(),
                                      const_cast<void*>(in_tensor.data()),
                                      in_tensor.numel(),
                                      phi::ccl::ToCCLDataType(in_tensor.dtype()),
@@ -57,7 +59,7 @@ void XCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
                                      xccl_comm_,
                                      stream);
   } else {
-    phi::DeviceManager::CCLBroadcast(device_type_,
+    phi::DeviceManager::CCLBroadcast(place_.GetDeviceType(),
                                      out_tensor->data(),
                                      out_tensor->numel(),
                                      phi::ccl::ToCCLDataType(in_tensor.dtype()),
@@ -77,7 +79,7 @@ void XCCLCommContext::AllGather(phi::DenseTensor* out_tensor,
       /*cur_rank*/ rank_,
       size_,
       phi::AllocationType::CUSTOM);
-  phi::DeviceManager::CCLAllGather(device_type_,
+  phi::DeviceManager::CCLAllGather(place_.GetDeviceType(),
                                    const_cast<void*>(in_tensor.data()),
                                    out_tensor->data(),
                                    in_tensor.numel(),
@@ -97,7 +99,7 @@ void XCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor,
       size_,
       phi::AllocationType::CUSTOM);
   phi::DeviceManager::CCLReduceScatter(
-      device_type_,
+      place_.GetDeviceType(),
       const_cast<void*>(in_tensor.data()),
       out_tensor->data(),
       out_tensor->numel(),
@@ -113,7 +115,7 @@ void XCCLCommContext::Send(const phi::DenseTensor& in_tensor,
                            const phi::stream::Stream& stream) const {
   phi::distributed::CommStaticCheck::CheckShape(
       in_tensor, rank_, size_, phi::AllocationType::CUSTOM);
-  phi::DeviceManager::CCLSend(device_type_,
+  phi::DeviceManager::CCLSend(place_.GetDeviceType(),
                               const_cast<void*>(in_tensor.data()),
                               count,
                               phi::ccl::ToCCLDataType(in_tensor.type()),
@@ -130,7 +132,7 @@ void XCCLCommContext::Recv(phi::DenseTensor* out_tensor,
                            const phi::stream::Stream& stream) const {
   phi::distributed::CommStaticCheck::CheckShape(
       *out_tensor, rank_, size_, phi::AllocationType::CUSTOM);
-  phi::DeviceManager::CCLRecv(device_type_,
+  phi::DeviceManager::CCLRecv(place_.GetDeviceType(),
                               out_tensor->data(),
                               count,
                               phi::ccl::ToCCLDataType(out_tensor->type()),
@@ -151,7 +153,7 @@ void XCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
                                                /*cur_rank*/ rank_,
                                                size_,
                                                phi::AllocationType::CUSTOM);
-  phi::DeviceManager::CCLAllReduce(device_type_,
+  phi::DeviceManager::CCLAllReduce(place_.GetDeviceType(),
                                    const_cast<void*>(in_tensor.data()),
                                    out_tensor->data(),
                                    in_tensor.numel(),
@@ -172,7 +174,7 @@ void XCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
                                                /*cur_rank*/ rank_,
                                                size_,
                                                phi::AllocationType::CUSTOM);
-  phi::DeviceManager::CCLReduce(device_type_,
+  phi::DeviceManager::CCLReduce(place_.GetDeviceType(),
                                 const_cast<void*>(in_tensor.data()),
                                 out_tensor->data(),
                                 in_tensor.numel(),
@@ -184,10 +186,10 @@ void XCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
 }
 
 void XCCLCommContext::GroupStart() const {
-  phi::DeviceManager::CCLGroupStart(device_type_);
+  phi::DeviceManager::CCLGroupStart(place_.GetDeviceType());
 }
 void XCCLCommContext::GroupEnd() const {
-  phi::DeviceManager::CCLGroupEnd(device_type_);
+  phi::DeviceManager::CCLGroupEnd(place_.GetDeviceType());
 }
 
 }  // namespace distributed
diff --git a/paddle/phi/core/distributed/xccl_comm_context.h b/paddle/phi/core/distributed/xccl_comm_context.h
index f5a51ab332640f..86f8dfc76a1eb3 100644
--- a/paddle/phi/core/distributed/xccl_comm_context.h
+++ b/paddle/phi/core/distributed/xccl_comm_context.h
@@ -24,14 +24,16 @@ namespace distributed {
 
 class XCCLCommContext final : public CommContext {
  public:
-  XCCLCommContext(const std::string& device_type,
+  XCCLCommContext(const phi::Place& place,
                   int rank,
                   int size,
                   const ccl::CCLRootId& xccl_id);
 
   ccl::CCLComm GetXcclComm() const { return xccl_comm_; }
 
-  const std::string& GetDeviceType() const { return device_type_; }
+  std::shared_ptr<phi::stream::Stream> GetStream() const { return stream_; }
+
+  std::string GetDeviceType() const { return place_.GetDeviceType(); }
 
   void Broadcast(phi::DenseTensor* out_tensor,
                  const phi::DenseTensor& in_tensor,
@@ -75,8 +77,9 @@ class XCCLCommContext final : public CommContext {
  private:
   DISABLE_COPY_AND_ASSIGN(XCCLCommContext);
 
-  std::string device_type_;
+  phi::Place place_;
   ccl::CCLComm xccl_comm_;
+  std::shared_ptr<phi::stream::Stream> stream_;
 };
 
 }  // namespace distributed
diff --git a/paddle/phi/core/extended_tensor.h b/paddle/phi/core/extended_tensor.h
index d02dbabde179fe..73cae43c0b54c0 100644
--- a/paddle/phi/core/extended_tensor.h
+++ b/paddle/phi/core/extended_tensor.h
@@ -18,12 +18,14 @@ limitations under the License. */
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/utils/test_macros.h"
+
 namespace phi {
 
 /// \brief The ExtendedTensor is a interface for custom designed class.
 /// If you want to pass some self-designed data as input/output to kernels,
 /// you can inherit from this class to store your self-designed data.
-class ExtendedTensor : public TensorBase {
+class TEST_API ExtendedTensor : public TensorBase {
  public:
   ExtendedTensor() = default;
   virtual ~ExtendedTensor() = default;
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 19e707c40bc551..c7a0a81c7fb4f4 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1333,18 +1333,20 @@ PHI_DEFINE_EXPORTED_int32(
     "Multiple of the CUPTI device buffer size. If the timestamps have "
     "been dropped when you are profiling, try increasing this value.");
 
+PHI_DEFINE_EXPORTED_bool(print_ir, false, "Whether print ir debug str.");
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 /**
  * Communication library related FLAG
  * Name: FLAGS_dynamic_static_unified_comm
  * Since Version: 2.5
- * Value Range: bool, default=false
+ * Value Range: bool, default=true
  * Example:
  * Note: Whether to use new communication library in auto parallel and static
  * mode. If true, it will use unified CommContextManager for communication.
  */
 PHI_DEFINE_EXPORTED_bool(dynamic_static_unified_comm,
-                         false,
+                         true,
                          "Whether to use new communication library in auto "
                          "parallel and static mode.");
 #endif  // FLAGS_dynamic_static_unified_comm
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index 8cdbb290ea34f8..4541b81de4630a 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -242,7 +242,7 @@ uint64_t Generator::GetCurrentSeed() {
 
 uint64_t Generator::Seed() {
   std::lock_guard<std::mutex> lock(this->mu_);
-  uint64_t seed;
+  uint64_t seed = 0;
   std::random_device de;
   seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
   this->state_.current_seed = seed;
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index adbcb8574518ba..18f3042bbf9c28 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 
 namespace phi {
 
-void InferMetaContext::SetMetaConfig(MetaConfig config) {
-  config_ = std::move(config);
-}
+void InferMetaContext::SetMetaConfig(MetaConfig config) { config_ = config; }
 
 void InferMetaContext::EmplaceBackInput(MetaTensor input) {
   int index = static_cast<int>(inputs_.size());
@@ -96,7 +94,7 @@ InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
       result.emplace_back(in.initialized() ? &in : nullptr);
     }
 
-    return paddle::optional<std::vector<const MetaTensor*>>(std::move(result));
+    return paddle::optional<std::vector<const MetaTensor*>>(result);
   }
   return paddle::none;
 }
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index f9c1dca46b2fb5..69c7900def16ba 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -526,7 +526,7 @@ std::string KernelSelectionErrorMessage(const std::string& kernel_name,
   std::unordered_set<std::string> dtype_set;
 
   // Record all kernel information of kernel_name
-  for (auto iter : KernelFactory::Instance().kernels()[kernel_name]) {
+  for (auto const& iter : KernelFactory::Instance().kernels()[kernel_name]) {
     KernelKey kernel_key = iter.first;
     if (kernel_key.backend() == target_key.backend()) {
       support_backend = true;
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 7f156463ca17bd..8f63dc5d4d56cf 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -124,7 +124,13 @@ void MetaTensor::set_dtype(DataType dtype) {
     DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
         ->dtype = dtype;
   } else if (phi::distributed::DistTensor::classof(tensor_)) {
-    // skip, DistTensor no need to set dtype
+    // For pipeline parallelism, DistTensor holds an uninitialized DenseTensor,
+    // But kernel launch needs to get it's placement, dtype and layout.
+    VLOG(3) << "DistTensor set dtype: " << dtype;
+    DenseTensorUtils::GetMutableMeta(
+        static_cast<phi::distributed::DistTensor*>(tensor_)
+            ->unsafe_mutable_value())
+        ->dtype = dtype;
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Unsupported settting dtype for `%s`.", tensor_->type_info().name()));
@@ -158,7 +164,11 @@ void MetaTensor::set_layout(DataLayout layout) {
     DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
         ->layout = layout;
   } else if (phi::distributed::DistTensor::classof(tensor_)) {
-    // skip, DistTensor no need to set dtype
+    VLOG(3) << "DistTensor set layout: " << layout;
+    DenseTensorUtils::GetMutableMeta(
+        static_cast<phi::distributed::DistTensor*>(tensor_)
+            ->unsafe_mutable_value())
+        ->layout = layout;
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Unsupported settting layout for `%s`.", tensor_->type_info().name()));
diff --git a/paddle/phi/core/tensor_array.cc b/paddle/phi/core/tensor_array.cc
index 8c717e151a1299..11a240596be73c 100644
--- a/paddle/phi/core/tensor_array.cc
+++ b/paddle/phi/core/tensor_array.cc
@@ -27,7 +27,7 @@ bool TensorArray::initialized() const {
     return false;
   }
 
-  for (auto tensor : tensors_) {
+  for (auto const& tensor : tensors_) {
     if (!tensor.initialized()) {
       return false;
     }
diff --git a/paddle/phi/core/utils/type_info.h b/paddle/phi/core/utils/type_info.h
index b4d908e2c1d9c0..9e31343ed04a42 100644
--- a/paddle/phi/core/utils/type_info.h
+++ b/paddle/phi/core/utils/type_info.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "paddle/utils/test_macros.h"
 
 namespace phi {
 
@@ -40,7 +41,7 @@ class TypeInfo {
 };
 
 template <typename BaseT, typename DerivedT>
-class TypeInfoTraits {
+class TEST_API TypeInfoTraits {
  public:
   static const TypeInfo<BaseT> kType;
   TypeInfoTraits();
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 2fd87760378fcf..2aa8543eb82c32 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1170,7 +1170,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
 void ElementwiseInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out) {
-  return ElementwiseRawInferMeta(x, y, -1, std::move(out));
+  return ElementwiseRawInferMeta(x, y, -1, out);
 }
 
 void ElementwiseRawInferMeta(const MetaTensor& x,
@@ -1435,7 +1435,7 @@ void FusedMatmulInferMeta(const MetaTensor& x,
     y_broadcasted = true;
   }
 
-  size_t M, N;
+  size_t M = 0, N = 0;
   if (transpose_x) {
     M = dims_x[ndims_x - 1];
   } else {
@@ -2136,7 +2136,7 @@ void MatmulInferMeta(const MetaTensor& x,
     y_broadcasted = true;
   }
 
-  size_t M, N;
+  size_t M = 0, N = 0;
   if (trans_x) {
     M = dims_x[ndims_x - 1];
   } else {
@@ -3028,7 +3028,7 @@ void YoloBoxInferMeta(const MetaTensor& x,
                         "But received class_num (%s)",
                         class_num));
 
-  int box_num;
+  int box_num = 0;
   if ((dim_x[2] > 0 && dim_x[3] > 0) || config.is_runtime) {
     box_num = static_cast<int>(dim_x[2] * dim_x[3] * anchor_num);
   } else {
@@ -3103,7 +3103,7 @@ void SolveInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
     y_broadcasted = true;
   }
 
-  size_t M, N;
+  size_t M = 0, N = 0;
   if (trans_x) {
     M = x_dims_vec[x_dims_n - 1];
   } else {
@@ -3227,6 +3227,35 @@ void Unpool3dInferMeta(const MetaTensor& x,
   }
 }
 
+void WeightDequantizeInferMeta(const MetaTensor& x,
+                               const MetaTensor& scale,
+                               const std::string& algo,
+                               DataType out_dtype,
+                               MetaTensor* out) {
+  PADDLE_ENFORCE_EQ(x.dims().size(),
+                    2UL,
+                    phi::errors::InvalidArgument(
+                        "The x tensor of dequantize op must be 2D, but got[%d]",
+                        x.dims().size()));
+  PADDLE_ENFORCE_EQ(
+      scale.dims().size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "The scale tensor of dequantize op must be 1D, but got[%d]",
+          scale.dims().size()));
+  PADDLE_ENFORCE_EQ(scale.dims()[0],
+                    x.dims()[0],
+                    phi::errors::InvalidArgument(
+                        "The scale tensor's shape must be equal to the x "
+                        "tensor's shape, but got [%d] not equal to [%d]",
+                        scale.dims()[0],
+                        x.dims()[0]));
+  int n = x.dims()[1];
+  int k = x.dims()[0];
+  out->set_dims(phi::make_ddim({n, k}));
+  out->set_dtype(out_dtype);
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 94d8bb606ea5d2..153a8d553ceb59 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -493,4 +493,10 @@ void Unpool3dInferMeta(const MetaTensor& x,
                        MetaTensor* out,
                        MetaConfig config = MetaConfig());
 
+void WeightDequantizeInferMeta(const MetaTensor& x,
+                               const MetaTensor& scale,
+                               const std::string& algo,
+                               DataType out_dtype,
+                               MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 6846b5928c1163..0aca25103f80a7 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -485,10 +485,10 @@ void FusedAttentionInferMeta(const MetaTensor& x,
                                        "(dim_embed, 3 * dim_embed)."));
     } else {
       // compute the mp nranks
-      nranks = (y_dim[0] * 3) / y_dim[1];
+      nranks = static_cast<int>((y_dim[0] * 3) / y_dim[1]);
     }
-    dim_head = y_dim[0] / (num_heads * nranks);
-    hidden_size = y_dim[0];
+    dim_head = static_cast<int>(y_dim[0] / (num_heads * nranks));
+    hidden_size = static_cast<int>(y_dim[0]);
   } else {
     PADDLE_ENFORCE_EQ(y_dim.size(),
                       4,
@@ -512,9 +512,9 @@ void FusedAttentionInferMeta(const MetaTensor& x,
                                        "and must satisfy the limitations: "
                                        "(num_head * dim_head == dim_embed)"));
     }
-    num_heads = y_dim[1];
-    dim_head = y_dim[2];
-    hidden_size = y_dim[3];
+    num_heads = static_cast<int>(y_dim[1]);
+    dim_head = static_cast<int>(y_dim[2]);
+    hidden_size = static_cast<int>(y_dim[3]);
   }
 
   PADDLE_ENFORCE_EQ(
@@ -1050,8 +1050,8 @@ void FusedGemmEpilogueInferMeta(const MetaTensor& x,
 
   auto x_mat_dims = phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
 
-  int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1];
-  int K_from_y = trans_y ? y_dims[1] : y_dims[0];
+  int K_from_x = static_cast<int>(trans_x ? x_mat_dims[0] : x_mat_dims[1]);
+  int K_from_y = static_cast<int>(trans_y ? y_dims[1] : y_dims[0]);
 
   PADDLE_ENFORCE_EQ(
       K_from_x,
@@ -1086,7 +1086,7 @@ void FusedGemmEpilogueInferMeta(const MetaTensor& x,
           "The ReserveSpace would not be used when activation = \"none\""));
     } else {
       int min_size_of_n = activation == "relu" ? 128 : 8;
-      int N_size = trans_y ? y_dims[0] : y_dims[1];
+      int N_size = static_cast<int>(trans_y ? y_dims[0] : y_dims[1]);
       PADDLE_ENFORCE_EQ(N_size % min_size_of_n,
                         0,
                         phi::errors::InvalidArgument(
@@ -1842,4 +1842,329 @@ void SqueezeExcitationInferMeta(const MetaTensor& x,
   out->set_dims(DDim(out_shape.data(), static_cast<int>(out_shape.size())));
 }
 
+void FusedEmbeddingEltWiseLayerNormInferMeta(
+    const std::vector<const MetaTensor*>& ids,
+    const std::vector<const MetaTensor*>& embs,
+    const MetaTensor& bias,
+    const MetaTensor& scale,
+    const float epsilon,
+    MetaTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      ids.size(),
+      embs.size(),
+      phi::errors::InvalidArgument(
+          "Two inputs of EmbeddingEltWiseLayerNormOp shoube be "
+          "the same size, but received the size of input Ids = %d,"
+          " the size of input Embs = %d",
+          ids.size(),
+          embs.size()));
+  PADDLE_ENFORCE_GE(embs.size(),
+                    2UL,
+                    phi::errors::InvalidArgument(
+                        "Input Embs of EmbeddingEltWiseLayerNormOp should "
+                        "have at least 2 tensors"));
+  PADDLE_ENFORCE_GE(ids.size(),
+                    2UL,
+                    phi::errors::InvalidArgument(
+                        "Input Ids of EmbeddingEltWiseLayerNormOp should "
+                        "have at least 2 tensors"));
+
+  // batch * seq_len * 1
+  std::vector<DDim> ids_dims, embs_dims;
+  ids_dims.reserve(ids.size());
+  std::transform(ids.begin(),
+                 ids.end(),
+                 std::back_inserter(ids_dims),
+                 [](const MetaTensor* var) { return var->dims(); });
+  // word_num * hidden
+  embs_dims.reserve(embs.size());
+  std::transform(embs.begin(),
+                 embs.end(),
+                 std::back_inserter(embs_dims),
+                 [](const MetaTensor* var) { return var->dims(); });
+  // hidden
+  DDim dims_bias = bias.dims();
+
+  int batch = ids_dims[0][0];
+  int seq_len = ids_dims[0][1];
+  int hidden = embs_dims[0][1];
+  for (auto& embs_dim : embs_dims) {
+    PADDLE_ENFORCE_EQ(
+        embs_dim.size(),
+        2,
+        phi::errors::InvalidArgument(
+            "The Emb dim's size shoule be 2, but found %d.", embs_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        embs_dim[1],
+        dims_bias[0],
+        phi::errors::InvalidArgument(
+            "The second dims (%d) of the Embedding should be equal "
+            "to the Bias's size(%d).",
+            embs_dim[1],
+            dims_bias[0]));
+    PADDLE_ENFORCE_EQ(
+        embs_dim[1],
+        hidden,
+        phi::errors::InvalidArgument(
+            "The second dimension size(%d) of the Embedding should be "
+            "equal to the hidden's size(%d)",
+            embs_dim[1],
+            hidden));
+  }
+
+  auto dim_output = phi::make_ddim({batch, seq_len, hidden});
+  out->set_dims(dim_output);
+  // out->share_lod(ids);
+  // context->ShareLoD("Ids", /*->*/ "Out");
+}
+
+void FusionTransposeFlattenConcatInferMeta(
+    const std::vector<const MetaTensor*>& x,
+    const std::vector<int>& trans_axis,
+    const int flatten_axis,
+    const int concat_axis,
+    MetaTensor* out) {
+  PADDLE_ENFORCE_GE(
+      x.size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "Inputs(X) of TransposeFlattenConcat op should not be empty."));
+
+  std::vector<DDim> ins;
+  ins.reserve(x.size());
+  std::transform(
+      x.begin(), x.end(), std::back_inserter(ins), [](const MetaTensor* var) {
+        return var->dims();
+      });
+  const size_t n = ins.size();
+  PADDLE_ENFORCE_GT(n,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The size of Inputs(X)'s dimension should be greater "
+                        " than 0, but received %d.",
+                        n));
+
+  size_t x_rank = ins[0].size();
+  size_t trans_axis_size = trans_axis.size();
+  PADDLE_ENFORCE_EQ(x_rank,
+                    trans_axis_size,
+                    phi::errors::InvalidArgument(
+                        "The input tensor's rank(%d) "
+                        "should be equal to the permutation axis's size(%d)",
+                        x_rank,
+                        trans_axis_size));
+
+  auto dims0 = phi::funcs::GetFlattenShape(
+      flatten_axis, phi::funcs::GetPermuteShape(trans_axis, ins[0]));
+  std::vector<int> out_dims(dims0);
+  for (size_t i = 1; i < n; i++) {
+    auto dimsi = phi::funcs::GetFlattenShape(
+        flatten_axis, phi::funcs::GetPermuteShape(trans_axis, ins[i]));
+    for (int j = 0; j < static_cast<int>(dims0.size()); j++) {
+      if (j == concat_axis) {
+        out_dims[concat_axis] += dimsi[j];
+      } else {
+        PADDLE_ENFORCE_EQ(out_dims[j],
+                          dimsi[j],
+                          phi::errors::InvalidArgument(
+                              "After flatting, the %d-th dim should be save "
+                              "except the specify axis.",
+                              j));
+      }
+    }
+  }
+  if (out_dims[concat_axis] < 0) {
+    out_dims[concat_axis] = -1;
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+}
+
+void FusedFCElementwiseLayerNormInferMeta(const MetaTensor& x,
+                                          const MetaTensor& w,
+                                          const MetaTensor& y,
+                                          const MetaTensor& bias0,
+                                          const MetaTensor& scale,
+                                          const MetaTensor& bias1,
+                                          const int x_num_col_dims,
+                                          const std::string& activation_type,
+                                          const float epsilon,
+                                          const int begin_norm_axis,
+                                          MetaTensor* out,
+                                          MetaTensor* mean,
+                                          MetaTensor* variance,
+                                          MetaConfig config) {
+  DDim w_dims = w.dims();
+  PADDLE_ENFORCE_EQ(
+      w_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The input Weight of fc is expected to be a 2-D tensor. "
+          "But received the number of Weight's dimensions is %d, ",
+          "Weight's shape is %s.",
+          w_dims.size(),
+          w_dims));
+
+  if (bias0) {
+    DDim bias0_dims = bias0.dims();
+
+    PADDLE_ENFORCE_LE(bias0_dims.size(),
+                      2,
+                      phi::errors::InvalidArgument(
+                          "The input Bias of fc is expected to be an 1-D or "
+                          "2-D tensor. But received the number of Bias's "
+                          "dimensions is %d, Bias's shape is %s.",
+                          bias0_dims.size(),
+                          bias0_dims));
+
+    PADDLE_ENFORCE_EQ(
+        bias0_dims[bias0_dims.size() - 1],
+        w_dims[1],
+        phi::errors::InvalidArgument(
+            "The last dimension of input Bias is expected be equal "
+            "to the actual width of input Weight. But received the last "
+            "dimension of Bias is %d, Bias's shape is %s; "
+            "the actual width of Weight is %d, Weight's shape is %s.",
+            bias0_dims[bias0_dims.size() - 1],
+            bias0_dims,
+            w_dims[1],
+            w_dims));
+
+    if (bias0_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          bias0_dims[0],
+          1,
+          phi::errors::InvalidArgument(
+              "The first dimension of input Bias is expected to be 1, "
+              "but received %d, Bias's shape is %s.",
+              bias0_dims[0],
+              bias0_dims));
+    }
+  }
+
+  DDim x_dims = x.dims();
+  PADDLE_ENFORCE_LT(
+      x_num_col_dims,
+      x_dims.size(),
+      phi::errors::InvalidArgument(
+          "The attribute x_num_col_dims used to flatten input X to "
+          "a 2-D tensor, is expected to be less than the number of "
+          "input X's dimensions. But received x_num_col_dims is %d, "
+          "the number of input X's dimensions is %d, input X's shape is %s.",
+          x_num_col_dims,
+          x_dims.size(),
+          x_dims));
+
+  auto x_mat_dims = phi::flatten_to_2d(x_dims, x_num_col_dims);
+  PADDLE_ENFORCE_EQ(
+      x_mat_dims[1],
+      w_dims[0],
+      phi::errors::InvalidArgument(
+          "The input's second dimension and weight's first dimension is "
+          "expected to be the same. But received input's second dimension is "
+          "%d, input's shape is %s; weight's first dimension is %d, weight's "
+          "shape is %s.",
+          x_mat_dims[1],
+          x_mat_dims,
+          w_dims[0],
+          w_dims));
+
+  std::vector<int64_t> fc_out_dims;
+  for (int i = 0; i < x_num_col_dims; ++i) {
+    fc_out_dims.push_back(x_dims[i]);
+  }
+  fc_out_dims.push_back(w_dims[1]);
+
+  DDim y_dims = y.dims();
+  PADDLE_ENFORCE_EQ(phi::make_ddim(fc_out_dims),
+                    y_dims,
+                    phi::errors::InvalidArgument(
+                        "The output's shape of fc is expected to be equal to "
+                        "that of input Y. But received output's shape of fc "
+                        "is %s, input Y's shape is %s.",
+                        phi::make_ddim(fc_out_dims),
+                        y_dims));
+
+  PADDLE_ENFORCE_LT(
+      begin_norm_axis,
+      y_dims.size(),
+      phi::errors::InvalidArgument(
+          "The attribute begin_norm_axis used to flatten input Y to a 2-D "
+          "tensor, is expected to be less than the number of input Y's "
+          "dimensions. But received begin_norm_axis is %d, the number of "
+          "input Y's dimensions is %d, input Y's shape is %s.",
+          begin_norm_axis,
+          y_dims.size(),
+          y_dims));
+
+  auto y_mat_dim = phi::flatten_to_2d(y_dims, begin_norm_axis);
+  int64_t dim_0 = y_mat_dim[0];
+  int64_t dim_1 = y_mat_dim[1];
+  if (scale) {
+    DDim scale_dims = scale.dims();
+    PADDLE_ENFORCE_EQ(scale_dims.size(),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The input Scale is expected to be an 1-D tensor. "
+                          "But received the number of input Scale's "
+                          "dimensions is %d, input Scale's shape is %s.",
+                          scale_dims.size(),
+                          scale_dims));
+
+    if (config.is_runtime) {
+      PADDLE_ENFORCE_EQ(
+          scale_dims[0],
+          dim_1,
+          phi::errors::InvalidArgument(
+              "The first dimension of input Scale is expected to be equal to "
+              "the second dimension of input Y after flattened. "
+              "But received the first dimension of input Scale is %d, input "
+              "Scale's shape is %s; the second dimension of flattened input "
+              "Y is %d, input Y's shape is %s, flattened axis is %d.",
+              scale_dims[0],
+              scale_dims,
+              dim_1,
+              y_dims,
+              begin_norm_axis));
+    }
+  }
+  if (bias1) {
+    DDim bias1_dims = bias1.dims();
+    PADDLE_ENFORCE_EQ(
+        bias1_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The input Bias1 is expected to be an 1-D tensor. "
+            "But received the number of input Bias1's dimension is %d, "
+            "input Bias1's shape is %s.",
+            bias1_dims.size(),
+            bias1_dims));
+
+    if (config.is_runtime) {
+      PADDLE_ENFORCE_EQ(
+          bias1_dims[0],
+          dim_1,
+          phi::errors::InvalidArgument(
+              "The first dimension of input Bias1 is expected to be equal to "
+              "the second dimension of input Y after flattened. "
+              "But received the first dimension of input Bias1 is %d, input "
+              "Bias1's shape is %s; the second dimension of flatten input "
+              "Y is %d, input Y's shape is %s, flattened axis is %d.",
+              bias1_dims[0],
+              bias1_dims,
+              dim_1,
+              y_dims,
+              begin_norm_axis));
+    }
+  }
+
+  out->set_dims(y_dims);
+  if (mean) {
+    mean->set_dims({dim_0});
+  }
+  if (variance) {
+    variance->set_dims({dim_0});
+  }
+  out->share_lod(x);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index fe3ebe989cdc38..c022a4257e4dc3 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -485,4 +485,34 @@ void SqueezeExcitationInferMeta(const MetaTensor& x,
                                 const std::vector<int>& filter_dims,
                                 MetaTensor* out);
 
+void FusedEmbeddingEltWiseLayerNormInferMeta(
+    const std::vector<const MetaTensor*>& ids,
+    const std::vector<const MetaTensor*>& embs,
+    const MetaTensor& bias,
+    const MetaTensor& scale,
+    const float epsilon,
+    MetaTensor* out);
+
+void FusionTransposeFlattenConcatInferMeta(
+    const std::vector<const MetaTensor*>& x,
+    const std::vector<int>& trans_axis,
+    const int flatten_axis,
+    const int concat_axis,
+    MetaTensor* out);
+
+void FusedFCElementwiseLayerNormInferMeta(const MetaTensor& x,
+                                          const MetaTensor& w,
+                                          const MetaTensor& y,
+                                          const MetaTensor& bias0,
+                                          const MetaTensor& scale,
+                                          const MetaTensor& bias1,
+                                          const int x_num_col_dims,
+                                          const std::string& activation_type,
+                                          const float epsilon,
+                                          const int begin_norm_axis,
+                                          MetaTensor* out,
+                                          MetaTensor* mean,
+                                          MetaTensor* variance,
+                                          MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 8f78755486a7d2..0cd5534a9c44ab 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1997,7 +1997,7 @@ static void Interpolate1DInferShapeCheck(
     return;
   }
 
-  int out_w_tmp;
+  int out_w_tmp = 0;
   if (scale_tensor) {
     auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
@@ -2130,7 +2130,7 @@ static void Interpolate2DInferShapeCheck(
     return;
   }
 
-  int out_h_tmp, out_w_tmp;
+  int out_h_tmp = 0, out_w_tmp = 0;
 
   if (scale_tensor) {
     auto scale_tensor_dim = scale_tensor.dims();
@@ -2282,7 +2282,7 @@ static void Interpolate3DInferShapeCheck(
     return;
   }
 
-  int out_d_tmp, out_h_tmp, out_w_tmp;
+  int out_d_tmp = 0, out_h_tmp = 0, out_w_tmp = 0;
   if (scale_tensor) {
     auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 1c57e2fae92ac0..0e3ac3fb5ca2c8 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -74,7 +74,7 @@ void EyeInferMeta(const Scalar& num_rows,
                   DataType dtype,
                   MetaTensor* out,
                   MetaConfig config) {
-  int64_t rows, columns;
+  int64_t rows = 0, columns = 0;
   if (!config.is_runtime && num_rows.FromTensor()) {
     rows = -1;
   } else {
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.cc b/paddle/phi/infermeta/spmd_rules/elementwise.cc
index 411c43de8cc41f..3a9e422320210f 100644
--- a/paddle/phi/infermeta/spmd_rules/elementwise.cc
+++ b/paddle/phi/infermeta/spmd_rules/elementwise.cc
@@ -309,5 +309,77 @@ SpmdInfo ElementwiseBinaryInferSpmdReverse(const DistMetaTensor& x,
   return {{x_dist_attr_dst, y_dist_attr_dst}, {out_dist_attr}};
 }
 
+SpmdInfo ElementwiseUnaryGradInferSpmd(const DistMetaTensor& x,
+                                       const DistMetaTensor& out_grad) {
+  return {{out_grad.dist_attr(), out_grad.dist_attr()}, {out_grad.dist_attr()}};
+}
+
+SpmdInfo ElementwiseBinaryGradInferSpmd(const DistMetaTensor& x,
+                                        const DistMetaTensor& y,
+                                        const DistMetaTensor& out_grad,
+                                        int64_t axis) {
+  TensorDistAttr x_dist_attr = out_grad.dist_attr();
+  TensorDistAttr y_dist_attr = out_grad.dist_attr();
+  TensorDistAttr x_grad_dist_attr = out_grad.dist_attr();
+  TensorDistAttr y_grad_dist_attr = out_grad.dist_attr();
+
+  PADDLE_ENFORCE_GE(
+      out_grad.dims().size(),
+      x.dims().size(),
+      phi::errors::InvalidArgument("If being broadcast, the dims of out_grad "
+                                   "must larger or equal to the inputs."
+                                   "But we get the rank of output as [%d] and "
+                                   "the rank of input as [%d].",
+                                   out_grad.dims().size(),
+                                   x.dims().size()));
+
+  PADDLE_ENFORCE_GE(
+      out_grad.dims().size(),
+      y.dims().size(),
+      phi::errors::InvalidArgument("If being broadcast, the dims of out_grad "
+                                   "must larger or equal to the inputs."
+                                   "But we get the rank of output as [%d] and "
+                                   "the rank of input as [%d].",
+                                   out_grad.dims().size(),
+                                   y.dims().size()));
+
+  // The backward rule of elementwise follows the princple: the dist_attr
+  // of input should equal to out_grad.
+  // Caution the special case when the inputs calculate together with different
+  // shape it means one of the input is broadcast to same shape with the other
+  // first. When doing backward the input_grad with broadcast input is in
+  // partial status, which need to do communicate and get the right result.
+  if (x.dims() != out_grad.dims()) {
+    int64_t diff = out_grad.dims().size() - x.dims().size();
+    auto dims_mapping = x_dist_attr.dims_mapping();
+    dims_mapping.erase(dims_mapping.begin(), dims_mapping.begin() + diff);
+    x_dist_attr.set_dims_mapping(dims_mapping);
+    x_grad_dist_attr.set_dims_mapping(dims_mapping);
+    for (int64_t i = 0; i < diff; ++i) {
+      if (out_grad.dist_attr().dims_mapping()[i] != -1) {
+        x_grad_dist_attr.set_partial_status(
+            std::vector<int64_t>{out_grad.dist_attr().dims_mapping()[i]});
+      }
+    }
+  }
+
+  if (y.dims() != out_grad.dims()) {
+    int64_t diff = out_grad.dims().size() - y.dims().size();
+    auto dims_mapping = y_dist_attr.dims_mapping();
+    dims_mapping.erase(dims_mapping.begin(), dims_mapping.begin() + diff);
+    y_dist_attr.set_dims_mapping(dims_mapping);
+    y_grad_dist_attr.set_dims_mapping(dims_mapping);
+    for (int64_t i = 0; i < diff; ++i) {
+      if (out_grad.dist_attr().dims_mapping()[i] != -1) {
+        y_grad_dist_attr.set_partial_status(
+            std::vector<int64_t>{out_grad.dist_attr().dims_mapping()[i]});
+      }
+    }
+  }
+
+  return {{x_dist_attr, y_dist_attr, out_grad.dist_attr()},
+          {x_grad_dist_attr, y_grad_dist_attr}};
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.h b/paddle/phi/infermeta/spmd_rules/elementwise.h
index 319d3ccbbdac14..188e557e6737b0 100644
--- a/paddle/phi/infermeta/spmd_rules/elementwise.h
+++ b/paddle/phi/infermeta/spmd_rules/elementwise.h
@@ -27,6 +27,9 @@ SpmdInfo ElementwiseUnaryInferSpmd(const DistMetaTensor& x);
 SpmdInfo ElementwiseUnaryInferSpmdReverse(const DistMetaTensor& x,
                                           const DistMetaTensor& out);
 
+SpmdInfo ElementwiseUnaryGradInferSpmd(const DistMetaTensor& x,
+                                       const DistMetaTensor& out_grad);
+
 SpmdInfo ElementwiseBinaryInferSpmd(const DistMetaTensor& x,
                                     const DistMetaTensor& y);
 
@@ -34,5 +37,10 @@ SpmdInfo ElementwiseBinaryInferSpmdReverse(const DistMetaTensor& x,
                                            const DistMetaTensor& y,
                                            const DistMetaTensor& out);
 
+SpmdInfo ElementwiseBinaryGradInferSpmd(const DistMetaTensor& x,
+                                        const DistMetaTensor& y,
+                                        const DistMetaTensor& out_grad,
+                                        int64_t axis);
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/flatten.cc b/paddle/phi/infermeta/spmd_rules/flatten.cc
new file mode 100644
index 00000000000000..0a9c4111d8e7fa
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/flatten.cc
@@ -0,0 +1,203 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/flatten.h"
+#include <numeric>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/dim_trans.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+
+using phi::distributed::auto_parallel::str_join;
+
+int PreprocessAxis(int axis, int ndim) {
+  if (axis < 0) {
+    axis += ndim;
+  }
+
+  PADDLE_ENFORCE_LT(
+      axis,
+      ndim,
+      phi::errors::InvalidArgument("The Start_axis or Stop_axis [%d] is not "
+                                   "less than the Tensor X's rank [%d].",
+                                   axis,
+                                   ndim));
+
+  return axis;
+}
+
+std::vector<DimTrans*> MakeFlattenDimTrans(
+    const std::vector<int64_t>& src_shape, int start_axis, int stop_axis) {
+  std::vector<DimTrans*> ret;
+
+  std::vector<DimTrans*> input_dims;
+  for (int64_t i = 0; i < static_cast<int64_t>(src_shape.size()); i++) {
+    if (i < start_axis || i > stop_axis) {
+      ret.emplace_back(new InputDim(i));
+    } else {
+      input_dims.emplace_back(new InputDim(i));
+    }
+
+    if (i == stop_axis) {
+      ret.emplace_back(make_flatten(input_dims));
+    }
+  }
+
+  return ret;
+}
+
+std::vector<DimTrans*> MakeFlattenDimTransReverse(
+    const std::vector<int64_t>& src_shape, int start_axis, int stop_axis) {
+  std::vector<DimTrans*> ret;
+
+  std::vector<int64_t> tgt_splitted_shape;
+  for (int i = start_axis; i <= stop_axis; i++) {
+    tgt_splitted_shape.emplace_back(src_shape[i]);
+  }
+
+  for (int64_t i = 0; i < static_cast<int64_t>(src_shape.size()); i++) {
+    if (i < start_axis) {
+      ret.emplace_back(new InputDim(i));
+    } else if (i > stop_axis) {
+      ret.emplace_back(new InputDim(i - (stop_axis - start_axis)));
+    } else {
+      ret.emplace_back(make_split(
+          new InputDim(start_axis), tgt_splitted_shape, i - start_axis));
+    }
+  }
+
+  return ret;
+}
+
+SpmdInfo FlattenInferSpmd(const DistMetaTensor& x,
+                          int start_axis,
+                          int stop_axis) {
+  // Step0: Verify input args based on flatten logic
+  auto src_shape = phi::vectorize(x.dims());
+  int x_ndim = static_cast<int64_t>(src_shape.size());
+  auto x_dist_attr_src = x.dist_attr();
+  std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
+  PADDLE_ENFORCE_EQ(
+      x_ndim,
+      x_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor X's rank [%d] and X's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   x_ndim,
+                                   x_dims_mapping.size()));
+
+  // Step1: Build the transformation from
+  // the original shape to the target shape
+
+  start_axis = PreprocessAxis(start_axis, x_ndim);
+  stop_axis = PreprocessAxis(stop_axis, x_ndim);
+  std::vector<DimTrans*> trans =
+      MakeFlattenDimTrans(src_shape, start_axis, stop_axis);
+
+  // Step2: Infer the dims mapping of input (if reshard is
+  // needed) and output from the dimension transformation.
+  std::vector<std::vector<int64_t>> dims_mapping_vec =
+      InferFromDimTrans(x, trans);
+
+  // Step3: Update the dist attributes of input
+  // and output with the inferred dims mapping.
+  TensorDistAttr x_dist_attr_dst(x_dist_attr_src);
+  x_dist_attr_dst.set_dims_mapping(dims_mapping_vec[0]);
+  TensorDistAttr out_dist_attr(x_dist_attr_src);
+  out_dist_attr.set_dims_mapping(dims_mapping_vec[1]);
+
+  VLOG(4) << "FlattenInferSpmd: X shape: [" << str_join(src_shape) << "]";
+  VLOG(4) << "Start_axis: " << start_axis;
+  VLOG(4) << "Stop_axis: " << start_axis;
+  VLOG(4) << "Transformation from input to output:";
+  for (int64_t i = 0, n = static_cast<int64_t>(trans.size()); i < n; i++) {
+    DimTrans* t = trans[i];
+    VLOG(4) << "\tOut axis[" << i << "]: " << t->to_string();
+  }
+  VLOG(4) << "X dims_mapping_src: [" << str_join(x_dims_mapping)
+          << "] dims_mapping_dst: [" << str_join(dims_mapping_vec[0]) << "]";
+  VLOG(4) << "Out dims_mapping: [" << str_join(dims_mapping_vec[1]) << "]\n\n";
+
+  CleanUp();
+
+  return {{x_dist_attr_dst}, {out_dist_attr}};
+}
+
+SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
+                                 const DistMetaTensor& out,
+                                 int start_axis,
+                                 int stop_axis) {
+  // Step0: Verify input args based on flatten logic
+  auto x_shape = phi::vectorize(x.dims());
+  auto x_ndim = x_shape.size();
+  auto out_shape = phi::vectorize(out.dims());
+  int out_ndim = out_shape.size();
+  auto out_dist_attr_src = out.dist_attr();
+  std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
+  PADDLE_ENFORCE_EQ(
+      out_ndim,
+      out_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor Out's rank [%d] and Out's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   out_ndim,
+                                   out_dims_mapping.size()));
+
+  // Step1: Build the transformation from the output shape
+  // to original shape. This function infers the dims mapping
+  // from output to input, we first get the transformation
+  // from output to input so that we can infer the dims mapping
+  // with the map from output axes to input axes.
+
+  start_axis = PreprocessAxis(start_axis, x_ndim);
+  stop_axis = PreprocessAxis(stop_axis, x_ndim);
+
+  std::vector<DimTrans*> trans =
+      MakeFlattenDimTransReverse(x_shape, start_axis, stop_axis);
+
+  // Step2: Infer the dims mapping of input with
+  // output's dims_mapping and the transformation.
+  std::vector<std::vector<int64_t>> dims_mapping_vec =
+      InferFromDimTrans(out, trans);
+
+  // Step3: Update the dist attributes of input
+  // and output with the inferred dims mapping
+  TensorDistAttr out_dist_attr_dst(out_dist_attr_src);
+  out_dist_attr_dst.set_dims_mapping(dims_mapping_vec[0]);
+  TensorDistAttr x_dist_attr(x.dist_attr());
+  x_dist_attr.set_dims_mapping(dims_mapping_vec[1]);
+
+  VLOG(4) << "FlattenInferSpmdReverse: Out shape: [" << str_join(out_shape)
+          << "] X shape: [" << str_join(x_shape) << "]";
+  VLOG(4) << "Transformation from output to input:";
+  for (int64_t i = 0, n = trans.size(); i < n; i++) {
+    DimTrans* t = trans[i];
+    VLOG(4) << "\tX axis[" << i << "]: " << t->to_string();
+  }
+  VLOG(4) << "Out dims_mapping_src: [" << str_join(out_dims_mapping) << "] "
+          << "dims_mapping_dst: [" << str_join(dims_mapping_vec[0]) << "]";
+  VLOG(4) << "X dims_mapping: [" << str_join(dims_mapping_vec[1]) << "]\n\n";
+
+  CleanUp();
+
+  return {{x_dist_attr}, {out_dist_attr_dst}};
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/flatten.h b/paddle/phi/infermeta/spmd_rules/flatten.h
new file mode 100644
index 00000000000000..bb62d8c0d7b0a2
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/flatten.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo FlattenInferSpmd(const DistMetaTensor& x,
+                          int start_axis,
+                          int stop_axis);
+
+SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
+                                 const DistMetaTensor& out,
+                                 int start_axis,
+                                 int stop_axis);
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/matmul.cc b/paddle/phi/infermeta/spmd_rules/matmul.cc
index 98c2ebd7493b91..4893c7071f19e4 100644
--- a/paddle/phi/infermeta/spmd_rules/matmul.cc
+++ b/paddle/phi/infermeta/spmd_rules/matmul.cc
@@ -278,6 +278,14 @@ SpmdInfo MatmulInferSpmdReverse(const DistMetaTensor& x,
   return {{x_dist_attr_dst, y_dist_attr_dst}, {out_dist_attr_src}};
 }
 
+static bool DistAttrsAreBasicallyEqual(
+    const phi::distributed::TensorDistAttr& in_dist_attr,
+    const phi::distributed::TensorDistAttr& out_dist_attr) {
+  return (in_dist_attr.process_mesh() == out_dist_attr.process_mesh() &&
+          in_dist_attr.dims_mapping() == out_dist_attr.dims_mapping() &&
+          in_dist_attr.partial_status() == out_dist_attr.partial_status());
+}
+
 SpmdInfo MatmulGradInferSpmd(const DistMetaTensor& x,
                              const DistMetaTensor& y,
                              const DistMetaTensor& out_grad,
@@ -287,8 +295,8 @@ SpmdInfo MatmulGradInferSpmd(const DistMetaTensor& x,
                                        const DistMetaTensor& y,
                                        const char* debug_msg) {
     PADDLE_ENFORCE_EQ(
-        x_dist_attr,
-        y.dist_attr(),
+        DistAttrsAreBasicallyEqual(x_dist_attr, y.dist_attr()),
+        true,
         phi::errors::Unavailable("The matmul grad infer spmd `%s` verify "
                                  "error: left dist attr is %s, "
                                  "right dist attr is %s.",
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index 4e037a8336d98e..eda61be1f22846 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h"
 #include "paddle/phi/infermeta/spmd_rules/elementwise.h"
 #include "paddle/phi/infermeta/spmd_rules/embedding.h"
+#include "paddle/phi/infermeta/spmd_rules/flatten.h"
 #include "paddle/phi/infermeta/spmd_rules/layer_norm.h"
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
 #include "paddle/phi/infermeta/spmd_rules/reduction.h"
@@ -52,6 +53,9 @@ namespace distributed {
 PD_REGISTER_SPMD_RULE(matmul,
                       PD_INFER_SPMD(phi::distributed::MatmulInferSpmd),
                       PD_INFER_SPMD(phi::distributed::MatmulInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(matmul_v2,  // static mode
+                      PD_INFER_SPMD(phi::distributed::MatmulInferSpmd),
+                      PD_INFER_SPMD(phi::distributed::MatmulInferSpmdReverse));
 
 PD_REGISTER_SPMD_RULE(
     elementwise_unary,
@@ -68,6 +72,10 @@ PD_REGISTER_SPMD_RULE(
     unsqueeze,
     PD_INFER_SPMD(phi::distributed::DefaultDataParallelInferSpmd),
     PD_INFER_SPMD(phi::distributed::DefaultDataParallelInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    default_,
+    PD_INFER_SPMD(phi::distributed::DefaultDataParallelInferSpmd),
+    PD_INFER_SPMD(phi::distributed::DefaultDataParallelInferSpmdReverse));
 
 // replicated rule /* for unittest */
 PD_REGISTER_SPMD_RULE(
@@ -466,6 +474,10 @@ PD_REGISTER_SPMD_RULE(
     sum,
     PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
     PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    reduce_sum,  // static
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
 
 // layer_norm
 PD_REGISTER_SPMD_RULE(
@@ -477,6 +489,14 @@ PD_REGISTER_SPMD_RULE(
 PD_REGISTER_SPMD_RULE(reshape,
                       PD_INFER_SPMD(phi::distributed::ReshapeInferSpmd),
                       PD_INFER_SPMD(phi::distributed::ReshapeInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(reshape2,
+                      PD_INFER_SPMD(phi::distributed::ReshapeInferSpmd),
+                      PD_INFER_SPMD(phi::distributed::ReshapeInferSpmdReverse));
+
+// flatten rule
+PD_REGISTER_SPMD_RULE(flatten,
+                      PD_INFER_SPMD(phi::distributed::FlattenInferSpmd),
+                      PD_INFER_SPMD(phi::distributed::FlattenInferSpmdReverse));
 
 // embedding rule
 PD_REGISTER_SPMD_RULE(
@@ -502,6 +522,10 @@ PD_REGISTER_SPMD_RULE(
     transpose,
     PD_INFER_SPMD(phi::distributed::TransposeInferSpmd),
     PD_INFER_SPMD(phi::distributed::TransposeInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    transpose2,
+    PD_INFER_SPMD(phi::distributed::TransposeInferSpmd),
+    PD_INFER_SPMD(phi::distributed::TransposeInferSpmdReverse));
 
 // softmax rule
 PD_REGISTER_SPMD_RULE(softmax,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index b18ecf48363f04..d97a16e57fa614 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -255,6 +255,32 @@ void BoxCoderInferMeta(const MetaTensor& prior_box,
   output_box->set_dtype(target_box.dtype());
 }
 
+void DpsgdInferMeta(const MetaTensor& param,
+                    const MetaTensor& grad,
+                    const MetaTensor& learning_rate,
+                    float clip,
+                    float batch_size,
+                    float sigma,
+                    int size,
+                    MetaTensor* param_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_EQ(phi::product(lr_dims),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Learning rate should have 1 dimension. But Received "
+                        "LearningRate's dims [%s].",
+                        phi::product(lr_dims)));
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      phi::errors::InvalidArgument(
+          "Param and Grad input of DpsgdOp should have same dimension. But "
+          "received Para's dim [%s] and Grad's dim [%s].",
+          param_dims,
+          grad.dims()));
+  param_out->set_dims(param_dims);
+}
 void FlashAttnInferMeta(const MetaTensor& q,
                         const MetaTensor& k,
                         const MetaTensor& v,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 47c4b9826da4a8..797835a1abd511 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -63,6 +63,15 @@ void BoxCoderInferMeta(const MetaTensor& prior_box,
                        MetaTensor* output_box,
                        MetaConfig config = MetaConfig());
 
+void DpsgdInferMeta(const MetaTensor& param,
+                    const MetaTensor& grad,
+                    const MetaTensor& learning_rate,
+                    float clip,
+                    float batch_size,
+                    float sigma,
+                    int size,
+                    MetaTensor* param_out);
+
 void FlashAttnInferMeta(const MetaTensor& q,
                         const MetaTensor& k,
                         const MetaTensor& v,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 6eaff66c583898..243f0b232395e4 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -148,18 +148,19 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
                         const Scalar& axis,
                         bool keepdims,
                         bool flatten,
-                        int dtype,
+                        DataType dtype,
                         MetaTensor* out,
                         MetaConfig config) {
   PADDLE_ENFORCE_EQ(
-      (dtype < 0 || dtype == 2 || dtype == 3),
+      (dtype == DataType::UNDEFINED || dtype == DataType::INT32 ||
+       dtype == DataType::INT64),
       true,
       phi::errors::InvalidArgument(
           "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
           "received [%s]",
           DataTypeToString(DataType::INT32),
           DataTypeToString(DataType::INT64),
-          DataTypeToString(phi::TransToPhiDataType(dtype))));
+          DataTypeToString(dtype)));
 
   if (!config.is_runtime && axis.FromTensor()) {
     std::vector<int64_t> vec;
@@ -177,10 +178,8 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
       }
     }
     out->set_dims(phi::make_ddim(vec));
-    if (dtype == 2) {
-      out->set_dtype(DataType::INT32);
-    } else if (dtype == 3) {
-      out->set_dtype(DataType::INT64);
+    if (dtype == DataType::INT32 || dtype == DataType::INT64) {
+      out->set_dtype(dtype);
     }
     return;
   }
@@ -216,7 +215,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
   if (int_axis < 0) int_axis += x_rank;
 
   if (config.is_runtime) {
-    if (dtype == phi::TransToProtoVarType(DataType::INT32)) {
+    if (dtype == DataType::INT32) {
       int64_t all_element_num = 0;
       if (flatten) {
         all_element_num = phi::product(x_dims);
@@ -253,10 +252,8 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
   }
 
   out->set_dims(phi::make_ddim(vec));
-  if (dtype == 2) {
-    out->set_dtype(DataType::INT32);
-  } else if (dtype == 3) {
-    out->set_dtype(DataType::INT64);
+  if (dtype == DataType::INT32 || dtype == DataType::INT64) {
+    out->set_dtype(dtype);
   }
 }
 
@@ -427,6 +424,14 @@ void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) {
   out->set_dtype(x.dtype());
 }
 
+void CINNBroadcastInferMeta(const MetaTensor& x,
+                            const std::vector<int64_t>& axes,
+                            const std::vector<int64_t>& out_shape,
+                            MetaTensor* out) {
+  out->set_dims(phi::make_ddim(out_shape));
+  out->set_dtype(x.dtype());
+}
+
 void ClassCenterSampleInferMeta(const MetaTensor& label,
                                 int num_classes,
                                 int num_samples,
@@ -555,7 +560,7 @@ void CumWithIndicesInferMeta(const MetaTensor& x,
       phi::errors::InvalidArgument("dtype of indices must be int32 or int64"));
 
   if (indices_type == DataType::INT32) {
-    int _axis;
+    int _axis = 0;
     if (axis < 0) {
       _axis = axis + x_dims.size();
     } else {
@@ -1682,11 +1687,11 @@ void FrameInferMeta(const MetaTensor& x,
           "Attribute(axis) of FrameOp should 0 or -1, but got %s.", axis));
 
   std::vector<int64_t> output_shape;
-  int seq_length;
-  int n_frames;
+  int seq_length = 0;
+  int n_frames = 0;
 
-  int start_axis;
-  int end_axis;
+  int start_axis = 0;
+  int end_axis = 0;
 
   if (axis == 0) {
     seq_length = static_cast<int>(x_dims[0]);
@@ -2566,12 +2571,12 @@ void OverlapAddInferMeta(const MetaTensor& x,
           "Attribute(axis) of OverlapAddOp should 0 or -1, but got %s.", axis));
 
   std::vector<int64_t> output_shape;
-  int n_frames;
-  int frame_length;
-  int seq_length;
+  int n_frames = 0;
+  int frame_length = 0;
+  int seq_length = 0;
 
-  int start_axis;
-  int end_axis;
+  int start_axis = 0;
+  int end_axis = 0;
   if (axis == 0) {
     n_frames = static_cast<int>(x_dims[0]);
     frame_length = static_cast<int>(x_dims[1]);
@@ -3143,8 +3148,8 @@ void QrInferMeta(const MetaTensor& x,
       x_dims.size(),
       2,
       phi::errors::InvalidArgument("the rank of input must greater than 2"));
-  bool compute_q;
-  bool reduced_mode;
+  bool compute_q = false;
+  bool reduced_mode = false;
   int m = static_cast<int>(x_dims[x_rank - 2]);
   int n = static_cast<int>(x_dims[x_rank - 1]);
   int min_mn = std::min(m, n);
@@ -3964,8 +3969,8 @@ void SqueezeWithXShapeInferMeta(const MetaTensor& x,
                                 MetaTensor* out,
                                 MetaTensor* xshape,
                                 MetaConfig config) {
-  SqueezeInferMeta(x, axes, out, config);
   const auto& x_dims = x.dims();
+  SqueezeInferMeta(x, axes, out, config);
   std::vector<int64_t> xshape_dims(x_dims.size() + 1);
   xshape_dims[0] = 0;
   for (int i = 0; i < x_dims.size(); ++i) {
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a3b7e87d86d0bf..d79b53a71097e4 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -49,7 +49,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
                         const Scalar& axis,
                         bool keepdims,
                         bool flatten,
-                        int dtype,
+                        DataType dtype,
                         MetaTensor* out,
                         MetaConfig config = MetaConfig());
 
@@ -89,6 +89,11 @@ void CheckNumericsInferMeta(const MetaTensor& tensor,
 
 void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out);
 
+void CINNBroadcastInferMeta(const MetaTensor& x,
+                            const std::vector<int64_t>& axes,
+                            const std::vector<int64_t>& out_shape,
+                            MetaTensor* output);
+
 void ClassCenterSampleInferMeta(const MetaTensor& label,
                                 int num_classes,
                                 int num_samples,
diff --git a/paddle/phi/kernels/all_gather_kernel.h b/paddle/phi/kernels/all_gather_kernel.h
index cc19f88202d2a8..1bde193a7b5cd1 100644
--- a/paddle/phi/kernels/all_gather_kernel.h
+++ b/paddle/phi/kernels/all_gather_kernel.h
@@ -34,7 +34,9 @@ void AllGather(const Context& dev_ctx,
   MetaTensor* out_meta_ptr = &out_meta;
 
   AllGatherInferMeta(phi::MetaTensor(x), nranks, out_meta_ptr);
-  AllGatherKernel<T, Context>(dev_ctx, x, nranks, out);
+  if (x.initialized()) {
+    AllGatherKernel<T, Context>(dev_ctx, x, nranks, out);
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/all_reduce_kernel.h b/paddle/phi/kernels/all_reduce_kernel.h
index 3583bde3416b37..2ec072bfd3ff27 100644
--- a/paddle/phi/kernels/all_reduce_kernel.h
+++ b/paddle/phi/kernels/all_reduce_kernel.h
@@ -35,7 +35,9 @@ void AllReduce(const Context& dev_ctx,
   MetaTensor* out_meta_ptr = &out_meta;
 
   AllReduceInferMeta(phi::MetaTensor(x), out_meta_ptr);
-  AllReduceKernel<T, Context>(dev_ctx, x, reduce_type, out);
+  if (x.initialized()) {
+    AllReduceKernel<T, Context>(dev_ctx, x, reduce_type, out);
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/all_to_all_kernel.h b/paddle/phi/kernels/all_to_all_kernel.h
index 5444960b1f69e3..5ac951deba5fb6 100644
--- a/paddle/phi/kernels/all_to_all_kernel.h
+++ b/paddle/phi/kernels/all_to_all_kernel.h
@@ -30,7 +30,9 @@ void AllToAll(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   MetaTensor* out_meta_ptr = &out_meta;
 
   AllToAllInferMeta(phi::MetaTensor(x), out_meta_ptr);
-  AllToAllKernel<T, Context>(dev_ctx, x, out);
+  if (x.initialized()) {
+    AllToAllKernel<T, Context>(dev_ctx, x, out);
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/arg_min_max_kernel.h b/paddle/phi/kernels/arg_min_max_kernel.h
index 258c8f21e0540b..5f1b4fc934fec2 100644
--- a/paddle/phi/kernels/arg_min_max_kernel.h
+++ b/paddle/phi/kernels/arg_min_max_kernel.h
@@ -25,7 +25,7 @@ void ArgMinKernel(const Context& dev_ctx,
                   const Scalar& axis,
                   bool keepdims,
                   bool flatten,
-                  int dtype,
+                  DataType dtype,
                   DenseTensor* out);
 
 template <typename T, typename Context>
@@ -34,7 +34,7 @@ void ArgMaxKernel(const Context& dev_ctx,
                   const Scalar& axis,
                   bool keepdims,
                   bool flatten,
-                  int dtype,
+                  DataType dtype,
                   DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index d3b99449a06f2d..e4f2d25f098334 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -41,7 +41,10 @@ void Concat(const Context& dev_ctx,
 
   MetaTensor meta_out(dense_out);
   ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out);
-  ConcatKernel<T, Context>(dev_ctx, x, axis, dense_out);
+
+  if (x[0]->initialized()) {
+    ConcatKernel<T, Context>(dev_ctx, x, axis, dense_out);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index 6816a353ce5042..65bde5601128f8 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -307,7 +307,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_grad,
+                                                SoftplusGradKernel)
 
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
                                           ReluDoubleGradKernel)
@@ -320,8 +321,8 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(sqrt_double_grad,
                                           SqrtDoubleGradKernel)
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(rsqrt_double_grad,
                                           RsqrtDoubleGradKernel)
-PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(softplus_double_grad,
-                                          SoftplusDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL_WITH_COMPLEX(softplus_double_grad,
+                                                       SoftplusDoubleGradKernel)
 
 PD_REGISTER_KERNEL(tanh_triple_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 813a7ffc7ba422..a8169df1021d2b 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -201,7 +201,7 @@ PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, STanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softplus, SoftplusKernel)
 
 PD_REGISTER_KERNEL(exp,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
index c6a512aa95cb18..fd6cf3aebc2687 100644
--- a/paddle/phi/kernels/cpu/allclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -30,7 +30,7 @@ void AllCloseKernel(const Context& dev_ctx,
                     const Scalar& atol,
                     bool equal_nan,
                     DenseTensor* out) {
-  double rtol_v, atol_v;
+  double rtol_v = NAN, atol_v = NAN;
   if (rtol.dtype() == DataType::FLOAT64) {
     rtol_v = rtol.to<double>();
   } else if (rtol.dtype() == DataType::FLOAT32) {
@@ -58,7 +58,7 @@ void AllCloseKernel(const Context& dev_ctx,
   auto num = x.numel();
   for (int64_t i = 0; i < num; ++i) {
     const T a = in_a[i], b = in_b[i];
-    bool val;
+    bool val = false;
     if (std::isnan(a) || std::isnan(b)) {
       val = equal_nan && std::isnan(a) == std::isnan(b);
     } else {
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
index 20dfd2faff8a42..ce00926101f2cc 100644
--- a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -151,9 +151,9 @@ void ArgMinMaxKernel(const Context& dev_ctx,
                      const Scalar& axis,
                      bool keepdims,
                      bool flatten,
-                     int dtype,
+                     DataType dtype,
                      DenseTensor* out) {
-  if (dtype < 0) {
+  if (dtype == DataType::UNDEFINED) {
     phi::VisitDataTypeTiny(
         phi::DataType::INT64,
         VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
@@ -161,7 +161,7 @@ void ArgMinMaxKernel(const Context& dev_ctx,
     return;
   }
   phi::VisitDataTypeTiny(
-      phi::TransToPhiDataType(dtype),
+      dtype,
       VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
           dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
 }
@@ -172,7 +172,7 @@ void ArgMinKernel(const Context& dev_ctx,
                   const Scalar& axis,
                   bool keepdims,
                   bool flatten,
-                  int dtype,
+                  DataType dtype,
                   DenseTensor* out) {
   ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMin>(
       dev_ctx, x, axis, keepdims, flatten, dtype, out);
@@ -184,7 +184,7 @@ void ArgMaxKernel(const Context& dev_ctx,
                   const Scalar& axis,
                   bool keepdims,
                   bool flatten,
-                  int dtype,
+                  DataType dtype,
                   DenseTensor* out) {
   ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMax>(
       dev_ctx, x, axis, keepdims, flatten, dtype, out);
diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
index 7a95e47047a103..071140a2a54200 100644
--- a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
@@ -51,8 +51,8 @@ void CumprodGradKernel(const Context& dev_ctx,
   size_t numel = outer_dim * mid_dim * inner_dim;
 
   // deal with complex
-  const T* x_data_deal;
-  const T* out_data_deal;
+  const T* x_data_deal = nullptr;
+  const T* out_data_deal = nullptr;
   Allocator::AllocationPtr x_conj;
   Allocator::AllocationPtr out_conj;
   if (phi::IsComplexType(x.dtype())) {
diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc
index 1576d80b15206b..fb15fcbe61f7e6 100644
--- a/paddle/phi/kernels/cpu/diag_kernel.cc
+++ b/paddle/phi/kernels/cpu/diag_kernel.cc
@@ -32,7 +32,7 @@ void DiagKernel(const Context& dev_ctx,
   T* out_data = dev_ctx.template Alloc<T>(out);
   auto out_dims = out->dims();
 
-  int64_t i;
+  int64_t i = 0;
   if (x_dims.size() <= 1) {
     phi::funcs::SetConstant<Context, T> set_padding_value;
     set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
diff --git a/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc b/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
index b8156459f2a923..aabca4c852e04b 100644
--- a/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
+++ b/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
@@ -46,7 +46,7 @@ void DistributeFpnProposalsKernel(
   }
 
   std::vector<size_t> fpn_rois_lod;
-  int fpn_rois_num;
+  int fpn_rois_num = 0;
   if (rois_num.get_ptr()) {
     fpn_rois_lod = funcs::GetLodFromRoisNum(dev_ctx, rois_num.get_ptr());
   } else {
diff --git a/paddle/phi/kernels/cpu/eigvals_kernel.cc b/paddle/phi/kernels/cpu/eigvals_kernel.cc
index b0fc48db5739c2..cd4aaca2ecf83f 100644
--- a/paddle/phi/kernels/cpu/eigvals_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc
@@ -216,7 +216,7 @@ void EigvalsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
 
   // query workspace size
   T qwork;
-  int info;
+  int info = 0;
   funcs::lapackEig<T, dtype::Real<T>>('N',
                                       'N',
                                       static_cast<int>(n_dim),
diff --git a/paddle/phi/kernels/cpu/group_norm_kernel.cc b/paddle/phi/kernels/cpu/group_norm_kernel.cc
index a041c855346756..35975018dca1cc 100644
--- a/paddle/phi/kernels/cpu/group_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/group_norm_kernel.cc
@@ -91,7 +91,7 @@ void GroupNormKernel(const Context& dev_ctx,
 
       if (data_layout == DataLayout::kNCHW) {
         for (int cid = 0; cid < number; cid++) {
-          int imid;
+          int imid = 0;
           for (imid = 0; imid < imsize - (imsize % M);
                imid += M, iter_x_data += M) {
             // TODO(gaoxiang): Because AVX/AVX2/AVX512 can not directly used
@@ -128,7 +128,7 @@ void GroupNormKernel(const Context& dev_ctx,
       } else {
         for (int cid = 0; cid < number; cid++) {
           iter_x_data = tmp_x + cid;
-          int imid;
+          int imid = 0;
           for (imid = 0; imid < imsize - (imsize % M);
                imid += M, iter_x_data += M * C) {
             // TODO(gaoxiang): Because AVX/AVX2/AVX512 can not directly used
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
index 7d5f60731f13dc..14937ea613936b 100644
--- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -170,7 +170,7 @@ void InstanceNormDoubleGradKernel(const Context& dev_ctx,
   const auto* ddBias = ddbias.get_ptr();
   phi::funcs::SetConstant<CPUContext, T> set_constant;
   const auto& x_dims = x.dims();
-  int N, C, H, W, D;
+  int N = 0, C = 0, H = 0, W = 0, D = 0;
   funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
   const int sample_size = static_cast<int>(x.numel() / N / C);
   const int NxC = N * C;
diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
index f1478d5e3b3e7e..e32738b4588c83 100644
--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -407,7 +407,7 @@ static void Interpolate1DCPUBwd(
     int align_mode,
     DenseTensor* input_grad) {
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
+  int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
   float scale_w = -1.0;
@@ -508,7 +508,7 @@ static void Interpolate2DCPUBwd(
     int align_mode,
     DenseTensor* input_grad) {
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
+  int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
   float scale_h = -1;
@@ -674,7 +674,7 @@ static void Interpolate3DCPUBwd(
     int align_mode,
     DenseTensor* input_grad) {
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
+  int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
   float scale_d = -1;
diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc
index 198cba7d1e9488..7c957657ceb39e 100644
--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -561,7 +561,7 @@ static void Interpolate1DCPUFwd(
     int align_mode,
     DenseTensor* output) {
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
+  int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
   float scale_w = -1.;
@@ -662,7 +662,7 @@ static void Interpolate2DCPUFwd(
     int align_mode,
     DenseTensor* output) {
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
+  int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
   float scale_h = -1;
@@ -833,7 +833,7 @@ static void Interpolate3DCPUFwd(
     int align_mode,
     DenseTensor* output) {
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
+  int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
   float scale_d = -1;
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 3d21c49ee1e2bc..0713725127190a 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -39,7 +39,7 @@ void LapackSVD(const T* x_data, T* eigenvalues_data, int rows, int cols) {
   int lwork = 3 * mn + std::max(mx, 7 * mn);
   std::vector<T> work(lwork);
   std::vector<int> iwork(8 * mn);
-  int info;
+  int info = 0;
 
   phi::funcs::lapackSvd<T>(jobz,
                            rows,
diff --git a/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc b/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
index 336af33d8679b6..aa04288124a9b7 100644
--- a/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
@@ -381,7 +381,7 @@ void MultiClassNMS(const Context& ctx,
   *num_nmsed_out = num_det;
   const T* scores_data = scores.data<T>();
   if (keep_top_k > -1 && num_det > keep_top_k) {
-    const T* sdata;
+    const T* sdata = nullptr;
     std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
     for (const auto& it : *indices) {
       int label = it.first;
@@ -441,7 +441,7 @@ void MultiClassOutput(const Context& ctx,
   auto* scores_data = scores.data<T>();
   auto* bboxes_data = bboxes.data<T>();
   auto* odata = out->data<T>();
-  const T* sdata;
+  const T* sdata = nullptr;
   DenseTensor bbox;
   bbox.Resize({scores.dims()[0], box_size});
   int count = 0;
@@ -456,7 +456,7 @@ void MultiClassOutput(const Context& ctx,
 
     for (auto idx : indices) {
       odata[count * out_dim] = label;  // label
-      const T* bdata;
+      const T* bdata = nullptr;
       if (scores_size == 3) {
         bdata = bboxes_data + idx * box_size;
         odata[count * out_dim + 1] = sdata[idx];  // score
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index 6d51a64c76bb1c..8bc46fa6cdffc6 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -40,7 +40,7 @@ void NormGradKernel(const Context& ctx,
 
   auto xdim = in_x->dims();
   if (axis < 0) axis = xdim.size() + axis;
-  int pre, n, post;
+  int pre = 0, n = 0, post = 0;
   funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post);
 
   auto* place = ctx.eigen_device();
diff --git a/paddle/phi/kernels/cpu/norm_kernel.cc b/paddle/phi/kernels/cpu/norm_kernel.cc
index 21af086515d71c..73540f83605920 100644
--- a/paddle/phi/kernels/cpu/norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_kernel.cc
@@ -33,10 +33,10 @@ void NormKernel(const Context& ctx,
   auto xdim = x.dims();
   T eps = epsilon;
   if (axis < 0) axis = xdim.size() + axis;
-  int pre, n, post;
+  int pre = 0, n = 0, post = 0;
   funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post);
 
-  DenseTensor* out_norm;
+  DenseTensor* out_norm = nullptr;
   DenseTensor out_norm_tmp;
   if (is_test) {
     auto out_dim = x.dims();
diff --git a/paddle/phi/kernels/cpu/p_norm_kernel.cc b/paddle/phi/kernels/cpu/p_norm_kernel.cc
index 7a683438176bb9..3a837c96ec58a9 100644
--- a/paddle/phi/kernels/cpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_kernel.cc
@@ -58,7 +58,7 @@ void PNormKernel(const Context& dev_ctx,
 
   auto xdim = in_x->dims();
   if (axis < 0) axis = xdim.size() + axis;
-  int pre, n, post;
+  int pre = 0, n = 0, post = 0;
   GetDims(xdim, axis, &pre, &n, &post, asvector);
 
   for (int i = 0; i < xdim.size(); i++) {
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
index 17539957a0d443..3a517cfa1fb612 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -43,7 +43,7 @@ void PsroiPoolGradKernel(const Context& ctx,
     DenseTensor rois_batch_id_list;
     rois_batch_id_list.Resize({rois_num_t});
     int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
-    int rois_batch_size;
+    int rois_batch_size = 0;
     if (rois_num.get_ptr()) {
       rois_batch_size = static_cast<int>(rois_num->numel());
       auto* rois_num_t_data = rois_num->data<int>();
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
index fe48ee9e7e88e3..3b15135133049f 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -53,7 +53,7 @@ void PsroiPoolKernel(const Context& ctx,
   rois_batch_id_list.Resize({rois_num_t});
   int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
 
-  int rois_batch_size;
+  int rois_batch_size = 0;
   if (rois_num.get_ptr()) {
     rois_batch_size = static_cast<int>(rois_num->numel());
     auto* rois_num_data = rois_num->data<int>();
diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc
index ac61e8e172ae6e..194906ae1dc346 100644
--- a/paddle/phi/kernels/cpu/qr_kernel.cc
+++ b/paddle/phi/kernels/cpu/qr_kernel.cc
@@ -29,8 +29,8 @@ void QrKernel(const Context& ctx,
               const std::string& mode,
               DenseTensor* q,
               DenseTensor* r) {
-  bool compute_q;
-  bool reduced_mode;
+  bool compute_q = false;
+  bool reduced_mode = false;
   std::tie(compute_q, reduced_mode) = phi::funcs::ParseQrMode(mode);
   auto numel = x.numel();
   PADDLE_ENFORCE_GT(
diff --git a/paddle/phi/kernels/cpu/quantize_linear_kernel.cc b/paddle/phi/kernels/cpu/quantize_linear_kernel.cc
new file mode 100644
index 00000000000000..a7f3954407a526
--- /dev/null
+++ b/paddle/phi/kernels/cpu/quantize_linear_kernel.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/phi/kernels/quantize_linear_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/impl/quantize_linear_impl.h"
+
+namespace phi {
+
+template <typename T>
+struct DequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
+                  T max_range,
+                  phi::DenseTensor* out) {
+    auto in_e = phi::EigenVector<T>::Flatten(*in);
+    const T* scale_factor = scale->data<T>();
+    auto out_e = phi::EigenVector<T>::Flatten(*out);
+
+    auto& dev = *dev_ctx.eigen_device();
+    out_e.device(dev) = in_e * scale_factor[0] / max_range;
+  }
+};
+
+template <typename T>
+struct ChannelDequantizeFunctorV2<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
+                  T max_range,
+                  const int quant_axis,
+                  phi::DenseTensor* out) {
+    // Dequant op is before quantized op
+    // Dequantize the weight of quantized op
+    auto in_dims = in->dims();
+    const int64_t channel = in_dims[quant_axis];
+    const T* scale_factor = scale->data<T>();
+    if (quant_axis == 0) {
+      for (int64_t i = 0; i < channel; i++) {
+        T s = scale_factor[i];
+        phi::DenseTensor one_channel_in = in->Slice(i, i + 1);
+        phi::DenseTensor one_channel_out = out->Slice(i, i + 1);
+        auto in_e = phi::EigenVector<T>::Flatten(one_channel_in);
+        auto out_e = phi::EigenVector<T>::Flatten(one_channel_out);
+        auto& dev = *dev_ctx.eigen_device();
+        out_e.device(dev) = in_e * s / max_range;
+      }
+    } else if (quant_axis == 1) {
+      int64_t out_iter = 1;
+      for (int i = 0; i < quant_axis; i++) {
+        out_iter *= in_dims[i];
+      }
+      int64_t step_i = in->numel() / out_iter;
+      int64_t step_j = in->numel() / (out_iter * channel);
+      auto* in_data = in->data<T>();
+      auto* out_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
+      for (int64_t i = 0; i < out_iter; i++) {
+        for (int64_t j = 0; j < channel; j++) {
+          auto* cur_in = in_data + i * step_i + j * step_j;
+          auto* cur_out = out_data + i * step_i + j * step_j;
+          T s = scale_factor[j];
+          for (int64_t k = 0; k < step_j; k++) {
+            *cur_out = (*cur_in) * s / max_range;
+            ++cur_in;
+            ++cur_out;
+          }
+        }
+      }
+    }
+  }
+};
+
+template struct DequantizeFunctor<phi::CPUContext, phi::dtype::float16>;
+template struct DequantizeFunctor<phi::CPUContext, float>;
+template struct DequantizeFunctor<phi::CPUContext, double>;
+template struct ChannelDequantizeFunctorV2<phi::CPUContext,
+                                           phi::dtype::float16>;
+template struct ChannelDequantizeFunctorV2<phi::CPUContext, float>;
+template struct ChannelDequantizeFunctorV2<phi::CPUContext, double>;
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dequantize_linear,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DeQuantizeLinearKernel,
+                   float,
+                   int8_t,
+                   double) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/paddle/phi/kernels/cpu/reduce_mean_kernel.cc b/paddle/phi/kernels/cpu/reduce_mean_kernel.cc
index a8d6723cce6d10..ea098d09a5d562 100644
--- a/paddle/phi/kernels/cpu/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_mean_kernel.cc
@@ -43,5 +43,7 @@ PD_REGISTER_KERNEL(mean_raw,
                    float,
                    double,
                    bool,
+                   int,
+                   int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
index 81868afc46318a..119f4ea1b0ac40 100644
--- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -29,7 +29,7 @@ void bilinear_interpolate_gradient(const int height,
                                    const T out_grad_this_bin,
                                    const T count,
                                    T* batch_grad_data) {
-  int x_low, y_low, x_high, y_high;
+  int x_low = 0, y_low = 0, x_high = 0, y_high = 0;
   T w1, w2, w3, w4;
   if (y < -1.0 || y > height || x < -1.0 || x > width) {
     w1 = w2 = w3 = w4 = 0;
@@ -94,7 +94,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
   DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
   int* box_batch_id_data = roi_batch_id_list.data<int>();
 
-  int boxes_batch_size;
+  int boxes_batch_size = 0;
   if (boxes_num) {
     boxes_batch_size = static_cast<int>(boxes_num->numel());
     auto* boxes_num_data = boxes_num->data<int>();
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
index 704a2b4b610fcc..e25a581cbd9dd9 100644
--- a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -37,7 +37,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
     DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
     int* box_batch_id_data = box_batch_id_list.data<int>();
 
-    int boxes_batch_size;
+    int boxes_batch_size = 0;
     if (boxes_num) {
       boxes_batch_size = static_cast<int>(boxes_num->numel());
       auto* boxes_num_data = boxes_num->data<int>();
diff --git a/paddle/phi/kernels/cpu/sign_kernel.cc b/paddle/phi/kernels/cpu/sign_kernel.cc
index 9ded252c5c5920..f03f39f80dcbed 100644
--- a/paddle/phi/kernels/cpu/sign_kernel.cc
+++ b/paddle/phi/kernels/cpu/sign_kernel.cc
@@ -21,4 +21,13 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/bfloat16.h"
 
-PD_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {}
+PD_REGISTER_KERNEL(sign,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SignKernel,
+                   int8_t,
+                   int16_t,
+                   int32_t,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/svd_kernel.cc b/paddle/phi/kernels/cpu/svd_kernel.cc
index 136835876249d2..a3f6f38fe47802 100644
--- a/paddle/phi/kernels/cpu/svd_kernel.cc
+++ b/paddle/phi/kernels/cpu/svd_kernel.cc
@@ -35,7 +35,7 @@ void LapackSvd(
   int lwork = full ? (4 * mn * mn + 6 * mn + mx) : (4 * mn * mn + 7 * mn);
   std::vector<T> work(lwork);
   std::vector<int> iwork(8 * mn);
-  int info;
+  int info = 0;
   phi::funcs::lapackSvd<T>(jobz,
                            rows,
                            cols,
@@ -98,7 +98,6 @@ void SvdKernel(const Context& dev_ctx,
   /*Create Tensors and output, set the dim ...*/
   auto numel = X.numel();
   DenseTensor trans_x = ::phi::TransposeLast2Dim<T>(dev_ctx, X);
-  auto* x_data = trans_x.data<T>();
   auto x_dims = X.dims();
   int rows = static_cast<int>(x_dims[x_dims.size() - 2]);
   int cols = static_cast<int>(x_dims[x_dims.size() - 1]);
@@ -113,6 +112,7 @@ void SvdKernel(const Context& dev_ctx,
       0,
       cols,
       errors::InvalidArgument("The col of Input(X) should be greater than 0."));
+  auto* x_data = trans_x.data<T>();
   int batches = static_cast<int>(numel / (rows * cols));
   auto* U_out = dev_ctx.template Alloc<phi::dtype::Real<T>>(U);
   auto* VH_out = dev_ctx.template Alloc<phi::dtype::Real<T>>(VH);
diff --git a/paddle/phi/kernels/cpu/weight_quantize_kernel.cc b/paddle/phi/kernels/cpu/weight_quantize_kernel.cc
index 2539f37d121976..8db05de3110822 100644
--- a/paddle/phi/kernels/cpu/weight_quantize_kernel.cc
+++ b/paddle/phi/kernels/cpu/weight_quantize_kernel.cc
@@ -1,16 +1,16 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/phi/kernels/weight_quantize_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
diff --git a/paddle/phi/kernels/cpu/yolo_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/yolo_loss_grad_kernel.cc
index 75fcf48cd4acf8..c876718d8a8b1c 100644
--- a/paddle/phi/kernels/cpu/yolo_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolo_loss_grad_kernel.cc
@@ -169,7 +169,7 @@ void YoloLossGradKernel(const Context& dev_ctx,
   T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
   memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
 
-  const T* gt_score_data;
+  const T* gt_score_data = nullptr;
   DenseTensor gtscore;
   if (!(gt_score.is_initialized())) {
     gtscore.Resize({n, b});
diff --git a/paddle/phi/kernels/cpu/yolo_loss_kernel.cc b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc
index 275e83cc9b40fa..280ac791d049bb 100644
--- a/paddle/phi/kernels/cpu/yolo_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc
@@ -229,7 +229,7 @@ void YoloLossKernel(const Context& dev_ctx,
   gt_match_mask->Resize({n, b});
   int* gt_match_mask_data = dev_ctx.template Alloc<int>(gt_match_mask);
 
-  const T* gt_score_data;
+  const T* gt_score_data = nullptr;
   DenseTensor gtscore;
   if (!(gt_score.is_initialized())) {
     gtscore.Resize({n, b});
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index c50194bfaf009a..b2c2d493c48ad3 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -799,6 +799,31 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct SoftplusGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<ComplexType<T>>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    auto x_beta = static_cast<ComplexType<T>>(beta) * x;  // NOLINT
+    dx.device(d) =
+        (x_beta > static_cast<ComplexType<T>>(threshold))
+            .select(dout,
+                    dout / (static_cast<ComplexType<T>>(1) + (-x_beta).exp())
+                               .unaryExpr(Conj<T>()));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct SoftplusDoubleGradFunctor : public BaseActivationFunctor<T> {
   float beta;
@@ -3681,7 +3706,7 @@ struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
     MPType t = static_cast<MPType>(threshold);
-    MPType x_beta = x * beta;
+    MPType x_beta = x * static_cast<MPType>(beta);
     return static_cast<T>(x_beta > t ? x : log(one + exp(x_beta)) / b);
   }
 };
@@ -3711,6 +3736,34 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaSoftplusGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  using MPType = typename phi::dtype::MPTypeTrait<ComplexType<T>>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<ComplexType<T>>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> arg_dout, const ComplexType<T> arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * static_cast<MPType>(beta);
+    return x_beta > t
+               ? dout
+               : static_cast<ComplexType<T>>(dout / conj(one + exp(-x_beta)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
index 558e7dc999cf8e..753aa44b0aa3ae 100644
--- a/paddle/phi/kernels/funcs/aligned_vector.h
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -30,6 +30,11 @@ struct NeedVectorized {
   static constexpr bool value = sizeof(T) <= sizeof(float);
 };
 
+template <int N>
+struct MaxWithOne {
+  static constexpr auto kValue = (N >= 1 ? N : 1);
+};
+
 // Aligned vector generates vectorized load/store on CUDA.
 template <typename T, int Size>
 struct alignas(sizeof(T) * Size) AlignedVector {
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 2ba3271d2c7df6..a1f9c1eb4346cb 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -511,318 +511,6 @@ void LaunchBroadcastKernel(
 #endif
 }
 
-#ifndef PADDLE_WITH_XPU_KP
-HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
-    int64_t src_idx,
-    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
-    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &dst_strides,
-    int rank) {
-  int64_t dst_idx = 0;
-  int64_t old_src_idx = src_idx;
-  for (int k = 0; k < rank; ++k) {
-    auto local_idx = src_idx / src_strides[k + 1];
-    src_idx -= local_idx * src_strides[k + 1];
-
-    if (dst_strides[k] != dst_strides[k + 1]) {
-      dst_idx += local_idx * dst_strides[k + 1];
-    }
-  }
-  return dst_idx;
-}
-
-template <int N>
-struct MaxWithOne {
-  static constexpr auto kValue = (N >= 1 ? N : 1);
-};
-
-template <int Index, int VecSize>
-struct ReadVecDataWithInt64Index {
-  template <typename Array1, typename Array2, typename Array3, typename ArgsT>
-  static __device__ __forceinline__ void Apply(
-      const Array1 &in,
-      ArgsT *args,
-      int64_t idx,
-      const Array2 &need_broadcast,
-      const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
-      const Array3 &dst_strides,
-      int rank,
-      bool is_boundary) {
-    using Type = std::tuple_element_t<Index, ArgsT>;
-    if (is_boundary) {
-#pragma unroll
-      for (int i = 0; i < VecSize; ++i) {
-        std::get<Index>(args[i]) = in[Index][ConvertSrcIdxToDstIdx(
-            idx + i, src_strides, dst_strides[Index], rank)];
-      }
-    } else {
-      if (!need_broadcast[Index]) {
-        kps::ReadData<Type, VecSize, 1, ArgsT, Index, false>(
-            args, reinterpret_cast<const _ptr_ Type *>(in[Index]) + idx, 1);
-      } else {
-#pragma unroll
-        for (int i = 0; i < VecSize; ++i) {
-          std::get<Index>(args[i]) = in[Index][ConvertSrcIdxToDstIdx(
-              idx + i, src_strides, dst_strides[Index], rank)];
-        }
-      }
-    }
-  }
-};
-
-template <typename OutT, typename Functor, int VecSize, int NumIns>
-__global__ void BroadcastKernelWithInt64Index(
-    const phi::Array<const _ptr_ char *__restrict__, MaxWithOne<NumIns>::kValue>
-        &ins,
-    OutT *out,
-    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank + 1>,
-               MaxWithOne<NumIns>::kValue> ins_strides,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> out_strides,
-    phi::Array<bool, MaxWithOne<NumIns>::kValue> need_broadcasts,
-    int rank,
-    Functor functor) {
-  int64_t numel = out_strides[0];
-  int64_t idx =
-      (static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x) * VecSize;
-  int64_t stride = static_cast<int64_t>(blockDim.x) * gridDim.x * VecSize;
-  int64_t limit = numel - VecSize;
-
-  using Traits = phi::funcs::FunctionTraits<Functor>;
-  using ArgsT = typename Traits::ArgsTuple;
-
-  ArgsT args[VecSize];
-  phi::AlignedVector<OutT, VecSize> out_vec;
-  for (; idx <= limit; idx += stride) {
-    Unroller<ReadVecDataWithInt64Index, VecSize, NumIns>::step(
-        ins, args, idx, need_broadcasts, out_strides, ins_strides, rank, false);
-
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      out_vec[i] = static_cast<OutT>(Apply(functor, args[i]));
-    }
-    phi::Store<OutT, VecSize>(out_vec, out + idx);
-  }
-
-  if (idx < numel) {
-    int remain = numel - idx;  // remain is always less than VecSize, therefore
-                               // `int` is enough here
-    Unroller<ReadVecDataWithInt64Index, VecSize, NumIns>::step(
-        ins, args, idx, need_broadcasts, out_strides, ins_strides, rank, true);
-    for (int i = 0; i < remain; ++i) {
-      out_vec[idx + i] = static_cast<OutT>(Apply(functor, args[i]));
-    }
-  }
-}
-
-template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
-struct LaunchBroadcastKernelWithInt64IndexHelper {
-  static void Run(const KPDevice &ctx,
-                  const std::vector<const DenseTensor *> &ins,
-                  std::vector<DenseTensor *> *outs,
-                  int axis,
-                  Functor functor) {
-    PADDLE_THROW(phi::errors::PermissionDenied(
-        "Unreachable code branch. This may be a bug."));
-  }
-};
-
-template <typename OutT, typename Functor, int Arity, int VecSize>
-struct LaunchBroadcastKernelWithInt64IndexHelper<OutT,
-                                                 Functor,
-                                                 Arity,
-                                                 /*NumOuts=*/1,
-                                                 VecSize> {
-  static void Run(const KPDevice &ctx,
-                  const std::vector<const DenseTensor *> &ins,
-                  std::vector<DenseTensor *> *outs,
-                  int axis,
-                  Functor functor) {
-    using Traits = phi::funcs::FunctionTraits<Functor>;
-    using ArgsT = typename Traits::ArgsTuple;
-    ArgsT arg;
-    phi::Array<const _ptr_ char *__restrict__, MaxWithOne<Arity>::kValue>
-        ins_ptrs;
-    UnrollerWithoutVecSize<InputSetter, Arity>::step(ins, arg, &ins_ptrs);
-
-    auto *out_tensor = (*outs)[0];
-    auto *out_ptr = ctx.Alloc<OutT>(out_tensor);
-
-    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank>,
-               MaxWithOne<Arity>::kValue>
-        ins_expand_dims;
-    phi::Array<int64_t, phi::DDim::kMaxRank> broadcast_out_dims;
-    int rank;
-    if (Arity == 1) {
-      rank = ins[0]->dims().size();
-      for (int i = 0; i < rank; ++i) {
-        broadcast_out_dims[i] = ins[0]->dims()[i];
-      }
-      ins_expand_dims[0] = broadcast_out_dims;
-    } else if (Arity >= 2) {
-      CalculateBroadcastDims(ins[0]->dims().Get(),
-                             ins[1]->dims().Get(),
-                             ins[0]->dims().size(),
-                             ins[1]->dims().size(),
-                             axis,
-                             ins_expand_dims[0].GetMutable(),
-                             ins_expand_dims[1].GetMutable(),
-                             broadcast_out_dims.GetMutable(),
-                             &rank);
-      for (int i = 2; i < Arity; ++i) {
-        auto tmp_dims = broadcast_out_dims;
-        phi::Array<int64_t, phi::DDim::kMaxRank> tmp_expand_dims;
-        int tmp_rank;
-        PADDLE_ENFORCE_GE(rank,
-                          ins[i]->dims().size(),
-                          phi::errors::InvalidArgument(
-                              "Unsupported reverse broadcast when the input "
-                              "tensor number is larger than 2."));
-        CalculateBroadcastDims(tmp_dims.Get(),
-                               ins[i]->dims().Get(),
-                               rank,
-                               ins[i]->dims().size(),
-                               axis,
-                               tmp_expand_dims.GetMutable(),
-                               ins_expand_dims[i].GetMutable(),
-                               broadcast_out_dims.GetMutable(),
-                               &tmp_rank);
-        PADDLE_ENFORCE_EQ(rank,
-                          tmp_rank,
-                          phi::errors::InvalidArgument(
-                              "Wrong broadcast algorithm. This may be a bug."));
-      }
-    }
-
-    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank + 1>,
-               MaxWithOne<Arity>::kValue>
-        ins_strides;
-    phi::Array<bool, MaxWithOne<Arity>::kValue> need_broadcasts;
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> out_strides;
-    const auto &out_dims = out_tensor->dims();
-    if (rank <= out_dims.size()) {
-      out_strides = ShapeToStride(out_dims.Get(), rank);
-    } else {
-      out_strides = ShapeToStride(broadcast_out_dims.Get(), rank);
-    }
-
-    for (int i = 0; i < Arity; ++i) {
-      ins_strides[i] = ShapeToStride(ins_expand_dims[i].Get(), rank);
-      need_broadcasts[i] =
-          !IsSameShape(out_strides.Get(), ins_strides[i].Get(), rank + 1);
-    }
-
-    int64_t numel = out_strides[0];
-    auto gpu_config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
-
-    BroadcastKernelWithInt64Index<OutT, Functor, VecSize, Arity>
-        <<<gpu_config.block_per_grid,
-           gpu_config.thread_per_block,
-           0,
-           ctx.stream()>>>(ins_ptrs,
-                           out_ptr,
-                           ins_strides,
-                           out_strides,
-                           need_broadcasts,
-                           rank,
-                           functor);
-  }
-
- private:
-  static void CalculateBroadcastDims(const int64_t *x_dims,
-                                     const int64_t *y_dims,
-                                     int nx,
-                                     int ny,
-                                     int axis,
-                                     int64_t *x_out_dims,
-                                     int64_t *y_out_dims,
-                                     int64_t *broadcast_out_dims,
-                                     int *length) {
-    PADDLE_ENFORCE_GE(
-        axis, 0, phi::errors::InvalidArgument("Invalid axis value: %d", axis));
-    if (nx == ny) {
-      *length = nx;
-      for (int i = 0; i < nx; ++i) {
-        if (x_dims[i] != y_dims[i]) {
-          PADDLE_ENFORCE_EQ(
-              x_dims[i] == 1 || y_dims[i] == 1,
-              true,
-              phi::errors::InvalidArgument("Cannot broadcast input shape where "
-                                           "x_dims[%d] = %d, y_dims[%d] = %d.",
-                                           i,
-                                           x_dims[i],
-                                           i,
-                                           y_dims[i]));
-        }
-        broadcast_out_dims[i] = std::max(x_dims[i], y_dims[i]);
-        x_out_dims[i] = x_dims[i];
-        y_out_dims[i] = y_dims[i];
-      }
-    } else if (nx > ny) {
-      *length = nx;
-      for (int i = nx - axis; i < ny; ++i) {
-        PADDLE_ENFORCE_EQ(
-            y_dims[i],
-            1,
-            phi::errors::InvalidArgument(
-                "The trailing Y.shape[%d] should be 1 but got %d.",
-                i,
-                y_dims[i]));
-      }
-
-      for (int i = 0; i < nx; ++i) {
-        if (i >= axis && i - axis < ny) {
-          if (x_dims[i] != y_dims[i - axis]) {
-            PADDLE_ENFORCE_EQ(x_dims[i] == 1 || y_dims[i - axis] == 1,
-                              true,
-                              phi::errors::InvalidArgument(
-                                  "Cannot broadcast input shape where "
-                                  "x_dims[%d] = %d, y_dims[%d] = %d.",
-                                  i,
-                                  x_dims[i],
-                                  i - axis,
-                                  y_dims[i - axis]));
-          }
-          broadcast_out_dims[i] = std::max(x_dims[i], y_dims[i - axis]);
-          x_out_dims[i] = x_dims[i];
-          y_out_dims[i] = y_dims[i - axis];
-        } else {
-          broadcast_out_dims[i] = x_dims[i];
-          x_out_dims[i] = x_dims[i];
-          y_out_dims[i] = 1;
-        }
-      }
-    } else {
-      CalculateBroadcastDims(y_dims,
-                             x_dims,
-                             ny,
-                             nx,
-                             axis,
-                             y_out_dims,
-                             x_out_dims,
-                             broadcast_out_dims,
-                             length);
-    }
-  }
-
-  static bool IsSameShape(const int64_t *x, const int64_t *y, int rank) {
-    for (int i = 0; i < rank; ++i) {
-      if (x[i] != y[i]) return false;
-    }
-    return true;
-  }
-
-  static phi::Array<int64_t, phi::DDim::kMaxRank + 1> ShapeToStride(
-      const int64_t *arr, int rank) {
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> strides;
-    strides[rank] = 1;
-    for (int i = rank - 1; i >= 0; --i) {
-      strides[i] = strides[i + 1] * arr[i];
-    }
-    return strides;
-  }
-};
-#endif
-
 template <typename OutT, typename Functor, int Arity, int NumOuts = 1>
 typename std::enable_if<!NeedVectorized<OutT>::value, void>::type
 BroadcastKernelForDifferentVecSize(const KPDevice &ctx,
@@ -830,25 +518,6 @@ BroadcastKernelForDifferentVecSize(const KPDevice &ctx,
                                    std::vector<DenseTensor *> *outs,
                                    int axis,
                                    Functor func) {
-#ifndef PADDLE_WITH_XPU_KP
-  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && Arity <= 3);
-  bool use_int64_index_kernel =
-      kEnabledInt64IndexKernel &&
-      (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
-  if (use_int64_index_kernel) {
-    LaunchBroadcastKernelWithInt64IndexHelper<OutT,
-                                              Functor,
-                                              Arity,
-                                              NumOuts,
-                                              VecSizeS>::Run(ctx,
-                                                             ins,
-                                                             outs,
-                                                             axis,
-                                                             func);
-    return;
-  }
-#endif
-
   auto classifier =
       BroadcastTypeClassifier<OutT, Functor, Arity, NumOuts>(ins, outs, axis);
   LaunchBroadcastKernel<OutT, Functor, Arity, NumOuts, VecSizeS>(
@@ -871,59 +540,6 @@ BroadcastKernelForDifferentVecSize(const KPDevice &ctx,
   int vec_size = GetVectorizedSizeForTensors(ins, *outs);
 #endif
 
-#ifndef PADDLE_WITH_XPU_KP
-  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && Arity <= 3);
-  bool use_int64_index_kernel =
-      kEnabledInt64IndexKernel &&
-      (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
-  if (use_int64_index_kernel) {
-    switch (vec_size) {
-      case VecSizeL: {
-        LaunchBroadcastKernelWithInt64IndexHelper<OutT,
-                                                  Functor,
-                                                  Arity,
-                                                  NumOuts,
-                                                  VecSizeL>::Run(ctx,
-                                                                 ins,
-                                                                 outs,
-                                                                 axis,
-                                                                 func);
-        break;
-      }
-      case VecSizeM: {
-        LaunchBroadcastKernelWithInt64IndexHelper<OutT,
-                                                  Functor,
-                                                  Arity,
-                                                  NumOuts,
-                                                  VecSizeM>::Run(ctx,
-                                                                 ins,
-                                                                 outs,
-                                                                 axis,
-                                                                 func);
-        break;
-      }
-      case VecSizeS: {
-        LaunchBroadcastKernelWithInt64IndexHelper<OutT,
-                                                  Functor,
-                                                  Arity,
-                                                  NumOuts,
-                                                  VecSizeS>::Run(ctx,
-                                                                 ins,
-                                                                 outs,
-                                                                 axis,
-                                                                 func);
-        break;
-      }
-      default: {
-        PADDLE_THROW(phi::errors::Unimplemented(
-            "Unsupported vectorized size: %d!", vec_size));
-        break;
-      }
-    }
-    return;
-  }
-#endif
-
   auto classifier =
       BroadcastTypeClassifier<OutT, Functor, Arity, NumOuts>(ins, outs, axis);
   switch (vec_size) {
@@ -950,6 +566,195 @@ BroadcastKernelForDifferentVecSize(const KPDevice &ctx,
   }
 }
 
+static void updateStridesDims(std::vector<int64_t> *strides,
+                              std::vector<int64_t> *dims) {
+  for (int i = 1; i < strides->size(); i++) {
+    (*strides)[i] = (*strides)[i - 1] * (*dims)[i - 1];
+  }
+  // reverse origin_in_dim and origin_in_stride if in's dim_size > 0
+  std::reverse(strides->begin(), strides->end());
+  std::reverse(dims->begin(), dims->end());
+}
+
+static void SliceTensor(DenseTensor *x,
+                        const DenseTensor *share,
+                        const std::vector<int64_t> &out_compute_dims,
+                        int64_t offset) {
+  auto new_dim = make_ddim(out_compute_dims);
+  DenseTensorMeta meta(share->dtype(),
+                       new_dim,
+                       share->layout(),
+                       offset * SizeOf(share->dtype()));
+  x->set_meta(meta);
+  x->ShareBufferWith(*(share), true);
+  x->Resize(new_dim);
+}
+
+template <typename OutT, typename Functor, int kArity, int NumOuts = 1>
+void BroadcastKernelSplit(const KPDevice &ctx,
+                          const std::vector<const DenseTensor *> &ins,
+                          std::vector<DenseTensor *> *outs,
+                          int axis,
+                          Functor func,
+                          const int64_t compute_size) {
+  const auto dims_simplifier =
+      BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis);
+  if (VLOG_IS_ON(6)) {
+    DimsSimplifiedLogger<int64_t>::Log(
+        ins, outs, dims_simplifier, "GPU Broadcast");
+  }
+
+  int all_rank = dims_simplifier.rank;
+  std::vector<int64_t> origin_out_strides(all_rank, 1);
+  auto origin_in_dims = dims_simplifier.in_dims;
+  auto origin_out_dims = dims_simplifier.out_dims;
+  auto origin_in_strides = dims_simplifier.in_dims;
+
+  // for split
+  std::vector<int64_t> loop_num_out(all_rank, 1);
+  std::vector<int64_t> loop_num_out_stride(all_rank, 1);
+
+  // for input's offset
+  std::vector<int64_t> ins_offset(kArity, 0);
+  std::vector<int64_t> ins_scale_for_dim(kArity, 0);
+
+  // init offset and check in's dim
+  for (int k = 0; k < kArity; k++) {
+    ins_scale_for_dim[k] = ins[k]->dims().size() == 0 ? 0 : 1;
+    if (ins_scale_for_dim[k]) {
+      origin_in_strides[k][0] = 1;
+    }
+  }
+
+  updateStridesDims(&origin_out_strides, &origin_out_dims);
+  for (int k = 0; k < kArity; k++) {
+    if (ins_scale_for_dim[k]) {
+      updateStridesDims(&origin_in_strides[k], &origin_in_dims[k]);
+    }
+  }
+
+  // init out_split_dim and in_split_dims
+  auto out_split_dim = origin_out_dims;
+  auto in_split_dims = origin_in_dims;
+
+  // init
+  int64_t loop_num = 1;
+  int64_t split_idx = 0;
+
+  for (int r = 0; r < all_rank; r++) {
+    // if the compute_size was too small the split_size must be 0, but the
+    // dim_num must ge 1
+    int64_t split_size = compute_size / origin_out_strides[r];
+    out_split_dim[r] = std::max(split_size, static_cast<int64_t>(1));
+    loop_num_out[r] =
+        (origin_out_dims[r] + out_split_dim[r] - 1) / out_split_dim[r];
+    loop_num *= loop_num_out[r];
+
+    for (int k = 0; k < kArity; k++) {
+      if (ins_scale_for_dim[k]) {
+        in_split_dims[k][r] = std::min(origin_in_dims[k][r], out_split_dim[r]);
+      }
+    }
+
+    // split_idx is the index for lash split dim
+    if (split_size != 0) {
+      split_idx = r;
+      break;
+    }
+  }
+
+  loop_num_out_stride[all_rank - 1] = 1;
+  for (int r = all_rank - 2; r >= 0; r--) {
+    loop_num_out_stride[r] = loop_num_out_stride[r + 1] * loop_num_out[r + 1];
+  }
+
+  // compute
+
+  for (int iter = 0; iter < loop_num; iter++) {
+    std::vector<const DenseTensor *> new_ins = {};
+    std::vector<DenseTensor *> new_outs = {};
+    phi::DenseTensor tmp_in[kArity];
+    DenseTensor tmp_out[NumOuts];
+
+    int64_t tmp_size = iter;
+    int64_t out_offset = 0;
+    // compute the offset before  last split dim
+    for (int i = 0; i < split_idx; i++) {
+      auto repeat_times = tmp_size / loop_num_out_stride[i];
+      out_offset += repeat_times * origin_out_strides[i];
+      for (int k = 0; k < kArity; k++) {
+        if (ins_scale_for_dim[k]) {
+          ins_offset[k] +=
+              (repeat_times % origin_in_dims[k][i]) * origin_in_strides[k][i];
+        }
+      }
+      tmp_size = tmp_size % loop_num_out_stride[i];
+    }
+    // tmp_size is the last split_dims's repeat idx
+    auto pre_deal_size = tmp_size * out_split_dim[split_idx];
+    out_offset += pre_deal_size * origin_out_strides[split_idx];
+    // compute_size
+    auto remainder_size = origin_out_dims[split_idx] - pre_deal_size;
+
+    // get current compute size
+    auto out_compute_dims = out_split_dim;
+    out_compute_dims[split_idx] =
+        std::min(out_split_dim[split_idx], remainder_size);
+
+    // in + compute_size
+    auto in_compute_dims = in_split_dims;
+    for (int k = 0; k < kArity; k++) {
+      if (ins_scale_for_dim[k]) {
+        auto split_repeat =
+            origin_in_dims[k][split_idx] == origin_out_dims[split_idx]
+                ? tmp_size
+                : 0;
+        ins_offset[k] += split_repeat * in_split_dims[k][split_idx] *
+                         origin_in_strides[k][split_idx];
+        in_compute_dims[k][split_idx] =
+            std::min(in_split_dims[k][split_idx], out_compute_dims[split_idx]);
+      }
+      SliceTensor(&tmp_in[k],
+                  ins[k],
+                  in_compute_dims[k],
+                  ins_scale_for_dim[k] * ins_offset[k]);
+      new_ins.emplace_back(&tmp_in[k]);
+      ins_offset[k] = 0;
+    }
+
+    for (int n = 0; n < NumOuts; n++) {
+      SliceTensor(&tmp_out[n], (*outs)[n], out_compute_dims, out_offset);
+      new_outs.emplace_back(&tmp_out[n]);
+    }
+
+    BroadcastKernelForDifferentVecSize<OutT, Functor, kArity, NumOuts>(
+        ctx, new_ins, &new_outs, axis, func);
+  }
+}
+
+template <typename OutT, typename Functor, int kArity, int NumOuts = 1>
+void BroadcastKernelApply(const KPDevice &ctx,
+                          const std::vector<const DenseTensor *> &ins,
+                          std::vector<DenseTensor *> *outs,
+                          int axis,
+                          Functor func) {
+#ifndef PADDLE_WITH_XPU_KP
+  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity <= 3);
+  // check whether need broadcast
+  auto compute_size = std::numeric_limits<int32_t>::max();
+  bool use_int64_index_kernel =
+      kEnabledInt64IndexKernel && (*outs)[0]->numel() >= compute_size;
+
+  if (use_int64_index_kernel) {  // use_int64_index_kernel
+    BroadcastKernelSplit<OutT, Functor, kArity, NumOuts>(
+        ctx, ins, outs, axis, func, compute_size);
+    return;
+  }
+#endif
+  BroadcastKernelForDifferentVecSize<OutT, Functor, kArity, NumOuts>(
+      ctx, ins, outs, axis, func);
+}
+
 template <typename OutT, typename Functor, int NumOuts = 1>
 void BroadcastKernel(const KPDevice &ctx,
                      const std::vector<const DenseTensor *> &ins,
@@ -1014,7 +819,7 @@ void BroadcastKernel(const KPDevice &ctx,
     max_rank = std::max(max_rank, (*outs)[0]->dims().size());
   }
   axis = axis == -1 ? max_rank - min_rank : axis;
-  BroadcastKernelForDifferentVecSize<OutT, Functor, kArity, NumOuts>(
+  BroadcastKernelApply<OutT, Functor, kArity, NumOuts>(
       ctx, ins, outs, axis, func);
 }
 
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index 8db9a92f47d5aa..d186bda9ceb959 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -244,5 +244,30 @@ inline int64_t CalStride(phi::DDim dim) {
   return strides;
 }
 
+inline std::vector<int32_t> GetPermuteShape(const std::vector<int> &axis,
+                                            const DDim &in_dims) {
+  std::vector<int32_t> out_dims(in_dims.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    out_dims[i] = in_dims[axis[i]];
+  }
+  return out_dims;
+}
+
+inline std::vector<int32_t> GetFlattenShape(const int axis,
+                                            const std::vector<int> &in_dims) {
+  int64_t outer = 1, inner = 1;
+  for (int i = 0; i < static_cast<int>(in_dims.size()); ++i) {
+    if (i < axis) {
+      outer *= in_dims[i];
+    } else {
+      inner *= in_dims[i];
+    }
+  }
+  std::vector<int32_t> out_shape(2);
+  out_shape[0] = outer;
+  out_shape[1] = inner;
+  return out_shape;
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/eigen/sign.cc b/paddle/phi/kernels/funcs/eigen/sign.cc
index 450df3c764c12d..e71257f3f74aae 100644
--- a/paddle/phi/kernels/funcs/eigen/sign.cc
+++ b/paddle/phi/kernels/funcs/eigen/sign.cc
@@ -29,6 +29,10 @@ struct EigenSign<Eigen::DefaultDevice, T> {
   }
 };
 
+template struct EigenSign<Eigen::DefaultDevice, int8_t>;
+template struct EigenSign<Eigen::DefaultDevice, int16_t>;
+template struct EigenSign<Eigen::DefaultDevice, int32_t>;
+template struct EigenSign<Eigen::DefaultDevice, int64_t>;
 template struct EigenSign<Eigen::DefaultDevice, float>;
 template struct EigenSign<Eigen::DefaultDevice, double>;
 
diff --git a/paddle/phi/kernels/funcs/eigen/sign.cu b/paddle/phi/kernels/funcs/eigen/sign.cu
index b630ba7bb6c408..58a4fe36232b6f 100644
--- a/paddle/phi/kernels/funcs/eigen/sign.cu
+++ b/paddle/phi/kernels/funcs/eigen/sign.cu
@@ -29,6 +29,10 @@ struct EigenSign<Eigen::GpuDevice, T> {
   }
 };
 
+template struct EigenSign<Eigen::GpuDevice, int8_t>;
+template struct EigenSign<Eigen::GpuDevice, int16_t>;
+template struct EigenSign<Eigen::GpuDevice, int32_t>;
+template struct EigenSign<Eigen::GpuDevice, int64_t>;
 template struct EigenSign<Eigen::GpuDevice, float>;
 template struct EigenSign<Eigen::GpuDevice, double>;
 template struct EigenSign<Eigen::GpuDevice, dtype::float16>;
diff --git a/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu b/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu
new file mode 100644
index 00000000000000..5d4611fa9d09a9
--- /dev/null
+++ b/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu
@@ -0,0 +1,210 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cub/cub.cuh>  // NOLINT
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_cuda_utils.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+__device__ inline T rsqrt(const T& x);
+
+template <>
+__device__ inline float rsqrt(const float& x) {
+  return rsqrtf(x);
+}
+
+template <typename T>
+__device__ __forceinline__ T local_rsqrt(T num) {
+  return rsqrt(static_cast<float>(num));
+}
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+__device__ __forceinline__ half local_rsqrt(half num) { return hrsqrt(num); }
+#endif
+
+template <typename T, int TPB>
+__device__ inline void LayerNorm(const phi::funcs::kvp<T>& thread_data,
+                                 const int ld,
+                                 const int offset,
+                                 const T* bias,
+                                 const T* scale,
+                                 T* output,
+                                 T eps) {
+  using BlockReduce = cub::BlockReduce<phi::funcs::kvp<T>, TPB>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T mu;      // mean
+  __shared__ T rsigma;  // 1 / std.dev.
+
+  const auto sum_kv = BlockReduce(temp_storage).Reduce(thread_data, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    mu = sum_kv.key;
+    rsigma = local_rsqrt(sum_kv.value - mu * mu + eps);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < ld; i += TPB) {
+    const int idx = offset + i;
+    const T val = output[idx];
+    const T g(scale[i]);
+    const T b(bias[i]);
+    output[idx] = g * (val - mu) * rsigma + b;
+  }
+}
+
+template <typename T, unsigned TPB>
+__global__ void EmbEltwiseLayernormKernel(int hidden,
+                                          const int64_t* ids,
+                                          const T* scale,
+                                          const T* bias,
+                                          const int64_t* embs,
+                                          T* output,
+                                          T eps,
+                                          int input_num) {
+  cub::Sum pair_sum;
+  // blockIdx.x: position in the sequence
+  // blockIdx.y: batch
+  // gridDim.x: Seq
+  // gridDim.y: Batch
+
+  extern __shared__ int64_t array_id[];
+
+  const T rhidden = T(1.f) / T(hidden);
+  const int64_t seq_pos = blockIdx.y + blockIdx.x * gridDim.y;
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < input_num; ++i) {
+      const int64_t* ids_p = reinterpret_cast<const int64_t*>(ids[i]);
+      array_id[i] = ids_p[seq_pos];
+    }
+  }
+  __syncthreads();
+
+  const int64_t out_offset = seq_pos * hidden;
+
+  phi::funcs::kvp<T> thread_data(0, 0);
+
+#pragma unroll
+  for (int it = threadIdx.x; it < hidden; it += TPB) {
+    T val = 0;
+    for (int i = 0; i < input_num; ++i) {
+      val += reinterpret_cast<const T*>(embs[i])[array_id[i] * hidden + it];
+    }
+
+    output[out_offset + it] = val;
+    const T rhiddenval = rhidden * val;
+    thread_data =
+        pair_sum(thread_data, phi::funcs::kvp<T>(rhiddenval, rhiddenval * val));
+  }
+  LayerNorm<T, TPB>(thread_data, hidden, out_offset, bias, scale, output, eps);
+}
+
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#ifndef __HIPCC__  // @{ Half kernel: EmbEltwiseLayernormKernel
+template <>
+__global__ void EmbEltwiseLayernormKernel<half, 256>(int hidden,
+                                                     const int64_t* ids,
+                                                     const half* scale,
+                                                     const half* bias,
+                                                     const int64_t* embs,
+                                                     half* output,
+                                                     half eps,
+                                                     int input_num) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  cub::Sum pair_sum;
+  // blockIdx.x: position in the sequence
+  // blockIdx.y: batch
+  // gridDim.x: Seq
+  // gridDim.y: Batch
+
+  extern __shared__ int64_t array_id[];
+
+  const half rhidden = half(1.f) / half(hidden);
+  const int64_t seq_pos = blockIdx.y + blockIdx.x * gridDim.y;
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < input_num; ++i) {
+      const int64_t* ids_p = reinterpret_cast<const int64_t*>(ids[i]);
+      array_id[i] = ids_p[seq_pos];
+    }
+  }
+  __syncthreads();
+
+  const int64_t out_offset = seq_pos * hidden;
+
+  phi::funcs::kvp<half> thread_data(0, 0);
+
+#pragma unroll
+  for (int it = threadIdx.x; it < hidden; it += 256) {
+    half val = 0;
+    for (int i = 0; i < input_num; ++i) {
+      val += reinterpret_cast<const half*>(embs[i])[array_id[i] * hidden + it];
+    }
+
+    output[out_offset + it] = val;
+    const half rhiddenval = rhidden * val;
+    thread_data = pair_sum(thread_data,
+                           phi::funcs::kvp<half>(rhiddenval, rhiddenval * val));
+  }
+  LayerNorm<half, 256>(
+      thread_data, hidden, out_offset, bias, scale, output, eps);
+#endif
+}
+#endif  // @} End Half kernel: EmbEltwiseLayernormKernel
+
+template <typename T>
+void EmbEltwiseLayerNormFunctor<T>::operator()(int batch,
+                                               int seq_len,
+                                               int hidden,
+                                               const int64_t* ids,
+                                               const T* scale,
+                                               const T* bias,
+                                               const int64_t* embs,
+                                               T* output,
+                                               float eps,
+                                               int input_num,
+                                               gpuStream_t stream) {
+  const unsigned tpb = 256;
+  const dim3 grid(seq_len, batch, 1);
+  const dim3 block(tpb, 1, 1);
+  int shared_bytes = input_num * sizeof(int64_t);
+  EmbEltwiseLayernormKernel<T, tpb><<<grid, block, shared_bytes, stream>>>(
+      hidden, ids, scale, bias, embs, output, eps, input_num);
+}
+
+template class EmbEltwiseLayerNormFunctor<float>;
+
+// device function 'operator()' is not supportted until cuda 10.0
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
+template class EmbEltwiseLayerNormFunctor<half>;
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.h b/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.h
new file mode 100644
index 00000000000000..d50224dd5bdaf5
--- /dev/null
+++ b/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+namespace phi {
+namespace funcs {
+
+// This functor involves a fusion calculation in Ernie or Bert.
+//  The fusion mode is as follows:
+//
+//      in_var  emb       in_var   emb
+//        |      |          |       |
+//      lookup_table      lookup_table
+//            |                 |
+//         lkt_var           lkt_var
+//             \                /
+//              elementwise_add
+//                     |
+//                elt_out_var
+//
+template <typename T>
+class EmbEltwiseLayerNormFunctor {
+ public:
+  void operator()(int batch,
+                  int seq_len,
+                  int hidden,
+                  const int64_t* ids,
+                  const T* scale,
+                  const T* bias,
+                  const int64_t* embs,
+                  T* output,
+                  float eps,
+                  int input_num,
+                  gpuStream_t stream);
+};
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
index 2b667c32d9db3f..597b8f231760bf 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -92,7 +92,7 @@ struct cpu_gather_scatter_functor {
       outer_dim_size *= index_dims[i];
     }
     int64_t index_idx = 0;
-    int64_t self_idx, src_idx;
+    int64_t self_idx = 0, src_idx = 0;
 
     // N layer loop squeezed into 3 layers loop
     for (int64_t i = 0; i < inner_dim_size; i++) {
diff --git a/paddle/phi/kernels/funcs/gpc.cc b/paddle/phi/kernels/funcs/gpc.cc
index b3199d88f5888e..47a3001b4fda2d 100644
--- a/paddle/phi/kernels/funcs/gpc.cc
+++ b/paddle/phi/kernels/funcs/gpc.cc
@@ -87,7 +87,7 @@ const std::array<std::array<h_state, 6>, 3> next_h_state = {
 */
 
 static void reset_it(it_node **it) {
-  it_node *itn;
+  it_node *itn = nullptr;
 
   while (*it) {
     itn = (*it)->next;
@@ -97,7 +97,7 @@ static void reset_it(it_node **it) {
 }
 
 static void reset_lmt(lmt_node **lmt) {
-  lmt_node *lmtn;
+  lmt_node *lmtn = nullptr;
 
   while (*lmt) {
     lmtn = (*lmt)->next;
@@ -140,7 +140,7 @@ static void insert_bound(edge_node **b, edge_node *e) {
 }
 
 static edge_node **bound_list(lmt_node **lmt, double y) {
-  lmt_node *existing_node;
+  lmt_node *existing_node = nullptr;
 
   if (!*lmt) {
     /* Add node onto the tail end of the LMT */
@@ -407,7 +407,7 @@ static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) {
 
 static void add_intersection(
     it_node **it, edge_node *edge0, edge_node *edge1, double x, double y) {
-  it_node *existing_node;
+  it_node *existing_node = nullptr;
 
   if (!*it) {
     /* Append a new node to the tail of the list */
@@ -440,7 +440,7 @@ static void add_st_edge(st_node **st,
                         it_node **it,
                         edge_node *edge,
                         double dy) {
-  st_node *existing_node;
+  st_node *existing_node = nullptr;
   double den = 0.0;
   double r = 0.0;
   double x = 0.0;
@@ -486,8 +486,8 @@ static void add_st_edge(st_node **st,
 }
 
 static void build_intersection_table(it_node **it, edge_node *aet, double dy) {
-  st_node *st;
-  st_node *stp;
+  st_node *st = nullptr;
+  st_node *stp = nullptr;
   edge_node *edge = nullptr;
 
   /* Build intersection table for the current scanbeam */
@@ -706,7 +706,7 @@ static void new_tristrip(polygon_node **tn,
 }
 
 static bbox *create_contour_bboxes(gpc_polygon *p) {
-  bbox *box;
+  bbox *box = nullptr;
   int c = 0;
   int v = 0;
 
@@ -744,8 +744,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
 }
 
 static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) {
-  bbox *s_bbox;
-  bbox *c_bbox;
+  bbox *s_bbox = nullptr;
+  bbox *c_bbox = nullptr;
   int s = 0;
   int c = 0;
   int *o_table = nullptr;
diff --git a/paddle/phi/kernels/funcs/im2col.cc b/paddle/phi/kernels/funcs/im2col.cc
index e4c470e1a7064f..44dd15ead335be 100644
--- a/paddle/phi/kernels/funcs/im2col.cc
+++ b/paddle/phi/kernels/funcs/im2col.cc
@@ -137,7 +137,7 @@ class Col2ImFunctor<phi::funcs::ColFormat::kCFO, DeviceContext, T> {
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
           if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
               (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            int im_offset;
+            int im_offset = 0;
             if (data_layout != DataLayout::kNHWC) {
               im_offset =
                   (c_im * im_height + im_row_idx) * im_width + im_col_idx;
diff --git a/paddle/phi/kernels/funcs/jit/benchmark.cc b/paddle/phi/kernels/funcs/jit/benchmark.cc
index 83a9a4a45d643f..894a711ddec6d7 100644
--- a/paddle/phi/kernels/funcs/jit/benchmark.cc
+++ b/paddle/phi/kernels/funcs/jit/benchmark.cc
@@ -113,7 +113,7 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
   BenchFunc<KernelTuple, Args...> benchmark;
   std::vector<std::pair<std::string, double>> infos;
   auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
-  for (auto f : funcs) {
+  for (auto const& f : funcs) {
     infos.push_back(std::make_pair(f.first, benchmark(f.second, args...)));
   }
 
@@ -128,7 +128,7 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
   std::ostringstream loginfos;
   loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": "
            << attr << ": ";
-  for (auto pair : infos) {
+  for (auto const& pair : infos) {
     loginfos << pair.first << " takes " << pair.second << " us; ";
   }
   LOG(INFO) << loginfos.str();
diff --git a/paddle/phi/kernels/funcs/jit/gen_base.cc b/paddle/phi/kernels/funcs/jit/gen_base.cc
index a80f9817c476ac..3758aaf4cace8d 100644
--- a/paddle/phi/kernels/funcs/jit/gen_base.cc
+++ b/paddle/phi/kernels/funcs/jit/gen_base.cc
@@ -47,7 +47,7 @@ void GenBase::dumpCode(const unsigned char* code) const {
 }
 
 void* GenBase::operator new(size_t size) {
-  void* ptr;
+  void* ptr = nullptr;
   constexpr size_t alignment = 32ul;
 #ifdef _WIN32
   ptr = _aligned_malloc(size, alignment);
@@ -71,8 +71,8 @@ void GenBase::operator delete(void* ptr) {
 }
 
 std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
-  int block;
-  int max_num_regs;
+  int block = 0;
+  int max_num_regs = 0;
   if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
     block = ZMM_FLOAT_BLOCK;
     max_num_regs = 32;
diff --git a/paddle/phi/kernels/funcs/jit/helper.cc b/paddle/phi/kernels/funcs/jit/helper.cc
index 5c93637649f897..c135d6ee3177dd 100644
--- a/paddle/phi/kernels/funcs/jit/helper.cc
+++ b/paddle/phi/kernels/funcs/jit/helper.cc
@@ -104,7 +104,7 @@ KernelType to_kerneltype(const std::string& act) {
 
 template <>
 void pack_weights<float>(const float* src, float* dst, int n, int k) {
-  int block, rest;
+  int block = 0, rest = 0;
   const auto groups = packed_groups(n, k, &block, &rest);
   std::for_each(groups.begin(), groups.end(), [&](int i) {
     PADDLE_ENFORCE_GT(i,
diff --git a/paddle/phi/kernels/funcs/jit/more/intrinsic/layer_norm.cc b/paddle/phi/kernels/funcs/jit/more/intrinsic/layer_norm.cc
index d7d62d6815501a..4b50de277a9c28 100644
--- a/paddle/phi/kernels/funcs/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/phi/kernels/funcs/jit/more/intrinsic/layer_norm.cc
@@ -44,8 +44,8 @@ void LayerNorm(float* x,
     __m256 mean_vec, var_vec;
     __m128 hi, lo;
     __m256 tmp = _mm256_setzero_ps();
-    size_t offset;
-    size_t j;
+    size_t offset = 0;
+    size_t j = 0;
     __m256 reverse_num_vec = _mm256_div_ps(
         _mm256_set1_ps(1.0), _mm256_set1_ps(static_cast<float>(right)));
     __m256 epsilon_vec = _mm256_set1_ps(epsilon);
diff --git a/paddle/phi/kernels/funcs/maxouting.cc b/paddle/phi/kernels/funcs/maxouting.cc
index 40b184865a5202..9c32453511f75d 100644
--- a/paddle/phi/kernels/funcs/maxouting.cc
+++ b/paddle/phi/kernels/funcs/maxouting.cc
@@ -43,7 +43,7 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
       int new_cindex = fea_size * c;
       for (int f = 0; f < fea_size; ++f) {
         T ele = static_cast<T>(-FLT_MAX);
-        int input_idx, output_idx;
+        int input_idx = 0, output_idx = 0;
         for (int ph = 0; ph < groups; ++ph) {
           if (axis == 1) {
             input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f;
@@ -89,7 +89,7 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
     for (int c = 0; c < output_channels; ++c) {
       int clen = fea_size * c;
       for (int f = 0; f < fea_size; ++f) {
-        int input_idx0, output_idx;
+        int input_idx0 = 0, output_idx = 0;
         bool continue_match = true;
         if (axis == 1) {
           input_idx0 = (blen + clen) * groups + f;
diff --git a/paddle/phi/kernels/funcs/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
index ae68da49653fff..0573430c2010c5 100644
--- a/paddle/phi/kernels/funcs/pooling.cc
+++ b/paddle/phi/kernels/funcs/pooling.cc
@@ -1592,8 +1592,8 @@ class MaxPool2dWithIndexFunctor<CPUContext, T1, T2> {
     T1* output_data = context.template Alloc<T1>(output);
     T2* mask_data = context.template Alloc<T2>(mask);
 
-    int hstart, hend;
-    int wstart, wend;
+    int hstart = 0, hend = 0;
+    int wstart = 0, wend = 0;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
@@ -1730,9 +1730,9 @@ class MaxPool3dWithIndexFunctor<CPUContext, T1, T2> {
     T1* output_data = context.template Alloc<T1>(output);
     T2* mask_data = context.template Alloc<T2>(mask);
 
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
+    int dstart = 0, dend = 0;
+    int hstart = 0, hend = 0;
+    int wstart = 0, wend = 0;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
diff --git a/paddle/phi/kernels/funcs/tensor_formatter.cc b/paddle/phi/kernels/funcs/tensor_formatter.cc
index 0b9d4f31d553e3..16d3b38bced7c1 100644
--- a/paddle/phi/kernels/funcs/tensor_formatter.cc
+++ b/paddle/phi/kernels/funcs/tensor_formatter.cc
@@ -66,7 +66,7 @@ std::string TensorFormatter::Format(const phi::DenseTensor& print_tensor,
   if (print_tensor_lod_) {
     log_stream << "  - lod: {";
     const phi::LoD& lod = print_tensor.lod();
-    for (auto level : lod) {
+    for (auto const& level : lod) {
       log_stream << "{";
       bool is_first = true;
       for (auto i : level) {
diff --git a/paddle/phi/kernels/funcs/vol2col.cc b/paddle/phi/kernels/funcs/vol2col.cc
index e505fcb3de3372..b5d6086feda770 100644
--- a/paddle/phi/kernels/funcs/vol2col.cc
+++ b/paddle/phi/kernels/funcs/vol2col.cc
@@ -123,7 +123,7 @@ class Vol2ColFunctor<phi::CPUContext, T> {
 
             int64_t col_idx =
                 ((c * output_depth + d) * output_height + h) * output_width + w;
-            int64_t vol_idx;
+            int64_t vol_idx = 0;
             if (data_layout != DataLayout::kNHWC) {
               vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
                             input_width +
@@ -248,7 +248,7 @@ class Col2VolFunctor<phi::CPUContext, T> {
 
             if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
                 w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx;
+              int vol_idx = 0;
               if (data_layout != DataLayout::kNHWC) {
                 vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
                               input_width +
diff --git a/paddle/phi/kernels/funcs/weight_dequant_functor.h b/paddle/phi/kernels/funcs/weight_dequant_functor.h
index dd1631ca722ee3..699ee271281d2d 100644
--- a/paddle/phi/kernels/funcs/weight_dequant_functor.h
+++ b/paddle/phi/kernels/funcs/weight_dequant_functor.h
@@ -1,32 +1,16 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/phi/kernels/funcs/weight_only_gemv.h b/paddle/phi/kernels/funcs/weight_only_gemv.h
index 8f61ab22ba6eac..dcadc5825fe34e 100644
--- a/paddle/phi/kernels/funcs/weight_only_gemv.h
+++ b/paddle/phi/kernels/funcs/weight_only_gemv.h
@@ -1,16 +1,16 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h b/paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h
new file mode 100644
index 00000000000000..44c02338f8b543
--- /dev/null
+++ b/paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
+                                    const DenseTensor &x,
+                                    const DenseTensor &scale,
+                                    const DenseTensor &bias,
+                                    const DenseTensor &y,
+                                    const DenseTensor &saved_mean,
+                                    const DenseTensor &saved_variance,
+                                    const DenseTensor &reserve_space,
+                                    const DenseTensor &y_grad,
+                                    float momentum,
+                                    float epsilon,
+                                    const std::string &act_type,
+                                    DenseTensor *x_grad,
+                                    DenseTensor *z_grad,
+                                    DenseTensor *scale_grad,
+                                    DenseTensor *bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/fused_bn_add_activation_kernel.h b/paddle/phi/kernels/fused_bn_add_activation_kernel.h
new file mode 100644
index 00000000000000..9d4f468a261ee6
--- /dev/null
+++ b/paddle/phi/kernels/fused_bn_add_activation_kernel.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FusedBatchNormAddActKernel(const Context &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &z,
+                                const DenseTensor &scale,
+                                const DenseTensor &bias,
+                                const DenseTensor &mean,
+                                const DenseTensor &variance,
+                                float momentum,
+                                float epsilon,
+                                const std::string &act_type,
+                                DenseTensor *y,
+                                DenseTensor *mean_out,
+                                DenseTensor *variance_out,
+                                DenseTensor *saved_mean,
+                                DenseTensor *saved_variance,
+                                DenseTensor *reserve_space);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
index cbe4571c5d010c..7caf30236bb79e 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
@@ -39,8 +39,8 @@
 
 
 def find_arch_range(min_arch, max_arch):
-    assert min_arch >= DEFAULT_ARCH[0] and min_arch < MAX_ARCH
-    assert max_arch >= DEFAULT_ARCH[0] and max_arch < MAX_ARCH
+    assert min_arch >= DEFAULT_ARCH[0] and min_arch <= MAX_ARCH
+    assert max_arch >= DEFAULT_ARCH[0] and max_arch <= MAX_ARCH
     assert min_arch <= max_arch
     n = len(DEFAULT_ARCH)
 
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
index cd21c12a4323a0..8dd51f0c797a43 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
@@ -39,8 +39,8 @@
 
 
 def find_arch_range(min_arch, max_arch):
-    assert min_arch >= DEFAULT_ARCH[0] and min_arch < MAX_ARCH
-    assert max_arch >= DEFAULT_ARCH[0] and max_arch < MAX_ARCH
+    assert min_arch >= DEFAULT_ARCH[0] and min_arch <= MAX_ARCH
+    assert max_arch >= DEFAULT_ARCH[0] and max_arch <= MAX_ARCH
     assert min_arch <= max_arch
     n = len(DEFAULT_ARCH)
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
new file mode 100644
index 00000000000000..3b9618db02db05
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
@@ -0,0 +1,223 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h"
+
+PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace phi {
+namespace fusion {
+
+template <typename T>
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, typename Context>
+void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
+                                    const DenseTensor &x,
+                                    const DenseTensor &scale,
+                                    const DenseTensor &bias,
+                                    const DenseTensor &y,
+                                    const DenseTensor &saved_mean,
+                                    const DenseTensor &saved_variance,
+                                    const DenseTensor &reserve_space,
+                                    const DenseTensor &y_grad,
+                                    float momentum,
+                                    float epsilon,
+                                    const std::string &act_type,
+                                    DenseTensor *x_grad,
+                                    DenseTensor *z_grad,
+                                    DenseTensor *scale_grad,
+                                    DenseTensor *bias_grad) {
+#if CUDNN_VERSION < 7401
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "The fused_bn_add_activation operator is not supported on GPU "
+      "when CUDNN version < 7.4.1"));
+#endif
+  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
+  PADDLE_ENFORCE_EQ(is_gpu_place,
+                    true,
+                    phi::errors::PreconditionNotMet("It must use CUDAPlace."));
+  double epsilon1 = static_cast<double>(epsilon);
+
+  const auto *x_ptr = &x;
+  const auto *y_ptr = &y;
+  const auto *d_y = &y_grad;
+  const auto *scale_ptr = &scale;
+  const auto *bias_ptr = &bias;
+  const auto *reserve_space_ptr = &reserve_space;
+
+  const auto &in_dims = x_ptr->dims();
+
+  int N, C, H, W, D;
+  const DataLayout data_layout = DataLayout::kNHWC;
+  phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+  // init output
+  auto *d_x = x_grad;
+  auto *d_z = z_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  dev_ctx.template Alloc<T>(d_x);
+  dev_ctx.template Alloc<T>(d_z);
+
+  PADDLE_ENFORCE_EQ(
+      d_scale && d_bias,
+      true,
+      phi::errors::PreconditionNotMet(
+          "Both the scale grad and the bias grad must not be null."));
+
+  dev_ctx.template Alloc<BatchNormParamType<T>>(d_scale);
+  dev_ctx.template Alloc<BatchNormParamType<T>>(d_bias);
+
+  PADDLE_ENFORCE_EQ(
+      scale_ptr->dims().size(),
+      1UL,
+      phi::errors::PreconditionNotMet("The scale only has one dimension."));
+  PADDLE_ENFORCE_EQ(
+      scale_ptr->dims()[0],
+      C,
+      phi::errors::PreconditionNotMet(
+          "The size of scale is equal to the channel of Input(X)."));
+
+  std::vector<int> dims = {N, C, H, W, D};
+  std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+  // ------------------- cudnn descriptors ---------------------
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+  if (epsilon1 <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon1 = std::max(epsilon1, CUDNN_BN_MIN_EPSILON);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      in_dims.size() > 3 ? in_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      bn_param_desc_, data_desc_, mode_));
+
+  const auto *saved_mean_ptr = &saved_mean;
+  const auto *saved_var_ptr = &saved_variance;
+  const auto *saved_mean_data =
+      saved_mean_ptr->template data<BatchNormParamType<T>>();
+  const auto *saved_var_data =
+      saved_var_ptr->template data<BatchNormParamType<T>>();
+
+  size_t workspace_size = 0;
+  void *workspace_ptr = nullptr;
+  phi::DenseTensor workspace_tensor;
+  auto reserve_space_size = reserve_space_ptr->memory_size();
+  cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+  phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
+  cudnnActivationDescriptor_t activation_desc_ =
+      scope_act_desc.descriptor<T>(act_type);
+  // --------------- cudnn batchnorm workspace ---------------
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+          /*handle=*/dev_ctx.cudnn_handle(),
+          /*mode=*/mode_,
+          /*bnOps=*/bnOps_,
+          /*xDesc=*/data_desc_,
+          /*yDesc=*/data_desc_,
+          /*dyDesc=*/data_desc_,
+          /*dzDesc=*/data_desc_,
+          /*dxDesc=*/data_desc_,
+          /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+          /*activationDesc=*/activation_desc_,
+          /*sizeInBytes=*/&workspace_size));
+
+  workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+  workspace_ptr = dev_ctx.template Alloc<T>(&workspace_tensor);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackwardEx(
+      /*handle=*/dev_ctx.cudnn_handle(),
+      /*mode=*/mode_,
+      /*bnOps=*/bnOps_,
+      /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+      /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+      /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+      /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+      /*xDesc=*/data_desc_,
+      /*xData=*/x_ptr->template data<T>(),
+      /*yDesc=*/data_desc_,
+      /*yData=*/y_ptr->template data<T>(),
+      /*dyDesc=*/data_desc_,
+      /*dyData=*/d_y->template data<T>(),
+      /*dzDesc=*/data_desc_,
+      /*dzData=*/d_z->template data<T>(),
+      /*dxDesc=*/data_desc_,
+      /*dxData=*/d_x->template data<T>(),
+      /*dBnScaleBiasDesc=*/bn_param_desc_,
+      /*bnScaleData=*/scale_ptr->template data<BatchNormParamType<T>>(),
+      /*bnBiasData=*/bias_ptr->template data<BatchNormParamType<T>>(),
+      /*dBnScaleData=*/d_scale->template data<BatchNormParamType<T>>(),
+      /*dBnBiasData=*/d_bias->template data<BatchNormParamType<T>>(),
+      /*epsilon=*/epsilon1,
+      /*savedMean=*/saved_mean_data,
+      /*savedInvVariance=*/saved_var_data,
+      /*activationDesmc=*/activation_desc_,
+      /*workspace=*/workspace_ptr,
+      /*workSpaceSizeInBytes=*/workspace_size,
+      /*reserveSpace=*/const_cast<T *>(reserve_space_ptr->template data<T>()),
+      /*reserveSpaceSizeInBytes=*/reserve_space_size));
+
+  // clean when exit.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_bn_add_activation_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedBatchNormAddActGradKernel,
+                   phi::dtype::float16) {
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
new file mode 100644
index 00000000000000..7b5b4119cf9705
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
@@ -0,0 +1,227 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/fused_bn_add_activation_kernel.h"
+
+PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace phi {
+namespace fusion {
+
+template <typename T>
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, typename Context>
+void FusedBatchNormAddActKernel(const Context &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &z,
+                                const DenseTensor &scale,
+                                const DenseTensor &bias,
+                                const DenseTensor &mean,
+                                const DenseTensor &variance,
+                                float momentum,
+                                float epsilon,
+                                const std::string &act_type,
+                                DenseTensor *y,
+                                DenseTensor *mean_out,
+                                DenseTensor *variance_out,
+                                DenseTensor *saved_mean,
+                                DenseTensor *saved_variance,
+                                DenseTensor *reserve_space) {
+#if CUDNN_VERSION < 7401
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "The fused_bn_add_activation operator is not supported on GPU "
+      "when CUDNN version < 7.4.1"));
+#endif
+  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
+  PADDLE_ENFORCE_EQ(is_gpu_place,
+                    true,
+                    phi::errors::PreconditionNotMet("It must use CUDAPlace."));
+
+  double epsilon1 = static_cast<double>(epsilon);
+  if (epsilon1 <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon1 = std::max(static_cast<double>(epsilon1), CUDNN_BN_MIN_EPSILON);
+
+  // Get the size for each dimension.
+  // NHWC [batch_size, in_height, in_width, in_channels]
+  const auto &in_dims = x.dims();
+
+  dev_ctx.template Alloc<BatchNormParamType<T>>(
+      mean_out, mean_out->numel() * sizeof(BatchNormParamType<T>));
+  dev_ctx.template Alloc<BatchNormParamType<T>>(
+      variance_out, variance_out->numel() * sizeof(BatchNormParamType<T>));
+
+  dev_ctx.template Alloc<BatchNormParamType<T>>(
+      saved_mean, saved_mean->numel() * sizeof(BatchNormParamType<T>));
+  dev_ctx.template Alloc<BatchNormParamType<T>>(
+      saved_variance, saved_variance->numel() * sizeof(BatchNormParamType<T>));
+
+  dev_ctx.template Alloc<T>(y, y->numel() * sizeof(T));
+
+  int N, C, H, W, D;
+  const DataLayout data_layout = DataLayout::kNHWC;
+  phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+  // ------------------- cudnn descriptors ---------------------
+  auto handle = dev_ctx.cudnn_handle();
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+
+  std::vector<int> dims = {N, C, H, W, D};
+  std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
+
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      in_dims.size() > 3 ? in_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      bn_param_desc_, data_desc_, mode_));
+
+  double this_factor = 1. - momentum;
+  cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+  phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
+  cudnnActivationDescriptor_t activation_desc_ =
+      scope_act_desc.descriptor<T>(act_type);
+  size_t workspace_size = 0;
+  size_t reserve_space_size = 0;
+  void *reserve_space_ptr = nullptr;
+  void *workspace_ptr = nullptr;
+  phi::DenseTensor workspace_tensor;
+  // Create reserve space and workspace for batch norm.
+  // Create tensor for each batchnorm op, it will be used in the
+  // backward. Thus this tensor shouldn't be temp.
+  PADDLE_ENFORCE_NOT_NULL(
+      reserve_space,
+      phi::errors::NotFound(
+          "The argument ReserveSpace of batch_norm op is not found."));
+
+  // --------------- cudnn batchnorm workspace ---------------
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+          /*handle=*/handle,
+          /*mode=*/mode_,
+          /*bnOps=*/bnOps_,
+          /*xDesc=*/data_desc_,
+          /*zDesc=*/data_desc_,
+          /*yDesc=*/data_desc_,
+          /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+          /*activationDesc=*/activation_desc_,
+          /*sizeInBytes=*/&workspace_size));
+
+  // -------------- cudnn batchnorm reserve space --------------
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+          /*handle=*/handle,
+          /*mode=*/mode_,
+          /*bnOps=*/bnOps_,
+          /*activationDesc=*/activation_desc_,
+          /*xDesc=*/data_desc_,
+          /*sizeInBytes=*/&reserve_space_size));
+
+  reserve_space->Resize(
+      {static_cast<int64_t>((reserve_space_size + phi::SizeOf(x.dtype()) - 1) /
+                            phi::SizeOf(x.dtype()))});
+  reserve_space_ptr = dev_ctx.template Alloc<T>(
+      reserve_space, reserve_space->numel() * sizeof(T));
+  workspace_tensor.Resize({static_cast<int64_t>(
+      (workspace_size + phi::SizeOf(x.dtype()) - 1) / phi::SizeOf(x.dtype()))});
+  workspace_ptr = dev_ctx.template Alloc<T>(
+      &workspace_tensor, workspace_tensor.numel() * sizeof(T));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnBatchNormalizationForwardTrainingEx(
+          handle,
+          mode_,
+          bnOps_,
+          CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(),
+          data_desc_,
+          x.template data<T>(),
+          data_desc_,
+          z.template data<T>(),
+          data_desc_,
+          y->template data<T>(),
+          bn_param_desc_,
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          this_factor,
+          dev_ctx.template Alloc<BatchNormParamType<T>>(
+              mean_out, mean_out->numel() * sizeof(BatchNormParamType<T>)),
+          dev_ctx.template Alloc<BatchNormParamType<T>>(
+              variance_out,
+              variance_out->numel() * sizeof(BatchNormParamType<T>)),
+          epsilon1,
+          dev_ctx.template Alloc<BatchNormParamType<T>>(
+              saved_mean, saved_mean->numel() * sizeof(BatchNormParamType<T>)),
+          dev_ctx.template Alloc<BatchNormParamType<T>>(
+              saved_variance,
+              saved_variance->numel() * sizeof(BatchNormParamType<T>)),
+          activation_desc_,
+          workspace_ptr,
+          workspace_size,
+          reserve_space_ptr,
+          reserve_space_size));
+
+  // clean when exit.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_bn_add_activation,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedBatchNormAddActKernel,
+                   phi::dtype::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
new file mode 100644
index 00000000000000..71e778ca6574e4
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
@@ -0,0 +1,142 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <type_traits>
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void EmbeddingEltWiseLayerNormKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& ids,
+    const std::vector<const DenseTensor*>& embs,
+    const DenseTensor& bias,
+    const DenseTensor& scale,
+    const float epsilon,
+    DenseTensor* out) {
+  PADDLE_ENFORCE_GE(
+      epsilon,
+      0.0f,
+      phi::errors::InvalidArgument(
+          "'epsilon' is %f, but it should be between 0.0 and 0.001", epsilon));
+  PADDLE_ENFORCE_LE(
+      epsilon,
+      0.001f,
+      phi::errors::InvalidArgument(
+          "'epsilon' is %f, but it should be between 0.0 and 0.001.", epsilon));
+  int input_num = static_cast<int>(ids.size());
+
+  DenseTensor in_ids_(phi::DataType::INT64), in_embs_(phi::DataType::INT64);
+  DDim in_dim{input_num};
+
+  in_ids_.Resize(in_dim);
+  in_embs_.Resize(in_dim);
+
+  int64_t* in_ids_d = dev_ctx.template Alloc<int64_t>(
+      &in_ids_, in_ids_.numel() * sizeof(int64_t));
+  int64_t* in_embs_d = dev_ctx.template Alloc<int64_t>(
+      &in_embs_, in_embs_.numel() * sizeof(int64_t));
+
+  std::vector<int64_t> in1s, in2s;
+  for (int i = 0; i < input_num; ++i) {
+    in1s.push_back(reinterpret_cast<uintptr_t>(ids[i]->data<int64_t>()));
+    in2s.push_back(reinterpret_cast<uintptr_t>(embs[i]->data<T>()));
+  }
+
+  phi::memory_utils::Copy(phi::GPUPlace{},
+                          in_ids_d,
+                          phi::CPUPlace{},
+                          in1s.data(),
+                          sizeof(int64_t) * input_num,
+                          dev_ctx.stream());
+  phi::memory_utils::Copy(phi::GPUPlace{},
+                          in_embs_d,
+                          phi::CPUPlace{},
+                          in2s.data(),
+                          sizeof(int64_t) * input_num,
+                          dev_ctx.stream());
+
+  // should be (B * S * hidden)
+  auto id0_dims = ids[0]->dims();
+  auto emb0_dims = embs[0]->dims();
+
+  int batch = id0_dims[0];
+  int seq_len = id0_dims[1];
+  int hidden = emb0_dims[1];
+
+  auto* bias_d = bias.data<T>();
+  auto* scale_d = scale.data<T>();
+  auto* output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
+  if (std::is_same<T, phi::dtype::float16>::value) {
+    const half* scale_new = reinterpret_cast<const half*>(scale_d);
+    const half* bias_new = reinterpret_cast<const half*>(bias_d);
+    half* output_new = reinterpret_cast<half*>(output_d);
+
+    phi::funcs::EmbEltwiseLayerNormFunctor<half> emb_eltwise_layernorm_func;
+    emb_eltwise_layernorm_func(batch,
+                               seq_len,
+                               hidden,
+                               in_ids_d,
+                               scale_new,
+                               bias_new,
+                               in_embs_d,
+                               output_new,
+                               epsilon,
+                               input_num,
+                               dev_ctx.stream());
+  } else {
+    phi::funcs::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
+    emb_eltwise_layernorm_func(batch,
+                               seq_len,
+                               hidden,
+                               in_ids_d,
+                               scale_d,
+                               bias_d,
+                               in_embs_d,
+                               output_d,
+                               epsilon,
+                               input_num,
+                               dev_ctx.stream());
+  }
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
+PD_REGISTER_KERNEL(fused_embedding_eltwise_layernorm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::EmbeddingEltWiseLayerNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(fused_embedding_eltwise_layernorm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::EmbeddingEltWiseLayerNormKernel,
+                   float) {}
+#endif
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
similarity index 71%
rename from paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
rename to paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
index f4a9f0a77a53b2..f7f8faa329d60f 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
@@ -1,16 +1,19 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <type_traits>
 
 #ifdef __NVCC__
 #include <cub/cub.cuh>
@@ -24,13 +27,17 @@ namespace cub = hipcub;
 #include <cuda_fp16.h>
 #endif
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace fusion {
 
 using float16 = phi::dtype::float16;
 
@@ -300,8 +307,8 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
   }
 }
 
-template <typename T>
-void AddReluAddLayerNorm(gpuStream_t stream,
+template <typename T, typename Context>
+void AddReluAddLayerNorm(const Context& dev_ctx,
                          bool with_relu,
                          int max_threads,
                          const T* y,
@@ -315,30 +322,30 @@ void AddReluAddLayerNorm(gpuStream_t stream,
                          int N,
                          float epsilon) {
   if (with_relu) {
-    switch (platform::RoundToPowerOfTwo(N)) {
+    switch (phi::backends::gpu::RoundToPowerOfTwo(N)) {
       CUDA_LAUNCH_KERNEL_HELPER(
           InplaceAddReluAddLayerNormKernel<T, true, kPowerOfTwoDim>
           <<<std::max(max_threads / kPowerOfTwoDim, 1),
              kPowerOfTwoDim,
              0,
-             stream>>>(
+             dev_ctx.stream()>>>(
               y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon));
     }
   } else {
-    switch (platform::RoundToPowerOfTwo(N)) {
+    switch (phi::backends::gpu::RoundToPowerOfTwo(N)) {
       CUDA_LAUNCH_KERNEL_HELPER(
           InplaceAddReluAddLayerNormKernel<T, false, kPowerOfTwoDim>
           <<<std::max(max_threads / kPowerOfTwoDim, 1),
              kPowerOfTwoDim,
              0,
-             stream>>>(
+             dev_ctx.stream()>>>(
               y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon));
     }
   }
 }
 
-template <>
-void AddReluAddLayerNorm(gpuStream_t stream,
+template <typename Context>
+void AddReluAddLayerNorm(const Context& dev_ctx,
                          bool with_relu,
                          int max_threads,
                          const float16* y,
@@ -352,109 +359,122 @@ void AddReluAddLayerNorm(gpuStream_t stream,
                          int N,
                          float epsilon) {
   if (with_relu) {
-    switch (platform::RoundToPowerOfTwo(N)) {
+    switch (phi::backends::gpu::RoundToPowerOfTwo(N)) {
       CUDA_LAUNCH_KERNEL_HELPER(
           InplaceAddReluAddLayerNormKernel<true, kPowerOfTwoDim>
           <<<std::max(max_threads / kPowerOfTwoDim, 1),
              kPowerOfTwoDim,
              0,
-             stream>>>(
+             dev_ctx.stream()>>>(
               y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon));
     }
   } else {
-    switch (platform::RoundToPowerOfTwo(N)) {
+    switch (phi::backends::gpu::RoundToPowerOfTwo(N)) {
       CUDA_LAUNCH_KERNEL_HELPER(
           InplaceAddReluAddLayerNormKernel<false, kPowerOfTwoDim>
           <<<std::max(max_threads / kPowerOfTwoDim, 1),
              kPowerOfTwoDim,
              0,
-             stream>>>(
+             dev_ctx.stream()>>>(
               y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon));
     }
   }
 }
 
-template <typename T, typename DeviceContext>
-class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* w = ctx.Input<phi::DenseTensor>("W");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto w_dims = w->dims();
-    int N = w_dims[1];
-    int K = w_dims[0];
-    int M = phi::product(x->dims()) / K;
-
-    const T* x_data = x->data<T>();
-    const T* w_data = w->data<T>();
-
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
-
-    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
-    blas.GEMM(CblasNoTrans,
-              CblasNoTrans,
-              M,
-              N,
-              K,
-              static_cast<T>(1.0),
-              x_data,
-              w_data,
-              static_cast<T>(0.0),
-              out_data);
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* bias_0 = ctx.Input<phi::DenseTensor>("Bias0");
-    auto* bias_1 = ctx.Input<phi::DenseTensor>("Bias1");
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-
-    const T* y_data = y->data<T>();
-    const T* bias_0_data = bias_0 ? bias_0->data<T>() : nullptr;
-    const T* bias_1_data = bias_1 ? bias_1->data<T>() : nullptr;
-    const T* scale_data = scale ? scale->data<T>() : nullptr;
-
-    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-    auto* variance = ctx.Output<phi::DenseTensor>("Variance");
-
-    T* mean_data =
-        mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
-             : nullptr;
-    T* variance_data = variance ? dev_ctx.template Alloc<T>(
-                                      variance, variance->numel() * sizeof(T))
-                                : nullptr;
-
-    bool with_relu =
-        (ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
-    float epsilon = ctx.Attr<float>("epsilon");
-
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    AddReluAddLayerNorm(dev_ctx.stream(),
-                        with_relu,
-                        max_threads,
-                        y_data,
-                        bias_0_data,
-                        bias_1_data,
-                        scale_data,
-                        out_data,
-                        mean_data,
-                        variance_data,
-                        M,
-                        N,
-                        epsilon);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-PD_REGISTER_STRUCT_KERNEL(fused_fc_elementwise_layernorm,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FusedFCElementwiseLayerNormOpKernel,
-                          float,
-                          double,
-                          plat::float16) {}
+template <typename T, typename Context>
+void FusedFCElementwiseLayerNormKernel(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& w,
+    const DenseTensor& y,
+    const paddle::optional<DenseTensor>& bias0,
+    const paddle::optional<DenseTensor>& scale,
+    const paddle::optional<DenseTensor>& bias1,
+    const int x_num_col_dims,
+    const std::string& activation_type,
+    const float epsilon,
+    const int begin_norm_axis,
+    DenseTensor* out,
+    DenseTensor* mean,
+    DenseTensor* variance) {
+  PADDLE_ENFORCE_GE(
+      x_num_col_dims,
+      1,
+      phi::errors::InvalidArgument(
+          "The x_num_col_dims must be  greater than or equal to 1, "
+          "But received the x_num_col_dims is %d",
+          x_num_col_dims));
+  PADDLE_ENFORCE_GE(epsilon,
+                    0.0f,
+                    phi::errors::InvalidArgument(
+                        "'epsilon' should be between 0.0 and 0.001."));
+  PADDLE_ENFORCE_LE(epsilon,
+                    0.001f,
+                    phi::errors::InvalidArgument(
+                        "'epsilon' should be between 0.0 and 0.001."));
+  PADDLE_ENFORCE_GT(begin_norm_axis,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "'begin_norm_axis' should be greater than zero."));
+
+  auto w_dims = w.dims();
+  int N = w_dims[1];
+  int K = w_dims[0];
+  int M = phi::product(x.dims()) / K;
+
+  const T* x_data = x.data<T>();
+  const T* w_data = w.data<T>();
+
+  auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
+  auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
+  blas.GEMM(CblasNoTrans,
+            CblasNoTrans,
+            M,
+            N,
+            K,
+            static_cast<T>(1.0),
+            x_data,
+            w_data,
+            static_cast<T>(0.0),
+            out_data);
+
+  const T* y_data = y.data<T>();
+  const T* bias_0_data = bias0 ? bias0->data<T>() : nullptr;
+  const T* bias_1_data = bias1 ? bias1->data<T>() : nullptr;
+  const T* scale_data = scale ? scale->data<T>() : nullptr;
+
+  T* mean_data =
+      mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
+           : nullptr;
+  T* variance_data = variance ? dev_ctx.template Alloc<T>(
+                                    variance, variance->numel() * sizeof(T))
+                              : nullptr;
+
+  bool with_relu = (activation_type == "relu") ? true : false;
+
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  AddReluAddLayerNorm(dev_ctx,
+                      with_relu,
+                      max_threads,
+                      y_data,
+                      bias_0_data,
+                      bias_1_data,
+                      scale_data,
+                      out_data,
+                      mean_data,
+                      variance_data,
+                      M,
+                      N,
+                      epsilon);
+}
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_fc_elementwise_layernorm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedFCElementwiseLayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu b/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
new file mode 100644
index 00000000000000..954fbd67b96abc
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
@@ -0,0 +1,127 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <type_traits>
+
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T>
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+
+template <typename T, typename Context>
+void TransposeFlattenConcatFusionKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& x,
+    const std::vector<int>& trans_axis,
+    const int flatten_axis,
+    const int concat_axis,
+    DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+  auto odims = out->dims();
+
+  int rank = x[0]->dims().size();
+  // use at least 4D in cudnnTransformTensor
+  int max_dim = rank < 4 ? 4 : rank;
+  std::vector<int> stride_x(max_dim, 0);
+  std::vector<int> stride_y(max_dim, 0);
+  std::vector<int> dims_y(max_dim, 0);
+
+  cudnnTensorDescriptor_t in_desc;
+  cudnnTensorDescriptor_t out_desc;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&in_desc));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&out_desc));
+  cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
+
+  auto handle = dev_ctx.cudnn_handle();
+
+  T* odata = out->data<T>();
+  for (auto& item : x) {
+    auto perm_shape = phi::funcs::GetPermuteShape(trans_axis, item->dims());
+    int osize = 1;
+    auto idims = item->dims();
+    for (int i = 0; i < rank; i++) {
+      stride_x[i] = 1;
+      for (int j = trans_axis[i] + 1; j < rank; j++) {
+        stride_x[i] *= idims[j];
+      }
+      dims_y[i] = perm_shape[i];
+      osize *= perm_shape[i];
+    }
+    stride_y[rank - 1] = 1;
+    for (int i = rank - 2; i >= 0; i--) {
+      if (((i + 1) == flatten_axis) && (concat_axis == 1)) {
+        stride_y[i] = odims[1];
+      } else {
+        stride_y[i] = stride_y[i + 1] * perm_shape[i + 1];
+      }
+    }
+
+    // Since concat is after flatten, the output is 2D tensor.
+    // If concat_axis is 0, each input's permutated tensor is continuous.
+    // If concat_axis is 1, the stride of 0-th dim of each input's
+    // permutated tensor is odims()[1].
+
+    for (int i = rank; i < max_dim; i++) {
+      stride_x[i] = 1;
+      stride_y[i] = 1;
+      dims_y[i] = 1;
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+        in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+        out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnTransformTensor(
+        handle,
+        CudnnDataType<T>::kOne(),
+        in_desc,
+        static_cast<const void*>(item->data<T>()),
+        CudnnDataType<T>::kZero(),
+        out_desc,
+        static_cast<void*>(odata)));
+    if (concat_axis == 0) {
+      odata += osize;
+    } else {
+      auto flat_shape = phi::funcs::GetFlattenShape(flatten_axis, perm_shape);
+      odata += flat_shape[1];
+    }
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(in_desc));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(out_desc));
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fusion_transpose_flatten_concat,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::TransposeFlattenConcatFusionKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu
index 8554378d3d4b11..47ceb7ba1fdbce 100644
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu
@@ -609,6 +609,11 @@ __global__ void masked_multihead_attention_kernel(
     //    bi * (params.timestep + 1) + params.timestep];
     // qk += static_cast<float>(mask);
     qk *= params.inv_sqrt_dh;
+    if (params.attn_mask) {
+      auto mask_bhi = params.mask_broadcast_num_heads ? bi : bhi;
+      T mask = params.attn_mask[mask_bhi * params.mask_length + act_time_step];
+      qk += static_cast<float>(mask);
+    }
     qk_max = qk;
     qk_smem[act_time_step] = qk;
   }
diff --git a/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h b/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
index ed311e520681f0..12e64caa54b0a6 100644
--- a/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
+++ b/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
@@ -1325,16 +1325,18 @@ inline __device__ void apply_rotary_embedding(uint2& q,      // NOLINT
   k.y = rotary_embedding_transform(k.y, cos.y, sin.x);
 }
 
-inline __device__ void apply_rotary_embedding(uint2& q,       // NOLINT
-                                              uint2& k,       // NOLINT
-                                              float4& cos,    // NOLINT
-                                              float4& sin) {  // NOLINT
+inline __device__ void apply_rotary_embedding(
+    uint2& q,       // NOLINT equals 4 half.
+    uint2& k,       // NOLINT
+    float4& cos,    // NOLINT 2 float2 cos.
+    float4& sin) {  // NOLINT
   Float4_& cos_ = *reinterpret_cast<Float4_*>(&cos);
   Float4_& sin_ = *reinterpret_cast<Float4_*>(&sin);
+  // cos_.x is float2
   q.x = rotary_embedding_transform(q.x, cos_.x, sin_.x);
   k.x = rotary_embedding_transform(k.x, cos_.x, sin_.x);
   q.y = rotary_embedding_transform(q.y, cos_.y, sin_.y);
-  k.y = rotary_embedding_transform(k.y, cos_.y, sin_.x);
+  k.y = rotary_embedding_transform(k.y, cos_.y, sin_.y);
 }
 
 inline __device__ void apply_rotary_embedding(uint4& q,      // NOLINT
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index b65eaa5d7757d1..c67864bc13f573 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -381,9 +381,10 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_double_grad,
-                                   SoftplusDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_grad,
+                                                SoftplusGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_double_grad,
+                                                SoftplusDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_double_grad, SqrtDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index acfe4dd5a2941b..6eeba717ece0dd 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -250,7 +250,7 @@ PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, StanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softplus, SoftplusKernel)
 
 PD_REGISTER_KERNEL(exp,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index 41f5f4c3f4d051..caa635255b9878 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -209,9 +209,9 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
                            const Scalar& axis,
                            bool keepdims,
                            bool flatten,
-                           int dtype,
+                           DataType dtype,
                            DenseTensor* out) {
-  if (dtype < 0) {
+  if (dtype == DataType::UNDEFINED) {
     phi::VisitDataTypeTiny(
         phi::DataType::INT64,
         VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
@@ -219,7 +219,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
     return;
   }
   phi::VisitDataTypeTiny(
-      phi::TransToPhiDataType(dtype),
+      dtype,
       VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
           dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
 }
@@ -230,7 +230,7 @@ void ArgMinKernel(const Context& dev_ctx,
                   const Scalar& axis,
                   bool keepdims,
                   bool flatten,
-                  int dtype,
+                  DataType dtype,
                   DenseTensor* out) {
   ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMin>(
       dev_ctx, x, axis, keepdims, flatten, dtype, out);
@@ -242,7 +242,7 @@ void ArgMaxKernel(const Context& dev_ctx,
                   const Scalar& axis,
                   bool keepdims,
                   bool flatten,
-                  int dtype,
+                  DataType dtype,
                   DenseTensor* out) {
   ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMax>(
       dev_ctx, x, axis, keepdims, flatten, dtype, out);
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index ad276ec6f1812b..3b73935699babb 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -1255,6 +1255,9 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
+#if CUDNN_VERSION_MIN(7, 4, 1)
+  kernel->OutputAt(5).SetDataType(phi::DataType::UINT8);
+#endif
 }
 #else
 PD_REGISTER_KERNEL(batch_norm,
@@ -1274,6 +1277,9 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
+#if CUDNN_VERSION_MIN(7, 4, 1)
+  kernel->OutputAt(5).SetDataType(phi::DataType::UINT8);
+#endif
 }
 #endif
 
diff --git a/paddle/phi/kernels/gpu/contiguous_kernel.cu b/paddle/phi/kernels/gpu/contiguous_kernel.cu
index b8dee10e31cdeb..357e104afb01c8 100644
--- a/paddle/phi/kernels/gpu/contiguous_kernel.cu
+++ b/paddle/phi/kernels/gpu/contiguous_kernel.cu
@@ -20,26 +20,120 @@ limitations under the License. */
 #include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace phi {
+template <typename T, size_t N>
+__global__ void ContiguousCaseZeroFunc(
+    const T* input_data,
+    T* out_data,
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_stride) {
+  int64_t input_offset = 0;
+  int64_t output_offset = (blockIdx.z * gridDim.y * gridDim.x +
+                           blockIdx.y * gridDim.x + blockIdx.x) *
+                              blockDim.z * blockDim.y * blockDim.x +
+                          threadIdx.z * blockDim.y * blockDim.x +
+                          threadIdx.y * blockDim.x + threadIdx.x;
+  float coordinate[6] = {threadIdx.x,
+                         threadIdx.y,
+                         threadIdx.z,
+                         blockIdx.x,
+                         blockIdx.y,
+                         blockIdx.z};
+
+#pragma unroll
+  for (int dim = N - 1; dim >= 0; --dim) {
+    input_offset += coordinate[N - 1 - dim] * input_stride[dim];
+  }
+
+  out_data[output_offset] = input_data[input_offset];
+}
 
 template <typename T, size_t N>
-__global__ void ContiguousFunc(
+__global__ void ContiguousCaseOneFunc(
     const T* input_data,
     T* out_data,
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_stride,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> dims,
-    const int64_t numel) {
-  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
-#pragma unroll
-  for (int64_t i = gid; i < numel; i += blockDim.x * gridDim.x) {
+    phi::Array<int64_t, 6> dims,
+    const int64_t x_max) {
+  int64_t x = blockIdx.x * blockDim.x + threadIdx.x;
+  if (x < x_max) {
     int64_t input_offset = 0;
-    int64_t index_tmp = i;
+    int64_t output_offset = (blockIdx.z * gridDim.y + blockIdx.y) * x_max + x;
+
+    int64_t reg_dims[6] = {
+        dims[0], dims[1], dims[2], dims[3], dims[4], dims[5]};
+    int64_t coordinate[phi::DDim::kMaxRank + 1];
+
+    switch (N) {
+      case 1:
+        coordinate[0] = x % reg_dims[0];
+        break;
+      case 2:
+        coordinate[0] = x % reg_dims[0];
+        coordinate[1] = x / reg_dims[0] % reg_dims[1];
+        break;
+      case 3:
+        coordinate[0] = x % reg_dims[0];
+        coordinate[1] = x / reg_dims[0] % reg_dims[1];
+        coordinate[2] = x / (reg_dims[0] * reg_dims[1]);
+        break;
+      case 4:
+        coordinate[0] = x % reg_dims[0];
+        coordinate[1] = x / reg_dims[0] % reg_dims[1];
+        coordinate[2] = x / (reg_dims[0] * reg_dims[1]);
+        coordinate[3] = blockIdx.y % reg_dims[2];
+        break;
+      case 5:
+        coordinate[0] = x % reg_dims[0];
+        coordinate[1] = x / reg_dims[0] % reg_dims[1];
+        coordinate[2] = x / (reg_dims[0] * reg_dims[1]);
+        coordinate[3] = blockIdx.y % reg_dims[2];
+        coordinate[4] = blockIdx.y / reg_dims[2] % reg_dims[3];
+        break;
+      case 6:
+        coordinate[0] = x % reg_dims[0];
+        coordinate[1] = x / reg_dims[0] % reg_dims[1];
+        coordinate[2] = x / (reg_dims[0] * reg_dims[1]);
+        coordinate[3] = blockIdx.y % reg_dims[2];
+        coordinate[4] = blockIdx.y / reg_dims[2] % reg_dims[3];
+        coordinate[5] = blockIdx.y / (reg_dims[2] * reg_dims[3]);
+        break;
+      case 7:
+        coordinate[0] = x % reg_dims[0];
+        coordinate[1] = x / reg_dims[0] % reg_dims[1];
+        coordinate[2] = x / (reg_dims[0] * reg_dims[1]);
+        coordinate[3] = blockIdx.y % reg_dims[2];
+        coordinate[4] = blockIdx.y / reg_dims[2] % reg_dims[3];
+        coordinate[5] = blockIdx.y / (reg_dims[2] * reg_dims[3]);
+        coordinate[6] = blockIdx.z % reg_dims[4];
+        break;
+      case 8:
+        coordinate[0] = x % reg_dims[0];
+        coordinate[1] = x / reg_dims[0] % reg_dims[1];
+        coordinate[2] = x / (reg_dims[0] * reg_dims[1]);
+        coordinate[3] = blockIdx.y % reg_dims[2];
+        coordinate[4] = blockIdx.y / reg_dims[2] % reg_dims[3];
+        coordinate[5] = blockIdx.y / (reg_dims[2] * reg_dims[3]);
+        coordinate[6] = blockIdx.z % reg_dims[4];
+        coordinate[7] = blockIdx.z / reg_dims[4] % reg_dims[5];
+        break;
+      case 9:
+        coordinate[0] = x % reg_dims[0];
+        coordinate[1] = x / reg_dims[0] % reg_dims[1];
+        coordinate[2] = x / (reg_dims[0] * reg_dims[1]);
+        coordinate[3] = blockIdx.y % reg_dims[2];
+        coordinate[4] = blockIdx.y / reg_dims[2] % reg_dims[3];
+        coordinate[5] = blockIdx.y / (reg_dims[2] * reg_dims[3]);
+        coordinate[6] = blockIdx.z % reg_dims[4];
+        coordinate[7] = blockIdx.z / reg_dims[4] % reg_dims[5];
+        coordinate[8] = blockIdx.z / (reg_dims[4] * reg_dims[5]);
+        break;
+    }
+
 #pragma unroll
     for (int dim = N - 1; dim >= 0; --dim) {
-      input_offset += index_tmp % dims[dim] * input_stride[dim];
-      index_tmp = index_tmp / dims[dim];
+      input_offset += coordinate[N - 1 - dim] * input_stride[dim];
     }
 
-    out_data[i] = input_data[input_offset];
+    out_data[output_offset] = input_data[input_offset];
   }
 }
 
@@ -135,49 +229,214 @@ void ContiguousKernel(const Context& dev_ctx,
     input_stride[0] = 1;
   }
 
-  int64_t block = 512;
-  int64_t grid = (numel + block - 1) / block;
-
-  switch (rank) {
-    case 1:
-      ContiguousFunc<T, 1><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    case 2:
-      ContiguousFunc<T, 2><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    case 3:
-      ContiguousFunc<T, 3><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    case 4:
-      ContiguousFunc<T, 4><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    case 5:
-      ContiguousFunc<T, 5><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    case 6:
-      ContiguousFunc<T, 6><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    case 7:
-      ContiguousFunc<T, 7><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    case 8:
-      ContiguousFunc<T, 8><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    case 9:
-      ContiguousFunc<T, 9><<<grid, block, 0, dev_ctx.stream()>>>(
-          input_data, output_data, input_stride, input_dims, numel);
-      break;
-    default:
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "The rank of input should be less than 9, but received %d.", rank));
+  dim3 grid(1, 1, 1), block(1, 1, 1);
+
+  int tmp = 1;
+
+  for (int i = 0; i < 3 && i < rank; i++) {
+    tmp *= input_dims[rank - 1 - i];
+  }
+
+  if (rank <= 6 && tmp <= 1024 &&
+      (input_dims.size() < 3 || input_dims[rank - 3] <= 64)) {
+    if (rank >= 1) {
+      block.x = input_dims[rank - 1];
+    }
+
+    if (rank >= 2) {
+      block.y = input_dims[rank - 2];
+    }
+
+    if (rank >= 3) {
+      block.z = input_dims[rank - 3];
+    }
+
+    switch (rank) {
+      case 1:
+        ContiguousCaseZeroFunc<T, 1><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data, output_data, input_stride);
+        break;
+      case 2:
+        ContiguousCaseZeroFunc<T, 2><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data, output_data, input_stride);
+        break;
+      case 3:
+        ContiguousCaseZeroFunc<T, 3><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data, output_data, input_stride);
+        break;
+      case 4:
+        grid.x = input_dims[rank - 4];
+        ContiguousCaseZeroFunc<T, 4><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data, output_data, input_stride);
+        break;
+      case 5:
+        grid.x = input_dims[rank - 4];
+        grid.y = input_dims[rank - 5];
+        ContiguousCaseZeroFunc<T, 5><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data, output_data, input_stride);
+        break;
+      case 6:
+        grid.x = input_dims[rank - 4];
+        grid.y = input_dims[rank - 5];
+        grid.z = input_dims[rank - 6];
+        ContiguousCaseZeroFunc<T, 6><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data, output_data, input_stride);
+        break;
+    }
+  } else {
+    phi::Array<int64_t, 6> cur_input_dims;
+    block.x = 512;
+    switch (rank) {
+      case 1:
+        grid.x = (numel + block.x - 1) / block.x;
+        cur_input_dims[0] = input_dims[rank - 1];
+        ContiguousCaseOneFunc<T, 1>
+            <<<grid, block, 0, dev_ctx.stream()>>>(input_data,
+                                                   output_data,
+                                                   input_stride,
+                                                   cur_input_dims,
+                                                   input_dims[rank - 1]);
+        break;
+      case 2:
+        grid.x = (numel + block.x - 1) / block.x;
+        cur_input_dims[0] = input_dims[rank - 1];
+        cur_input_dims[1] = input_dims[rank - 2];
+        ContiguousCaseOneFunc<T, 2><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data,
+            output_data,
+            input_stride,
+            cur_input_dims,
+            input_dims[rank - 1] * input_dims[rank - 2]);
+        break;
+      case 3:
+        grid.x = (numel + block.x - 1) / block.x;
+        cur_input_dims[0] = input_dims[rank - 1];
+        cur_input_dims[1] = input_dims[rank - 2];
+        ContiguousCaseOneFunc<T, 3><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data,
+            output_data,
+            input_stride,
+            cur_input_dims,
+            input_dims[rank - 1] * input_dims[rank - 2] * input_dims[rank - 3]);
+        break;
+      case 4:
+        grid.x = (input_dims[rank - 1] * input_dims[rank - 2] *
+                      input_dims[rank - 3] +
+                  block.x - 1) /
+                 block.x;
+        grid.y = input_dims[rank - 4];
+        cur_input_dims[0] = input_dims[rank - 1];
+        cur_input_dims[1] = input_dims[rank - 2];
+        cur_input_dims[2] = input_dims[rank - 4];
+        ContiguousCaseOneFunc<T, 4><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data,
+            output_data,
+            input_stride,
+            cur_input_dims,
+            input_dims[rank - 1] * input_dims[rank - 2] * input_dims[rank - 3]);
+        break;
+      case 5:
+        grid.x = (input_dims[rank - 1] * input_dims[rank - 2] *
+                      input_dims[rank - 3] +
+                  block.x - 1) /
+                 block.x;
+        grid.y = input_dims[rank - 4] * input_dims[rank - 5];
+        cur_input_dims[0] = input_dims[rank - 1];
+        cur_input_dims[1] = input_dims[rank - 2];
+        cur_input_dims[2] = input_dims[rank - 4];
+        cur_input_dims[3] = input_dims[rank - 5];
+        ContiguousCaseOneFunc<T, 5><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data,
+            output_data,
+            input_stride,
+            cur_input_dims,
+            input_dims[rank - 1] * input_dims[rank - 2] * input_dims[rank - 3]);
+        break;
+      case 6:
+        grid.x = (input_dims[rank - 1] * input_dims[rank - 2] *
+                      input_dims[rank - 3] +
+                  block.x - 1) /
+                 block.x;
+        grid.y =
+            input_dims[rank - 4] * input_dims[rank - 5] * input_dims[rank - 6];
+        cur_input_dims[0] = input_dims[rank - 1];
+        cur_input_dims[1] = input_dims[rank - 2];
+        cur_input_dims[2] = input_dims[rank - 4];
+        cur_input_dims[3] = input_dims[rank - 5];
+        ContiguousCaseOneFunc<T, 6><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data,
+            output_data,
+            input_stride,
+            cur_input_dims,
+            input_dims[rank - 1] * input_dims[rank - 2] * input_dims[rank - 3]);
+        break;
+      case 7:
+        grid.x = (input_dims[rank - 1] * input_dims[rank - 2] *
+                      input_dims[rank - 3] +
+                  block.x - 1) /
+                 block.x;
+        grid.y =
+            input_dims[rank - 4] * input_dims[rank - 5] * input_dims[rank - 6];
+        grid.z = input_dims[rank - 7];
+        cur_input_dims[0] = input_dims[rank - 1];
+        cur_input_dims[1] = input_dims[rank - 2];
+        cur_input_dims[2] = input_dims[rank - 4];
+        cur_input_dims[3] = input_dims[rank - 5];
+        cur_input_dims[4] = input_dims[rank - 7];
+        ContiguousCaseOneFunc<T, 7><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data,
+            output_data,
+            input_stride,
+            cur_input_dims,
+            input_dims[rank - 1] * input_dims[rank - 2] * input_dims[rank - 3]);
+        break;
+      case 8:
+        grid.x = (input_dims[rank - 1] * input_dims[rank - 2] *
+                      input_dims[rank - 3] +
+                  block.x - 1) /
+                 block.x;
+        grid.y =
+            input_dims[rank - 4] * input_dims[rank - 5] * input_dims[rank - 6];
+        grid.z = input_dims[rank - 7] * input_dims[rank - 8];
+        cur_input_dims[0] = input_dims[rank - 1];
+        cur_input_dims[1] = input_dims[rank - 2];
+        cur_input_dims[2] = input_dims[rank - 4];
+        cur_input_dims[3] = input_dims[rank - 5];
+        cur_input_dims[4] = input_dims[rank - 7];
+        cur_input_dims[5] = input_dims[rank - 8];
+        ContiguousCaseOneFunc<T, 8><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data,
+            output_data,
+            input_stride,
+            cur_input_dims,
+            input_dims[rank - 1] * input_dims[rank - 2] * input_dims[rank - 3]);
+        break;
+      case 9:
+        grid.x = (input_dims[rank - 1] * input_dims[rank - 2] *
+                      input_dims[rank - 3] +
+                  block.x - 1) /
+                 block.x;
+        grid.y =
+            input_dims[rank - 4] * input_dims[rank - 5] * input_dims[rank - 6];
+        grid.z =
+            input_dims[rank - 7] * input_dims[rank - 8] * input_dims[rank - 9];
+        cur_input_dims[0] = input_dims[rank - 1];
+        cur_input_dims[1] = input_dims[rank - 2];
+        cur_input_dims[2] = input_dims[rank - 4];
+        cur_input_dims[3] = input_dims[rank - 5];
+        cur_input_dims[4] = input_dims[rank - 7];
+        cur_input_dims[5] = input_dims[rank - 8];
+        ContiguousCaseOneFunc<T, 9><<<grid, block, 0, dev_ctx.stream()>>>(
+            input_data,
+            output_data,
+            input_stride,
+            cur_input_dims,
+            input_dims[rank - 1] * input_dims[rank - 2] * input_dims[rank - 3]);
+        break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "The rank of input should be less than 9, but received %d.", rank));
+    }
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
index 9129c87b91434f..e146fb47cf66d4 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -21,7 +21,9 @@
 #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/legacy/reduce_max_kernel.h"
 #include "paddle/phi/kernels/p_norm_kernel.h"
+#include "paddle/phi/kernels/reduce_min_kernel.h"
 
 namespace phi {
 
@@ -149,16 +151,16 @@ void DistKernel(const Context& dev_ctx,
       ReduceMaxWithSubtract<T>
           <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
               x_ptr, y_ptr, i_ptr, n);
-      phi::funcs::ReduceKernel<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, intermediate, out, kps::IdentityFunctor<T>(), reduce_axis);
+      phi::MaxRawKernel<T, Context>(
+          dev_ctx, intermediate, reduce_axis, true, true, out);
 
     } else if (p == -INFINITY) {
       ReduceMinWithSubtract<T>
           <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
               x_ptr, y_ptr, i_ptr, n);
 
-      phi::funcs::ReduceKernel<T, T, kps::MinFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, intermediate, out, kps::IdentityFunctor<T>(), reduce_axis);
+      phi::MinRawKernel<T, Context>(
+          dev_ctx, intermediate, reduce_axis, true, true, out);
 
     } else {
       MT p_order = static_cast<MT>(p);
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 8829d32596be1e..bd1d7db96cfeca 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -44,6 +44,7 @@ void FullKernel(const Context& dev_ctx,
   out->Resize(phi::make_ddim(shape.GetData()));
   int numel = out->numel();
   dev_ctx.template Alloc<T>(out);
+
   if (numel > 0) {
     // in transformer model the numel of outpout will be zero.
     std::vector<const DenseTensor*> inputs = {};
diff --git a/paddle/phi/kernels/gpu/lerp_kernel.cu b/paddle/phi/kernels/gpu/lerp_kernel.cu
index 9f059f1b5d6fbf..f9d8514a54ca22 100644
--- a/paddle/phi/kernels/gpu/lerp_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_kernel.cu
@@ -17,15 +17,11 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace phi {
 
-template <typename T>
-struct BroadcastMinElementWiseDirectCUDAFunctor {
-  HOSTDEVICE inline T operator()(const T min) const { return min; }
-};
-
 template <typename T>
 struct LerpElementWiseDirectCUDAFunctor {
   HOSTDEVICE inline T operator()(const T x, const T y, const T weight) const {
@@ -87,36 +83,23 @@ void LerpKernel(const Context &ctx,
     DenseTensor b_min = phi::EmptyLike<T>(ctx, *out);
     if (x.dims().size() != y.dims().size() &&
         weight.dims().size() != y.dims().size()) {
-      std::vector<const DenseTensor *> broadcast_min_inputs;
-      broadcast_min_inputs.reserve(1);
-      std::vector<DenseTensor *> broadcast_min_outputs = {&b_min};
-      auto broadcast_min_functor =
-          BroadcastMinElementWiseDirectCUDAFunctor<T>();
       if (x.dims().size() < y.dims().size() &&
           x.dims().size() < weight.dims().size()) {
-        broadcast_min_inputs.emplace_back(&x);
-        phi::funcs::BroadcastKernel<T>(ctx,
-                                       broadcast_min_inputs,
-                                       &broadcast_min_outputs,
-                                       broadcast_min_functor);
+        // x broadcast to b_min
+        ExpandKernel<T, Context>(ctx, x, phi::vectorize(b_min.dims()), &b_min);
         inputs.emplace_back(&b_min);
         inputs.emplace_back(&y);
         inputs.emplace_back(&weight);
       } else if (y.dims().size() < weight.dims().size()) {
-        broadcast_min_inputs.emplace_back(&y);
-        phi::funcs::BroadcastKernel<T>(ctx,
-                                       broadcast_min_inputs,
-                                       &broadcast_min_outputs,
-                                       broadcast_min_functor);
+        // y broadcast to b_min
+        ExpandKernel<T, Context>(ctx, y, phi::vectorize(b_min.dims()), &b_min);
         inputs.emplace_back(&x);
         inputs.emplace_back(&b_min);
         inputs.emplace_back(&weight);
       } else {
-        broadcast_min_inputs.emplace_back(&weight);
-        phi::funcs::BroadcastKernel<T>(ctx,
-                                       broadcast_min_inputs,
-                                       &broadcast_min_outputs,
-                                       broadcast_min_functor);
+        // weight broadcast to b_min
+        ExpandKernel<T, Context>(
+            ctx, weight, phi::vectorize(b_min.dims()), &b_min);
         inputs.emplace_back(&x);
         inputs.emplace_back(&y);
         inputs.emplace_back(&b_min);
diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
index 72f878c38dd364..ef2c29bbb2da0d 100644
--- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
@@ -18,12 +18,15 @@
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/transpose_function.cu.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace phi {
 
@@ -42,27 +45,6 @@ struct ComputeType<phi::dtype::bfloat16> {
   using type = float;
 };
 
-template <typename T>
-struct LogCUDAFunctor {
-  HOSTDEVICE inline T operator()(const T x) const { return std::log(x); }
-};
-
-template <>
-struct LogCUDAFunctor<float16> {
-  HOSTDEVICE inline float16 operator()(const float16 x) const {
-    auto x_ = static_cast<float>(x);
-    return static_cast<float16>(std::log(x_));
-  }
-};
-
-template <>
-struct LogCUDAFunctor<bfloat16> {
-  HOSTDEVICE inline bfloat16 operator()(const bfloat16 x) const {
-    auto x_ = static_cast<float>(x);
-    return static_cast<bfloat16>(std::log(x_));
-  }
-};
-
 template <typename T, typename Context>
 void LogsumexpFallbackKernel(const Context& dev_ctx,
                              const DenseTensor& x,
@@ -84,18 +66,14 @@ void LogsumexpFallbackKernel(const Context& dev_ctx,
   max_x.Resize(outdim);
   dev_ctx.template Alloc<T>(&max_x);
 
-  phi::funcs::ReduceKernel<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
-      dev_ctx, *in_x, &max_x, kps::IdentityFunctor<T>(), axis_vec);
+  phi::MaxKernel<T, Context>(dev_ctx, *in_x, axis_vec, false, &max_x);
 
   max_x.Resize(keeped_outdim);
   DenseTensor temp_x = Subtract<T, Context>(dev_ctx, *in_x, max_x);
   phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
       dev_ctx, temp_x, out_y, kps::ExpFunctor<T>(), axis_vec);
 
-  const std::vector<const DenseTensor*> inputs = {out_y};
-  std::vector<DenseTensor*> outputs = {&temp_x};
-  phi::funcs::ElementwiseKernel<T>(
-      dev_ctx, inputs, &outputs, LogCUDAFunctor<T>());
+  phi::LogKernel<T, Context>(dev_ctx, *out_y, &temp_x);
   temp_x.Resize(outdim);
   out->Resize(outdim);
   phi::AddKernel<T, Context>(dev_ctx, temp_x, max_x, out);
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index ba137b6fadc761..96fc3d1ac2b2e5 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -191,7 +191,7 @@ void MultinomialKernel(const Context& dev_ctx,
 
     if (int_num_samples == 1) {
       ArgMaxKernel<T, Context>(
-          dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out);
+          dev_ctx, rand, -1, true, false, DataType::INT64, out);
     } else {
       std::vector<int64_t> out_dim_vec = vectorize<int64_t>(out->dims());
       DenseTensor value = Empty<T, Context>(dev_ctx, IntArray(out_dim_vec));
diff --git a/paddle/phi/kernels/gpu/quantize_linear_kernel.cu b/paddle/phi/kernels/gpu/quantize_linear_kernel.cu
new file mode 100644
index 00000000000000..11c043e76f464e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/quantize_linear_kernel.cu
@@ -0,0 +1,130 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/phi/kernels/quantize_linear_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/quantize_linear_impl.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void KeDequantize(
+    const T* in, const T* scale, T max_range, int64_t num, T* out) {
+  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * scale[0] / max_range;
+  }
+}
+
+template <typename T>
+__global__ void DequantizeOneScaleQuantAxisN(const T* in,
+                                             const T* scale,
+                                             const T max_range,
+                                             const int64_t num,
+                                             const int n_scales,
+                                             const int quant_stride,
+                                             T* out) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / quant_stride) % n_scales];
+    out[i] = in[i] * s / max_range;
+  }
+}
+
+template <typename T>
+struct ChannelDequantizeFunctorV2<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
+                  T max_range,
+                  const int quant_axis,
+                  phi::DenseTensor* out) {
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    T* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+    int64_t num = in->numel();
+    const T* scale_factor = scale->data<T>();
+    int64_t block_size = std::min(
+        num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
+    int64_t max_threads =
+        dev_ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (num + block_size - 1) / block_size);
+
+    int quant_stride = 1;
+    for (int i = quant_axis + 1; i < in_dims.size(); i++) {
+      quant_stride *= in_dims[i];
+    }
+
+    DequantizeOneScaleQuantAxisN<T>
+        <<<grid_size, block_size, 0, dev_ctx.stream()>>>(in_data,
+                                                         scale_factor,
+                                                         max_range,
+                                                         num,
+                                                         in_dims[quant_axis],
+                                                         quant_stride,
+                                                         out_data);
+  }
+};
+
+template <typename T>
+struct DequantizeFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
+                  T max_range,
+                  phi::DenseTensor* out) {
+    const T* in_data = in->data<T>();
+    const T* scale_factor = scale->data<T>();
+    T* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
+    int64_t num = in->numel();
+    int64_t block_size = std::min(
+        num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
+    int64_t max_threads =
+        dev_ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (num + block_size - 1) / block_size);
+    KeDequantize<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+        in_data, scale_factor, max_range, num, out_data);
+  }
+};
+
+template struct DequantizeFunctor<phi::GPUContext, phi::dtype::float16>;
+template struct DequantizeFunctor<phi::GPUContext, float>;
+template struct DequantizeFunctor<phi::GPUContext, double>;
+template struct ChannelDequantizeFunctorV2<phi::GPUContext, float16>;
+template struct ChannelDequantizeFunctorV2<phi::GPUContext, float>;
+template struct ChannelDequantizeFunctorV2<phi::GPUContext, double>;
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dequantize_linear,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DeQuantizeLinearKernel,
+                   float,
+                   int8_t,
+                   double,
+                   phi::dtype::float16) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/paddle/phi/kernels/gpu/sign_kernel.cu.cc b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
index 71cd1d39b687d6..bbccc906a06e3c 100644
--- a/paddle/phi/kernels/gpu/sign_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
@@ -25,6 +25,10 @@ PD_REGISTER_KERNEL(sign,
                    GPU,
                    ALL_LAYOUT,
                    phi::SignKernel,
+                   int8_t,
+                   int16_t,
+                   int32_t,
+                   int64_t,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index be630f85ce07da..b69c4a691d0e33 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -33,8 +33,10 @@ namespace cub = hipcub;
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
@@ -80,21 +82,6 @@ int64_t ComputeBlockSize(int64_t col) {
     return 8;
 }
 
-template <typename Context,
-          template <typename T>
-          typename BinaryFunctor,
-          typename T>
-struct BinaryOperation {
-  void operator()(const Context& dev_ctx,
-                  const DenseTensor& lhs,
-                  const DenseTensor& rhs,
-                  DenseTensor* output) {
-    std::vector<const DenseTensor*> ins{&lhs, &rhs};
-    std::vector<DenseTensor*> outs{output};
-    phi::funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, BinaryFunctor<T>(), 0);
-  }
-};
-
 template <typename Context,
           template <typename InT, typename OutT>
           typename CompareFunctor,
@@ -314,47 +301,46 @@ void ViterbiDecodeKernel(const Context& dev_ctx,
   start_trans.Resize({1, n_labels});
   auto logit0 = input_exp.Slice(0, 1);
   logit0.Resize({batch_size, n_labels});
-  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
-  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
-  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
-  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
-  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
-  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
   if (include_bos_eos_tag) {
-    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    phi::AddKernel<T, Context>(dev_ctx, logit0, start_trans, &alpha);
     GetMask<Context, phi::funcs::EqualFunctor, T>()(
         dev_ctx, left_length, one, &float_mask);
-    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    phi::MultiplyKernel<T, Context>(
+        dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    phi::AddKernel<T, Context>(dev_ctx, alpha, alpha_nxt, &alpha);
   } else {
     alpha = logit0;
   }
-  SubInt(dev_ctx, left_length, one, &left_length);
+  phi::SubtractKernel<int64_t, Context>(
+      dev_ctx, left_length, one, &left_length);
   Argmax<Context, T, int64_t> argmax;
   for (int64_t i = 1; i < max_seq_len; ++i) {
     DenseTensor logit = input_exp.Slice(i, i + 1);
     logit.Resize({batch_size, n_labels});
     DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
-    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    phi::AddKernel<T, Context>(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
     auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
     alpha_argmax_temp.Resize({batch_size, n_labels});
     argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
     historys.emplace_back(alpha_argmax_temp);
-    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    phi::AddKernel<T, Context>(dev_ctx, alpha_max, logit, &alpha_nxt);
     alpha.Resize({batch_size, n_labels});
     GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
         dev_ctx, left_length, zero, &float_mask);
-    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
-    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
-    MulFloat(dev_ctx, alpha, float_mask, &alpha);
-    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    phi::MultiplyKernel<T, Context>(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    phi::SubtractKernel<T, Context>(
+        dev_ctx, float_one, float_mask, &float_mask);
+    phi::MultiplyKernel<T, Context>(dev_ctx, alpha, float_mask, &alpha);
+    phi::AddKernel<T, Context>(dev_ctx, alpha, alpha_nxt, &alpha);
     if (include_bos_eos_tag) {
       GetMask<Context, phi::funcs::EqualFunctor, T>()(
           dev_ctx, left_length, one, &float_mask);
-      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+      phi::MultiplyKernel<T, Context>(
+          dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      phi::AddKernel<T, Context>(dev_ctx, alpha, alpha_nxt, &alpha);
     }
-    SubInt(dev_ctx, left_length, one, &left_length);
+    phi::SubtractKernel<int64_t, Context>(
+        dev_ctx, left_length, one, &left_length);
   }
   argmax(dev_ctx, alpha, &last_ids, scores, 1);
   left_length.Resize({batch_size});
@@ -363,7 +349,8 @@ void ViterbiDecodeKernel(const Context& dev_ctx,
   // last_ids_update = last_ids * tag_mask
   int last_ids_index = 1;
   int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
-  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  phi::MultiplyKernel<int64_t, Context>(
+      dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
   // The algorithm below can refer to
   // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
   ARange<Context> arange;
@@ -371,24 +358,32 @@ void ViterbiDecodeKernel(const Context& dev_ctx,
   Gather<Context, int64_t, int64_t> gather;
   for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
     ++last_ids_index;
-    AddInt(dev_ctx, left_length, one, &left_length);
-    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    phi::AddKernel<int64_t, Context>(dev_ctx, left_length, one, &left_length);
+    phi::AddKernel<int64_t, Context>(
+        dev_ctx, batch_offset, last_ids, &gather_idx);
     DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
     hist->Resize({batch_size * n_labels});
     gather(dev_ctx, *hist, gather_idx, &last_ids_update);
     GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
         dev_ctx, left_length, zero, &int_mask);
-    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    phi::MultiplyKernel<int64_t, Context>(
+        dev_ctx, last_ids_update, int_mask, &last_ids_update);
     GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
         dev_ctx, left_length, zero, &zero_len_mask);
-    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
-    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
-    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
-    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    phi::MultiplyKernel<int64_t, Context>(
+        dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    phi::SubtractKernel<int64_t, Context>(
+        dev_ctx, one, zero_len_mask, &zero_len_mask);
+    phi::MultiplyKernel<int64_t, Context>(
+        dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    phi::AddKernel<int64_t, Context>(
+        dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
     GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
         dev_ctx, left_length, zero, &int_mask);
-    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
-    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+    phi::MultiplyKernel<int64_t, Context>(
+        dev_ctx, last_ids, int_mask, &last_ids);
+    phi::AddKernel<int64_t, Context>(
+        dev_ctx, last_ids_update, last_ids, &last_ids);
   }
   TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
 }
diff --git a/paddle/phi/kernels/gpu/weight_dequantize_kernel.cu b/paddle/phi/kernels/gpu/weight_dequantize_kernel.cu
new file mode 100644
index 00000000000000..fce785804c344f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/weight_dequantize_kernel.cu
@@ -0,0 +1,53 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/weight_dequantize_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#if defined(PADDLE_WITH_CUTLASS)
+#include "paddle/phi/kernels/funcs/weight_dequant_functor.h"
+#endif
+
+namespace phi {
+
+template <typename T, typename Context>
+void WeightDequantizeKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const std::string& algo,
+                            DataType out_dtype,
+                            DenseTensor* out) {
+#if defined(PADDLE_WITH_CUTLASS)
+  auto out_dims = out->dims();
+  dev_ctx.template Alloc<T>(out);
+  WeightDequantize<T, Context>(dev_ctx, x, scale, algo, true, out);
+  out->Resize({{out_dims[1], out_dims[0]}});
+  auto out_tmp = Transpose<T, Context>(dev_ctx, *out, {1, 0});
+  out->ShareDataWith(out_tmp);
+#else
+  PADDLE_THROW(
+      phi::errors::PreconditionNotMet("Not compiled with WITH_CUTLASS=ON"));
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(weight_dequantize,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WeightDequantizeKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/weight_only_linear_grad_kernel.cu b/paddle/phi/kernels/gpu/weight_only_linear_grad_kernel.cu
index f327ccef1a1aae..7ebe0c983a3442 100644
--- a/paddle/phi/kernels/gpu/weight_only_linear_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/weight_only_linear_grad_kernel.cu
@@ -1,32 +1,16 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/phi/kernels/weight_only_linear_grad_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -62,8 +46,12 @@ void WeightOnlyLinearGradKernel(const Context& dev_ctx,
       dev_ctx, weight, weight_scale, algo, true, &weight_dequantized);
   MatmulKernel<T, Context>(
       dev_ctx, out_grad, weight_dequantized, false, false, x_grad);
+#else
+  PADDLE_THROW(
+      phi::errors::PreconditionNotMet("Not compiled with WITH_CUTLASS=ON"));
 #endif
 }
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(weight_only_linear_grad,
diff --git a/paddle/phi/kernels/gpu/weight_only_linear_kernel.cu b/paddle/phi/kernels/gpu/weight_only_linear_kernel.cu
index 65af9f9c6c2b5c..0d2ab397ad130a 100644
--- a/paddle/phi/kernels/gpu/weight_only_linear_kernel.cu
+++ b/paddle/phi/kernels/gpu/weight_only_linear_kernel.cu
@@ -1,16 +1,17 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/phi/kernels/weight_only_linear_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/datatype_traits.h"
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index a4571b83e39e75..51d9fadc21c1c4 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -30,29 +30,6 @@ limitations under the License. */
 #define MATRIX_SOFTMAX_ALIGN_BYTES 16
 #define MATRIX_SOFTMAX_THREAHOLD 100000
 
-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_VEC_SIZE_BASE(vec_size, ...) \
-  case (vec_size): {                       \
-    constexpr auto VecSize = (vec_size);   \
-    __VA_ARGS__;                           \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
-#define FIXED_VEC_SIZE(...)              \
-  FIXED_VEC_SIZE_BASE(8, ##__VA_ARGS__); \
-  FIXED_VEC_SIZE_BASE(4, ##__VA_ARGS__)
-
 namespace phi {
 
 using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
@@ -112,7 +89,7 @@ static inline int Log2Ceil(int value) {
   return log2_value;
 }
 
-inline int getBlockSize(int vec_size, uint64_t dim_size) {
+inline int CalcBlockSize(int vec_size, uint64_t dim_size) {
   uint64_t block_size = 1;
   uint64_t max_block_size =
       std::min(dim_size / vec_size, static_cast<uint64_t>(1024));
@@ -461,14 +438,11 @@ __device__ __forceinline__ void ThreadVecWrite(T* out,
   }
 }
 
-template <typename T,
-          typename AccT,
-          typename IndexType,
-          int BatchSize,
-          int VecSize,
-          bool LogMode = false>
+template <typename T, typename AccT, typename IndexType, bool LogMode = false>
 __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) {
-  using VecT = phi::AlignedVector<T, VecSize>;
+  constexpr int kVecSize =
+      MaxWithOne<MATRIX_SOFTMAX_ALIGN_BYTES / sizeof(T)>::kValue;
+  using VecT = phi::AlignedVector<T, kVecSize>;
 
   int bid = blockIdx.x;
   T* batch_input = const_cast<T*>(src) + bid * dim_size;
@@ -480,16 +454,16 @@ __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) {
       ((uint64_t)batch_output) % MATRIX_SOFTMAX_ALIGN_BYTES / sizeof(T);
 
   // get max value
-  AccT thread_max = ThreadVecReduce<MaxFunctor, T, AccT, VecSize>(
+  AccT thread_max = ThreadVecReduce<MaxFunctor, T, AccT, kVecSize>(
       batch_input,
       dim_size,
       input_align_shift,
       MaxFunctor<T, AccT>(),
-      std::numeric_limits<AccT>::min());
+      -std::numeric_limits<AccT>::infinity());
   BlockReduceMax<AccT>(&thread_max);
 
   // get exp value and sum all
-  AccT thread_exp = ThreadVecReduce<SumExpFunctor, T, AccT, VecSize>(
+  AccT thread_exp = ThreadVecReduce<SumExpFunctor, T, AccT, kVecSize>(
       batch_input,
       dim_size,
       input_align_shift,
@@ -501,19 +475,19 @@ __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) {
   if (LogMode) {
     LogSoftmaxForwardFunctor<AccT, T> reduction(thread_max, thread_exp);
     if (input_align_shift == output_align_shift) {
-      ThreadVecWriteVec<LogSoftmaxForwardFunctor, T, AccT, VecSize>(
+      ThreadVecWriteVec<LogSoftmaxForwardFunctor, T, AccT, kVecSize>(
           batch_output, batch_input, dim_size, input_align_shift, reduction);
     } else {
-      ThreadVecWrite<LogSoftmaxForwardFunctor, T, AccT, VecSize>(
+      ThreadVecWrite<LogSoftmaxForwardFunctor, T, AccT, kVecSize>(
           batch_output, batch_input, dim_size, reduction);
     }
   } else {
     SoftmaxForwardFunctor<AccT, T> reduction(thread_max, thread_exp);
     if (input_align_shift == output_align_shift) {
-      ThreadVecWriteVec<SoftmaxForwardFunctor, T, AccT, VecSize>(
+      ThreadVecWriteVec<SoftmaxForwardFunctor, T, AccT, kVecSize>(
           batch_output, batch_input, dim_size, input_align_shift, reduction);
     } else {
-      ThreadVecWrite<SoftmaxForwardFunctor, T, AccT, VecSize>(
+      ThreadVecWrite<SoftmaxForwardFunctor, T, AccT, kVecSize>(
           batch_output, batch_input, dim_size, reduction);
     }
   }
@@ -785,9 +759,9 @@ void SwitchWarpSoftmaxForward(const IndexType blocks,
                               const IndexType batch_size,
                               const IndexType stride,
                               const IndexType element_count,
-                              IndexType Log2Elements) {
+                              IndexType log2_element_count) {
   using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
-  switch (Log2Elements) {
+  switch (log2_element_count) {
     SOFTMAX_WARP_FORWARD_CASE(0, AccT);
     SOFTMAX_WARP_FORWARD_CASE(1, AccT);
     SOFTMAX_WARP_FORWARD_CASE(2, AccT);
@@ -800,6 +774,10 @@ void SwitchWarpSoftmaxForward(const IndexType blocks,
     SOFTMAX_WARP_FORWARD_CASE(9, AccT);
     SOFTMAX_WARP_FORWARD_CASE(10, AccT);
     default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported softmax dim: element_count=%d, log2_element_count=%d!",
+          element_count,
+          log2_element_count));
       break;
   }
 }
@@ -824,9 +802,9 @@ void SwitchWarpSoftmaxBackward(const int blocks,
                                const int batch_size,
                                const int stride,
                                const int element_count,
-                               int Log2Elements) {
+                               int log2_element_count) {
   using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
-  switch (Log2Elements) {
+  switch (log2_element_count) {
     SOFTMAX_WARP_BACKWARD_CASE(0, AccT);
     SOFTMAX_WARP_BACKWARD_CASE(1, AccT);
     SOFTMAX_WARP_BACKWARD_CASE(2, AccT);
@@ -839,6 +817,9 @@ void SwitchWarpSoftmaxBackward(const int blocks,
     SOFTMAX_WARP_BACKWARD_CASE(9, AccT);
     SOFTMAX_WARP_BACKWARD_CASE(10, AccT);
     default:
+      // PADDLE_THROW(phi::errors::Unimplemented(
+      //     "Unsupported softmax dim: element_count=%d,
+      //     log2_element_count=%d!", element_count, log2_element_count));
       break;
   }
 }
@@ -1202,24 +1183,11 @@ template <typename T, typename IndexType, bool LogMode>
 void LaunchKeMatrixSoftmaxForwardKernel(
     const GPUContext& dev_ctx, T* out, const T* input, int N, int dim_size) {
   using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
-  const int vec_size = MATRIX_SOFTMAX_ALIGN_BYTES / sizeof(T);
-  switch (getBlockSize(vec_size, dim_size)) {
-    FIXED_BLOCK_DIM(switch (vec_size) {
-      FIXED_VEC_SIZE(
-          KeMatrixSoftmaxForward<T,
-                                 AccT,
-                                 IndexType,
-                                 kBlockDim,
-                                 VecSize,
-                                 LogMode>
-          <<<N, kBlockDim, 0, dev_ctx.stream()>>>(out, input, dim_size));
-      default:
-        break;
-    });
-    default:
-      PADDLE_THROW(
-          errors::Fatal("the input dim has error in the softmax cuda kernel."));
-  }
+  constexpr int kVecSize =
+      MaxWithOne<MATRIX_SOFTMAX_ALIGN_BYTES / sizeof(T)>::kValue;
+  int block_dim = CalcBlockSize(kVecSize, dim_size);
+  KeMatrixSoftmaxForward<T, AccT, IndexType, LogMode>
+      <<<N, block_dim, 0, dev_ctx.stream()>>>(out, input, dim_size);
 }
 
 #if CUDNN_VERSION < 8100
@@ -1262,8 +1230,7 @@ bool UseCudnnSoftmax(const GPUContext& ctx,
   }
   constexpr int max_dim = 1024;
   if (!cudnn_available || !last_dim ||
-      (softmax_dim <= max_dim && sizeof(T) <= 4) ||
-      softmax_dim >= MATRIX_SOFTMAX_THREAHOLD) {
+      (softmax_dim <= max_dim && sizeof(T) <= 4)) {
     return false;
   } else {
     return true;
@@ -1450,9 +1417,5 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
         dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
   }
 }
-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-#undef FIXED_VEC_SIZE_BASE
-#undef FIXED_VEC_SIZE
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index 0121f35b3cecb4..137829b5193f24 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -54,13 +54,8 @@ namespace phi {
                        const DenseTensor& y,                         \
                        int axis,                                     \
                        DenseTensor* out) {                           \
-    std::vector<const DenseTensor*> inputs;                          \
-    inputs.reserve(2);                                               \
-    std::vector<DenseTensor*> outputs;                               \
-    outputs.reserve(1);                                              \
-    inputs.emplace_back(&x);                                         \
-    inputs.emplace_back(&y);                                         \
-    outputs.emplace_back(out);                                       \
+    std::vector<const DenseTensor*> inputs = {&x, &y};               \
+    std::vector<DenseTensor*> outputs = {out};                       \
     dev_ctx.template Alloc<T>(out);                                  \
     funcs::BroadcastKernel<T>(                                       \
         dev_ctx, inputs, &outputs, funcs::name##Functor<T>(), axis); \
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
index 4829ae0a9f0c95..352e4d30067197 100644
--- a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
 
@@ -234,12 +235,12 @@ struct KronGradOpFunctor {
 #if defined(__NVCC__) || defined(__HIPCC__)
     auto stream = dev_ctx.stream();  // it is a cuda device_context
     if (dx) {
-      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1});
+      phi::SumKernel<T, Context>(
+          dev_ctx, dout_x, {1}, dout_x.dtype(), false, dx);
     }
     if (dy) {
-      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1});
+      phi::SumKernel<T, Context>(
+          dev_ctx, dout_y, {1}, dout_y.dtype(), false, dy);
     }
 #else
     auto *place = dev_ctx.eigen_device();
diff --git a/paddle/phi/kernels/impl/quantize_linear_impl.h b/paddle/phi/kernels/impl/quantize_linear_impl.h
new file mode 100644
index 00000000000000..9f86fd07447ee5
--- /dev/null
+++ b/paddle/phi/kernels/impl/quantize_linear_impl.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/kernels/quantize_linear_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+struct DequantizeFunctor {
+  void operator()(const Context& dev_ctx,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
+                  T max_range,
+                  phi::DenseTensor* out);
+};
+
+template <typename Context, typename T>
+struct ChannelDequantizeFunctorV2 {
+  void operator()(const Context& dev_ctx,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor** scales,
+                  const int scale_num,
+                  T max_range,
+                  const int quant_axis,
+                  phi::DenseTensor* out);
+};
+
+template <typename T, typename Context, typename D>
+void DeQuantizeLinearImpl(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& scale,
+                          int quant_axis,
+                          int bit_length,
+                          bool only_observer,
+                          DenseTensor* out) {
+  auto* in = &x;
+
+  auto in_tmp = phi::Cast<T>(dev_ctx, *in, phi::CppTypeToDataType<D>::Type());
+
+  dev_ctx.template Alloc<D>(out, out->numel() * sizeof(D));
+
+  if (only_observer) {
+    phi::Copy(dev_ctx, *in, dev_ctx.GetPlace(), false, out);
+    return;
+  }
+
+  if (quant_axis < 0) {
+    float max_range = (std::pow(2, bit_length - 1) - 1);
+    DequantizeFunctor<Context, D>()(
+        dev_ctx, &in_tmp, &scale, static_cast<D>(max_range), out);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        scale.numel(),
+        in_tmp.dims()[quant_axis],
+        phi::errors::PreconditionNotMet(
+            "The number of first scale values must be the same with "
+            "quant_axis dimension value of Input(X) when the `scale` has "
+            "only one element, but %ld != %ld here.",
+            scale.numel(),
+            in_tmp.dims()[quant_axis]));
+    int max_range = (std::pow(2, bit_length - 1) - 1);
+
+    ChannelDequantizeFunctorV2<Context, D>()(
+        dev_ctx, &in_tmp, &scale, static_cast<D>(max_range), quant_axis, out);
+  }
+}
+
+template <typename T, typename Context>
+void DeQuantizeLinearKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& zero_point,
+                            const paddle::optional<DenseTensor>& in_accum,
+                            const paddle::optional<DenseTensor>& in_state,
+                            int quant_axis,
+                            int bit_length,
+                            int round_type,
+                            bool is_test,
+                            bool only_observer,
+                            DenseTensor* out,
+                            DenseTensor* out_state,
+                            DenseTensor* out_accum,
+                            DenseTensor* out_scale) {
+  switch (scale.dtype()) {
+    case phi::DataType::FLOAT64:
+      DeQuantizeLinearImpl<T, Context, double>(
+          dev_ctx, x, scale, quant_axis, bit_length, only_observer, out);
+      break;
+    case phi::DataType::FLOAT32:
+      DeQuantizeLinearImpl<T, Context, float>(
+          dev_ctx, x, scale, quant_axis, bit_length, only_observer, out);
+      break;
+    case phi::DataType::FLOAT16:
+      DeQuantizeLinearImpl<T, Context, float16>(
+          dev_ctx, x, scale, quant_axis, bit_length, only_observer, out);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "In DeQuantizeLinearKernel, "
+          "data type %d for scale/output is not supported ",
+          scale.dtype()));
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu
index 545a9df2961bf2..14bb86b4753207 100644
--- a/paddle/phi/kernels/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/kps/compare_kernel.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
 
 #ifdef PADDLE_WITH_XPU_KP
 #include "paddle/phi/backends/xpu/xpu_context.h"
@@ -27,6 +27,7 @@
 #include "paddle/phi/kernels/compare_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/legacy/compare_kernel.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #endif
 
@@ -43,37 +44,27 @@ struct BitwiseAdd {
   }
 };
 
-template <typename T,
-          typename Context,
-          typename Functor,
-          typename InverseFunctor>
-inline void CompareKernelImpl(const Context& ctx,
-                              const DenseTensor& x,
-                              const DenseTensor& y,
-                              int axis,
-                              DenseTensor* out) {
-  ctx.template Alloc<bool>(out);
-  std::vector<const DenseTensor*> ins{&x, &y};
-  std::vector<DenseTensor*> outs{out};
-  funcs::BroadcastKernel<bool>(ctx, ins, &outs, Functor(), axis);
-}
+#define DEFINE_CUDA_COMPARE_KERNEL(name)                      \
+  template <typename T, typename Context>                     \
+  void name##Kernel(const Context& ctx,                       \
+                    const DenseTensor& x,                     \
+                    const DenseTensor& y,                     \
+                    DenseTensor* out) {                       \
+    if (out->IsSharedWith(x)) {                               \
+      auto x_origin = x;                                      \
+      name##RawKernel<T, Context>(ctx, x_origin, y, -1, out); \
+    } else {                                                  \
+      name##RawKernel<T, Context>(ctx, x, y, -1, out);        \
+    }                                                         \
+  }
 
-template <typename T,
-          typename Context,
-          typename Functor,
-          typename InverseFunctor>
-inline void InplaceCompareKernelImpl(const Context& ctx,
-                                     const DenseTensor& x,
-                                     const DenseTensor& y,
-                                     int axis,
-                                     DenseTensor* out) {
-  auto x_origin = x;
-  ctx.template Alloc<bool>(out);
-  out->set_type(phi::DataType::BOOL);
-  std::vector<const DenseTensor*> ins{&x_origin, &y};
-  std::vector<DenseTensor*> outs{out};
-  funcs::BroadcastKernel<bool>(ctx, ins, &outs, Functor(), axis);
-}
+DEFINE_CUDA_COMPARE_KERNEL(LessThan)
+DEFINE_CUDA_COMPARE_KERNEL(LessEqual)
+DEFINE_CUDA_COMPARE_KERNEL(GreaterThan)
+DEFINE_CUDA_COMPARE_KERNEL(GreaterEqual)
+DEFINE_CUDA_COMPARE_KERNEL(Equal)
+DEFINE_CUDA_COMPARE_KERNEL(NotEqual)
+#undef DEFINE_CUDA_COMPARE_KERNEL
 
 #ifndef PADDLE_WITH_XPU_KP
 template <typename T, typename Context, typename Functor>
@@ -106,6 +97,14 @@ inline void CompareAllKernelImpl(const Context& ctx,
   funcs::ReduceKernel<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
       ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims);
 }
+
+template <typename T, typename Context>
+void EqualAllKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  CompareAllKernelImpl<T, Context, funcs::EqualFunctor<T>>(ctx, x, y, out);
+}
 #endif
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index e2a33d76120f8c..584e026241bde3 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -18,9 +18,9 @@
 #include "paddle/phi/common/float16.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 #include "paddle/phi/kernels/legacy/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/legacy/elementwise_divide_kernel.h"
 #include "paddle/phi/kernels/legacy/elementwise_kernel.h"
 #include "paddle/phi/kernels/legacy/elementwise_multipy_kernel.h"
 #include "paddle/phi/kernels/legacy/elementwise_subtract_kernel.h"
@@ -146,13 +146,8 @@ void HeavisideKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y,
                      DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
+  std::vector<const DenseTensor*> inputs = {&x, &y};
+  std::vector<DenseTensor*> outputs = {out};
   dev_ctx.template Alloc<T>(out);
   funcs::BroadcastKernel<T>(
       dev_ctx, inputs, &outputs, funcs::ElementwiseHeavisideFunctor<T>());
diff --git a/paddle/phi/kernels/legacy/compare_kernel.h b/paddle/phi/kernels/legacy/compare_kernel.h
new file mode 100644
index 00000000000000..541ec10d244da4
--- /dev/null
+++ b/paddle/phi/kernels/legacy/compare_kernel.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LessThanRawKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void LessEqualRawKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        int axis,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void GreaterThanRawKernel(const Context& ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          int axis,
+                          DenseTensor* out);
+
+template <typename T, typename Context>
+void GreaterEqualRawKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out);
+
+template <typename T, typename Context>
+void EqualRawKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void NotEqualRawKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/phi/kernels/legacy/elementwise_divide_kernel.h
similarity index 54%
rename from paddle/fluid/operators/lstmp_op.cu
rename to paddle/phi/kernels/legacy/elementwise_divide_kernel.h
index 5559d09f1b9ba9..b63bcaad116938 100644
--- a/paddle/fluid/operators/lstmp_op.cu
+++ b/paddle/phi/kernels/legacy/elementwise_divide_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/lstmp_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    lstmp, GPU, ALL_LAYOUT, ops::LSTMPKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    lstmp_grad, GPU, ALL_LAYOUT, ops::LSTMPGradKernel, float, double) {}
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/legacy/elementwise_kernel.h b/paddle/phi/kernels/legacy/elementwise_kernel.h
index 1d453ec790f7ce..b51704da7a6d61 100644
--- a/paddle/phi/kernels/legacy/elementwise_kernel.h
+++ b/paddle/phi/kernels/legacy/elementwise_kernel.h
@@ -19,13 +19,6 @@
 
 namespace phi {
 
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out);
-
 template <typename T, typename Context>
 void MaximumRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
diff --git a/paddle/phi/kernels/legacy/kps/compare_kernel.cu b/paddle/phi/kernels/legacy/kps/compare_kernel.cu
index 5ab9dd8fea2d33..67bd491738346e 100644
--- a/paddle/phi/kernels/legacy/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/legacy/kps/compare_kernel.cu
@@ -32,27 +32,14 @@
 
 namespace phi {
 
-template <typename T>
-struct BitwiseAdd {
-  // Bitwise add operator, returns <tt>a + b</tt>
-  inline T initial() { return static_cast<T>(true); }
-
-  __host__ __device__ __forceinline__ T operator()(const T& a,
-                                                   const T& b) const {
-    return a & b;
-  }
-};
-
-template <typename T,
-          typename Context,
-          typename Functor,
-          typename InverseFunctor>
-inline void CompareCudaRawKernelImpl(const Context& ctx,
-                                     const DenseTensor& x,
-                                     const DenseTensor& y,
-                                     int axis,
-                                     DenseTensor* out) {
+template <typename T, typename Context, typename Functor>
+inline void CompareRawKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 int axis,
+                                 DenseTensor* out) {
   ctx.template Alloc<bool>(out);
+  out->set_type(phi::DataType::BOOL);
   std::vector<const DenseTensor*> ins{&x, &y};
   std::vector<DenseTensor*> outs{out};
   funcs::BroadcastKernel<bool>(ctx, ins, &outs, Functor(), axis);
@@ -64,10 +51,8 @@ void LessThanRawKernel(const Context& ctx,
                        const DenseTensor& y,
                        int axis,
                        DenseTensor* out) {
-  CompareCudaRawKernelImpl<T,
-                           Context,
-                           funcs::LessThanFunctor<T>,
-                           funcs::GreaterThanFunctor<T>>(ctx, x, y, axis, out);
+  CompareRawKernelImpl<T, Context, funcs::LessThanFunctor<T>>(
+      ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
@@ -76,10 +61,8 @@ void LessEqualRawKernel(const Context& ctx,
                         const DenseTensor& y,
                         int axis,
                         DenseTensor* out) {
-  CompareCudaRawKernelImpl<T,
-                           Context,
-                           funcs::LessEqualFunctor<T>,
-                           funcs::GreaterEqualFunctor<T>>(ctx, x, y, axis, out);
+  CompareRawKernelImpl<T, Context, funcs::LessEqualFunctor<T>>(
+      ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
@@ -88,43 +71,38 @@ void GreaterThanRawKernel(const Context& ctx,
                           const DenseTensor& y,
                           int axis,
                           DenseTensor* out) {
-  CompareCudaRawKernelImpl<T,
-                           Context,
-                           funcs::GreaterThanFunctor<T>,
-                           funcs::LessThanFunctor<T>>(ctx, x, y, axis, out);
+  CompareRawKernelImpl<T, Context, funcs::GreaterThanFunctor<T>>(
+      ctx, x, y, axis, out);
 }
+
 template <typename T, typename Context>
 void GreaterEqualRawKernel(const Context& ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            int axis,
                            DenseTensor* out) {
-  CompareCudaRawKernelImpl<T,
-                           Context,
-                           funcs::GreaterEqualFunctor<T>,
-                           funcs::LessEqualFunctor<T>>(ctx, x, y, axis, out);
+  CompareRawKernelImpl<T, Context, funcs::GreaterEqualFunctor<T>>(
+      ctx, x, y, axis, out);
 }
+
 template <typename T, typename Context>
 void EqualRawKernel(const Context& ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
                     int axis,
                     DenseTensor* out) {
-  CompareCudaRawKernelImpl<T,
-                           Context,
-                           funcs::EqualFunctor<T>,
-                           funcs::EqualFunctor<T>>(ctx, x, y, axis, out);
+  CompareRawKernelImpl<T, Context, funcs::EqualFunctor<T>>(
+      ctx, x, y, axis, out);
 }
+
 template <typename T, typename Context>
 void NotEqualRawKernel(const Context& ctx,
                        const DenseTensor& x,
                        const DenseTensor& y,
                        int axis,
                        DenseTensor* out) {
-  CompareCudaRawKernelImpl<T,
-                           Context,
-                           funcs::NotEqualFunctor<T>,
-                           funcs::NotEqualFunctor<T>>(ctx, x, y, axis, out);
+  CompareRawKernelImpl<T, Context, funcs::NotEqualFunctor<T>>(
+      ctx, x, y, axis, out);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
index 6cd7f2dc20a861..f07164bc16885b 100644
--- a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
@@ -23,105 +23,14 @@
 namespace phi {
 
 DEFINE_CUDA_ELEMENTWISE_OP(Add)
-
-// Create the definition of Divide
 DEFINE_CUDA_ELEMENTWISE_OP(Divide)
-
-// Create the definition of Multiply
 DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-
-// Create the definition of Subtract
 DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-
-template <typename T, typename Context>
-void MaximumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::MaximumFunctor<T>(), axis);
-}
-
-template <typename T, typename Context>
-void MinimumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::MinimumFunctor<T>(), axis);
-}
-
-template <typename T, typename Context>
-void RemainderRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::RemainderFunctor<T>(), axis);
-}
-
-template <typename T, typename Context>
-void FloorDivideRawKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          int axis,
-                          DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::FloorDivideFunctor<T>(), axis);
-}
-
-template <typename T, typename Context>
-void ElementwisePowRawKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int axis,
-                             DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::ElementwisePowFunctor<T>(), axis);
-}
+DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
+DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
+DEFINE_CUDA_ELEMENTWISE_OP(Remainder)
+DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide)
+DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/quantize_linear_kernel.h b/paddle/phi/kernels/quantize_linear_kernel.h
new file mode 100644
index 00000000000000..c10a67f51e6030
--- /dev/null
+++ b/paddle/phi/kernels/quantize_linear_kernel.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeQuantizeLinearKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& zero_point,
+                            const paddle::optional<DenseTensor>& in_accum,
+                            const paddle::optional<DenseTensor>& in_state,
+                            int quant_axis,
+                            int bit_length,
+                            int round_type,
+                            bool is_test,
+                            bool only_observer,
+                            DenseTensor* out,
+                            DenseTensor* out_state,
+                            DenseTensor* out_accum,
+                            DenseTensor* out_scale);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 2333de4b2e02a9..59f63c5d8cae5b 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -38,6 +38,8 @@ PD_REGISTER_KERNEL(mean,
                    float,
                    double,
                    bool,
+                   int,
+                   int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h
index 972d72ad706d92..d03e44c0636c84 100644
--- a/paddle/phi/kernels/reshape_kernel.h
+++ b/paddle/phi/kernels/reshape_kernel.h
@@ -48,7 +48,9 @@ void Reshape(const Context& dev_ctx,
              DenseTensor* out) {
   MetaTensor meta_out(out);
   InferMetaFromVecValue(x, shape, &meta_out);
-  ReshapeInferKernel<Context>(dev_ctx, x, IntArray(shape), out);
+  if (x.initialized()) {
+    ReshapeInferKernel<Context>(dev_ctx, x, IntArray(shape), out);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
index 78d34fa14295c8..1deddcf6dc0faf 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
@@ -118,7 +118,7 @@ void AdamDenseParamSparseGradKernel(
   }
 
   phi::SelectedRows tmp_grad_merge;
-  const phi::SelectedRows* grad_merge_ptr;
+  const phi::SelectedRows* grad_merge_ptr = nullptr;
   if (is_strict_sorted) {
     grad_merge_ptr = &grad;
   } else {
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 2869bf3206f7d3..d752a40084a224 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -74,7 +74,9 @@ void Split(const Context& dev_ctx,
     outs.push_back(&result->at(i));
   }
 
-  SplitKernel<T, Context>(dev_ctx, x, sections, axis, outs);
+  if (x.initialized()) {
+    SplitKernel<T, Context>(dev_ctx, x, sections, axis, outs);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index 473acf9d7a1d15..a8a788e817472b 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -76,6 +76,7 @@ PD_REGISTER_KERNEL(squeeze_grad,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    bool,
                    int,
                    uint8_t,
diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc
index d495b040921b59..a8d24423fcb45c 100644
--- a/paddle/phi/kernels/squeeze_kernel.cc
+++ b/paddle/phi/kernels/squeeze_kernel.cc
@@ -116,6 +116,7 @@ PD_REGISTER_KERNEL(squeeze_infer,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    bool,
                    int,
                    uint8_t,
@@ -129,6 +130,7 @@ PD_REGISTER_KERNEL(squeeze,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    bool,
                    int,
                    uint8_t,
diff --git a/paddle/phi/kernels/stride/diagonal_kernel.cc b/paddle/phi/kernels/stride/diagonal_kernel.cc
index e8929e6773f533..b4ca6d9b277df5 100644
--- a/paddle/phi/kernels/stride/diagonal_kernel.cc
+++ b/paddle/phi/kernels/stride/diagonal_kernel.cc
@@ -36,7 +36,7 @@ void DiagonalStridedKernel(const Context& dev_ctx,
     axis2 += static_cast<int>(x_rank);
   }
 
-  int64_t diag_size;
+  int64_t diag_size = 0;
   int64_t x_offset = static_cast<int64_t>(x.offset());
   if (offset >= 0) {
     diag_size = std::max<int64_t>(
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
index 5555586c043872..20c4af9cff1f9e 100644
--- a/paddle/phi/kernels/transpose_kernel.h
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -19,6 +19,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+
 namespace phi {
 
 template <typename T, typename Context>
@@ -43,7 +44,9 @@ void Transpose(const Context& dev_ctx,
 
   // do not call TransposeStridedKernel, because some other kernels call
   // Transpose directly
-  TransposeKernel<T, Context>(dev_ctx, x, axis, dense_out);
+  if (x.initialized()) {
+    TransposeKernel<T, Context>(dev_ctx, x, axis, dense_out);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index 3c119db2c73d6e..d26753ece47cdc 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -77,6 +77,7 @@ PD_REGISTER_KERNEL(unsqueeze_grad,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    bool,
                    int,
                    uint8_t,
diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc
index c08c31da4ef0ce..3e1c8f8cc15e12 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_kernel.cc
@@ -27,7 +27,7 @@ void UnsqueezeInferKernel(const Context& dev_ctx,
                           DenseTensor* out) {
   auto x_dims = x.dims();
   auto out_dims = out->dims();
-  if (axes.FromTensor()) {
+  if (axes.FromTensor() && out->dims()[0] == -1) {
     out_dims = funcs::GetUnsqueezeShape(axes.GetData(), x_dims);
   }
   out->Resize(out_dims);
@@ -124,6 +124,7 @@ PD_REGISTER_KERNEL(unsqueeze_infer,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    bool,
                    int,
                    uint8_t,
@@ -137,6 +138,7 @@ PD_REGISTER_KERNEL(unsqueeze,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    bool,
                    int,
                    uint8_t,
diff --git a/paddle/fluid/operators/diag_op.cu b/paddle/phi/kernels/weight_dequantize_kernel.h
similarity index 51%
rename from paddle/fluid/operators/diag_op.cu
rename to paddle/phi/kernels/weight_dequantize_kernel.h
index c9afc983b03bbc..3a0a10924b57e0 100644
--- a/paddle/fluid/operators/diag_op.cu
+++ b/paddle/phi/kernels/weight_dequantize_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/diag_op.h"
-#include "paddle/fluid/framework/op_registry.h"
+#pragma once
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(diag,
-                        ops::DiagKernel<phi::GPUContext, int>,
-                        ops::DiagKernel<phi::GPUContext, int64_t>,
-                        ops::DiagKernel<phi::GPUContext, float>,
-                        ops::DiagKernel<phi::GPUContext, double>);
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WeightDequantizeKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const std::string& algo,
+                            DataType out_dtype,
+                            DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/weight_only_linear_grad_kernel.h b/paddle/phi/kernels/weight_only_linear_grad_kernel.h
index 6cf44ef6d46887..518ef43c98d0f8 100644
--- a/paddle/phi/kernels/weight_only_linear_grad_kernel.h
+++ b/paddle/phi/kernels/weight_only_linear_grad_kernel.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/phi/kernels/weight_only_linear_kernel.h b/paddle/phi/kernels/weight_only_linear_kernel.h
index 19d4d274964b8d..4e0de2ec9a6455 100644
--- a/paddle/phi/kernels/weight_only_linear_kernel.h
+++ b/paddle/phi/kernels/weight_only_linear_kernel.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/phi/kernels/weight_quantize_kernel.h b/paddle/phi/kernels/weight_quantize_kernel.h
index ea49b3ffb2dced..ba4277e84e6378 100644
--- a/paddle/phi/kernels/weight_quantize_kernel.h
+++ b/paddle/phi/kernels/weight_quantize_kernel.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,4 +24,5 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                           const std::string& algo,
                           DenseTensor* out,
                           DenseTensor* scale);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
index 2b637e9da09e86..b5b2ed7d328884 100644
--- a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
@@ -22,23 +22,18 @@
 
 namespace phi {
 
-namespace {
-const int ARG_MAX_OUTPUT_DATATYPE_INT32 = 2;
-const int ARG_MAX_OUTPUT_DATATYPE_INT64 = 3;
-}  // Anonymous namespace
-
 template <typename T, typename Context>
 void ArgMaxKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& axis,
                   bool keepdims,
                   bool flatten,
-                  int dtype,
+                  DataType dtype,
                   DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   PADDLE_ENFORCE_EQ(
-      (dtype < 0 || dtype == ARG_MAX_OUTPUT_DATATYPE_INT32 ||
-       dtype == ARG_MAX_OUTPUT_DATATYPE_INT64),
+      (dtype == DataType::UNDEFINED || dtype == DataType::INT32 ||
+       dtype == DataType::INT64),
       true,
       errors::InvalidArgument(
           "The attribute of dtype in xpu argmin/argmax must be [%s] or [%s], "
@@ -60,7 +55,7 @@ void ArgMaxKernel(const Context& dev_ctx,
   }
   auto xdims_vec = phi::vectorize<int>(x_dims);
   int r = 0;
-  if (dtype != ARG_MAX_OUTPUT_DATATYPE_INT32) {
+  if (dtype != DataType::INT32) {
     dev_ctx.template Alloc<int64_t>(out);
     if (x.dims().size() == 0) {
       xpu::constant(dev_ctx.x_context(),
diff --git a/paddle/phi/kernels/xpu/batch_norm_kernel.cc b/paddle/phi/kernels/xpu/batch_norm_kernel.cc
index b95dda1fed13d1..e2f2d28182b67d 100644
--- a/paddle/phi/kernels/xpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/batch_norm_kernel.cc
@@ -140,4 +140,9 @@ PD_REGISTER_KERNEL(batch_norm,
                    ALL_LAYOUT,
                    phi::BatchNormKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+}
diff --git a/paddle/phi/kernels/xpu/set_value_kernel.cc b/paddle/phi/kernels/xpu/set_value_kernel.cc
index dc154657c729e5..a706ef00b9a41d 100644
--- a/paddle/phi/kernels/xpu/set_value_kernel.cc
+++ b/paddle/phi/kernels/xpu/set_value_kernel.cc
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
@@ -386,20 +387,31 @@ void SetValueKernel(const Context& dev_ctx,
                     const std::vector<int64_t>& shape,
                     const std::vector<Scalar>& values,
                     DenseTensor* out) {
-  std::vector<T> assign_values;
-  assign_values.reserve(values.size());
-  for (const auto& val : values) {
-    assign_values.push_back(val.to<T>());
+  // avoid using vector<T> if T is bool or phi::dtype::float16
+  int value_size = sizeof(T);
+  int values_size = values.size();
+  int values_length = values_size * value_size;
+  std::vector<uint8_t> assign_values(values_length);
+  uint8_t* value_data_uint8_cpu = assign_values.data();
+  for (int i = 0; i < values_size; i++) {
+    T value = values[i].to<T>();
+    memcpy(value_data_uint8_cpu + i * value_size, &value, value_size);
   }
 
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  T* value_data =
+      reinterpret_cast<T*>(RAII_GUARD.alloc_l3_or_gm<XPUType>(values_size));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     value_data,
+                     phi::CPUPlace(),
+                     value_data_uint8_cpu,
+                     values_length);
   auto value_dims = phi::make_ddim(shape);
 
-  DenseTensor value_tensor;
-  TensorFromVector<T>(assign_values, dev_ctx, &value_tensor);
-
   SetValueKernelImpl<T, Context>(dev_ctx,
                                  x,
-                                 value_tensor.data<T>(),
+                                 value_data,
                                  value_dims,
                                  starts,
                                  ends,
diff --git a/paddle/phi/ops/compat/fused_bn_add_activation_sig.cc b/paddle/phi/ops/compat/fused_bn_add_activation_sig.cc
new file mode 100644
index 00000000000000..c32175b856397d
--- /dev/null
+++ b/paddle/phi/ops/compat/fused_bn_add_activation_sig.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FusedBatchNormAddActOpArgumentMapping(
+    const ArgumentMappingContext& ctx UNUSED) {
+  return KernelSignature("fused_bn_add_activation",
+                         {"X", "Z", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum", "epsilon", "act_type"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+
+KernelSignature FusedBatchNormAddActGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx UNUSED) {
+  return KernelSignature("fused_bn_add_activation_grad",
+                         {"X",
+                          "Scale",
+                          "Bias",
+                          "Y",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace",
+                          "Y@GRAD"},
+                         {"momentum", "epsilon", "act_type"},
+                         {"X@GRAD", "Z@GRAD", "Scale@GRAD", "Bias@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(fused_bn_add_activation,
+                           phi::FusedBatchNormAddActOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fused_bn_add_activation_grad,
+                           phi::FusedBatchNormAddActGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/quantize_linear_sig.cc b/paddle/phi/ops/compat/quantize_linear_sig.cc
new file mode 100644
index 00000000000000..75e523bf55367d
--- /dev/null
+++ b/paddle/phi/ops/compat/quantize_linear_sig.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DeQuantizeLinearOpArgumentMapping(
+    const ArgumentMappingContext& ctx UNUSED) {
+  return KernelSignature(
+      "dequantize_linear",
+      {"X", "Scale", "ZeroPoint", "InAccum", "InState"},
+      {"quant_axis", "bit_length", "round_type", "is_test", "only_observer"},
+      {"Y", "OutState", "OutAccum", "OutScale"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dequantize_linear,
+                           phi::DeQuantizeLinearOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/strided_slice_sig.cc b/paddle/phi/ops/compat/strided_slice_sig.cc
index 02b39147878661..0c0e5d0c868f4e 100644
--- a/paddle/phi/ops/compat/strided_slice_sig.cc
+++ b/paddle/phi/ops/compat/strided_slice_sig.cc
@@ -57,7 +57,7 @@ KernelSignature StridedSliceOpArgumentMapping(
                                              "decrease_axis"};
   paddle::small_vector<const char*> outputs = {"Out"};
 
-  const char* kernel_name;
+  const char* kernel_name = nullptr;
   if (ctx.IsDenseTensorVectorInput("Input")) {
     kernel_name = "strided_slice_array";
   } else {
@@ -106,7 +106,7 @@ KernelSignature StridedSliceGradOpArgumentMapping(
                                              "decrease_axis"};
   paddle::small_vector<const char*> outputs = {"Input@GRAD"};
 
-  const char* kernel_name;
+  const char* kernel_name = nullptr;
   if (ctx.IsDenseTensorVectorInput("Input")) {
     kernel_name = "strided_slice_array_grad";
   } else {
diff --git a/paddle/pir/core/block.h b/paddle/pir/core/block.h
index b7a730715c12c5..c61a8f22e54256 100644
--- a/paddle/pir/core/block.h
+++ b/paddle/pir/core/block.h
@@ -84,6 +84,7 @@ class IR_API Block {
   ArgsIterator args_end() { return arguments_.end(); }
   bool args_empty() const { return arguments_.empty(); }
   uint32_t args_size() const { return arguments_.size(); }
+  const BlockArgListType &args() const { return arguments_; }
   BlockArgument argument(uint32_t index) { return arguments_[index]; }
   Type argument_type(uint32_t index) const { return arguments_[index].type(); }
   void ClearArguments();
diff --git a/paddle/pir/core/block_argument.cc b/paddle/pir/core/block_argument.cc
index 4acbfe9176ef91..3b851c054b85ed 100644
--- a/paddle/pir/core/block_argument.cc
+++ b/paddle/pir/core/block_argument.cc
@@ -73,9 +73,9 @@ BlockArgument BlockArgument::Create(Type type, Block *owner, uint32_t index) {
 /// Destroy the argument.
 void BlockArgument::Destroy() {
   if (impl_) {
-    LOG(WARNING) << "Destroying a null block argument.";
-  } else {
     delete IMPL_;
+  } else {
+    LOG(WARNING) << "Destroying a null block argument.";
   }
 }
 
diff --git a/paddle/pir/core/builtin_dialect.cc b/paddle/pir/core/builtin_dialect.cc
index 23ba43c3d292ec..60575da6d9472c 100644
--- a/paddle/pir/core/builtin_dialect.cc
+++ b/paddle/pir/core/builtin_dialect.cc
@@ -53,6 +53,7 @@ void BuiltinDialect::initialize() {
   RegisterOps<ModuleOp,
               GetParameterOp,
               SetParameterOp,
+              ShadowOutputOp,
               CombineOp,
               SliceOp,
               SplitOp,
diff --git a/paddle/pir/core/builtin_op.cc b/paddle/pir/core/builtin_op.cc
index 8b6679e9c4aa61..b092cb7bed266d 100644
--- a/paddle/pir/core/builtin_op.cc
+++ b/paddle/pir/core/builtin_op.cc
@@ -82,7 +82,7 @@ void ModuleOp::Destroy() {
   }
 }
 
-void ModuleOp::Verify() const {
+void ModuleOp::VerifySig() const {
   VLOG(4) << "Verifying inputs, outputs and attributes for: ModuleOp.";
   // Verify inputs:
   IR_ENFORCE(num_operands() == 0u, "The size of inputs must be equal to 0.");
@@ -118,7 +118,7 @@ void GetParameterOp::PassStopGradients(OperationArgument &argument) {
       pir::ArrayAttribute::get(pir::IrContext::Instance(), outs_stop_gradient));
 }
 
-void GetParameterOp::Verify() const {
+void GetParameterOp::VerifySig() const {
   VLOG(4) << "Verifying inputs, outputs and attributes for: GetParameterOp.";
   // Verify inputs:
   IR_ENFORCE(num_operands() == 0u, "The size of inputs must be equal to 0.");
@@ -144,7 +144,7 @@ void SetParameterOp::Build(Builder &builder,             // NOLINT
   argument.AddAttribute(attributes_name[0],
                         pir::StrAttribute::get(builder.ir_context(), name));
 }
-void SetParameterOp::Verify() const {
+void SetParameterOp::VerifySig() const {
   VLOG(4) << "Verifying inputs, outputs and attributes for: SetParameterOp.";
   // Verify inputs:
   IR_ENFORCE(num_operands() == 1, "The size of outputs must be equal to 1.");
@@ -159,6 +159,32 @@ void SetParameterOp::Verify() const {
   IR_ENFORCE(num_results() == 0u, "The size of outputs must be equal to 0.");
 }
 
+const char *ShadowOutputOp::attributes_name[attributes_num] = {  // NOLINT
+    "output_name"};
+
+void ShadowOutputOp::Build(Builder &builder,             // NOLINT
+                           OperationArgument &argument,  // NOLINT
+                           Value parameter,
+                           const std::string &name) {
+  argument.AddInput(parameter);
+  argument.AddAttribute(attributes_name[0],
+                        pir::StrAttribute::get(builder.ir_context(), name));
+}
+void ShadowOutputOp::VerifySig() const {
+  VLOG(4) << "Verifying inputs, outputs and attributes for: ShadowOutputOp.";
+  // Verify inputs:
+  IR_ENFORCE(num_operands() == 1, "The size of outputs must be equal to 1.");
+
+  // Verify attributes:
+  auto &attributes = this->attributes();
+  auto iter = attributes.find("output_name");
+  IR_ENFORCE(iter != attributes.end() && iter->second.isa<StrAttribute>(),
+             "Type of attribute: output_name is not right.");
+
+  // Verify outputs:
+  IR_ENFORCE(num_results() == 0u, "The size of outputs must be equal to 0.");
+}
+
 void CombineOp::Build(Builder &builder,
                       OperationArgument &argument,
                       const std::vector<Value> &inputs) {
@@ -172,7 +198,7 @@ void CombineOp::Build(Builder &builder,
   PassStopGradientsDefaultly(argument);
 }
 
-void CombineOp::Verify() const {
+void CombineOp::VerifySig() const {
   // outputs.size() == 1
   IR_ENFORCE(num_results() == 1u, "The size of outputs must be equal to 1.");
 
@@ -234,7 +260,7 @@ void SliceOp::PassStopGradients(OperationArgument &argument, int index) {
       pir::ArrayAttribute::get(pir::IrContext::Instance(), outs_stop_gradient));
 }
 
-void SliceOp::Verify() const {
+void SliceOp::VerifySig() const {
   // inputs.size() == 1
   auto input_size = num_operands();
   IR_ENFORCE(
@@ -338,7 +364,7 @@ void SplitOp::PassStopGradients(OperationArgument &argument) {
       pir::ArrayAttribute::get(pir::IrContext::Instance(), outs_stop_gradient));
 }
 
-void SplitOp::Verify() const {
+void SplitOp::VerifySig() const {
   // inputs.size() == 1
   IR_ENFORCE(num_operands() == 1u, "The size of inputs must be equal to 1.");
 
@@ -367,7 +393,7 @@ void ConstantOp::Build(Builder &builder,
   argument.output_types.push_back(output_type);
 }
 
-void ConstantOp::Verify() const {
+void ConstantOp::VerifySig() const {
   IR_ENFORCE(num_operands() == 0, "The size of inputs must be equal to 0.");
   IR_ENFORCE(num_results() == 1, "The size of outputs must be equal to 1.");
   IR_ENFORCE(attributes().count("value") > 0, "must has value attribute");
@@ -380,6 +406,7 @@ Attribute ConstantOp::value() const { return attributes().at("value"); }
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::ModuleOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::GetParameterOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::SetParameterOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::ShadowOutputOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::CombineOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::SliceOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::SplitOp)
diff --git a/paddle/pir/core/builtin_op.h b/paddle/pir/core/builtin_op.h
index e5327f4c5db45e..19ca96b0526928 100644
--- a/paddle/pir/core/builtin_op.h
+++ b/paddle/pir/core/builtin_op.h
@@ -31,7 +31,7 @@ class IR_API ModuleOp : public pir::Op<ModuleOp> {
   static const char *name() { return "builtin.module"; }
   static constexpr uint32_t attributes_num = 1;
   static const char *attributes_name[attributes_num];
-  void Verify() const;
+  void VerifySig() const;
   Program *program();
   Block *block();
 
@@ -56,7 +56,7 @@ class IR_API GetParameterOp : public pir::Op<GetParameterOp> {
                     OperationArgument &argument,  // NOLINT
                     const std::string &name,
                     Type type);
-  void Verify() const;
+  void VerifySig() const;
 
  private:
   static void PassStopGradients(OperationArgument &argument);  // NOLINT
@@ -76,7 +76,24 @@ class IR_API SetParameterOp : public pir::Op<SetParameterOp> {
                     OperationArgument &argument,  // NOLINT
                     Value parameter,
                     const std::string &name);
-  void Verify() const;
+  void VerifySig() const;
+};
+
+///
+/// \brief ShdowOutputOp: ShdowOutputOp(OpOperand, {StrAttribute,
+/// StrAttribute})
+///
+class IR_API ShadowOutputOp : public pir::Op<ShadowOutputOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "builtin.shadow_output"; }
+  static constexpr uint32_t attributes_num = 1;
+  static const char *attributes_name[attributes_num];
+  static void Build(Builder &builder,             // NOLINT
+                    OperationArgument &argument,  // NOLINT
+                    Value parameter,
+                    const std::string &name);
+  void VerifySig() const;
 };
 
 ///
@@ -96,7 +113,7 @@ class IR_API CombineOp : public pir::Op<CombineOp> {
                     OperationArgument &argument,  // NOLINT
                     const std::vector<Value> &inputs);
 
-  void Verify() const;
+  void VerifySig() const;
   std::vector<pir::Value> inputs() {
     std::vector<pir::Value> inputs;
     for (uint32_t idx = 0; idx < num_operands(); idx++) {
@@ -125,7 +142,7 @@ class IR_API SliceOp : public pir::Op<SliceOp> {
                     Value input,
                     int index);
 
-  void Verify() const;
+  void VerifySig() const;
   pir::Value input() { return operand_source(0); }
 
  private:
@@ -150,7 +167,7 @@ class IR_API SplitOp : public pir::Op<SplitOp> {
                     OperationArgument &argument,  // NOLINT
                     Value input);
 
-  void Verify() const;
+  void VerifySig() const;
   pir::Value input() { return operand_source(0); }
   std::vector<OpResult> outputs() {
     std::vector<OpResult> res;
@@ -186,7 +203,7 @@ class IR_API ConstantOp : public Op<ConstantOp, ConstantLikeTrait> {
                     Attribute value,
                     Type output_type);
 
-  void Verify() const;
+  void VerifySig() const;
 
   Attribute value() const;
 };
@@ -198,6 +215,7 @@ void PassStopGradientsDefaultly(OperationArgument &argument);  // NOLINT
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::ModuleOp)
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::GetParameterOp)
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SetParameterOp)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::ShadowOutputOp)
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::CombineOp)
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SliceOp)
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SplitOp)
diff --git a/paddle/pir/core/dialect.h b/paddle/pir/core/dialect.h
index 07debaf1960410..8c66f3c1d6a159 100644
--- a/paddle/pir/core/dialect.h
+++ b/paddle/pir/core/dialect.h
@@ -100,7 +100,8 @@ class IR_API Dialect {
                                  ConcreteOp::GetTraitSet(),
                                  ConcreteOp::attributes_num,
                                  ConcreteOp::attributes_name,
-                                 ConcreteOp::VerifyInvariants);
+                                 ConcreteOp::VerifySigInvariants,
+                                 ConcreteOp::VerifyRegionInvariants);
   }
 
   void RegisterOp(const std::string &name, OpInfoImpl *op_info);
diff --git a/paddle/pir/core/enforce.h b/paddle/pir/core/enforce.h
index a3b1401b64d250..e8624b8bbe4e13 100644
--- a/paddle/pir/core/enforce.h
+++ b/paddle/pir/core/enforce.h
@@ -19,6 +19,13 @@
 
 #include "paddle/utils/string/printf.h"
 
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
 #if !defined(_WIN32)
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
diff --git a/paddle/pir/core/interface_support.h b/paddle/pir/core/interface_support.h
index 3814570eb4f3b9..de8e09403765c5 100644
--- a/paddle/pir/core/interface_support.h
+++ b/paddle/pir/core/interface_support.h
@@ -42,8 +42,7 @@ class ConstructInterfacesOrTraits {
   static void PlacementConstrctInterface(
       InterfaceValue *&p_interface) {  // NOLINT
     p_interface->swap(InterfaceValue::get<ConcreteT, T>());
-    VLOG(6) << "New a interface: id["
-            << (p_interface->type_id()).AsOpaquePointer() << "].";
+    VLOG(6) << "New a interface: id[" << p_interface->type_id() << "].";
     ++p_interface;
   }
 
@@ -51,7 +50,7 @@ class ConstructInterfacesOrTraits {
   template <typename T>
   static void PlacementConstrctTrait(pir::TypeId *&p_trait) {  // NOLINT
     *p_trait = TypeId::get<T>();
-    VLOG(6) << "New a trait: id[" << p_trait->AsOpaquePointer() << "].";
+    VLOG(6) << "New a trait: id[" << *p_trait << "].";
     ++p_trait;
   }
 };
diff --git a/paddle/pir/core/ir_context.cc b/paddle/pir/core/ir_context.cc
index b7aca14e8f60b3..1ebd9e4f0c6423 100644
--- a/paddle/pir/core/ir_context.cc
+++ b/paddle/pir/core/ir_context.cc
@@ -106,7 +106,7 @@ class IrContextImpl {
   void RegisterOpInfo(const std::string &name, OpInfo info) {
     std::lock_guard<pir::SpinLock> guard(registed_op_infos_lock_);
     VLOG(6) << "Register an operation of: [Name=" << name
-            << ", OpInfo ptr=" << info.AsOpaquePointer() << "].";
+            << ", OpInfo ptr=" << info << "].";
     registed_op_infos_.emplace(name, info);
   }
 
@@ -115,7 +115,7 @@ class IrContextImpl {
     auto iter = registed_op_infos_.find(name);
     if (iter != registed_op_infos_.end()) {
       VLOG(8) << "Found a cached OpInfo of: [name=" << name
-              << ", OpInfo: ptr=" << iter->second.AsOpaquePointer() << "].";
+              << ", OpInfo: ptr=" << iter->second << "].";
       return iter->second;
     }
     VLOG(8) << "No cache found operation of: [Name=" << name << "].";
@@ -226,7 +226,8 @@ void IrContext::RegisterAbstractAttribute(
     pir::TypeId type_id, AbstractAttribute &&abstract_attribute) {
   if (GetRegisteredAbstractAttribute(type_id) == nullptr) {
     impl().RegisterAbstractAttribute(
-        type_id, new AbstractAttribute(std::move(abstract_attribute)));
+        type_id,
+        new AbstractAttribute(std::move(abstract_attribute)));  // NOLINT
   } else {
     LOG(WARNING) << " Attribute already registered.";
   }
@@ -258,14 +259,14 @@ Dialect *IrContext::GetOrRegisterDialect(
 
 std::vector<Dialect *> IrContext::GetRegisteredDialects() {
   std::vector<Dialect *> result;
-  for (auto dialect_map : impl().registed_dialect_) {
+  for (auto const &dialect_map : impl().registed_dialect_) {
     result.push_back(dialect_map.second);
   }
   return result;
 }
 
 Dialect *IrContext::GetRegisteredDialect(const std::string &dialect_name) {
-  for (auto dialect_map : impl().registed_dialect_) {
+  for (auto const &dialect_map : impl().registed_dialect_) {
     if (dialect_map.first == dialect_name) {
       return dialect_map.second;
     }
@@ -277,8 +278,8 @@ Dialect *IrContext::GetRegisteredDialect(const std::string &dialect_name) {
 void IrContext::RegisterAbstractType(pir::TypeId type_id,
                                      AbstractType &&abstract_type) {
   if (GetRegisteredAbstractType(type_id) == nullptr) {
-    impl().RegisterAbstractType(type_id,
-                                new AbstractType(std::move(abstract_type)));
+    impl().RegisterAbstractType(
+        type_id, new AbstractType(std::move(abstract_type)));  // NOLINT
   } else {
     LOG(WARNING) << " type already registered.";
   }
@@ -291,7 +292,8 @@ void IrContext::RegisterOpInfo(Dialect *dialect,
                                const std::vector<TypeId> &trait_set,
                                size_t attributes_num,
                                const char **attributes_name,
-                               VerifyPtr verify) {
+                               VerifyPtr verify_sig,
+                               VerifyPtr verify_region) {
   if (impl().IsOpInfoRegistered(name)) {
     LOG(WARNING) << name << " op already registered.";
   } else {
@@ -302,7 +304,8 @@ void IrContext::RegisterOpInfo(Dialect *dialect,
                                      trait_set,
                                      attributes_num,
                                      attributes_name,
-                                     verify);
+                                     verify_sig,
+                                     verify_region);
     impl().RegisterOpInfo(name, info);
   }
 }
diff --git a/paddle/pir/core/ir_context.h b/paddle/pir/core/ir_context.h
index d459f915242290..c20a0d7bba2925 100644
--- a/paddle/pir/core/ir_context.h
+++ b/paddle/pir/core/ir_context.h
@@ -113,7 +113,8 @@ class IR_API IrContext {
                       const std::vector<TypeId> &trait_set,
                       size_t attributes_num,
                       const char **attributes_name,
-                      void (*verify)(Operation *));
+                      void (*verify_sig)(Operation *),
+                      void (*verify_region)(Operation *));
 
   ///
   /// \brief Get registered operaiton infomation.
diff --git a/paddle/pir/core/ir_printer.cc b/paddle/pir/core/ir_printer.cc
index 528144437727f7..81cb3b4bcf2244 100644
--- a/paddle/pir/core/ir_printer.cc
+++ b/paddle/pir/core/ir_printer.cc
@@ -204,11 +204,17 @@ void IrPrinter::PrintValue(Value v) {
     os << ret->second;
     return;
   }
-
-  std::string new_name = "%" + std::to_string(cur_var_number_);
-  cur_var_number_++;
-  aliases_[key] = new_name;
-  os << new_name;
+  if (v.isa<OpResult>()) {
+    std::string new_name = "%" + std::to_string(cur_result_number_);
+    cur_result_number_++;
+    aliases_[key] = new_name;
+    os << new_name;
+  } else {
+    std::string new_name = "%arg" + std::to_string(cur_block_argument_number_);
+    cur_block_argument_number_++;
+    aliases_[key] = new_name;
+    os << new_name;
+  }
 }
 
 void IrPrinter::PrintOpResult(Operation* op) {
diff --git a/paddle/pir/core/ir_printer.h b/paddle/pir/core/ir_printer.h
index 929da4fe332e1c..e4d821c01911bb 100644
--- a/paddle/pir/core/ir_printer.h
+++ b/paddle/pir/core/ir_printer.h
@@ -71,7 +71,8 @@ class IR_API IrPrinter : public BasicIrPrinter {
   void PrintOpReturnType(Operation* op);
 
  private:
-  size_t cur_var_number_{0};
+  size_t cur_result_number_{0};
+  size_t cur_block_argument_number_{0};
   std::unordered_map<const void*, std::string> aliases_;
 };
 
diff --git a/paddle/pir/core/op_base.h b/paddle/pir/core/op_base.h
index f9de8dfc6cf8d0..f0710ff5ec6297 100644
--- a/paddle/pir/core/op_base.h
+++ b/paddle/pir/core/op_base.h
@@ -22,6 +22,8 @@
 #include "paddle/pir/core/utils.h"
 
 namespace pir {
+class Builder;
+class IrPrinter;
 
 class IR_API OpBase {
  public:
@@ -61,6 +63,10 @@ class IR_API OpBase {
     return operation()->attribute<T>(name);
   }
 
+  void VerifySig() {}
+
+  void VerifyRegion() {}
+
  private:
   Operation *operation_;  // Not owned
 };
@@ -160,14 +166,21 @@ class Op : public OpBase {
     class EmptyOp : public Op<EmptyOp, TraitOrInterface...> {};
     return sizeof(ConcreteOp) == sizeof(EmptyOp);
   }
-  // Implementation of `VerifyInvariantsFn` OperationName hook.
-  static void VerifyInvariants(Operation *op) {
+
+  // Implementation of `VerifySigInvariantsFn` OperationName hook.
+  static void VerifySigInvariants(Operation *op) {
     static_assert(HasNoDataMembers(),
                   "Op class shouldn't define new data members");
-    op->dyn_cast<ConcreteOp>().Verify();
+    op->dyn_cast<ConcreteOp>().VerifySig();
     (void)std::initializer_list<int>{
         0, (VerifyTraitOrInterface<TraitOrInterface>::call(op), 0)...};
   }
+
+  static void VerifyRegionInvariants(Operation *op) {
+    static_assert(HasNoDataMembers(),
+                  "Op class shouldn't define new data members");
+    op->dyn_cast<ConcreteOp>().VerifyRegion();
+  }
 };
 
 }  // namespace pir
diff --git a/paddle/pir/core/op_info.cc b/paddle/pir/core/op_info.cc
index b018bec30448d4..499bfda0e69e7b 100644
--- a/paddle/pir/core/op_info.cc
+++ b/paddle/pir/core/op_info.cc
@@ -35,7 +35,18 @@ const char *OpInfo::name() const { return impl_ ? impl_->name() : nullptr; }
 
 TypeId OpInfo::id() const { return impl_ ? impl_->id() : TypeId(); }
 
-void OpInfo::Verify(Operation *operation) const { impl_->verify()(operation); }
+void OpInfo::Verify(Operation *operation) const {
+  VerifySig(operation);
+  VerifyRegion(operation);
+}
+
+void OpInfo::VerifySig(Operation *operation) const {
+  impl_->VerifySig()(operation);
+}
+
+void OpInfo::VerifyRegion(Operation *operation) const {
+  impl_->VerifyRegion()(operation);
+}
 
 void *OpInfo::GetInterfaceImpl(TypeId interface_id) const {
   return impl_ ? impl_->GetInterfaceImpl(interface_id) : nullptr;
diff --git a/paddle/pir/core/op_info.h b/paddle/pir/core/op_info.h
index 7065a295be0821..a7416c146a90e5 100644
--- a/paddle/pir/core/op_info.h
+++ b/paddle/pir/core/op_info.h
@@ -54,6 +54,10 @@ class IR_API OpInfo {
 
   void Verify(Operation *) const;
 
+  void VerifySig(Operation *) const;
+
+  void VerifyRegion(Operation *) const;
+
   template <typename Trait>
   bool HasTrait() const {
     return HasTrait(TypeId::get<Trait>());
@@ -71,9 +75,8 @@ class IR_API OpInfo {
   template <typename InterfaceT>
   typename InterfaceT::Concept *GetInterfaceImpl() const;
 
-  operator const void *() const { return impl_; }
-  void *AsOpaquePointer() const { return impl_; }
-  static OpInfo RecoverFromOpaquePointer(void *pointer) {
+  operator void *() const { return impl_; }
+  static OpInfo RecoverFromVoidPointer(void *pointer) {
     return OpInfo(static_cast<OpInfoImpl *>(pointer));
   }
 
@@ -105,7 +108,7 @@ namespace std {
 template <>
 struct hash<pir::OpInfo> {
   std::size_t operator()(const pir::OpInfo &obj) const {
-    return std::hash<const void *>()(obj);
+    return std::hash<void *>()(obj);
   }
 };
 }  // namespace std
diff --git a/paddle/pir/core/op_info_impl.cc b/paddle/pir/core/op_info_impl.cc
index 12245f12a652a5..33320f1d523670 100644
--- a/paddle/pir/core/op_info_impl.cc
+++ b/paddle/pir/core/op_info_impl.cc
@@ -24,7 +24,8 @@ OpInfo OpInfoImpl::Create(Dialect *dialect,
                           const std::vector<TypeId> &trait_set,
                           size_t attributes_num,
                           const char *attributes_name[],  // NOLINT
-                          VerifyPtr verify) {
+                          VerifyPtr verify_sig,
+                          VerifyPtr verify_region) {
   // (1) Malloc memory for interfaces, traits, opinfo_impl.
   size_t interfaces_num = interface_map.size();
   size_t traits_num = trait_set.size();
@@ -59,7 +60,8 @@ OpInfo OpInfoImpl::Create(Dialect *dialect,
                                                     traits_num,
                                                     attributes_num,
                                                     attributes_name,
-                                                    verify));
+                                                    verify_sig,
+                                                    verify_region));
   return op_info;
 }
 void OpInfoImpl::Destroy(OpInfo info) {
diff --git a/paddle/pir/core/op_info_impl.h b/paddle/pir/core/op_info_impl.h
index cc63a52d40064a..a08084682f1d00 100644
--- a/paddle/pir/core/op_info_impl.h
+++ b/paddle/pir/core/op_info_impl.h
@@ -42,14 +42,17 @@ class OpInfoImpl {
                        const std::vector<TypeId> &trait_set,
                        size_t attributes_num,
                        const char *attributes_name[],
-                       VerifyPtr verify);
+                       VerifyPtr verify_sig,
+                       VerifyPtr verify_region);
   static void Destroy(OpInfo info);
 
   TypeId id() const { return op_id_; }
 
   Dialect *dialect() const { return dialect_; }
 
-  VerifyPtr verify() const { return verify_; }
+  VerifyPtr VerifySig() const { return verify_sig_; }
+
+  VerifyPtr VerifyRegion() const { return verify_region_; }
 
   IrContext *ir_context() const;
 
@@ -76,7 +79,8 @@ class OpInfoImpl {
              uint32_t num_traits,
              uint32_t num_attributes,
              const char **p_attributes,
-             VerifyPtr verify)
+             VerifyPtr verify_sig,
+             VerifyPtr verify_region)
       : dialect_(dialect),
         op_id_(op_id),
         op_name_(op_name),
@@ -84,7 +88,8 @@ class OpInfoImpl {
         num_traits_(num_traits),
         num_attributes_(num_attributes),
         p_attributes_(p_attributes),
-        verify_(verify) {}
+        verify_sig_(verify_sig),
+        verify_region_(verify_region) {}
   void Destroy();
 
   /// The dialect of this Op belong to.
@@ -108,7 +113,9 @@ class OpInfoImpl {
   /// Attributes array address.
   const char **p_attributes_{nullptr};
 
-  VerifyPtr verify_{nullptr};
+  VerifyPtr verify_sig_{nullptr};
+
+  VerifyPtr verify_region_{nullptr};
 };
 
 }  // namespace pir
diff --git a/paddle/pir/core/op_trait.cc b/paddle/pir/core/op_trait.cc
new file mode 100644
index 00000000000000..ccea4e3f06d9b9
--- /dev/null
+++ b/paddle/pir/core/op_trait.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/core/op_trait.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/type_util.h"
+
+namespace pir::detail {
+
+void VerifySameOperandsShapeTrait(Operation *op) {
+  VLOG(4) << "Verify SameOperandsShapeTrait for : " << op->name();
+
+  IR_ENFORCE(op->num_operands() > 0,
+             "Op %s with SameOperandsShapeTrait requires at least 1 operands, "
+             "but got %u operands.",
+             op->name(),
+             op->num_operands());
+
+  std::vector<pir::OpOperand> operands = op->operands();
+  std::vector<pir::Type> types;
+  std::for_each(operands.begin(), operands.end(), [&types](pir::OpOperand op) {
+    types.push_back(op.type());
+  });
+
+  IR_ENFORCE(VerifyCompatibleShapes(types),
+             "Op %s with SameOperandsShapeTrait requires the same shape for "
+             "all operands.",
+             op->name());
+}
+
+void VerifySameOperandsAndResultShapeTrait(Operation *op) {
+  VLOG(4) << "Verify SameOperandsAndResultShapeTrait for : " << op->name();
+
+  IR_ENFORCE(op->num_operands() > 0,
+             "Op %s with SameOperandsAndResultShapeTrait requires at least 1 "
+             "operands, but got %u operands.",
+             op->name(),
+             op->num_operands());
+
+  IR_ENFORCE(op->num_results() > 0,
+             "Op %s with SameOperandsAndResultShapeTrait requires at least 1 "
+             "results, but got %u results.",
+             op->name(),
+             op->num_results());
+
+  std::vector<pir::OpOperand> operands = op->operands();
+  std::vector<pir::OpResult> results = op->results();
+
+  std::vector<pir::Type> types;
+
+  std::for_each(operands.begin(), operands.end(), [&types](pir::OpOperand op) {
+    types.push_back(op.type());
+  });
+
+  std::for_each(results.begin(), results.end(), [&types](pir::OpResult op) {
+    types.push_back(op.type());
+  });
+
+  IR_ENFORCE(VerifyCompatibleShapes(types),
+             "Op %s with SameOperandsAndResultShapeTrait requires compatible "
+             "shapes for operands and results.",
+             op->name());
+}
+
+void VerifySameOperandsElementTypeTrait(Operation *op) {
+  VLOG(4) << "Verify SameOperandsElementTypeTrait for : " << op->name();
+
+  IR_ENFORCE(op->num_operands() > 0,
+             "Op %s with SameOperandsElementTypeTrait requires at least 1 "
+             "operands, but got %u operands.",
+             op->name(),
+             op->num_operands());
+
+  auto elementType = GetElementTypeOrSelf(op->result(0).type());
+  for (auto operand : op->operands()) {
+    IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType,
+               "Op %s with SameOperandsElementTypeTrait requires the same "
+               "element type for all operands.",
+               op->name());
+  }
+}
+
+void VerifySameOperandsAndResultElementTypeTrait(Operation *op) {
+  VLOG(4) << "Verify SameOperandsAndResultElementTypeTrait for : "
+          << op->name();
+
+  IR_ENFORCE(op->num_operands() > 0,
+             "Op %s with SameOperandsAndResultElementTypeTrait requires at "
+             "least 1 operands, but got %u operands.",
+             op->name(),
+             op->num_operands());
+
+  IR_ENFORCE(op->num_results() > 0,
+             "Op %s with SameOperandsAndResultElementTypeTrait requires at "
+             "least 1 results, but got %u results.",
+             op->name(),
+             op->num_results());
+
+  auto elementType = GetElementTypeOrSelf(op->result(0).type());
+
+  // Verify result element type matches first result's element type.
+  for (auto result : op->results()) {
+    IR_ENFORCE(GetElementTypeOrSelf(result.type()) == elementType,
+               "Op %s with SameOperandsAndResultElementTypeTrait requires the "
+               "same element type for all operands and results.",
+               op->name());
+  }
+
+  // Verify operand's element type matches first result's element type.
+  for (auto operand : op->operands()) {
+    IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType,
+               "Op %s with SameOperandsAndResultElementTypeTrait requires the "
+               "same element type for all operands and results.",
+               op->name());
+  }
+}
+
+void VerifySameOperandsAndResultTypeTrait(Operation *op) {
+  VLOG(4) << "Verify SameOperandsAndResultTypeTrait for : " << op->name();
+
+  IR_ENFORCE(op->num_operands() > 0,
+             "Op %s with SameOperandsAndResultTypeTrait requires at least 1 "
+             "operands, but got %u operands.",
+             op->name(),
+             op->num_operands());
+
+  IR_ENFORCE(op->num_results() > 0,
+             "Op %s with SameOperandsAndResultTypeTrait requires at least 1 "
+             "results, but got %u results.",
+             op->name(),
+             op->num_results());
+
+  auto type = op->result(0).type();
+  auto elementType = GetElementTypeOrSelf(type);
+
+  for (auto result : op->results()) {
+    IR_ENFORCE(GetElementTypeOrSelf(result.type()) == elementType,
+               "Op %s with SameOperandsAndResultTypeTrait requires the same "
+               "type for all operands and results.",
+               op->name());
+
+    IR_ENFORCE(VerifyCompatibleShape(result.type(), type),
+               "Op %s with SameOperandsAndResultTypeTrait requires the same "
+               "type for all operands and results.",
+               op->name());
+  }
+
+  for (auto operand : op->operands()) {
+    IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType,
+               "Op %s with SameOperandsAndResultTypeTrait requires the same "
+               "type for all operands and results.",
+               op->name());
+
+    IR_ENFORCE(VerifyCompatibleShape(operand.type(), type),
+               "Op %s with SameOperandsAndResultTypeTrait requires the same "
+               "type for all operands and results.",
+               op->name());
+  }
+}
+
+void VerifySameTypeOperandsTrait(Operation *op) {
+  VLOG(4) << "Verify SameTypeOperandsTrait for : " << op->name();
+
+  // For zero or only one operand.
+  unsigned operand_nums = op->num_operands();
+  if (operand_nums < 2) return;
+
+  auto type = op->operand(0).type();
+
+  for (auto operand : op->operands()) {
+    IR_ENFORCE(operand.type() == type,
+               "Op %s with SameTypeOperandsTrait requires all operands to have "
+               "the same type.",
+               op->name());
+  }
+}
+
+}  // namespace  pir::detail
+
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsShapeTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultShapeTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsElementTypeTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultElementTypeTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultTypeTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameTypeOperandsTrait)
diff --git a/paddle/pir/core/op_trait.h b/paddle/pir/core/op_trait.h
new file mode 100644
index 00000000000000..760799fd16165d
--- /dev/null
+++ b/paddle/pir/core/op_trait.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/core/op_base.h"
+
+namespace pir {
+
+namespace detail {
+void VerifySameOperandsShapeTrait(Operation *op);
+void VerifySameOperandsAndResultShapeTrait(Operation *op);
+void VerifySameOperandsElementTypeTrait(Operation *op);
+void VerifySameOperandsAndResultElementTypeTrait(Operation *op);
+void VerifySameOperandsAndResultTypeTrait(Operation *op);
+void VerifySameTypeOperandsTrait(Operation *op);
+}  // namespace detail
+
+///
+/// \brief Provides verification for ops that are known to have the
+/// same operand shape.
+///
+class SameOperandsShapeTrait : public pir::OpTraitBase<SameOperandsShapeTrait> {
+ public:
+  explicit SameOperandsShapeTrait(pir::Operation *op)
+      : pir::OpTraitBase<SameOperandsShapeTrait>(op) {}
+  static void Verify(Operation *op) {
+    return detail::VerifySameOperandsShapeTrait(op);
+  }
+};
+
+///
+/// \brief Provides verification for ops that are known to have the
+/// same operand and result shape.
+///
+class SameOperandsAndResultShapeTrait
+    : public pir::OpTraitBase<SameOperandsAndResultShapeTrait> {
+ public:
+  explicit SameOperandsAndResultShapeTrait(pir::Operation *op)
+      : pir::OpTraitBase<SameOperandsAndResultShapeTrait>(op) {}
+  static void Verify(Operation *op) {
+    return detail::VerifySameOperandsAndResultShapeTrait(op);
+  }
+};
+
+///
+/// \brief Provides verification for ops that are known to have the
+/// same operand element type (or the type itself if it is scalar).
+///
+class SameOperandsElementTypeTrait
+    : public pir::OpTraitBase<SameOperandsElementTypeTrait> {
+ public:
+  explicit SameOperandsElementTypeTrait(pir::Operation *op)
+      : pir::OpTraitBase<SameOperandsElementTypeTrait>(op) {}
+  static void Verify(Operation *op) {
+    return detail::VerifySameOperandsElementTypeTrait(op);
+  }
+};
+
+///
+/// \brief Provides verification for ops that are known to have the
+/// same operand and result element type (or the type itself if it is scalar).
+///
+class SameOperandsAndResultElementTypeTrait
+    : public pir::OpTraitBase<SameOperandsAndResultElementTypeTrait> {
+ public:
+  explicit SameOperandsAndResultElementTypeTrait(pir::Operation *op)
+      : pir::OpTraitBase<SameOperandsAndResultElementTypeTrait>(op) {}
+  static void Verify(Operation *op) {
+    return detail::VerifySameOperandsAndResultElementTypeTrait(op);
+  }
+};
+
+///
+/// \brief Provides verification for ops that are known to have the
+/// same operand and result type. It Subsumes both
+/// SameOperandsAndResultShapeTrait and SameOperandsAndResultElementTypeTrait
+///
+class SameOperandsAndResultTypeTrait
+    : public pir::OpTraitBase<SameOperandsAndResultTypeTrait> {
+ public:
+  explicit SameOperandsAndResultTypeTrait(pir::Operation *op)
+      : pir::OpTraitBase<SameOperandsAndResultTypeTrait>(op) {}
+
+  static void Verify(Operation *op) {
+    return detail::VerifySameOperandsAndResultTypeTrait(op);
+  }
+};
+
+///
+/// \brief Provides verification that all operands of the specified op have the
+/// same type.
+///
+class SameTypeOperandsTrait : public pir::OpTraitBase<SameTypeOperandsTrait> {
+ public:
+  explicit SameTypeOperandsTrait(pir::Operation *op)
+      : pir::OpTraitBase<SameTypeOperandsTrait>(op) {}
+  static void Verify(Operation *op) {
+    return detail::VerifySameTypeOperandsTrait(op);
+  }
+};
+
+}  // namespace pir
+
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsShapeTrait)
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultShapeTrait)
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsElementTypeTrait)
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultElementTypeTrait)
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultTypeTrait)
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameTypeOperandsTrait)
diff --git a/paddle/pir/core/operation.cc b/paddle/pir/core/operation.cc
index 48f5ff85cd5cea..92e8cdfe8e2577 100644
--- a/paddle/pir/core/operation.cc
+++ b/paddle/pir/core/operation.cc
@@ -123,7 +123,12 @@ Operation *Operation::Create(const std::vector<Value> &inputs,
 
   // 0. Verify
   if (op_info) {
-    op_info.Verify(op);
+    try {
+      op_info.VerifySig(op);
+    } catch (const pir::IrNotMetException &e) {
+      op->Destroy();
+      throw e;
+    }
   }
   return op;
 }
@@ -283,6 +288,7 @@ void Operation::SetParent(Block *parent, const Block::Iterator &position) {
 }
 
 void Operation::MoveTo(Block *block, Block::Iterator position) {
+  IR_ENFORCE(parent_, "Operation does not have parent");
   Operation *op = parent_->Take(this);
   block->insert(position, op);
 }
diff --git a/paddle/pir/core/parser/ir_parser.cc b/paddle/pir/core/parser/ir_parser.cc
index 008dcdea6c7b10..ef881771ff4cfa 100644
--- a/paddle/pir/core/parser/ir_parser.cc
+++ b/paddle/pir/core/parser/ir_parser.cc
@@ -77,13 +77,13 @@ Type IrParser::ParseType() {
     return builder->int16_type();
   } else if (type_val == "i32") {
     ConsumeToken();
-    return Int32Type::get(ctx);
+    return builder->int32_type();
   } else if (type_val == "i64") {
     ConsumeToken();
     return Int64Type::get(ctx);
   } else if (type_val == "index") {
     ConsumeToken();
-    return IndexType::get(ctx);
+    return builder->index_type();
   } else if (type_val == "c64") {
     ConsumeToken();
     return builder->complex64_type();
@@ -95,12 +95,15 @@ Type IrParser::ParseType() {
     ConsumeAToken("[");
     std::vector<Type> vec_type;
     Token vec_type_token = PeekToken();
+    if (vec_type_token.val_ == "]") {
+      ConsumeAToken("]");
+    }
     while (vec_type_token.val_ != "]") {
       Type cur_type = ParseType();
       vec_type.push_back(cur_type);
       vec_type_token = ConsumeToken();
     }
-    return VectorType::get(ctx, vec_type);
+    return builder->vec_type(vec_type);
   } else {
     IR_ENFORCE(type_val.find('.') != std::string::npos,
                "No function parsing " + type_val + " exists!" +
@@ -138,12 +141,20 @@ Attribute IrParser::ParseAttribute() {
     ConsumeAToken("Float");
     ConsumeAToken(")");
     std::string val = ConsumeToken().val_;
-    return builder->float_attr(atof(val.c_str()));
+    if (val == "-") {
+      ConsumeAToken("inf");
+      float neg_inf = -std::numeric_limits<float>::infinity();
+      return builder->float_attr(neg_inf);
+    } else if (val == "inf") {
+      float pos_inf = std::numeric_limits<float>::infinity();
+      return builder->float_attr(pos_inf);
+    }
+    return builder->float_attr(static_cast<float>(atof(val.c_str())));
   } else if (attribute_type == "Double") {
     ConsumeAToken("Double");
     ConsumeAToken(")");
     std::string val = ConsumeToken().val_;
-    return builder->double_attr(atof(val.c_str()));
+    return builder->double_attr(std::stod(val.c_str()));
   } else if (attribute_type == "Int32") {
     ConsumeAToken("Int32");
     ConsumeAToken(")");
diff --git a/paddle/pir/core/parser/lexer.cc b/paddle/pir/core/parser/lexer.cc
index 9bbfd7dbc804a7..8ab23e47576897 100644
--- a/paddle/pir/core/parser/lexer.cc
+++ b/paddle/pir/core/parser/lexer.cc
@@ -35,16 +35,23 @@ Token Lexer::ConsumeToken() {
 
 Token Lexer::PeekToken() {
   auto pos = is.tellg();
+  size_t cache_line = line;
+  size_t cache_column = column;
+
   auto token = ConsumeToken();
+
   if (is.eof()) {
     is.clear();
   }
   is.seekg(pos);
+  line = cache_line;
+  column = cache_column;
+
   return token;
 }
 
 char Lexer::GetChar() {
-  char c = is.get();
+  char c = static_cast<char>(is.get());
   if (c == '\n') {
     line++;
     column = 1;
@@ -59,13 +66,14 @@ size_t Lexer::GetColumn() { return column; }
 size_t Lexer::GetLine() { return line; }
 
 void Lexer::SkipWhitespace() {
-  while (IsSpace(is.peek())) {
+  while (IsSpace(static_cast<char>(is.peek()))) {
     GetChar();
   }
 }
 
 std::unique_ptr<Token> Lexer::LexIdentifer() {
-  if ((!isalpha(is.peek()) && is.peek() != '_') || IsEndTag(is.peek())) {
+  if ((!isalpha(is.peek()) && is.peek() != '_') ||
+      IsEndTag(static_cast<char>(is.peek()))) {
     return nullptr;
   }
   std::string token_identifier = "";
@@ -114,7 +122,7 @@ std::unique_ptr<Token> Lexer::LexNumberOrArraow() {
 }
 
 std::unique_ptr<Token> Lexer::LexEndTagOrNullVal() {
-  if (!IsEndTag(is.peek())) {
+  if (!IsEndTag(static_cast<char>(is.peek()))) {
     return nullptr;
   }
   std::string token_end = "";
diff --git a/paddle/pir/core/type.cc b/paddle/pir/core/type.cc
index fef0eb9c1a4437..91933019fb8359 100644
--- a/paddle/pir/core/type.cc
+++ b/paddle/pir/core/type.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/pir/core/type.h"
+#include "paddle/pir/core/builtin_type.h"
 #include "paddle/pir/core/dialect.h"
 #include "paddle/pir/core/type_base.h"
 
@@ -24,4 +25,10 @@ TypeId Type::type_id() { return storage_->abstract_type().type_id(); }
 const AbstractType &Type::abstract_type() { return storage_->abstract_type(); }
 
 Dialect &Type::dialect() const { return storage_->abstract_type().dialect(); }
+
+bool Type::IsIntOrIndex() const {
+  return isa<IndexType>() || isa<Int8Type>() || isa<UInt8Type>() ||
+         isa<Int16Type>() || isa<Int32Type>() || isa<Int64Type>();
+}
+
 }  // namespace pir
diff --git a/paddle/pir/core/type.h b/paddle/pir/core/type.h
index 0c2cb9d6bc7fa0..c1b2f155e8d5a4 100644
--- a/paddle/pir/core/type.h
+++ b/paddle/pir/core/type.h
@@ -39,7 +39,7 @@ class IR_API Type {
   using TypeBase = detail::StorageHelperBase<ConcreteType,
                                              BaseType,
                                              StorageType,
-                                             pir::TypeManager,
+                                             TypeManager,
                                              TraitOrInterface...>;
 
   using Storage = TypeStorage;
@@ -47,8 +47,7 @@ class IR_API Type {
 
   Type() = default;
 
-  Type(const Storage *storage)  // NOLINT
-      : storage_(storage) {}
+  Type(const Storage *storage) : storage_(storage) {}  // NOLINT
 
   Type(const Type &other) = default;
 
@@ -74,8 +73,8 @@ class IR_API Type {
   /// \brief Support PointerLikeTypeTraits.
   ///
   operator const void *() const { return storage_; }
-  static Type RecoverFromOpaquePointer(const void *pointer) {
-    return Type(reinterpret_cast<Storage *>(const_cast<void *>(pointer)));
+  static Type RecoverFromVoidPointer(const void *pointer) {
+    return Type(reinterpret_cast<const Storage *>(pointer));
   }
 
   ///
@@ -116,6 +115,12 @@ class IR_API Type {
     return pir::cast<U>(*this);
   }
 
+  ///
+  /// \brief Return true if this is an integer (any signedness) or an index
+  /// type.
+  ///
+  bool IsIntOrIndex() const;
+
  protected:
   const Storage *storage_{nullptr};
 
diff --git a/paddle/pir/core/type_id.h b/paddle/pir/core/type_id.h
index d2511be7fe9a6b..08bc7025e1df31 100644
--- a/paddle/pir/core/type_id.h
+++ b/paddle/pir/core/type_id.h
@@ -53,9 +53,8 @@ class TypeId {
   ///
   /// \brief Support PointerLikeTypeTraits.
   ///
-  operator const void *() const { return storage_; }
-  void *AsOpaquePointer() const { return storage_; }
-  static TypeId RecoverFromOpaquePointer(void *pointer) {
+  operator void *() const { return storage_; }
+  static TypeId RecoverFromVoidPointer(void *pointer) {
     return TypeId(static_cast<Storage *>(pointer));
   }
 
@@ -93,7 +92,7 @@ class alignas(8) UniqueingId {
   UniqueingId &operator=(UniqueingId &&) = delete;
 
   operator TypeId() { return id(); }
-  TypeId id() { return TypeId::RecoverFromOpaquePointer(this); }
+  TypeId id() { return TypeId::RecoverFromVoidPointer(this); }
 };
 
 template <typename T>
@@ -146,7 +145,7 @@ namespace std {
 template <>
 struct hash<pir::TypeId> {
   std::size_t operator()(const pir::TypeId &obj) const {
-    return std::hash<const void *>()(obj);
+    return std::hash<void *>()(obj);
   }
 };
 }  // namespace std
diff --git a/paddle/pir/core/type_util.cc b/paddle/pir/core/type_util.cc
new file mode 100644
index 00000000000000..0d6d137a897f0d
--- /dev/null
+++ b/paddle/pir/core/type_util.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/core/type_util.h"
+#include <algorithm>
+
+namespace pir {
+
+Type GetElementTypeOrSelf(Type type) {
+  if (auto sType = type.dyn_cast<ShapedTypeInterface>())
+    return sType.GetElementType();
+  return type;
+}
+
+bool VerifyCompatibleShape(const phi::DDim &lhs_shape,
+                           const phi::DDim &rhs_shape) {
+  if (lhs_shape.size() != rhs_shape.size()) return false;
+
+  for (auto dim1 : phi::vectorize(lhs_shape)) {
+    for (auto dim2 : phi::vectorize(rhs_shape)) {
+      if (!ShapedTypeInterface::IsDynamic(dim1) &&
+          !ShapedTypeInterface::IsDynamic(dim2) && dim1 != dim2)
+        return false;
+    }
+  }
+  return true;
+}
+
+bool VerifyCompatibleShape(Type lhs_type, Type rhs_type) {
+  auto lhs_shaped_type = lhs_type.dyn_cast<ShapedTypeInterface>();
+  auto rhs_shaped_type = rhs_type.dyn_cast<ShapedTypeInterface>();
+
+  // Either both or neither type should be shaped.
+  if (!lhs_shaped_type) return !rhs_shaped_type;
+  if (!rhs_shaped_type) return false;
+
+  if (!lhs_shaped_type.HasRank() || !rhs_shaped_type.HasRank()) return true;
+
+  return VerifyCompatibleShape(lhs_shaped_type.GetShape(),
+                               rhs_shaped_type.GetShape());
+}
+
+bool VerifyCompatibleDims(const std::vector<int64_t> &dims) {
+  if (dims.empty()) return true;
+  auto static_dim = std::accumulate(
+      dims.begin(), dims.end(), dims.front(), [](auto &fold, auto &dim) {
+        return ShapedTypeInterface::IsDynamic(dim) ? fold : dim;
+      });
+  return std::all_of(dims.begin(), dims.begin(), [&](auto dim) {
+    return ShapedTypeInterface::IsDynamic(dim) || dim == static_dim;
+  });
+}
+
+bool VerifyCompatibleShapes(const std::vector<Type> &lhs_types,
+                            const std::vector<Type> &rhs_types) {
+  if (lhs_types.size() != rhs_types.size()) return false;
+
+  for (auto it1 : lhs_types) {
+    for (auto it2 : rhs_types) {
+      if (!VerifyCompatibleShape(it1, it2)) return false;
+    }
+  }
+  return true;
+}
+
+bool VerifyCompatibleShapes(const std::vector<Type> &types) {
+  std::vector<ShapedTypeInterface> shaped_type_interfaces;
+
+  std::for_each(
+      types.begin(), types.end(), [&shaped_type_interfaces](Type type) {
+        shaped_type_interfaces.push_back(type.dyn_cast<ShapedTypeInterface>());
+      });
+
+  // Return false if some, but not all are not shaped. Return early if none
+  // are shaped also.
+  if (std::none_of(shaped_type_interfaces.begin(),
+                   shaped_type_interfaces.end(),
+                   [](auto t) { return t; }))
+    return true;
+
+  if (!std::all_of(shaped_type_interfaces.begin(),
+                   shaped_type_interfaces.end(),
+                   [](auto t) { return t; }))
+    return false;
+
+  // Remove all unranked shapes
+  std::vector<ShapedTypeInterface> shapes;
+
+  std::for_each(shaped_type_interfaces.begin(),
+                shaped_type_interfaces.end(),
+                [&shapes](ShapedTypeInterface type) {
+                  if (type.HasRank())
+                    shapes.push_back(type.dyn_cast<ShapedTypeInterface>());
+                });
+  if (shapes.empty()) return true;
+
+  // All ranks should be equal
+  int64_t firstRank = shapes.front().GetRank();
+
+  if (std::any_of(shapes.begin(), shapes.end(), [&](auto shape) {
+        return firstRank != shape.GetRank();
+      }))
+    return false;
+
+  for (unsigned i = 0; i < firstRank; ++i) {
+    // For all ranked dimensions
+    std::vector<int64_t> dims;
+    std::for_each(shapes.begin(), shapes.end(), [&](ShapedTypeInterface shape) {
+      dims.push_back(shape.GetDimSize(i));
+    });
+
+    if (!VerifyCompatibleDims(dims)) return false;
+  }
+
+  return true;
+}
+
+}  // namespace pir
diff --git a/paddle/pir/core/type_util.h b/paddle/pir/core/type_util.h
new file mode 100644
index 00000000000000..5704ba2abea781
--- /dev/null
+++ b/paddle/pir/core/type_util.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+///
+/// \brief Utility Functions
+///
+
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+
+namespace pir {
+///
+/// \brief Return the element type or return the type itself.
+///
+Type GetElementTypeOrSelf(Type type);
+
+///
+/// \brief Returns true if the given two shapes are compatible. That is, they
+/// have the same size and each pair of the elements are equal or one of them is
+/// dynamic.
+///
+bool VerifyCompatibleShape(const phi::DDim& lhs_shape,
+                           const phi::DDim& rhs_shape);
+
+///
+/// \brief Returns true if the given two types have compatible shape. That
+/// is, they are both scalars (not shaped), or they are both shaped types and at
+/// least one is unranked or they have compatible dimensions. Dimensions are
+/// compatible if at least one is dynamic or both are equal. The element type
+/// does not matter.
+///
+bool VerifyCompatibleShape(Type lhs_type, Type rhs_type);
+
+///
+/// \brief Dimensions are compatible if all non-dynamic dims are equal.
+///
+bool VerifyCompatibleDims(const std::vector<int64_t>& dims);
+
+///
+/// \brief Returns true if the given two arrays have the same number of elements
+/// and each pair wise entries have compatible shape.
+///
+bool VerifyCompatibleShapes(const std::vector<Type>& lhs_types,
+                            const std::vector<Type>& rhs_types);
+
+///
+/// \brief Returns true if all given types have compatible shapes. That is,
+/// they are all scalars (not shaped), or they are all shaped types and any
+/// ranked shapes have compatible dimensions. Dimensions are compatible if all
+/// non-dynamic dims are equal. The element type does not matter.
+///
+bool VerifyCompatibleShapes(const std::vector<Type>& types);
+}  // namespace pir
diff --git a/paddle/pir/core/value.cc b/paddle/pir/core/value.cc
index a4bd4430507af2..13b0b4a5cfee88 100644
--- a/paddle/pir/core/value.cc
+++ b/paddle/pir/core/value.cc
@@ -40,16 +40,11 @@ bool Value::operator!=(const Value &other) const {
 
 bool Value::operator!() const { return impl_ == nullptr; }
 
-bool Value::operator<(const Value &other) const {
-  return std::hash<Value>{}(*this) < std::hash<Value>{}(other);
-}
+bool Value::operator<(const Value &other) const { return impl_ < other.impl_; }
 
 Value::operator bool() const { return impl_; }
 
-pir::Type Value::type() const {
-  CHECK_VALUE_NULL_IMPL(type);
-  return impl_->type();
-}
+pir::Type Value::type() const { return impl_ ? impl_->type() : nullptr; }
 
 void Value::set_type(pir::Type type) {
   CHECK_VALUE_NULL_IMPL(set_type);
@@ -66,8 +61,7 @@ Value::UseIterator Value::use_begin() const { return OpOperand(first_use()); }
 Value::UseIterator Value::use_end() const { return Value::UseIterator(); }
 
 OpOperand Value::first_use() const {
-  CHECK_VALUE_NULL_IMPL(first_use);
-  return impl_->first_use();
+  return impl_ ? impl_->first_use() : nullptr;
 }
 
 bool Value::use_empty() const { return !first_use(); }
diff --git a/paddle/pir/dialect/control_flow/ir/cf_dialect.cc b/paddle/pir/dialect/control_flow/ir/cf_dialect.cc
index 7166af2ece6363..ed36c0c81cca6a 100644
--- a/paddle/pir/dialect/control_flow/ir/cf_dialect.cc
+++ b/paddle/pir/dialect/control_flow/ir/cf_dialect.cc
@@ -15,6 +15,6 @@
 #include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
 
 namespace pir {
-void ControlFlowDialect::initialize() { RegisterOps<YieldOp, CondYieldOp>(); }
+void ControlFlowDialect::initialize() { RegisterOps<YieldOp>(); }
 }  // namespace pir
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::ControlFlowDialect)
diff --git a/paddle/pir/dialect/control_flow/ir/cf_ops.cc b/paddle/pir/dialect/control_flow/ir/cf_ops.cc
index 69dce41e62badb..7981a6ab963965 100644
--- a/paddle/pir/dialect/control_flow/ir/cf_ops.cc
+++ b/paddle/pir/dialect/control_flow/ir/cf_ops.cc
@@ -24,4 +24,3 @@ void YieldOp::Build(Builder &builder,
 }  // namespace pir
 
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::YieldOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(pir::CondYieldOp)
diff --git a/paddle/pir/dialect/control_flow/ir/cf_ops.h b/paddle/pir/dialect/control_flow/ir/cf_ops.h
index 898f954e09d5f5..7d669c0b648ea0 100644
--- a/paddle/pir/dialect/control_flow/ir/cf_ops.h
+++ b/paddle/pir/dialect/control_flow/ir/cf_ops.h
@@ -28,33 +28,8 @@ class IR_API YieldOp : public Op<YieldOp> {
   static void Build(Builder &builder,             // NOLINT
                     OperationArgument &argument,  // NOLINT
                     const std::vector<Value> &Value);
-  void Verify() {}
+  void VerifySig() {}
 };
-
-class IR_API CondYieldOp : public Op<CondYieldOp> {
- public:
-  using Op::Op;
-  static const char *name() { return "cf.cond_yield"; }
-  static constexpr uint32_t attributes_num = 0;
-  static constexpr const char **attributes_name = nullptr;
-
-  template <class ValueContainer>
-  static void Build(Builder &builder,             // NOLINT
-                    OperationArgument &argument,  // NOLINT
-                    Value cond,
-                    const ValueContainer &inputs);
-  void Verify() {}
-};
-
-template <class ValueContainer>
-void CondYieldOp::Build(Builder &builder,             // NOLINT
-                        OperationArgument &argument,  // NOLINT
-                        Value cond,
-                        const ValueContainer &inputs) {
-  argument.AddInput(cond);
-  argument.AddInputs(inputs);
-}
 }  // namespace pir
 
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::YieldOp);
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::CondYieldOp);
diff --git a/paddle/pir/dialect/shape/ir/shape_dialect.h b/paddle/pir/dialect/shape/ir/shape_dialect.h
index b4ae3aa6172108..b8fe39bd8d500f 100644
--- a/paddle/pir/dialect/shape/ir/shape_dialect.h
+++ b/paddle/pir/dialect/shape/ir/shape_dialect.h
@@ -24,12 +24,6 @@ namespace dialect {
 class IR_API ShapeDialect : public Dialect {
  public:
   explicit ShapeDialect(IrContext* context);
-  ///
-  /// \brief Each Dialect needs to provide a name function to return the name of
-  /// the Dialect.
-  ///
-  /// \return The name of this Dialect.
-  ///
   static const char* name() { return "shape"; }
   void PrintOperation(Operation* op,
                       IrPrinter& printer) const override;  // NOLINT
diff --git a/paddle/pir/dialect/shape/ir/shape_op.cc b/paddle/pir/dialect/shape/ir/shape_op.cc
index aa2e9c2e26e4ca..885f50d080143e 100644
--- a/paddle/pir/dialect/shape/ir/shape_op.cc
+++ b/paddle/pir/dialect/shape/ir/shape_op.cc
@@ -16,115 +16,122 @@
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/enforce.h"
 
-namespace pir {
-namespace dialect {
+namespace pir::dialect {
 
-const char *SymbolicDim::attributes_name[attributes_num] = {"knownNegativeOne",
-                                                            "knownNonNegative",
-                                                            "knownNonSizeOne",
-                                                            "knownNonSizeZero",
-                                                            "sym_name",
-                                                            "value"};  // NOLINT
+const char *SymbolicDim::attributes_name[attributes_num] = {
+    "known_negative_one",   // value = -1
+    "known_non_negative",   // value >= 0
+    "known_non_size_one",   // value != 1
+    "known_non_size_zero",  // value != 0
+    "sym_name",
+    "value"};  // NOLINT
 
 void SymbolicDim::Build(Builder &builder,
                         OperationArgument &argument,
                         const std::string &sym_name,
                         int64_t value,
-                        bool knownNonNegative,
-                        bool knownNegativeOne,
-                        bool knownNonSizeOne,
-                        bool knownNonSizeZero) {
-  Attribute attr_sym_name = StrAttribute::get(IrContext::Instance(), sym_name);
+                        bool known_non_negative,
+                        bool known_negative_one,
+                        bool known_non_size_one,
+                        bool known_non_size_zero) {
+  IrContext *ctx = IrContext::Instance();
+  auto attr_sym_name = StrAttribute::get(ctx, sym_name);
+  auto attr_value = Int64Attribute::get(ctx, value);
+  auto attr_known_none_negative = BoolAttribute::get(ctx, known_non_negative);
+  auto attr_known_negative_one = BoolAttribute::get(ctx, known_negative_one);
+  auto attr_known_non_size_one = BoolAttribute::get(ctx, known_non_size_one);
+  auto attr_known_non_size_zero = BoolAttribute::get(ctx, known_non_size_zero);
+
   argument.AddAttribute("sym_name", attr_sym_name);
-  Attribute attr_value = Int64Attribute::get(IrContext::Instance(), value);
   argument.AddAttribute("value", attr_value);
-  Attribute attr_knownNonNegative =
-      BoolAttribute::get(IrContext::Instance(), knownNonNegative);
-  argument.AddAttribute("knownNonNegative", attr_knownNonNegative);
-  Attribute attr_knownNegativeOne =
-      BoolAttribute::get(IrContext::Instance(), knownNegativeOne);
-  argument.AddAttribute("knownNegativeOne", attr_knownNegativeOne);
-  Attribute attr_knownNonSizeOne =
-      BoolAttribute::get(IrContext::Instance(), knownNonSizeOne);
-  argument.AddAttribute("knownNonSizeOne", attr_knownNonSizeOne);
-  Attribute attr_knownNonSizeZero =
-      BoolAttribute::get(IrContext::Instance(), knownNonSizeZero);
-  argument.AddAttribute("knownNonSizeZero", attr_knownNonSizeZero);
+  argument.AddAttribute("known_non_negative", attr_known_none_negative);
+  argument.AddAttribute("known_negative_one", attr_known_negative_one);
+  argument.AddAttribute("known_non_size_one", attr_known_non_size_one);
+  argument.AddAttribute("known_non_size_zero", attr_known_non_size_zero);
 }
 
-const std::string SymbolicDim::getSymName() {
+const std::string SymbolicDim::GetSymName() {
   return attribute<StrAttribute>("sym_name").AsString();
 }
-int64_t SymbolicDim::getValue() {
+int64_t SymbolicDim::GetDimSize() {
   return attribute<Int64Attribute>("value").data();
 }
-bool SymbolicDim::getKnownNonNegative() {
-  return attribute<BoolAttribute>("knownNonNegative").data();
+bool SymbolicDim::GetKnownNonNegative() {
+  return attribute<BoolAttribute>("known_non_negative").data();
 }
-bool SymbolicDim::getKnownNegativeOne() {
-  return attribute<BoolAttribute>("knownNegativeOne").data();
+bool SymbolicDim::GetKnownNegativeOne() {
+  return attribute<BoolAttribute>("known_negative_one").data();
 }
-bool SymbolicDim::getKnownNonSizeOne() {
-  return attribute<BoolAttribute>("knownNonSizeOne").data();
+bool SymbolicDim::GetKnownNonSizeOne() {
+  return attribute<BoolAttribute>("known_non_size_one").data();
 }
-bool SymbolicDim::getKnownNonSizeZero() {
-  return attribute<BoolAttribute>("knownNonSizeZero").data();
+bool SymbolicDim::GetKnownNonSizeZero() {
+  return attribute<BoolAttribute>("known_non_size_zero").data();
 }
 
-void SymbolicDim::updateSymName(std::string attrValue) {
+void SymbolicDim::SetSymName(const std::string &attr_value) {
   operation()->set_attribute(
-      "sym_name", StrAttribute::get(IrContext::Instance(), attrValue));
+      "sym_name", StrAttribute::get(IrContext::Instance(), attr_value));
 }
-void SymbolicDim::updateValue(int64_t attrValue) {
+void SymbolicDim::SetDimSize(int64_t attr_value) {
   operation()->set_attribute(
-      "value", Int64Attribute::get(IrContext::Instance(), attrValue));
+      "value", Int64Attribute::get(IrContext::Instance(), attr_value));
 }
 
-void SymbolicDim::updateKnownNonNegative(bool attrValue) {
-  operation()->set_attribute(
-      "knownNonNegative", BoolAttribute::get(IrContext::Instance(), attrValue));
+void SymbolicDim::UpdateKnownNonNegative(bool flag) {
+  operation()->set_attribute("known_non_negative",
+                             BoolAttribute::get(IrContext::Instance(), flag));
 }
-void SymbolicDim::updateKnownNegativeOne(bool attrValue) {
-  operation()->set_attribute(
-      "knownNegativeOne", BoolAttribute::get(IrContext::Instance(), attrValue));
+void SymbolicDim::UpdateKnownNegativeOne(bool flag) {
+  operation()->set_attribute("known_negative_one",
+                             BoolAttribute::get(IrContext::Instance(), flag));
 }
-void SymbolicDim::updateKnownNonSizeOne(bool attrValue) {
-  operation()->set_attribute(
-      "knownNonSizeOne", BoolAttribute::get(IrContext::Instance(), attrValue));
+void SymbolicDim::UpdateKnownNonSizeOne(bool flag) {
+  operation()->set_attribute("known_non_size_one",
+                             BoolAttribute::get(IrContext::Instance(), flag));
 }
-void SymbolicDim::updateKnownNonSizeZero(bool attrValue) {
-  operation()->set_attribute(
-      "knownNonSizeZero", BoolAttribute::get(IrContext::Instance(), attrValue));
+void SymbolicDim::UpdateKnownNonSizeZero(bool flag) {
+  operation()->set_attribute("known_non_size_zero",
+                             BoolAttribute::get(IrContext::Instance(), flag));
 }
 
 bool SymbolicDim::IsDynamic() {
-  return getValue() == ShapedTypeInterface::kDynamic;
+  return GetDimSize() == ShapedTypeInterface::kDynamic;
 }
 
 bool SymbolicDim::Merge(SymbolicDim other) {
-  if (!IsDynamic() && !other.IsDynamic() && getValue() != other.getValue())
+  VLOG(4) << "Try to merge two SymbolicDim ops.";
+
+  if (!IsDynamic() && !other.IsDynamic() && GetDimSize() != other.GetDimSize())
     return false;
-  if (IsDynamic() && !other.IsDynamic()) updateValue(other.getValue());
-  if (!IsDynamic() && other.IsDynamic()) other.updateValue(getValue());
-
-  bool knownNonNegativeFlag =
-      getKnownNonNegative() || other.getKnownNonNegative();
-  bool knownNegativeOneFlag =
-      getKnownNegativeOne() || other.getKnownNegativeOne();
-  bool knownNonSizeOneFlag = getKnownNonSizeOne() ||
-                             other.getKnownNonSizeOne() || knownNegativeOneFlag;
-  bool knownNonSizeZeroFlag = getKnownNonSizeZero() ||
-                              other.getKnownNonSizeZero() ||
-                              knownNegativeOneFlag;
-
-  if (knownNonNegativeFlag && knownNegativeOneFlag) return false;
-
-  updateKnownNonSizeZero(knownNonSizeZeroFlag);
-  updateKnownNonSizeOne(knownNonSizeOneFlag);
-  updateKnownNegativeOne(knownNegativeOneFlag);
-  updateKnownNonNegative(knownNonNegativeFlag);
 
+  if (IsDynamic() && !other.IsDynamic()) SetDimSize(other.GetDimSize());
+  if (!IsDynamic() && other.IsDynamic()) other.SetDimSize(GetDimSize());
+
+  // eiter value >= 0
+  bool known_non_negative_flag =
+      GetKnownNonNegative() || other.GetKnownNonNegative();
+
+  // eiter value == -1
+  bool known_negative_one_flag =
+      GetKnownNegativeOne() || other.GetKnownNegativeOne();
+
+  if (known_non_negative_flag && known_negative_one_flag) return false;
+
+  bool known_non_size_one_flag = GetKnownNonSizeOne() ||
+                                 other.GetKnownNonSizeOne() ||
+                                 known_negative_one_flag;
+
+  bool known_non_size_zero_flag = GetKnownNonSizeZero() ||
+                                  other.GetKnownNonSizeZero() ||
+                                  known_negative_one_flag;
+
+  UpdateKnownNonSizeZero(known_non_size_zero_flag);
+  UpdateKnownNonSizeOne(known_non_size_one_flag);
+  UpdateKnownNegativeOne(known_negative_one_flag);
+  UpdateKnownNonNegative(known_non_negative_flag);
   return true;
 }
 
@@ -196,7 +203,7 @@ std::vector<Value> TieProductEqualOp::rhs() {
 }
 
 const char *TieShapeOp::attributes_name[attributes_num] = {
-    SymbolicDim::getSymbolicDimAttrName().c_str()};  // NOLINT
+    SymbolicDim::GetSymbolicDimAttrName().c_str()};  // NOLINT
 
 void TieShapeOp::Build(Builder &builder,
                        OperationArgument &argument,
@@ -266,8 +273,7 @@ void TensorDimOp::Build(Builder &builder,
 Value TensorDimOp::source() { return operand_source(0); }
 
 Value TensorDimOp::index() { return operand_source(1); }
-}  // namespace dialect
-}  // namespace pir
+}  // namespace pir::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::SymbolicDim)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::DimOp)
diff --git a/paddle/pir/dialect/shape/ir/shape_op.h b/paddle/pir/dialect/shape/ir/shape_op.h
index 3163d404a61ee4..c838624d2566df 100644
--- a/paddle/pir/dialect/shape/ir/shape_op.h
+++ b/paddle/pir/dialect/shape/ir/shape_op.h
@@ -19,13 +19,12 @@
 #include "paddle/pir/core/ir_printer.h"
 #include "paddle/pir/core/op_base.h"
 
-namespace pir {
-namespace dialect {
+namespace pir::dialect {
 
 class IR_API SymbolicDim : public Op<SymbolicDim> {
  public:
   using Op::Op;
-  static const char *name() { return "shape.SymbolicDim"; }
+  static const char *name() { return "shape.symbolic_dim"; }
 
   static constexpr uint32_t attributes_num = 6;
   static const char *attributes_name[attributes_num];
@@ -34,32 +33,45 @@ class IR_API SymbolicDim : public Op<SymbolicDim> {
                     OperationArgument &argument,  // NOLINT
                     const std::string &sym_name,
                     int64_t value = ShapedTypeInterface::kDynamic,
-                    bool knownNonNegative = false,
-                    bool knownNegativeOne = false,
-                    bool knownNonSizeOne = false,
-                    bool knownNonSizeZero = false);
-  const std::string getSymName();
-  int64_t getValue();
-  bool getKnownNonNegative();
-  bool getKnownNegativeOne();
-  bool getKnownNonSizeOne();
-  bool getKnownNonSizeZero();
-
-  void updateSymName(std::string attrValue);
-  void updateValue(int64_t attrValue);
-  void updateKnownNonNegative(bool attrValue);
-  void updateKnownNegativeOne(bool attrValue);
-  void updateKnownNonSizeOne(bool attrValue);
-  void updateKnownNonSizeZero(bool attrValue);
+                    bool known_non_negative = false,
+                    bool known_negative_one = false,
+                    bool known_non_size_one = false,
+                    bool known_non_size_zero = false);
 
+  const std::string GetSymName();
+  int64_t GetDimSize();
+
+  bool GetKnownNonNegative();
+  bool GetKnownNegativeOne();
+  bool GetKnownNonSizeOne();
+  bool GetKnownNonSizeZero();
+
+  void SetSymName(const std::string &attr_value);
+  void SetDimSize(int64_t attr_value);
+
+  // Sets `known_non_negative` to the value of `flag`
+  void UpdateKnownNonNegative(bool flag);
+
+  // Sets `known_negative_one` to the value of `flag`
+  void UpdateKnownNegativeOne(bool flag);
+
+  // Sets `known_non_size_one` to the value of `flag`
+  void UpdateKnownNonSizeOne(bool flag);
+
+  // Sets `known_non_size_zero` to the value of `flag`
+  void UpdateKnownNonSizeZero(bool flag);
+
+  // Returns true if this SymbolicDim is not known at compile-time.
   bool IsDynamic();
+
+  // Try to merge two SymbolicDim ops.
   bool Merge(SymbolicDim other);
 
-  static const std::string getSymbolicDimAttrName() {
+  static const std::string GetSymbolicDimAttrName() {
     return "kSymbolicDimAttr";
   }
 
-  void Verify() {}
+  void VerifySig() {}
 };
 
 class IR_API DimOp : public Op<DimOp> {
@@ -77,7 +89,7 @@ class IR_API DimOp : public Op<DimOp> {
   const std::string getName();
   void setName(std::string attrValue);
   OpResult out() { return result(0); }
-  void Verify() {}
+  void VerifySig() {}
 };
 
 class IR_API TieProductEqualOp : public Op<TieProductEqualOp> {
@@ -99,7 +111,7 @@ class IR_API TieProductEqualOp : public Op<TieProductEqualOp> {
                     const std::vector<Value> &rhs);
   std::vector<Value> lhs();
   std::vector<Value> rhs();
-  void Verify() {}
+  void VerifySig() {}
 };
 
 class IR_API TieShapeOp : public Op<TieShapeOp> {
@@ -120,7 +132,7 @@ class IR_API TieShapeOp : public Op<TieShapeOp> {
                     const std::vector<Value> &dims);
   Value value();
   std::vector<Value> dims();
-  void Verify() {}
+  void VerifySig() {}
 };
 
 class IR_API FuncOp : public Op<FuncOp> {
@@ -135,7 +147,7 @@ class IR_API FuncOp : public Op<FuncOp> {
                     OperationArgument &argument);  // NOLINT
   void Print(IrPrinter &printer);                  // NOLINT
   Block *block();
-  void Verify() {}
+  void VerifySig() {}
 };
 
 class IR_API TensorDimOp : public Op<TensorDimOp> {
@@ -157,11 +169,10 @@ class IR_API TensorDimOp : public Op<TensorDimOp> {
   Value index();
   Value source();
   OpResult out() { return result(0); }
-  void Verify() {}
+  void VerifySig() {}
 };
 
-}  // namespace dialect
-}  // namespace pir
+}  // namespace pir::dialect
 
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::SymbolicDim);
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::DimOp);
diff --git a/paddle/pir/dialect/shape/transforms/shape_optimization_pass.h b/paddle/pir/dialect/shape/transforms/passes.h
similarity index 95%
rename from paddle/pir/dialect/shape/transforms/shape_optimization_pass.h
rename to paddle/pir/dialect/shape/transforms/passes.h
index 43bad532c920d5..9433ef9b570bd6 100644
--- a/paddle/pir/dialect/shape/transforms/shape_optimization_pass.h
+++ b/paddle/pir/dialect/shape/transforms/passes.h
@@ -21,6 +21,7 @@ namespace pir {
 
 class Pass;
 
+// Apply some shape-related optimization.
 IR_API std::unique_ptr<Pass> CreateShapeOptimizationPass();
 
 }  // namespace pir
diff --git a/paddle/pir/dialect/shape/transforms/shape_optimization.cc b/paddle/pir/dialect/shape/transforms/shape_optimization.cc
new file mode 100644
index 00000000000000..54f43c74cb4154
--- /dev/null
+++ b/paddle/pir/dialect/shape/transforms/shape_optimization.cc
@@ -0,0 +1,331 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/dialect/shape/ir/shape_op.h"
+
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/dialect/shape/utils/shape_utils.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_manager.h"
+#include "paddle/pir/pass/pass_registry.h"
+
+namespace pir {
+namespace {
+using PassPipelineRunner =
+    std::function<bool(pir::PassManager&, pir::ModuleOp)>;
+
+bool InsertTieShapeOnValue(pir::Value value,
+                           pir::Builder& builder) {  // NOLINT
+  auto ty = value.type().dyn_cast<paddle::dialect::DenseTensorType>();
+
+  if (!ty || ty.dims().size() == 0) return true;
+  std::vector<pir::Value> dimSizes;
+  for (int64_t dim = 0, rank = ty.dims().size(); dim < rank; ++dim) {
+    auto dimOp = builder.Build<pir::dialect::TensorDimOp>(value, dim);
+    dimSizes.push_back(dimOp.out());
+  }
+  builder.Build<pir::dialect::TieShapeOp>(value, dimSizes);
+  return true;
+}
+
+bool InsertTieShapeOnRegion(pir::Region* region);
+
+bool InsertTieShapeOnOperation(pir::Operation* op,
+                               pir::Builder& builder) {  // NOLINT
+  // TODO(zhangbo63): skip more specialized Ops.
+  if (op->isa<pir::dialect::TieShapeOp>() || op->isa<pir::dialect::FuncOp>())
+    return true;
+
+  for (size_t i = 0; i < op->num_regions(); ++i) {
+    if (!InsertTieShapeOnRegion(&(op->region(i)))) return false;
+  }
+  builder.SetInsertionPointAfter(op);
+  for (pir::OpResult v : op->results()) {
+    if (!InsertTieShapeOnValue(v, builder)) return false;
+  }
+
+  return true;
+}
+
+bool InsertTieShapeOnBlock(pir::Block* block) {
+  pir::Builder builder =
+      pir::Builder(pir::IrContext::Instance(), block, block->begin());
+  // TODO(liujinnan): mapping block arguments
+
+  std::vector<pir::Operation*> op_list;
+  for (pir::Operation* op : *block) op_list.push_back(op);
+  for (pir::Operation* op : op_list) {
+    if (!InsertTieShapeOnOperation(op, builder)) return false;
+  }
+  return true;
+}
+
+bool InsertTieShapeOnRegion(pir::Region* region) {
+  for (pir::Block* block : *region) {
+    if (!InsertTieShapeOnBlock(block)) return false;
+  }
+  return true;
+}
+
+bool MaterializeShapeComputation(pir::ModuleOp m) {
+  if (!InsertTieShapeOnRegion(&(m->region(0)))) return false;
+  // TODO(liujinnan): add rewitter pattern for reifyInferShape.
+  return true;
+}
+
+bool IsCandidateShapeTensorType(Type type) {
+  auto tensor_type = type.dyn_cast<DenseTensorType>();
+  auto shaped_type = tensor_type.dyn_cast<ShapedTypeInterface>();
+
+  return (tensor_type && tensor_type && shaped_type.GetRank() == 1 &&
+          shaped_type.HasStaticShape() &&
+          shaped_type.GetElementType().IsIntOrIndex() &&
+          shaped_type.GetShape()[0] < 32);
+}
+
+class ShapeComputationIRAnalysis {
+ public:
+  using func = std::function<bool(Operation* op)>;
+  explicit ShapeComputationIRAnalysis(ModuleOp m,
+                                      SymbolicDimMgr& mgr);  // NOLINT
+  bool Run();
+
+ private:
+  bool RunOnRegion(Region* region, func fn);
+  bool RunOnBlock(Block* block, func fn);
+  bool RunOnOperation(Operation* op, func fn);
+
+  bool BuildShapeOnOperation(Operation* op);
+  bool BuildShapeOnValue(Value value);
+
+  bool ApplyOpConstraint(Operation* op);
+  bool ApplyIndexOpConstraint(Operation* op);
+  bool ApplyTieShapeOpConstraint(Operation* op);
+
+  bool initialized_ = false;
+  ModuleOp m_;
+  SymbolicDimMgr& mgr_;
+
+  std::unordered_map<Value, SymbolicDim> value_to_sym_dim_;
+
+  // shape tensor is the 1D ranked tensor with int/index dtype.
+  std::unordered_map<Value, std::vector<SymbolicDim>> shape_tensor_to_sym_dims_;
+
+  std::unordered_map<Value, std::vector<SymbolicDim>> dense_tensor_to_sym_dims_;
+};
+
+// Returns true if the type is possible to be a shape tensor type.
+// Shape tensor type :
+//    - rank-1 static-shaped tensor type
+//    - element type of the tensor is int or index
+//    - number of elements of the tensor < 32, supposing that the
+//      higiest possible rank is smaller than 32.
+
+ShapeComputationIRAnalysis::ShapeComputationIRAnalysis(ModuleOp m,
+                                                       SymbolicDimMgr& mgr)
+    : m_(m), mgr_(mgr) {}
+
+bool ShapeComputationIRAnalysis::Run() {
+  // Make sure only run once.
+  if (initialized_) return false;
+  initialized_ = true;
+  auto build_shape_func =
+      std::bind(&ShapeComputationIRAnalysis::BuildShapeOnOperation,
+                this,
+                std::placeholders::_1);
+  if (!RunOnRegion(&(m_->region(0)), build_shape_func)) return false;
+  auto apply_op_constraint_func =
+      std::bind(&ShapeComputationIRAnalysis::ApplyOpConstraint,
+                this,
+                std::placeholders::_1);
+  if (!RunOnRegion(&(m_->region(0)), apply_op_constraint_func)) return false;
+  return true;
+}
+
+bool ShapeComputationIRAnalysis::RunOnRegion(Region* region, func fn) {
+  for (Block* block : *region) {
+    if (!RunOnBlock(block, fn)) return false;
+  }
+  return true;
+}
+
+bool ShapeComputationIRAnalysis::RunOnBlock(Block* block, func fn) {
+  // TODO(liujinnan): mapping block arguments
+
+  std::vector<Operation*> op_list;
+  for (Operation* op : *block) op_list.push_back(op);
+  for (Operation* op : op_list) {
+    if (!RunOnOperation(op, fn)) return false;
+  }
+  return true;
+}
+
+bool ShapeComputationIRAnalysis::RunOnOperation(Operation* op, func fn) {
+  for (size_t i = 0; i < op->num_regions(); ++i) {
+    if (!RunOnRegion(&(op->region(i)), fn)) return false;
+  }
+  return fn(op);
+}
+
+bool ShapeComputationIRAnalysis::BuildShapeOnOperation(Operation* op) {
+  if (op->isa<dialect::FuncOp>()) return true;
+  if (op->isa<dialect::TieShapeOp>()) {
+    Value value = op->operand_source(0);
+    std::vector<SymbolicDim> symbols;
+    if (op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) {
+      auto attrs =
+          op->attribute<ArrayAttribute>(SymbolicDim::GetSymbolicDimAttrName())
+              .AsVector();
+      for (Attribute attr : attrs) {
+        auto sym = mgr_.symbolTable().Lookup<SymbolicDim>(
+            attr.dyn_cast<StrAttribute>().AsString());
+        assert(sym);
+        SymbolicDim root = mgr_.GetRootSymbolicDim(sym);
+        symbols.push_back(root);
+      }
+    } else {
+      symbols = mgr_.CreateSymbolicDimsForRankedValue(value);
+      std::vector<Attribute> attrs;
+      for (SymbolicDim sym : symbols) {
+        Attribute rootSymbol =
+            StrAttribute::get(m_->ir_context(), sym.GetSymName());
+        attrs.push_back(rootSymbol);
+      }
+      op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(),
+                        ArrayAttribute::get(m_->ir_context(), attrs));
+    }
+    dense_tensor_to_sym_dims_[value] = std::move(symbols);
+    return true;
+  }
+  for (size_t i = 0; i < op->num_results(); ++i) {
+    if (!BuildShapeOnValue(op->result(i))) return false;
+  }
+  return true;
+}
+
+bool ShapeComputationIRAnalysis::BuildShapeOnValue(Value value) {
+  Type type = value.type();
+  if (type.IsIntOrIndex()) {
+    SymbolicDim sym = mgr_.NewSymbolicDim();
+    value_to_sym_dim_[value] = sym;
+  } else if (IsCandidateShapeTensorType(type)) {
+    auto shaped_type = type.dyn_cast<ShapedTypeInterface>();
+    std::vector<SymbolicDim> symbols;
+    for (size_t i = 0, d = shaped_type.GetShape()[0]; i < d; ++i)
+      symbols.push_back(mgr_.NewSymbolicDim());
+    shape_tensor_to_sym_dims_[value] = std::move(symbols);
+  }
+  return true;
+}
+
+bool ShapeComputationIRAnalysis::ApplyOpConstraint(Operation* op) {
+  IR_ENFORCE(ApplyIndexOpConstraint(op),
+             "Fail to apply constraint for index op");
+  IR_ENFORCE(ApplyTieShapeOpConstraint(op),
+             "Fail to apply constraint for tie_shape op");
+
+  // TODO(zhangbo63): add more constraints
+  return true;
+}
+
+bool ShapeComputationIRAnalysis::ApplyIndexOpConstraint(Operation* op) {
+  if (op->num_results() == 0) return true;
+
+  Type type = op->result(0).type();
+  if (!type.IsIntOrIndex()) return true;
+
+  if (auto dim_op = op->dyn_cast<dialect::TensorDimOp>()) {
+    int64_t dim_index = dim_op.index()
+                            .dyn_cast<OpResult>()
+                            .owner()
+                            ->attribute<Int64Attribute>("value")
+                            .data();
+    value_to_sym_dim_[dim_op.out()].UpdateKnownNonNegative(true);
+    if (!mgr_.MapSymbolicDimEqual(
+            value_to_sym_dim_[dim_op.out()],
+            dense_tensor_to_sym_dims_[dim_op.source()][dim_index])) {
+      return false;
+    }
+
+  } else if (auto const_op = op->dyn_cast<ConstantOp>()) {
+    int64_t val = const_op.value().dyn_cast<Int64Attribute>().data();
+    if (!mgr_.MapSymbolicDimEqual(value_to_sym_dim_[op->result(0)],
+                                  mgr_.NewConstantSymbolicDim(val))) {
+      return false;
+    }
+  }
+  // TODO(zhangbo63): add support for reifyInferShape. (e.g. mul/add)
+  return true;
+}
+
+bool ShapeComputationIRAnalysis::ApplyTieShapeOpConstraint(Operation* op) {
+  if (auto tie_shape = op->dyn_cast<dialect::TieShapeOp>()) {
+    auto& value = dense_tensor_to_sym_dims_[op->operand_source(0)];
+    for (size_t idx = 0; idx < tie_shape.dims().size(); ++idx) {
+      if (!mgr_.MapSymbolicDimEqual(value_to_sym_dim_[tie_shape.dims()[idx]],
+                                    value[idx]))
+        return false;
+      mgr_.GetRootSymbolicDim(value[idx]).UpdateKnownNonNegative(true);
+    }
+  }
+  return true;
+}
+
+bool OptimizeShapeComputation(pir::ModuleOp m, PassPipelineRunner runner) {
+  // TODO(liujinnan): Do some Canonicalizer.
+  pir::SymbolicDimMgr mgr(m);
+  IR_ENFORCE(mgr.Load(),
+             "SymbolicDimMgr Load failed in OptimizeShapeComputation.");
+  ShapeComputationIRAnalysis analysis(m, mgr);
+  if (!analysis.Run()) {
+    return false;
+  }
+  IR_ENFORCE(mgr.Save(),
+             "SymbolicDimMgr save failed in OptimizeShapeComputation.");
+  return true;
+}
+
+class ShapeOptimizationPass : public pir::Pass {
+ public:
+  ShapeOptimizationPass() : pir::Pass("shape_optimization_pass", 0) {}
+
+  void Run(pir::Operation* op) override {
+    auto module_op = op->dyn_cast<pir::ModuleOp>();
+    IR_ENFORCE(module_op, "ShapeOptimizationPass should run on module op.");
+    MaterializeShapeComputation(module_op);
+    // runner is for Canonicalizer.
+    PassPipelineRunner runner = [this](pir::PassManager& pm, pir::ModuleOp m) {
+      return pm.Run(m.program());
+    };
+    if (!OptimizeShapeComputation(module_op, runner)) {
+      return;
+    }
+  }
+
+  bool CanApplyOn(pir::Operation* op) const override {
+    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateShapeOptimizationPass() {
+  return std::make_unique<ShapeOptimizationPass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(shape_optimization_pass, pir::ShapeOptimizationPass);
diff --git a/paddle/pir/dialect/shape/transforms/shape_optimization_pass.cc b/paddle/pir/dialect/shape/transforms/shape_optimization_pass.cc
deleted file mode 100644
index 6bbb918ebc1f1f..00000000000000
--- a/paddle/pir/dialect/shape/transforms/shape_optimization_pass.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/pir/dialect/shape/transforms/shape_optimization_pass.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
-#include "paddle/pir/dialect/shape/ir/shape_op.h"
-
-#include "paddle/pir/core/builtin_op.h"
-#include "paddle/pir/core/program.h"
-#include "paddle/pir/dialect/shape/utils/shape_utils.h"
-#include "paddle/pir/pass/pass.h"
-#include "paddle/pir/pass/pass_manager.h"
-#include "paddle/pir/pass/pass_registry.h"
-
-namespace {
-using PassPipelineRunner =
-    std::function<bool(pir::PassManager&, pir::ModuleOp)>;
-
-bool InsertTieShapeOnValue(pir::Value value,
-                           pir::Builder& builder) {  // NOLINT
-  auto ty = value.type().dyn_cast<paddle::dialect::DenseTensorType>();
-
-  if (!ty || ty.dims().size() == 0) return true;
-  std::vector<pir::Value> dimSizes;
-  for (int64_t dim = 0, rank = ty.dims().size(); dim < rank; ++dim) {
-    auto dimOp = builder.Build<pir::dialect::TensorDimOp>(value, dim);
-    dimSizes.push_back(dimOp.out());
-  }
-  builder.Build<pir::dialect::TieShapeOp>(value, dimSizes);
-  return true;
-}
-
-bool InsertTieShapeOnRegion(pir::Region* region);
-
-bool InsertTieShapeOnOperation(pir::Operation* op,
-                               pir::Builder& builder) {  // NOLINT
-  // TODO(zhangbo63): skip more specialized Ops.
-  if (op->isa<pir::dialect::TieShapeOp>() || op->isa<pir::dialect::FuncOp>())
-    return true;
-
-  for (size_t i = 0; i < op->num_regions(); ++i) {
-    if (!InsertTieShapeOnRegion(&(op->region(i)))) return false;
-  }
-  builder.SetInsertionPointAfter(op);
-  for (pir::OpResult v : op->results()) {
-    if (!InsertTieShapeOnValue(v, builder)) return false;
-  }
-
-  return true;
-}
-
-bool InsertTieShapeOnBlock(pir::Block* block) {
-  pir::Builder builder =
-      pir::Builder(pir::IrContext::Instance(), block, block->begin());
-  // TODO(liujinnan): mapping block arguments
-
-  std::vector<pir::Operation*> op_list;
-  for (pir::Operation* op : *block) op_list.push_back(op);
-  for (pir::Operation* op : op_list) {
-    if (!InsertTieShapeOnOperation(op, builder)) return false;
-  }
-  return true;
-}
-
-bool InsertTieShapeOnRegion(pir::Region* region) {
-  for (pir::Block* block : *region) {
-    if (!InsertTieShapeOnBlock(block)) return false;
-  }
-  return true;
-}
-
-bool MaterializeShapeComputation(pir::ModuleOp m) {
-  if (!InsertTieShapeOnRegion(&(m->region(0)))) return false;
-  // TODO(liujinnan): add rewitter pattern for reifyInferShape.
-  return true;
-}
-
-bool OptimizeShapeComputation(pir::ModuleOp m, PassPipelineRunner runner) {
-  // TODO(liujinnan): Do some Canonicalizer.
-  pir::SymbolicDimMgr mgr(m);
-  IR_ENFORCE(mgr.Load(),
-             "SymbolicDimMgr Load failed in OptimizeShapeComputation.");
-  pir::ShapeComputationIRAnalysis analysis(m, mgr);
-  if (!analysis.Run()) {
-    return false;
-  }
-  IR_ENFORCE(mgr.Save(),
-             "SymbolicDimMgr save failed in OptimizeShapeComputation.");
-  return true;
-}
-
-class ShapeOptimizationPass : public pir::Pass {
- public:
-  ShapeOptimizationPass() : pir::Pass("shape_optimization", 0) {}
-
-  void Run(pir::Operation* op) override {
-    auto module_op = op->dyn_cast<pir::ModuleOp>();
-    IR_ENFORCE(module_op, "ShapeOptimizationPass should run on module op.");
-    MaterializeShapeComputation(module_op);
-    // runner is for Canonicalizer.
-    PassPipelineRunner runner = [this](pir::PassManager& pm, pir::ModuleOp m) {
-      return pm.Run(m.program());
-    };
-    if (!OptimizeShapeComputation(module_op, runner)) {
-      return;
-    }
-  }
-
-  bool CanApplyOn(pir::Operation* op) const override {
-    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
-  }
-};
-
-}  // namespace
-
-namespace pir {
-
-std::unique_ptr<Pass> CreateShapeOptimizationPass() {
-  return std::make_unique<ShapeOptimizationPass>();
-}
-
-}  // namespace pir
-
-REGISTER_IR_PASS(shape_optimization, ShapeOptimizationPass);
diff --git a/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc b/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc
new file mode 100644
index 00000000000000..07f7cf4129a4d9
--- /dev/null
+++ b/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc
@@ -0,0 +1,621 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/dialect/shape/utils/shape_optimization_utils.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/dialect/shape/utils/symbol_table.h"
+
+namespace pir {
+
+bool CompareSymbolicDimNames(const std::string& lhs, const std::string& rhs) {
+  // S -> Symbol   : unknown  dimension size at compile time
+  // C -> Constant : constant dimension size at compile time
+  if (lhs.size() < 1 || (lhs[0] != 'S' && lhs[0] != 'C')) return lhs < rhs;
+  if (rhs.size() < 1 || (rhs[0] != 'S' && rhs[0] != 'C')) return lhs < rhs;
+  int64_t lhs_idx = 0, rhs_idx = 0;
+  try {
+    lhs_idx = stol(lhs.substr(1));
+    rhs_idx = stol(rhs.substr(1));
+  } catch (const std::exception& e) {
+    IR_THROW("Invalid symbolic name");
+  }
+  return (lhs[0] < rhs[0]) || (lhs[0] == rhs[0] && lhs_idx < rhs_idx);
+}
+
+// Gives a consistent order of a list op SymbolicDimProducts
+bool CompareSymbolicDimProduct(SymbolicDimProduct& lhs,    // NOLINT
+                               SymbolicDimProduct& rhs) {  // NOLINT
+  if (lhs.symbols.size() < rhs.symbols.size()) return true;
+  if (lhs.symbols.size() == rhs.symbols.size()) {
+    for (size_t idx = 0; idx < lhs.symbols.size(); ++idx) {
+      const std::string lhs_name = lhs.symbols[idx].GetSymName();
+      const std::string rhs_name = rhs.symbols[idx].GetSymName();
+      if (CompareSymbolicDimNames(lhs_name, rhs_name)) return true;
+      if (lhs_name != rhs_name) return false;
+    }
+  }
+  return false;
+}
+
+SymbolicDimMgr::SymbolicDimMgr(ModuleOp m) : m_(m) {
+  for (auto op : *(m.block())) {
+    if (op->isa<dialect::FuncOp>()) {
+      symbol_table_ = SymbolTable(op);
+      return;
+    }
+  }
+  Builder builder = Builder(m_.ir_context(), m_.block(), m_.block()->begin());
+  dialect::FuncOp func = builder.Build<dialect::FuncOp>();
+  symbol_table_ = SymbolTable(func);
+}
+
+bool SymbolicDimMgr::Load() {
+  auto func_op = symbol_table_.getOp()->dyn_cast<dialect::FuncOp>();
+  assert(func_op);
+  for (auto op : *(func_op.block())) {
+    symbol_table_.insert(op);
+    if (SymbolicDim sym_dim_op = op->dyn_cast<SymbolicDim>()) {
+      symbol_dim_union_set_[sym_dim_op] = sym_dim_op;
+      symbol_name_set_.insert(sym_dim_op.GetSymName());
+    }
+  }
+  return LoadShapeConstraintGraph();
+}
+
+bool SymbolicDimMgr::LoadShapeConstraintGraph() {
+  // TODO(liujinnan): add more constraint function. currently, only support
+  // tie_product_equal.
+  auto constraint_vec =
+      symbol_table_.Lookup<dialect::TieProductEqualOp>("tie_product_equal");
+
+  if (!constraint_vec.size()) return true;
+
+  auto build_sym_product = [&](std::vector<Value> range,
+                               SymbolicDimProduct& product) {
+    for (Value v : range) {
+      auto defining_op = v.dyn_cast<OpResult>().owner();
+      if (auto constOp = defining_op->dyn_cast<ConstantOp>()) {
+        product.factor *= constOp.value().dyn_cast<Int32Attribute>().data();
+        continue;
+      } else if (auto dimOp = defining_op->dyn_cast<dialect::DimOp>()) {
+        auto sym = symbol_table_.Lookup<SymbolicDim>(dimOp.getName());
+        if (!sym) return false;
+        product.symbols.push_back(sym);
+        continue;
+      }
+      return false;
+    }
+    return true;
+  };
+
+  for (auto op : constraint_vec) {
+    SymbolicDimProduct lhs, rhs;
+    if (!build_sym_product(op.lhs(), lhs) ||
+        !build_sym_product(op.rhs(), rhs) ||
+        !MapSymbolicDimProductEqual(lhs, rhs))
+      return false;
+  }
+  return true;
+}
+
+bool SymbolicDimMgr::MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
+                                                const SymbolicDimProduct& rhs) {
+  SymbolicDimProduct new_lhs, new_rhs;
+  std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs);
+
+  // Return true for identity case.
+  if (new_lhs == new_rhs) return true;
+
+  if (new_lhs.factor == new_rhs.factor && new_lhs.symbols.size() == 1 &&
+      new_rhs.symbols.size() == 1) {
+    return MapSymbolicDimEqual(new_lhs.symbols[0], new_rhs.symbols[0]);
+  } else if (new_lhs.symbols.size() == 0 && new_rhs.symbols.size() == 1 &&
+             new_rhs.factor == 1) {
+    return MapSymbolicDimEqual(NewConstantSymbolicDim(new_lhs.factor),
+                               new_rhs.symbols[0]);
+  } else if (new_rhs.symbols.size() == 0 && new_lhs.symbols.size() == 1 &&
+             new_lhs.factor == 1) {
+    return MapSymbolicDimEqual(NewConstantSymbolicDim(new_rhs.factor),
+                               new_lhs.symbols[0]);
+  }
+
+  product_equality_map_[new_lhs][new_rhs] =
+      product_equality_map_[new_rhs][new_lhs] = true;
+
+  product_equality_map_updated_ = false;
+  return true;
+}
+
+SymbolicDimProduct SymbolicDimMgr::SimplifySymbolicDimProduct(
+    const SymbolicDimProduct& x) {
+  std::vector<SymbolicDim> copied;
+  copied.reserve(x.symbols.size());
+  for (SymbolicDim op : x.symbols) copied.push_back(GetRootSymbolicDim(op));
+
+  std::sort(
+      copied.begin(), copied.end(), [&](SymbolicDim lhs, SymbolicDim rhs) {
+        return CompareSymbolicDimNames(lhs.GetSymName(), rhs.GetSymName());
+      });
+  SymbolicDimProduct new_x;
+  new_x.factor = x.factor;
+  for (SymbolicDim op : copied) {
+    if (!op.IsDynamic()) {
+      new_x.factor *= op.GetDimSize();
+    } else {
+      new_x.symbols.push_back(op);
+    }
+  }
+  return new_x;
+}
+
+std::pair<SymbolicDimProduct, SymbolicDimProduct>
+SymbolicDimMgr::SimplifySymbolicDimProductPair(const SymbolicDimProduct& x,
+                                               const SymbolicDimProduct& y) {
+  // First do some basic clean up (e.g. folding const symbolic dim op into the
+  // fator field)
+  auto lhs = SimplifySymbolicDimProduct(x);
+  auto rhs = SimplifySymbolicDimProduct(y);
+
+  SymbolicDimProduct new_lhs, new_rhs;
+  int64_t gcd_factor = std::gcd(std::abs(lhs.factor), std::abs(rhs.factor));
+
+  // 0 * lhs_symbols = 0 * rhs_symbols, no more information.
+  // Just return empty new_lhs & new_rhs
+  if (!gcd_factor)
+    return std::make_pair(std::move(new_lhs), std::move(new_rhs));
+
+  // Canonicalization factor form: always let the smaller factor being positive
+  // number.
+  if (std::abs(lhs.factor) < std::abs(rhs.factor)) {
+    if (lhs.factor < 0) gcd_factor = -gcd_factor;
+  } else {
+    if (rhs.factor < 0) gcd_factor = -gcd_factor;
+  }
+
+  new_lhs.factor = lhs.factor / gcd_factor;
+  new_rhs.factor = rhs.factor / gcd_factor;
+
+  std::unordered_map<SymbolicDim, int, SymDimHasher> lhs_symbol_map;
+  std::unordered_map<SymbolicDim, int, SymDimHasher> rhs_symbol_map;
+
+  for (SymbolicDim op : lhs.symbols) ++lhs_symbol_map[op];
+  for (SymbolicDim op : rhs.symbols) ++rhs_symbol_map[op];
+
+  for (SymbolicDim op : lhs.symbols) {
+    auto it = rhs_symbol_map.find(op);
+    if (it != rhs_symbol_map.end() && op.GetKnownNonSizeZero()) {
+      if (--it->second == 0) rhs_symbol_map.erase(it);
+      continue;
+    }
+    new_lhs.symbols.push_back(op);
+  }
+
+  for (SymbolicDim op : rhs.symbols) {
+    auto it = lhs_symbol_map.find(op);
+    if (it != lhs_symbol_map.end() && op.GetKnownNonSizeZero()) {
+      if (--it->second == 0) lhs_symbol_map.erase(it);
+      continue;
+    }
+    new_rhs.symbols.push_back(op);
+  }
+
+  if (!new_lhs.factor) new_lhs.symbols.clear();
+  if (!new_rhs.factor) new_rhs.symbols.clear();
+
+  return std::make_pair(std::move(new_lhs), std::move(new_rhs));
+}
+
+const std::string SymbolicDimMgr::GetNextName() {
+  std::string name;
+  do {
+    name = "S" + std::to_string(next_symbolic_idx_++);
+  } while (!symbol_name_set_.insert(name).second);
+  return name;
+}
+
+SymbolicDim SymbolicDimMgr::NewSymbolicDim(const std::string& name) {
+  auto func_op = symbol_table_.getOp()->dyn_cast<dialect::FuncOp>();
+  assert(func_op);
+  Builder builder = Builder(m_.ir_context(), func_op.block());
+  // default settting dim != 0
+  dialect::SymbolicDim symbol =
+      builder.Build<dialect::SymbolicDim>(name.empty() ? GetNextName() : name,
+                                          ShapedTypeInterface::kDynamic,
+                                          false,
+                                          false,
+                                          false,
+                                          true);
+  symbol_dim_union_set_[symbol] = symbol;
+  symbol_table_.insert(symbol);
+  return symbol;
+}
+
+SymbolicDim SymbolicDimMgr::NewConstantSymbolicDim(int64_t val) {
+  auto it = constant_symbolic_dim_map_.find(val);
+  if (it == constant_symbolic_dim_map_.end()) {
+    auto name = "C" + std::to_string(val);
+    it = constant_symbolic_dim_map_
+             .insert(std::make_pair(val, NewSymbolicDim(name)))
+             .first;
+    it->second.SetDimSize(val);
+    if (val == -1) it->second.UpdateKnownNegativeOne(true);
+    if (val >= 0) it->second.UpdateKnownNonNegative(true);
+    if (val != 1) it->second.UpdateKnownNonSizeOne(true);
+    if (val != 0) it->second.UpdateKnownNonSizeZero(true);
+  }
+  return GetRootSymbolicDim(it->second);
+}
+
+std::vector<SymbolicDim> SymbolicDimMgr::CreateSymbolicDimsForRankedValue(
+    Value value) {
+  std::vector<SymbolicDim> symbols;
+  auto dims = value.type().dyn_cast<pir::DenseTensorType>().dims();
+  for (int idx = 0; idx < dims.size(); ++idx) {
+    symbols.push_back(dims[idx] == ShapedTypeInterface::kDynamic
+                          ? NewSymbolicDim()
+                          : NewConstantSymbolicDim(dims[idx]));
+  }
+  return symbols;
+}
+
+SymbolicDim SymbolicDimMgr::GetRootSymbolicDim(SymbolicDim symbol) {
+  SymbolicDim current = symbol;
+  std::vector<SymbolicDim> path;
+  while (symbol_dim_union_set_[current] != current) {
+    path.push_back(current);
+    current = symbol_dim_union_set_[current];
+  }
+  for (SymbolicDim sym : path) symbol_dim_union_set_[sym] = current;
+  return current;
+}
+
+bool SymbolicDimMgr::IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) {
+  SymbolicDim lhs_root = GetRootSymbolicDim(lhs);
+  SymbolicDim rhs_root = GetRootSymbolicDim(rhs);
+  return lhs_root == rhs_root;
+}
+
+bool SymbolicDimMgr::MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) {
+  SymbolicDim lhs_root = GetRootSymbolicDim(lhs);
+  SymbolicDim rhs_root = GetRootSymbolicDim(rhs);
+
+  if (lhs_root != rhs_root) {
+    if (CompareSymbolicDimNames(lhs_root.GetSymName(), rhs_root.GetSymName())) {
+      if (!lhs_root.Merge(rhs_root)) return false;
+      symbol_dim_union_set_[rhs_root] = lhs_root;
+    } else {
+      if (!rhs_root.Merge(lhs_root)) return false;
+      symbol_dim_union_set_[lhs_root] = rhs_root;
+    }
+    product_equality_map_updated_ = false;
+  }
+  return true;
+}
+
+SymbolicDimProduct* SymbolicDimMgr::SymbolicDimProductDivide(
+    const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) {
+  SymbolicDimProduct new_lhs, new_rhs;
+  std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs);
+
+  if (new_lhs.factor == 0 || new_rhs.factor == 0) return nullptr;
+  if (new_lhs.factor % new_rhs.factor != 0) return nullptr;
+  if (new_lhs.symbols.size() < new_rhs.symbols.size()) return nullptr;
+
+  SymbolicDimProduct* result = new SymbolicDimProduct();
+  result->factor = new_lhs.factor / new_rhs.factor;
+
+  std::unordered_map<SymbolicDim, int, SymDimHasher> sym_proc_map;
+  for (SymbolicDim sym : new_rhs.symbols) ++sym_proc_map[sym];
+
+  for (SymbolicDim sym : new_lhs.symbols) {
+    auto it = sym_proc_map.find(sym);
+    if (it == sym_proc_map.end()) {
+      result->symbols.push_back(sym);
+      continue;
+    }
+    if (--it->second == 0) {
+      sym_proc_map.erase(it);
+      continue;
+    }
+  }
+
+  if (!sym_proc_map.empty()) return nullptr;
+  return result;
+}
+
+bool SymbolicDimMgr::IsMultipleOfKnownSymbolicDimProductEqualPair(
+    const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) {
+  for (auto& pair_outter : product_equality_map_) {
+    const SymbolicDimProduct& x = pair_outter.first;
+    auto factor_x = SymbolicDimProductDivide(lhs, x);
+    if (!factor_x) continue;
+    for (auto& pair_inner : pair_outter.second) {
+      if (!pair_inner.second) continue;
+      const SymbolicDimProduct& y = pair_inner.first;
+      auto factor_y = SymbolicDimProductDivide(rhs, y);
+      if (!factor_y || (*factor_x) != (*factor_y)) continue;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool SymbolicDimMgr::UpdateProductEqualityMap() {
+  // Return true if nothing is updated.
+  if (product_equality_map_updated_) return true;
+
+  SymbolicDimProductMap new_map;
+  std::unordered_set<SymbolicDimProduct, SymProductHasher> product_set;
+  for (auto& pair_outter : product_equality_map_) {
+    const SymbolicDimProduct& x = pair_outter.first;
+    for (auto& pair_inner : pair_outter.second) {
+      if (!pair_inner.second) continue;
+
+      const SymbolicDimProduct& y = pair_inner.first;
+      SymbolicDimProduct new_x, new_y;
+      std::tie(new_x, new_y) = SimplifySymbolicDimProductPair(x, y);
+      if (new_x == new_y) continue;
+
+      new_map[new_x][new_y] = new_map[new_y][new_x] = true;
+      product_set.insert(new_x);
+      product_set.insert(new_y);
+    }
+  }
+  // hash function of SymbolicDimProduct is expensive, thus we map it to integer
+  // domain first.
+  std::unordered_map<const SymbolicDimProduct*, size_t> symProd2Idx;
+  std::vector<const SymbolicDimProduct*> idx2SymProd(product_set.size());
+  std::vector<size_t> idx2root(product_set.size());
+  for (auto& x : product_set) {
+    size_t idx = symProd2Idx.size();
+    symProd2Idx[&x] = idx;
+    idx2SymProd[idx] = &x;
+    idx2root[idx] = idx;
+  }
+
+  auto getRootIdx = [&](size_t root) {
+    std::vector<size_t> path;
+    while (idx2root[root] != root) {
+      path.push_back(root);
+      root = idx2root[root];
+    }
+    for (size_t idx : path) idx2root[idx] = root;
+    return root;
+  };
+
+  for (size_t x = 0; x < symProd2Idx.size(); ++x) {
+    auto& xProd = *idx2SymProd[x];
+    auto& rowMap = new_map[xProd];
+    size_t xRoot = getRootIdx(x);
+    for (size_t y = x; y < symProd2Idx.size(); ++y) {
+      auto& yProd = *idx2SymProd[y];
+      if (!rowMap[yProd]) continue;
+      idx2root[getRootIdx(y)] = xRoot;
+    }
+  }
+
+  for (size_t x = 0; x < symProd2Idx.size(); ++x)
+    for (size_t y = x; y < symProd2Idx.size(); ++y) {
+      if (getRootIdx(x) != getRootIdx(y)) continue;
+      auto& xSymProd = *idx2SymProd[x];
+      auto& ySymProd = *idx2SymProd[y];
+
+      new_map[xSymProd][ySymProd] = new_map[ySymProd][xSymProd] = true;
+    }
+
+  product_equality_map_ = std::move(new_map);
+
+  for (auto& x : product_set)
+    for (auto& y : product_set) {
+      if (!product_equality_map_[x][y]) continue;
+      product_equality_map_[x][y] = product_equality_map_[y][x] = false;
+      if (!IsMultipleOfKnownSymbolicDimProductEqualPair(x, y)) {
+        product_equality_map_[x][y] = product_equality_map_[y][x] = true;
+      }
+    }
+
+  std::unordered_set<SymbolicDimProduct, SymProductHasher> toRemove;
+  for (auto& x : product_set) {
+    if (std::all_of(product_set.begin(),
+                    product_set.end(),
+                    [&](const SymbolicDimProduct& y) {
+                      return !product_equality_map_[x][y];
+                    })) {
+      toRemove.insert(x);
+    }
+  }
+
+  for (auto& x : toRemove) {
+    product_equality_map_.erase(x);
+  }
+
+  product_equality_map_updated_ = true;
+  return true;
+}
+
+bool SymbolicDimMgr::IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
+                                               const SymbolicDimProduct& rhs) {
+  SymbolicDimProduct new_lhs, new_rhs;
+  std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs);
+
+  // Return true for identity case.
+  if (new_lhs == new_rhs) return true;
+  IR_ENFORCE(UpdateProductEqualityMap(), "Update product equality map failed.");
+  return IsMultipleOfKnownSymbolicDimProductEqualPair(new_lhs, new_rhs);
+}
+
+bool SymbolicDimMgr::Save() {
+  using Name2SymbolFn = std::function<SymbolicDim(const std::string&)>;
+  auto update_attrs = [&](ArrayAttribute attrs, Name2SymbolFn fn) {
+    std::vector<Attribute> new_attrs;
+    for (Attribute attr : attrs.AsVector()) {
+      auto sym = fn(attr.dyn_cast<StrAttribute>().AsString());
+      assert(sym);
+      SymbolicDim root = GetRootSymbolicDim(sym);
+      Attribute root_symbol =
+          StrAttribute::get(m_->ir_context(), root.GetSymName());
+      new_attrs.push_back(root_symbol);
+    }
+    return ArrayAttribute::get(m_->ir_context(), new_attrs);
+  };
+
+  // TODO(liujinnan): update attributes attached in DenseTensorType
+  for (auto op : *(m_.block())) {
+    if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue;
+    auto attrs =
+        op->attribute<ArrayAttribute>(SymbolicDim::GetSymbolicDimAttrName());
+    auto symbolic_shape_attr =
+        update_attrs(attrs, [&](const std::string& name) {
+          return symbol_table_.Lookup<SymbolicDim>(name);
+        });
+    op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(),
+                      symbolic_shape_attr);
+  }
+  if (!UpdateProductEqualityMap()) {
+    return false;
+  }
+  std::unordered_set<SymbolicDim, SymDimHasher> used_symbolic_ops;
+  std::vector<std::string> used_symbol_names;
+  // TODO(liujinnan): collect uses in value.
+  auto collect_used_symbols = [&](ArrayAttribute attrs) {
+    for (Attribute attr : attrs.AsVector()) {
+      auto sym = symbol_table_.Lookup<SymbolicDim>(
+          attr.dyn_cast<StrAttribute>().AsString());
+      assert(sym);
+      if (used_symbolic_ops.insert(sym).second)
+        used_symbol_names.push_back(sym.GetSymName());
+    }
+  };
+  for (auto op : *(m_.block())) {
+    if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue;
+    auto attrs =
+        op->attribute<ArrayAttribute>(SymbolicDim::GetSymbolicDimAttrName());
+    collect_used_symbols(attrs);
+  }
+  auto func_op = symbol_table_.getOp()->dyn_cast<dialect::FuncOp>();
+  assert(func_op);
+  for (auto& p : symbol_dim_union_set_) {
+    if (!used_symbolic_ops.count(p.first)) {
+      func_op.block()->erase(*(p.first.operation()));
+    }
+  }
+
+  std::vector<SymbolicDimProduct> candidates;
+  for (auto& outter : product_equality_map_) {
+    if (std::any_of(
+            outter.first.symbols.begin(),
+            outter.first.symbols.end(),
+            [&](SymbolicDim sym) { return used_symbolic_ops.count(sym) == 0; }))
+      candidates.push_back(outter.first);
+  }
+
+  for (auto& prod : candidates) product_equality_map_.erase(prod);
+  for (auto& outter : product_equality_map_) {
+    std::vector<SymbolicDimProduct> candidates;
+    for (auto& inner : outter.second) {
+      if (std::any_of(inner.first.symbols.begin(),
+                      inner.first.symbols.end(),
+                      [&](SymbolicDim sym) {
+                        return used_symbolic_ops.count(sym) == 0;
+                      }))
+        candidates.push_back(outter.first);
+    }
+    for (auto& prod : candidates) outter.second.erase(prod);
+  }
+
+  std::sort(used_symbol_names.begin(),
+            used_symbol_names.end(),
+            [&](const std::string& lhs, const std::string& rhs) {
+              return CompareSymbolicDimNames(lhs, rhs);
+            });
+  int non_const_dims_num = 0;
+  std::unordered_map<std::string, std::string> name_mapping;
+  for (const auto& name : used_symbol_names) {
+    if (name.size() > 0 && name[0] == 'C') {
+      name_mapping[name] = name;
+    } else {
+      name_mapping[name] = ("S" + std::to_string(non_const_dims_num++));
+    }
+  }
+
+  std::unordered_map<std::string, SymbolicDim> name_to_symbol;
+  for (SymbolicDim op : used_symbolic_ops) {
+    auto name = op.GetSymName();
+    op.SetSymName(name_mapping[name]);
+    name_to_symbol[name] = op;
+  }
+
+  for (auto op : *(m_.block())) {
+    if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue;
+    auto attrs =
+        op->attribute<ArrayAttribute>(SymbolicDim::GetSymbolicDimAttrName());
+    auto symbolic_shape_attr = update_attrs(
+        attrs, [&](const std::string& name) { return name_to_symbol[name]; });
+    op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(),
+                      symbolic_shape_attr);
+  }
+
+  // TODO(liujinnan): update attributes attached to values.
+
+  return SaveShapeConstraintGraph();
+}
+
+bool SymbolicDimMgr::SaveShapeConstraintGraph() {
+  auto func_op = symbol_table_.getOp()->dyn_cast<dialect::FuncOp>();
+  assert(func_op);
+  auto op_it = func_op.block()->rbegin();
+  while (op_it != func_op.block()->rend()) {
+    if (((*op_it)->isa<dialect::SymbolicDim>()) ||
+        ((*op_it)->isa<dialect::TieShapeOp>()))
+      op_it++;
+    else
+      op_it = decltype(op_it)(func_op.block()->erase(*(*op_it)));
+  }
+
+  // save product equal predicate
+  Builder builder = Builder(m_->ir_context(), func_op.block());
+  auto build_operands = [&](const SymbolicDimProduct& prod) {
+    std::vector<Value> values;
+
+    if (prod.factor != 1) {
+      values.push_back(
+          builder
+              .Build<ConstantOp>(
+                  Int32Attribute::get(m_->ir_context(), prod.factor),
+                  Int32Type::get(m_->ir_context()))
+              ->result(0));
+    }
+    for (SymbolicDim sym : prod.symbols) {
+      values.push_back(builder.Build<dialect::DimOp>(sym.GetSymName()).out());
+    }
+    return values;
+  };
+  std::vector<SymbolicDimProduct> sorted_product_vec;
+  for (auto& p : product_equality_map_) sorted_product_vec.push_back(p.first);
+  std::sort(sorted_product_vec.begin(),
+            sorted_product_vec.end(),
+            CompareSymbolicDimProduct);
+  for (auto& x : sorted_product_vec) {
+    for (auto& y : sorted_product_vec) {
+      if (!CompareSymbolicDimProduct(x, y)) continue;
+      if (!product_equality_map_[x][y]) continue;
+      auto lhs_operands = build_operands(x);
+      auto rhs_operands = build_operands(y);
+      builder.Build<dialect::TieProductEqualOp>(lhs_operands, rhs_operands);
+    }
+  }
+  return true;
+}
+}  // namespace pir
diff --git a/paddle/pir/dialect/shape/utils/shape_optimization_utils.h b/paddle/pir/dialect/shape/utils/shape_optimization_utils.h
new file mode 100644
index 00000000000000..5541e8a8ee2f19
--- /dev/null
+++ b/paddle/pir/dialect/shape/utils/shape_optimization_utils.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/pir/dialect/shape/utils/symbol_table.h"
+
+namespace pir {
+using dialect::SymbolicDim;
+
+// Represents a product of symbolic and concrete factors.
+// Used to prove product equalities symbolically.
+struct SymbolicDimProduct {
+  // List all symbolic factors that can not be aggregated.
+  std::vector<SymbolicDim> symbols;
+
+  // Product of all const factors.
+  int64_t factor = 1;
+  bool empty() { return factor == 1 && symbols.empty(); }
+};
+
+// Returns true if two SymbolicDimProduct are equal
+inline bool operator==(const SymbolicDimProduct& lhs,
+                       const SymbolicDimProduct& rhs) {
+  return lhs.factor == rhs.factor && lhs.symbols == rhs.symbols;
+}
+
+// Returns true if two SymbolicDimProduct are not equal
+inline bool operator!=(const SymbolicDimProduct& lhs,
+                       const SymbolicDimProduct& rhs) {
+  return !(lhs == rhs);
+}
+
+struct SymDimHasher {
+  size_t operator()(const dialect::SymbolicDim& symbol) const noexcept {
+    return std::hash<Operation*>{}(symbol.operation());
+  }
+};
+
+struct SymProductHasher {
+  size_t operator()(const SymbolicDimProduct& symProd) const noexcept {
+    size_t hash = std::hash<size_t>{}(symProd.symbols.size());
+    for (auto& symbol : symProd.symbols) {
+      hash = hash_combine(hash, SymDimHasher{}(symbol));  // NOLINT
+    }
+    hash = hash_combine(hash, std::hash<int64_t>{}(symProd.factor));
+    return hash;
+  }
+};
+
+// A class to manage shape-constraint related IR
+class SymbolicDimMgr {
+ public:
+  explicit SymbolicDimMgr(ModuleOp m);
+
+  // Loads pre-defined SymbolicDim ops from the module this mgr runs on.
+  bool Load();
+
+  // Create a new symbolicDim instance owned by this mgr.
+  SymbolicDim NewSymbolicDim(const std::string& name = {});
+
+  // Create a symbolicDim with static dim size == `val`.
+  SymbolicDim NewConstantSymbolicDim(int64_t val);
+
+  // Create a symbolicDim with given value.
+  std::vector<SymbolicDim> CreateSymbolicDimsForRankedValue(Value value);
+
+  // All symbolic-equal dims form a group.
+  // Returns the root SymbolicDim of the symbolic-equal symbolic dim group which
+  // this SymbolicDim belongs to.
+  SymbolicDim GetRootSymbolicDim(SymbolicDim symbol);
+
+  // Returns true if lhs and rhs are known to be equal.
+  bool IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs);
+
+  // Marks lhs and rhs have same size and try to merge lhs & rhs static known
+  // info. Returns false if failed to merge lhs & rhs.
+  bool MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs);
+
+  // Returns the simplified version of SymbolicDimProduct.
+  // This will try to fold some symbolicDim ops with const values.
+  SymbolicDimProduct SimplifySymbolicDimProduct(const SymbolicDimProduct& x);
+
+  // Returns the simplified version of SymbolicDimProductPair.
+  // This will try to reduce some common symbolic ops if they are known nonzero.
+  std::pair<SymbolicDimProduct, SymbolicDimProduct>
+  SimplifySymbolicDimProductPair(const SymbolicDimProduct& x,
+                                 const SymbolicDimProduct& y);
+
+  // Returns null if x is not divided exactly by y, otherwise the result of x /
+  // y Suppose that all symbols are nonzero, thus common symbolic dim factors
+  // can be elimiated safely. For example:
+  //    x = 6 * symbol_0 * symbol_1 * symbol_2
+  //    y = 3 * symbol_0 * symbol_1
+  //    x / y == 2 * symbol_2 (all symbols are nonzero)
+  SymbolicDimProduct* SymbolicDimProductDivide(const SymbolicDimProduct& x,
+                                               const SymbolicDimProduct& y);
+
+  // Mark group [a0, b0, ...] and [a1, b1, ...] are multiplication equal :
+  //    `a0 * b0 * ... = a1 * b1 * c1 * ...`
+  bool IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
+                                 const SymbolicDimProduct& rhs);
+
+  // Mark `product([a0, b0, ...]) == product([a1, b1, c1, ...])`
+  bool MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
+                                  const SymbolicDimProduct& rhs);
+
+  // Saves the updated shape constraint IR
+  bool Save();
+
+  // retuns the SymbolTable.
+  SymbolTable& symbolTable() { return symbol_table_; }
+
+ private:
+  const std::string GetNextName();
+  bool SaveShapeConstraintGraph();
+  bool LoadShapeConstraintGraph();
+  bool UpdateProductEqualityMap();
+  bool IsMultipleOfKnownSymbolicDimProductEqualPair(
+      const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs);
+
+ private:
+  ModuleOp m_;
+
+  SymbolTable symbol_table_;
+
+  int64_t next_symbolic_idx_ = 0;
+
+  std::unordered_set<std::string> symbol_name_set_;
+
+  std::unordered_map<SymbolicDim, SymbolicDim, SymDimHasher>
+      symbol_dim_union_set_;
+
+  std::unordered_map<int64_t, SymbolicDim> constant_symbolic_dim_map_;
+
+  // product_equality_map_[A][B] == true : Product[A] == Product[B]
+  using SymbolicDimProductMap = std::unordered_map<
+      SymbolicDimProduct,
+      std::unordered_map<SymbolicDimProduct, bool, SymProductHasher>,
+      SymProductHasher>;
+  SymbolicDimProductMap product_equality_map_;
+  bool product_equality_map_updated_ = true;
+};
+
+}  // namespace pir
diff --git a/paddle/pir/dialect/shape/utils/shape_utils.cc b/paddle/pir/dialect/shape/utils/shape_utils.cc
index d9f8aee3043256..d746831835ed89 100644
--- a/paddle/pir/dialect/shape/utils/shape_utils.cc
+++ b/paddle/pir/dialect/shape/utils/shape_utils.cc
@@ -17,878 +17,121 @@
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 namespace pir {
 
-bool CompareSymbolicDimNames(const std::string& lhs, const std::string& rhs) {
-  if (lhs.size() < 1 || (lhs[0] != 'S' && lhs[0] != 'C')) return lhs < rhs;
-  if (rhs.size() < 1 || (rhs[0] != 'S' && rhs[0] != 'C')) return lhs < rhs;
-  int64_t lhsIdx = 0, rhsIdx = 0;
-  try {
-    lhsIdx = stol(lhs.substr(1));
-    rhsIdx = stol(rhs.substr(1));
-  } catch (const std::exception& e) {
-    IR_THROW("Invalid symbolic name");
-  }
-  return (lhs[0] < rhs[0]) || (lhs[0] == rhs[0] && lhsIdx < rhsIdx);
-}
-
-bool CompareSymbolicDimProduct(SymbolicDimProduct& lhs,    // NOLINT
-                               SymbolicDimProduct& rhs) {  // NOLINT
-  if (lhs.symbols.size() < rhs.symbols.size()) return true;
-  if (lhs.symbols.size() == rhs.symbols.size()) {
-    for (size_t idx = 0; idx < lhs.symbols.size(); ++idx) {
-      const std::string lhsName = lhs.symbols[idx].getSymName();
-      const std::string rhsName = rhs.symbols[idx].getSymName();
-      if (CompareSymbolicDimNames(lhsName, rhsName)) return true;
-      if (lhsName != rhsName) return false;
-    }
-  }
-  return false;
-}
-
-const std::string SymbolTable::insert(Operation* symbol) {
-  std::string name;
-  if (symbol->isa<dialect::SymbolicDim>()) {
-    name = symbol->dyn_cast<SymbolicDim>().getSymName();
-    symbolTableMap_.insert({name, symbol});
-  }
-
-  // TODO(liujinnan): add more constraint_func name branch.
-  if (symbol->isa<dialect::TieProductEqualOp>()) {
-    name = "tie_product_equal";
-    symbolFuncMap_[name].emplace_back(symbol);
-  }
-
-  return name;
-}
-
-bool SymbolicDimMgr::Load() {
-  auto funcOp = symbolTable_.getOp()->dyn_cast<dialect::FuncOp>();
-  assert(funcOp);
-  for (auto op_ : *(funcOp.block())) {
-    symbolTable_.insert(op_);
-    if (SymbolicDim op = op_->dyn_cast<SymbolicDim>()) {
-      symbolDimUnionSet_[op] = op;
-      symbolNameSet_.insert(op.getSymName());
-    }
-  }
-  return LoadShapeConstraintGraph();
-}
-
-bool SymbolicDimMgr::LoadShapeConstraintGraph() {
-  // TODO(liujinnan): add more constraint function. currently, only support
-  // tie_product_equal.
-  auto constraint_vec =
-      symbolTable_.Lookup<dialect::TieProductEqualOp>("tie_product_equal");
-
-  if (!constraint_vec.size()) return true;
-
-  auto build_sym_product = [&](std::vector<Value> range,
-                               SymbolicDimProduct& product) {
-    for (Value v : range) {
-      auto definingOp = v.dyn_cast<OpResult>().owner();
-      if (auto constOp = definingOp->dyn_cast<ConstantOp>()) {
-        product.factor *= constOp.value().dyn_cast<Int32Attribute>().data();
-        continue;
-      } else if (auto dimOp = definingOp->dyn_cast<dialect::DimOp>()) {
-        auto sym = symbolTable_.Lookup<SymbolicDim>(dimOp.getName());
-        if (!sym) return false;
-        product.symbols.push_back(sym);
-        continue;
-      }
-      return false;
-    }
-    return true;
-  };
-
-  for (auto op : constraint_vec) {
-    SymbolicDimProduct lhs, rhs;
-    if (!build_sym_product(op.lhs(), lhs) ||
-        !build_sym_product(op.rhs(), rhs) ||
-        !MapSymbolicDimProductEqual(lhs, rhs))
-      return false;
-  }
-  return true;
-}
-
-int64_t gcd(int64_t m, int64_t n) {
-  if (!m) return n;
-  if (!n) return m;
-  return (m < n) ? gcd(m, n % m) : gcd(m % n, n);
-}
-
-bool SymbolicDimMgr::MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
-                                                const SymbolicDimProduct& rhs) {
-  SymbolicDimProduct newLhs, newRhs;
-  std::tie(newLhs, newRhs) = SimplifySymbolicDimProductPair(lhs, rhs);
-
-  // early return for identity case.
-  if (newLhs == newRhs) return true;
-
-  if (newLhs.factor == newRhs.factor && newLhs.symbols.size() == 1 &&
-      newRhs.symbols.size() == 1) {
-    return MapSymbolicDimEqual(newLhs.symbols[0], newRhs.symbols[0]);
-  } else if (newLhs.symbols.size() == 0 && newRhs.symbols.size() == 1 &&
-             newRhs.factor == 1) {
-    return MapSymbolicDimEqual(NewConstantSymbolicDim(newLhs.factor),
-                               newRhs.symbols[0]);
-  } else if (newRhs.symbols.size() == 0 && newLhs.symbols.size() == 1 &&
-             newLhs.factor == 1) {
-    return MapSymbolicDimEqual(NewConstantSymbolicDim(newRhs.factor),
-                               newLhs.symbols[0]);
-  }
-
-  productEqualityMap_[newLhs][newRhs] = productEqualityMap_[newRhs][newLhs] =
-      true;
-
-  productEqualityMapUpdated_ = false;
-  return true;
-}
-
-std::pair<SymbolicDimProduct, SymbolicDimProduct>
-SymbolicDimMgr::SimplifySymbolicDimProductPair(const SymbolicDimProduct& x,
-                                               const SymbolicDimProduct& y) {
-  auto lhs = SimplifySymbolicDimProduct(x);
-  auto rhs = SimplifySymbolicDimProduct(y);
-
-  SymbolicDimProduct newLhs, newRhs;
-  int64_t gcdFactor = gcd(std::abs(lhs.factor), std::abs(rhs.factor));
-  if (!gcdFactor) return std::make_pair(std::move(newLhs), std::move(newRhs));
-  if (std::abs(lhs.factor) < std::abs(rhs.factor)) {
-    if (lhs.factor < 0) gcdFactor = -gcdFactor;
-  } else {
-    if (rhs.factor < 0) gcdFactor = -gcdFactor;
-  }
-
-  newLhs.factor = lhs.factor / gcdFactor;
-  newRhs.factor = rhs.factor / gcdFactor;
-
-  std::unordered_map<SymbolicDim, int, SymDimHasher> lhsSymbolMap;
-  std::unordered_map<SymbolicDim, int, SymDimHasher> rhsSymbolMap;
-  for (SymbolicDim op : lhs.symbols) ++lhsSymbolMap[op];
-  for (SymbolicDim op : rhs.symbols) ++rhsSymbolMap[op];
-
-  for (SymbolicDim op : lhs.symbols) {
-    auto it = rhsSymbolMap.find(op);
-    if (it != rhsSymbolMap.end() && op.getKnownNonSizeZero()) {
-      if (--it->second == 0) rhsSymbolMap.erase(it);
-      continue;
-    }
-    newLhs.symbols.push_back(op);
-  }
-
-  for (SymbolicDim op : rhs.symbols) {
-    auto it = lhsSymbolMap.find(op);
-    if (it != lhsSymbolMap.end() && op.getKnownNonSizeZero()) {
-      if (--it->second == 0) lhsSymbolMap.erase(it);
-      continue;
-    }
-    newRhs.symbols.push_back(op);
-  }
-
-  if (!newLhs.factor) newLhs.symbols.clear();
-  if (!newRhs.factor) newRhs.symbols.clear();
-
-  return std::make_pair(std::move(newLhs), std::move(newRhs));
-}
-
-SymbolicDimProduct SymbolicDimMgr::SimplifySymbolicDimProduct(
-    const SymbolicDimProduct& x) {
-  std::vector<SymbolicDim> copied;
-  copied.reserve(x.symbols.size());
-  for (SymbolicDim op : x.symbols) copied.push_back(GetRootSymbolicDim(op));
-
-  sort(copied.begin(), copied.end(), [&](SymbolicDim lhs, SymbolicDim rhs) {
-    return CompareSymbolicDimNames(lhs.getSymName(), rhs.getSymName());
-  });
-  SymbolicDimProduct newX;
-  newX.factor = x.factor;
-  for (SymbolicDim op : copied) {
-    if (!op.IsDynamic()) {
-      newX.factor *= op.getValue();
-    } else {
-      newX.symbols.push_back(op);
-    }
-  }
-  return newX;
-}
-
-const std::string SymbolicDimMgr::GetNextName() {
-  std::string name;
-  do {
-    name = "S" + std::to_string(nextSymbolicIdx_++);
-  } while (!symbolNameSet_.insert(name).second);
-  return name;
-}
-
-SymbolicDimMgr::SymbolicDimMgr(ModuleOp m) : m_(m) {
-  for (auto op : *(m.block())) {
-    if (op->isa<dialect::FuncOp>()) {
-      symbolTable_ = SymbolTable(op);
-      return;
-    }
-  }
-  Builder builder = Builder(m_.ir_context(), m_.block(), m_.block()->begin());
-  dialect::FuncOp func = builder.Build<dialect::FuncOp>();
-  symbolTable_ = SymbolTable(func);
-}
-
-SymbolicDim SymbolicDimMgr::NewSymbolicDim(const std::string& name) {
-  auto funcOp = symbolTable_.getOp()->dyn_cast<dialect::FuncOp>();
-  assert(funcOp);
-  Builder builder = Builder(m_.ir_context(), funcOp.block());
-  // default settting dim != 0
-  dialect::SymbolicDim symbol =
-      builder.Build<dialect::SymbolicDim>(name.empty() ? GetNextName() : name,
-                                          ShapedTypeInterface::kDynamic,
-                                          false,
-                                          false,
-                                          false,
-                                          true);
-  symbolDimUnionSet_[symbol] = symbol;
-  symbolTable_.insert(symbol);
-  return symbol;
-}
-
-SymbolicDim SymbolicDimMgr::NewConstantSymbolicDim(int64_t val) {
-  auto it = constantSymbolicDimMap_.find(val);
-  if (it == constantSymbolicDimMap_.end()) {
-    auto name = "C" + std::to_string(val);
-    it = constantSymbolicDimMap_
-             .insert(std::make_pair(val, NewSymbolicDim(name)))
-             .first;
-    it->second.updateValue(val);
-    if (val == -1) it->second.updateKnownNegativeOne(true);
-    if (val >= 0) it->second.updateKnownNonNegative(true);
-    if (val != 1) it->second.updateKnownNonSizeOne(true);
-    if (val != 0) it->second.updateKnownNonSizeZero(true);
-  }
-  return GetRootSymbolicDim(it->second);
-}
-
-std::vector<SymbolicDim> SymbolicDimMgr::CreateSymbolicDimsForRankedValue(
-    Value value) {
-  std::vector<SymbolicDim> symbols;
-  auto dims = value.type().dyn_cast<paddle::dialect::DenseTensorType>().dims();
-  for (int idx = 0; idx < dims.size(); ++idx) {
-    symbols.push_back(dims[idx] == ShapedTypeInterface::kDynamic
-                          ? NewSymbolicDim()
-                          : NewConstantSymbolicDim(dims[idx]));
-  }
-  return symbols;
-}
-
-SymbolicDim SymbolicDimMgr::GetRootSymbolicDim(SymbolicDim symbol) {
-  SymbolicDim current = symbol;
-  std::vector<SymbolicDim> path;
-  while (symbolDimUnionSet_[current] != current) {
-    path.push_back(current);
-    current = symbolDimUnionSet_[current];
-  }
-  for (SymbolicDim sym : path) symbolDimUnionSet_[sym] = current;
-  return current;
-}
-
-bool SymbolicDimMgr::IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) {
-  SymbolicDim lhsRoot = GetRootSymbolicDim(lhs);
-  SymbolicDim rhsRoot = GetRootSymbolicDim(rhs);
-  return lhsRoot == rhsRoot;
-}
-
-bool SymbolicDimMgr::MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) {
-  SymbolicDim lhsRoot = GetRootSymbolicDim(lhs);
-  SymbolicDim rhsRoot = GetRootSymbolicDim(rhs);
-
-  if (lhsRoot != rhsRoot) {
-    if (CompareSymbolicDimNames(lhsRoot.getSymName(), rhsRoot.getSymName())) {
-      if (!lhsRoot.Merge(rhsRoot)) return false;
-      symbolDimUnionSet_[rhsRoot] = lhsRoot;
-    } else {
-      if (!rhsRoot.Merge(lhsRoot)) return false;
-      symbolDimUnionSet_[lhsRoot] = rhsRoot;
-    }
-  }
-  return true;
-}
-
-SymbolicDimProduct* SymbolicDimMgr::SymbolicDimProductDivide(
-    const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) {
-  SymbolicDimProduct newLhs, newRhs;
-  std::tie(newLhs, newRhs) = SimplifySymbolicDimProductPair(lhs, rhs);
-
-  if (newLhs.factor == 0 || newRhs.factor == 0) return nullptr;
-  if (newLhs.factor % newRhs.factor != 0) return nullptr;
-  if (newLhs.symbols.size() < newRhs.symbols.size()) return nullptr;
-
-  SymbolicDimProduct* result = new SymbolicDimProduct();
-  result->factor = newLhs.factor / newRhs.factor;
-
-  std::unordered_map<SymbolicDim, int, SymDimHasher> symProcMap;
-  for (SymbolicDim sym : newRhs.symbols) ++symProcMap[sym];
-
-  for (SymbolicDim sym : newLhs.symbols) {
-    auto it = symProcMap.find(sym);
-    if (it == symProcMap.end()) {
-      result->symbols.push_back(sym);
-      continue;
-    }
-    if (--it->second == 0) {
-      symProcMap.erase(it);
-      continue;
-    }
-  }
-
-  if (!symProcMap.empty()) return nullptr;
-  return result;
-}
-
-bool SymbolicDimMgr::IsMultipleOfKnownSymbolicDimProductEqualPair(
-    const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) {
-  for (auto& pairOutter : productEqualityMap_) {
-    const SymbolicDimProduct& x = pairOutter.first;
-    auto factorX = SymbolicDimProductDivide(lhs, x);
-    if (!factorX) continue;
-    for (auto& pairInner : pairOutter.second) {
-      if (!pairInner.second) continue;
-      const SymbolicDimProduct& y = pairInner.first;
-      auto factorY = SymbolicDimProductDivide(rhs, y);
-      if (!factorY || (*factorX) != (*factorY)) continue;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool SymbolicDimMgr::UpdateProductEqualityMap() {
-  // early return if nothing is updated.
-  if (productEqualityMapUpdated_) return true;
-
-  SymbolicDimProductMap newMap;
-  std::unordered_set<SymbolicDimProduct, SymProductHasher> productSet;
-  for (auto& pairOutter : productEqualityMap_) {
-    const SymbolicDimProduct& x = pairOutter.first;
-    for (auto& pairInner : pairOutter.second) {
-      if (!pairInner.second) continue;
-      const SymbolicDimProduct& y = pairInner.first;
-      SymbolicDimProduct newX, newY;
-      std::tie(newX, newY) = SimplifySymbolicDimProductPair(x, y);
-      if (newX == newY) continue;
-      newMap[newX][newY] = newMap[newY][newX] = true;
-      productSet.insert(newX);
-      productSet.insert(newY);
-    }
-  }
-  // hash function of SymbolicDimProduct is expensive, thus we map it to integer
-  // domain first.
-  std::unordered_map<const SymbolicDimProduct*, size_t> symProd2Idx;
-  std::vector<const SymbolicDimProduct*> idx2SymProd(productSet.size());
-  std::vector<size_t> idx2root(productSet.size());
-  for (auto& x : productSet) {
-    size_t idx = symProd2Idx.size();
-    symProd2Idx[&x] = idx;
-    idx2SymProd[idx] = &x;
-    idx2root[idx] = idx;
-  }
-
-  auto getRootIdx = [&](size_t root) {
-    std::vector<size_t> path;
-    while (idx2root[root] != root) {
-      path.push_back(root);
-      root = idx2root[root];
-    }
-    for (size_t idx : path) idx2root[idx] = root;
-    return root;
-  };
-
-  for (size_t x = 0; x < symProd2Idx.size(); ++x) {
-    auto& xProd = *idx2SymProd[x];
-    auto& rowMap = newMap[xProd];
-    size_t xRoot = getRootIdx(x);
-    for (size_t y = x; y < symProd2Idx.size(); ++y) {
-      auto& yProd = *idx2SymProd[y];
-      if (!rowMap[yProd]) continue;
-      idx2root[getRootIdx(y)] = xRoot;
-    }
-  }
-
-  for (size_t x = 0; x < symProd2Idx.size(); ++x)
-    for (size_t y = x; y < symProd2Idx.size(); ++y) {
-      if (getRootIdx(x) != getRootIdx(y)) continue;
-      auto& xSymProd = *idx2SymProd[x];
-      auto& ySymProd = *idx2SymProd[y];
-
-      newMap[xSymProd][ySymProd] = newMap[ySymProd][xSymProd] = true;
-    }
-
-  productEqualityMap_ = std::move(newMap);
-
-  for (auto& x : productSet)
-    for (auto& y : productSet) {
-      if (!productEqualityMap_[x][y]) continue;
-      productEqualityMap_[x][y] = productEqualityMap_[y][x] = false;
-      if (!IsMultipleOfKnownSymbolicDimProductEqualPair(x, y)) {
-        productEqualityMap_[x][y] = productEqualityMap_[y][x] = true;
-      }
-    }
-
-  std::unordered_set<SymbolicDimProduct, SymProductHasher> toRemove;
-  for (auto& x : productSet) {
-    if (std::all_of(productSet.begin(),
-                    productSet.end(),
-                    [&](const SymbolicDimProduct& y) {
-                      return !productEqualityMap_[x][y];
-                    })) {
-      toRemove.insert(x);
-    }
-  }
-
-  for (auto& x : toRemove) {
-    productEqualityMap_.erase(x);
-  }
-
-  productEqualityMapUpdated_ = true;
-  return true;
-}
-
-bool SymbolicDimMgr::IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
-                                               const SymbolicDimProduct& rhs) {
-  SymbolicDimProduct newLhs, newRhs;
-  std::tie(newLhs, newRhs) = SimplifySymbolicDimProductPair(lhs, rhs);
-
-  // early return for identity case.
-  if (newLhs == newRhs) return true;
-  IR_ENFORCE(UpdateProductEqualityMap(), "Update product equality map failed.");
-  return IsMultipleOfKnownSymbolicDimProductEqualPair(newLhs, newRhs);
-}
-
-bool SymbolicDimMgr::Save() {
-  using Name2SymbolFn = std::function<SymbolicDim(const std::string&)>;
-  auto updateAttrs = [&](ArrayAttribute attrs, Name2SymbolFn fn) {
-    std::vector<Attribute> newAttrs;
-    for (Attribute attr : attrs.AsVector()) {
-      auto sym = fn(attr.dyn_cast<StrAttribute>().AsString());
-      assert(sym);
-      SymbolicDim root = GetRootSymbolicDim(sym);
-      Attribute rootSymbol =
-          StrAttribute::get(m_->ir_context(), root.getSymName());
-      newAttrs.push_back(rootSymbol);
-    }
-    return ArrayAttribute::get(m_->ir_context(), newAttrs);
-  };
-
-  // TODO(liujinnan): update attributes attached in DenseTensorType
-  for (auto op : *(m_.block())) {
-    if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue;
-    auto attrs =
-        op->attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName());
-    auto symbolicShapeAttr = updateAttrs(attrs, [&](const std::string& name) {
-      return symbolTable_.Lookup<SymbolicDim>(name);
-    });
-    op->set_attribute(SymbolicDim::getSymbolicDimAttrName(), symbolicShapeAttr);
-  }
-  if (!UpdateProductEqualityMap()) {
-    return false;
-  }
-  std::unordered_set<SymbolicDim, SymDimHasher> usedSymbolicOps;
-  std::vector<std::string> usedSymbolNames;
-  // TODO(liujinnan): collect uses in value.
-  auto collectUsedSymbols = [&](ArrayAttribute attrs) {
-    for (Attribute attr : attrs.AsVector()) {
-      auto sym = symbolTable_.Lookup<SymbolicDim>(
-          attr.dyn_cast<StrAttribute>().AsString());
-      assert(sym);
-      if (usedSymbolicOps.insert(sym).second)
-        usedSymbolNames.push_back(sym.getSymName());
-    }
-  };
-  for (auto op : *(m_.block())) {
-    if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue;
-    auto attrs =
-        op->attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName());
-    collectUsedSymbols(attrs);
-  }
-  auto funcOp = symbolTable_.getOp()->dyn_cast<dialect::FuncOp>();
-  assert(funcOp);
-  for (auto& p : symbolDimUnionSet_) {
-    if (!usedSymbolicOps.count(p.first)) {
-      funcOp.block()->erase(*(p.first.operation()));
-    }
-  }
-
-  std::vector<SymbolicDimProduct> candidates;
-  for (auto& outter : productEqualityMap_) {
-    if (std::any_of(
-            outter.first.symbols.begin(),
-            outter.first.symbols.end(),
-            [&](SymbolicDim sym) { return usedSymbolicOps.count(sym) == 0; }))
-      candidates.push_back(outter.first);
-  }
-
-  for (auto& prod : candidates) productEqualityMap_.erase(prod);
-  for (auto& outter : productEqualityMap_) {
-    std::vector<SymbolicDimProduct> candidates;
-    for (auto& inner : outter.second) {
-      if (std::any_of(
-              inner.first.symbols.begin(),
-              inner.first.symbols.end(),
-              [&](SymbolicDim sym) { return usedSymbolicOps.count(sym) == 0; }))
-        candidates.push_back(outter.first);
-    }
-    for (auto& prod : candidates) outter.second.erase(prod);
-  }
-
-  std::sort(usedSymbolNames.begin(),
-            usedSymbolNames.end(),
-            [&](const std::string& lhs, const std::string& rhs) {
-              return CompareSymbolicDimNames(lhs, rhs);
-            });
-  int numNonConstDims = 0;
-  std::unordered_map<std::string, std::string> nameMapping;
-  for (const auto& name : usedSymbolNames) {
-    if (name.size() > 0 && name[0] == 'C') {
-      nameMapping[name] = name;
-    } else {
-      nameMapping[name] = ("S" + std::to_string(numNonConstDims++));
-    }
-  }
-
-  std::unordered_map<std::string, SymbolicDim> name2Symbol;
-  for (SymbolicDim op : usedSymbolicOps) {
-    auto name = op.getSymName();
-    op.updateSymName(nameMapping[name]);
-    name2Symbol[name] = op;
-  }
-
-  for (auto op : *(m_.block())) {
-    if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue;
-    auto attrs =
-        op->attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName());
-    auto symbolicShapeAttr = updateAttrs(
-        attrs, [&](const std::string& name) { return name2Symbol[name]; });
-    op->set_attribute(SymbolicDim::getSymbolicDimAttrName(), symbolicShapeAttr);
-  }
-
-  // TODO(liujinnan): update attributes attached to values.
-
-  return SaveShapeConstraintGraph();
-}
-
-bool SymbolicDimMgr::SaveShapeConstraintGraph() {
-  auto funcOp = symbolTable_.getOp()->dyn_cast<dialect::FuncOp>();
-  assert(funcOp);
-  auto op_it = funcOp.block()->rbegin();
-  while (op_it != funcOp.block()->rend()) {
-    if (((*op_it)->isa<dialect::SymbolicDim>()) ||
-        ((*op_it)->isa<dialect::TieShapeOp>()))
-      op_it++;
-    else
-      op_it = decltype(op_it)(funcOp.block()->erase(*(*op_it)));
-  }
-
-  Builder builder = Builder(m_->ir_context(), funcOp.block());
-  auto build_operands = [&](const SymbolicDimProduct& prod) {
-    std::vector<Value> values;
-
-    if (prod.factor != 1) {
-      values.push_back(
-          builder
-              .Build<ConstantOp>(
-                  Int32Attribute::get(m_->ir_context(), prod.factor),
-                  Int32Type::get(m_->ir_context()))
-              ->result(0));
-    }
-    for (SymbolicDim sym : prod.symbols) {
-      values.push_back(builder.Build<dialect::DimOp>(sym.getSymName()).out());
-    }
-    return values;
-  };
-  std::vector<SymbolicDimProduct> sortedProductVec;
-  for (auto& p : productEqualityMap_) sortedProductVec.push_back(p.first);
-  std::sort(sortedProductVec.begin(),
-            sortedProductVec.end(),
-            CompareSymbolicDimProduct);
-  for (auto& x : sortedProductVec) {
-    for (auto& y : sortedProductVec) {
-      if (!CompareSymbolicDimProduct(x, y)) continue;
-      if (!productEqualityMap_[x][y]) continue;
-      auto lhsOperands = build_operands(x);
-      auto rhsOperands = build_operands(y);
-      builder.Build<dialect::TieProductEqualOp>(lhsOperands, rhsOperands);
-    }
-  }
-  return true;
-}
-
 bool ShapeAnalysis::IsSameNumElements(Value lhs, Value rhs) {
   if (lhs == rhs) return true;
-  auto lhsTy = lhs.type().dyn_cast<ShapedTypeInterface>();
-  auto rhsTy = rhs.type().dyn_cast<ShapedTypeInterface>();
+  auto lhs_type = lhs.type().dyn_cast<ShapedTypeInterface>();
+  auto rhs_type = rhs.type().dyn_cast<ShapedTypeInterface>();
 
-  if (!lhsTy || !rhsTy || !lhsTy.HasRank() || !rhsTy.HasRank()) return false;
+  if (!lhs_type || !rhs_type || !lhs_type.HasRank() || !rhs_type.HasRank())
+    return false;
 
-  return IsProductEqual(lhs, 0, lhsTy.GetRank(), rhs, 0, rhsTy.GetRank());
+  return IsProductEqual(lhs,
+                        0,
+                        static_cast<int>(lhs_type.GetRank()),
+                        rhs,
+                        0,
+                        static_cast<int>(rhs_type.GetRank()));
 }
 
 bool ShapeAnalysis::IsProductEqual(
-    Value lhs, int lhsFrom, int lhsTo, Value rhs, int rhsFrom, int rhsTo) {
-  std::vector<int> lhsDimIdxs, rhsDimIdxs;
-  lhsDimIdxs.reserve(lhsTo - lhsFrom);
-  rhsDimIdxs.reserve(rhsTo - rhsFrom);
-  for (int i = lhsFrom; i < lhsTo; ++i) lhsDimIdxs.push_back(i);
-  for (int i = rhsFrom; i < rhsTo; ++i) rhsDimIdxs.push_back(i);
+    Value lhs, int lhs_from, int lhs_to, Value rhs, int rhs_from, int rhs_to) {
+  std::vector<int> lhs_dim_idxs, rhs_dim_idxs;
+
+  lhs_dim_idxs.reserve(lhs_to - lhs_from);
+  rhs_dim_idxs.reserve(rhs_to - rhs_from);
 
-  return IsProductEqual(lhs, lhsDimIdxs, rhs, rhsDimIdxs);
+  for (int i = lhs_from; i < lhs_to; ++i) lhs_dim_idxs.push_back(i);
+  for (int i = rhs_from; i < rhs_to; ++i) rhs_dim_idxs.push_back(i);
+
+  return IsProductEqual(lhs, lhs_dim_idxs, rhs, rhs_dim_idxs);
 }
 
-SymbolicDimShapeAnalysis::SymbolicDimShapeAnalysis(ModuleOp m)
+ShapeConstraintIRAnalysis::ShapeConstraintIRAnalysis(ModuleOp m)
     : m_(m), mgr_(m) {
   mgr_.Load();
   for (auto op : *(m_.block())) {
-    auto tieShapeOp = op->dyn_cast<dialect::TieShapeOp>();
-    if (!tieShapeOp) continue;
-    Value result = tieShapeOp.value();
-    auto& symbols = value2SymDims_[result];
+    auto tie_shape_op = op->dyn_cast<dialect::TieShapeOp>();
+    if (!tie_shape_op) continue;
+    Value result = tie_shape_op.value();
+    auto& symbols = value_to_sym_dims_[result];
     auto attrs =
-        tieShapeOp
-            .attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName())
+        tie_shape_op
+            .attribute<ArrayAttribute>(SymbolicDim::GetSymbolicDimAttrName())
             .AsVector();
     for (const auto& attr : attrs) {
-      auto symOp = mgr_.symbolTable().Lookup<SymbolicDim>(
+      auto sym_op = mgr_.symbolTable().Lookup<SymbolicDim>(
           attr.dyn_cast<StrAttribute>().AsString());
-      if (!symOp) continue;
-      symbols.push_back(symOp);
+      if (!sym_op) continue;
+      symbols.push_back(sym_op);
     }
   }
 }
 
-SymbolicDimShapeAnalysis::~SymbolicDimShapeAnalysis() { mgr_.Save(); }
+ShapeConstraintIRAnalysis::~ShapeConstraintIRAnalysis() { mgr_.Save(); }
 
-bool SymbolicDimShapeAnalysis::IsShapeEqual(Value lhs, Value rhs) {
+bool ShapeConstraintIRAnalysis::IsShapeEqual(Value lhs, Value rhs) {
   if (lhs == rhs) return true;
 
-  auto lhsTy = lhs.type().dyn_cast<ShapedTypeInterface>();
-  auto rhsTy = rhs.type().dyn_cast<ShapedTypeInterface>();
+  auto lhs_type = lhs.type().dyn_cast<ShapedTypeInterface>();
+  auto rhs_type = rhs.type().dyn_cast<ShapedTypeInterface>();
 
-  if (!lhsTy || !rhsTy || !lhsTy.HasRank() || !rhsTy.HasRank()) return false;
+  if (!lhs_type || !rhs_type || !lhs_type.HasRank() || !rhs_type.HasRank())
+    return false;
 
-  if (lhsTy.HasStaticShape() && rhsTy.HasStaticShape()) {
-    return vectorize(lhsTy.GetShape()) == vectorize(rhsTy.GetShape());
+  if (lhs_type.HasStaticShape() && rhs_type.HasStaticShape()) {
+    return vectorize(lhs_type.GetShape()) == vectorize(rhs_type.GetShape());
   }
 
-  auto lhsIt = value2SymDims_.find(lhs);
-  auto rhsIt = value2SymDims_.find(rhs);
+  auto lhs_it = value_to_sym_dims_.find(lhs);
+  auto rhs_it = value_to_sym_dims_.find(rhs);
 
-  if (lhsIt == value2SymDims_.end() || rhsIt == value2SymDims_.end() ||
-      lhsIt->second.size() != rhsIt->second.size())
+  if (lhs_it == value_to_sym_dims_.end() ||
+      rhs_it == value_to_sym_dims_.end() ||
+      lhs_it->second.size() != rhs_it->second.size())
     return false;
 
-  std::vector<SymbolicDim> lhsSyms;
-  std::vector<SymbolicDim> rhsSyms;
-  for (auto sym : lhsIt->second) {
-    lhsSyms.push_back(mgr_.GetRootSymbolicDim(sym));
+  std::vector<SymbolicDim> lhs_syms;
+  std::vector<SymbolicDim> rhs_syms;
+  for (auto sym : lhs_it->second) {
+    lhs_syms.push_back(mgr_.GetRootSymbolicDim(sym));
   }
-  for (auto sym : rhsIt->second) {
-    rhsSyms.push_back(mgr_.GetRootSymbolicDim(sym));
+  for (auto sym : rhs_it->second) {
+    rhs_syms.push_back(mgr_.GetRootSymbolicDim(sym));
   }
-  return lhsSyms == rhsSyms;
+  return lhs_syms == rhs_syms;
 }
 
-bool SymbolicDimShapeAnalysis::IsProductEqual(Value lhs,
-                                              std::vector<int> lhsDimIdxs,
-                                              Value rhs,
-                                              std::vector<int> rhsDimIdxs) {
-  SymbolicDimProduct lhsProd;
-  SymbolicDimProduct rhsProd;
+bool ShapeConstraintIRAnalysis::IsProductEqual(Value lhs,
+                                               std::vector<int> lhs_dim_idxs,
+                                               Value rhs,
+                                               std::vector<int> rhs_dim_idxs) {
+  SymbolicDimProduct lhs_prod;
+  SymbolicDimProduct rhs_prod;
 
-  auto buildSymbolicDimProduct =
-      [&](SymbolicDimProduct& prod, Value value, std::vector<int> dimIdxs) {
-        auto ty = value.type().dyn_cast<ShapedTypeInterface>();
-        auto it = value2SymDims_.find(value);
-        if (!ty || !ty.HasRank()) return false;
-        for (int idx : dimIdxs) {
-          if (ty.GetShape()[idx] == ShapedTypeInterface::kDynamic) {
-            if (it == value2SymDims_.end() ||
+  auto build_symbolic_dim_product =
+      [&](SymbolicDimProduct& prod, Value value, std::vector<int> dim_idxs) {
+        auto type = value.type().dyn_cast<ShapedTypeInterface>();
+        auto it = value_to_sym_dims_.find(value);
+        if (!type || !type.HasRank()) return false;
+        for (int idx : dim_idxs) {
+          if (type.GetShape()[idx] == ShapedTypeInterface::kDynamic) {
+            if (it == value_to_sym_dims_.end() ||
                 static_cast<int>(it->second.size()) <= idx)
               return false;
             prod.symbols.push_back(it->second[idx]);
           } else {
-            prod.factor *= ty.GetShape()[idx];
+            prod.factor *= type.GetShape()[idx];
           }
         }
         return true;
       };
 
-  if (!buildSymbolicDimProduct(lhsProd, lhs, lhsDimIdxs) ||
-      !buildSymbolicDimProduct(rhsProd, rhs, rhsDimIdxs)) {
+  if (!build_symbolic_dim_product(lhs_prod, lhs, lhs_dim_idxs) ||
+      !build_symbolic_dim_product(rhs_prod, rhs, rhs_dim_idxs)) {
     return false;
   }
 
-  return mgr_.IsSymbolicDimProductEqual(lhsProd, rhsProd);
-}
-
-ShapeComputationIRAnalysis::ShapeComputationIRAnalysis(ModuleOp m,
-                                                       SymbolicDimMgr& mgr)
-    : m_(m), mgr_(mgr) {}
-
-bool ShapeComputationIRAnalysis::Run() {
-  // Make sure only run once.
-  if (initialized_) return false;
-  initialized_ = true;
-  auto buildShapeFunc =
-      std::bind(&ShapeComputationIRAnalysis::BuildShapeOnOperation,
-                this,
-                std::placeholders::_1);
-  if (!RunOnRegion(&(m_->region(0)), buildShapeFunc)) return false;
-  auto applyOpConstraintFunc =
-      std::bind(&ShapeComputationIRAnalysis::ApplyOpConstraint,
-                this,
-                std::placeholders::_1);
-  if (!RunOnRegion(&(m_->region(0)), applyOpConstraintFunc)) return false;
-  return true;
-}
-
-bool ShapeComputationIRAnalysis::RunOnRegion(Region* region, func fn) {
-  for (Block* block : *region) {
-    if (!RunOnBlock(block, fn)) return false;
-  }
-  return true;
-}
-
-bool ShapeComputationIRAnalysis::RunOnBlock(Block* block, func fn) {
-  // TODO(liujinnan): mapping block arguments
-
-  std::vector<Operation*> op_list;
-  for (Operation* op : *block) op_list.push_back(op);
-  for (Operation* op : op_list) {
-    if (!RunOnOperation(op, fn)) return false;
-  }
-  return true;
-}
-
-bool ShapeComputationIRAnalysis::RunOnOperation(Operation* op, func fn) {
-  for (size_t i = 0; i < op->num_regions(); ++i) {
-    if (!RunOnRegion(&(op->region(i)), fn)) return false;
-  }
-  return fn(op);
-}
-
-bool ShapeComputationIRAnalysis::BuildShapeOnOperation(Operation* op) {
-  if (op->isa<dialect::FuncOp>()) return true;
-  if (op->isa<dialect::TieShapeOp>()) {
-    Value value = op->operand_source(0);
-    std::vector<SymbolicDim> symbols;
-    if (op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) {
-      auto attrs =
-          op->attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName())
-              .AsVector();
-      for (Attribute attr : attrs) {
-        auto sym = mgr_.symbolTable().Lookup<SymbolicDim>(
-            attr.dyn_cast<StrAttribute>().AsString());
-        assert(sym);
-        SymbolicDim root = mgr_.GetRootSymbolicDim(sym);
-        symbols.push_back(root);
-      }
-    } else {
-      symbols = mgr_.CreateSymbolicDimsForRankedValue(value);
-      std::vector<Attribute> attrs;
-      for (SymbolicDim sym : symbols) {
-        Attribute rootSymbol =
-            StrAttribute::get(m_->ir_context(), sym.getSymName());
-        attrs.push_back(rootSymbol);
-      }
-      op->set_attribute(SymbolicDim::getSymbolicDimAttrName(),
-                        ArrayAttribute::get(m_->ir_context(), attrs));
-    }
-    rankedTensor2SymDims_[value] = std::move(symbols);
-    return true;
-  }
-  for (size_t i = 0; i < op->num_results(); ++i) {
-    if (!BuildShapeOnValue(op->result(i))) return false;
-  }
-  return true;
-}
-
-bool ShapeComputationIRAnalysis::BuildShapeOnValue(Value value) {
-  Type ty = value.type();
-  if (IsIntOrIndex(ty)) {
-    SymbolicDim sym = mgr_.NewSymbolicDim();
-    value2SymDim_[value] = sym;
-  } else if (IsCandidateShapeTensorType(ty)) {
-    auto shapedTy = ty.dyn_cast<ShapedTypeInterface>();
-    std::vector<SymbolicDim> symbols;
-    for (size_t i = 0, d = shapedTy.GetShape()[0]; i < d; ++i)
-      symbols.push_back(mgr_.NewSymbolicDim());
-    shapeTensor2SymDims_[value] = std::move(symbols);
-  }
-  return true;
-}
-
-bool ShapeComputationIRAnalysis::ApplyOpConstraint(Operation* op) {
-  IR_ENFORCE(ApplyIndexOpConstraint(op),
-             "Fail to apply constraint for index op");
-  IR_ENFORCE(ApplyTieShapeOpConstraint(op),
-             "Fail to apply constraint for tie_shape op");
-
-  // TODO(zhangbo63): add more constraints
-  return true;
-}
-
-bool ShapeComputationIRAnalysis::ApplyIndexOpConstraint(Operation* op) {
-  if (op->num_results() == 0) return true;
-
-  Type ty = op->result(0).type();
-  if (!IsIntOrIndex(ty)) return true;
-
-  if (auto dimOp = op->dyn_cast<dialect::TensorDimOp>()) {
-    int64_t dimIndex = dimOp.index()
-                           .dyn_cast<OpResult>()
-                           .owner()
-                           ->attribute<Int64Attribute>("value")
-                           .data();
-    value2SymDim_[dimOp.out()].updateKnownNonNegative(true);
-    if (!mgr_.MapSymbolicDimEqual(
-            value2SymDim_[dimOp.out()],
-            rankedTensor2SymDims_[dimOp.source()][dimIndex])) {
-      return false;
-    }
-
-  } else if (auto constOp = op->dyn_cast<ConstantOp>()) {
-    int64_t val = constOp.value().dyn_cast<Int64Attribute>().data();
-    if (!mgr_.MapSymbolicDimEqual(value2SymDim_[op->result(0)],
-                                  mgr_.NewConstantSymbolicDim(val))) {
-      return false;
-    }
-  }
-  // TODO(zhangbo63): add support for reifyInferShape. (e.g. mul/add)
-  return true;
-}
-
-bool ShapeComputationIRAnalysis::ApplyTieShapeOpConstraint(Operation* op) {
-  if (auto tieShape = op->dyn_cast<dialect::TieShapeOp>()) {
-    auto& value = rankedTensor2SymDims_[op->operand_source(0)];
-    for (size_t idx = 0; idx < tieShape.dims().size(); ++idx) {
-      if (!mgr_.MapSymbolicDimEqual(value2SymDim_[tieShape.dims()[idx]],
-                                    value[idx]))
-        return false;
-      mgr_.GetRootSymbolicDim(value[idx]).updateKnownNonNegative(true);
-    }
-  }
-  return true;
-}
-
-bool IsIntOrIndex(Type type) {
-  return type.isa<IndexType>() || type.isa<Int8Type>() ||
-         type.isa<UInt8Type>() || type.isa<Int16Type>() ||
-         type.isa<Int32Type>() || type.isa<Int64Type>();
-}
-
-bool IsCandidateShapeTensorType(Type ty) {
-  if (auto tensorTy = ty.dyn_cast<paddle::dialect::DenseTensorType>()) {
-    auto shapedTy = tensorTy.dyn_cast<ShapedTypeInterface>();
-    return (shapedTy.GetRank() == 1 && shapedTy.HasStaticShape() &&
-            IsIntOrIndex(shapedTy.GetElementType()) &&
-            shapedTy.GetShape()[0] < 32);
-  }
-  return false;
+  return mgr_.IsSymbolicDimProductEqual(lhs_prod, rhs_prod);
 }
 
 }  // namespace pir
diff --git a/paddle/pir/dialect/shape/utils/shape_utils.h b/paddle/pir/dialect/shape/utils/shape_utils.h
index bb6dd58cebb26a..0842313962d36b 100644
--- a/paddle/pir/dialect/shape/utils/shape_utils.h
+++ b/paddle/pir/dialect/shape/utils/shape_utils.h
@@ -14,211 +14,70 @@
 
 #pragma once
 
-#include <algorithm>
-#include <functional>
-#include <iterator>
-#include <type_traits>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/pir/core/builtin_attribute.h"
-#include "paddle/pir/core/builtin_op.h"
-#include "paddle/pir/core/builtin_type_interfaces.h"
-#include "paddle/pir/core/utils.h"
-#include "paddle/pir/dialect/shape/ir/shape_op.h"
+#include "paddle/pir/dialect/shape/utils/shape_optimization_utils.h"
+#include "paddle/pir/dialect/shape/utils/symbol_table.h"
 
 namespace pir {
 
-using dialect::SymbolicDim;
-
-struct SymbolicDimProduct {
-  std::vector<SymbolicDim> symbols;
-  int64_t factor = 1;
-  bool empty() { return factor == 1 && symbols.empty(); }
-  friend inline bool operator==(const SymbolicDimProduct& lhs,
-                                const SymbolicDimProduct& rhs) {
-    return lhs.factor == rhs.factor && lhs.symbols == rhs.symbols;
-  }
-
-  friend inline bool operator!=(const SymbolicDimProduct& lhs,
-                                const SymbolicDimProduct& rhs) {
-    return !(lhs == rhs);
-  }
-};
-
-class SymbolTable {
- public:
-  explicit SymbolTable(Operation* symbolTableOp)
-      : symbolTableOp_(symbolTableOp) {}
-  SymbolTable() = default;
-  template <typename T>
-  typename std::enable_if<std::is_same<T, SymbolicDim>::value,
-                          SymbolicDim>::type
-  Lookup(const std::string& name) const {
-    auto it = symbolTableMap_.find(name);
-    return it != symbolTableMap_.end() ? it->second->dyn_cast<SymbolicDim>()
-                                       : SymbolicDim(nullptr);
-  }
-  template <typename T>
-  typename std::enable_if<!std::is_same<T, SymbolicDim>::value,
-                          std::vector<T>>::type
-  Lookup(const std::string& name) const {
-    std::vector<T> res;
-    auto it = symbolFuncMap_.find(name);
-    if (it != symbolFuncMap_.end()) {
-      for (auto& p : it->second) {
-        res.push_back(p->dyn_cast<T>());
-      }
-    }
-    return res;
-  }
-
-  const std::string insert(Operation* symbol);
-  Operation* getOp() const { return symbolTableOp_; }
-
- private:
-  Operation* symbolTableOp_;
-  std::unordered_map<std::string, Operation*> symbolTableMap_;
-  std::unordered_map<std::string, std::vector<Operation*>> symbolFuncMap_;
-};
-
-struct SymDimHasher {
-  size_t operator()(const dialect::SymbolicDim& symbol) const noexcept {
-    return std::hash<Operation*>{}(symbol.operation());
-  }
-};
-
-struct SymProductHasher {
-  size_t operator()(const SymbolicDimProduct& symProd) const noexcept {
-    size_t hash = std::hash<size_t>{}(symProd.symbols.size());
-    for (auto& symbol : symProd.symbols) {
-      hash = hash_combine(hash, SymDimHasher{}(symbol));  // NOLINT
-    }
-    hash = hash_combine(hash, std::hash<int64_t>{}(symProd.factor));
-    return hash;
-  }
-};
-
-class SymbolicDimMgr {
- public:
-  explicit SymbolicDimMgr(ModuleOp m);
-  bool Load();
-  SymbolicDim NewSymbolicDim(const std::string& name = {});
-  SymbolicDim NewConstantSymbolicDim(int64_t val);
-  std::vector<SymbolicDim> CreateSymbolicDimsForRankedValue(Value value);
-  SymbolicDim GetRootSymbolicDim(SymbolicDim symbol);
-  bool IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs);
-  SymbolTable& symbolTable() { return symbolTable_; }
-  bool MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs);
-  SymbolicDimProduct SimplifySymbolicDimProduct(const SymbolicDimProduct& x);
-  std::pair<SymbolicDimProduct, SymbolicDimProduct>
-  SimplifySymbolicDimProductPair(const SymbolicDimProduct& x,
-                                 const SymbolicDimProduct& y);
-  SymbolicDimProduct* SymbolicDimProductDivide(const SymbolicDimProduct& x,
-                                               const SymbolicDimProduct& y);
-
-  bool Save();
-
-  bool IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
-                                 const SymbolicDimProduct& rhs);
-  bool MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
-                                  const SymbolicDimProduct& rhs);
-
- private:
-  const std::string GetNextName();
-  bool UpdateProductEqualityMap();
-  bool IsMultipleOfKnownSymbolicDimProductEqualPair(
-      const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs);
-  bool SaveShapeConstraintGraph();
-  bool LoadShapeConstraintGraph();
-
- private:
-  ModuleOp m_;
-
-  SymbolTable symbolTable_;
-
-  int64_t nextSymbolicIdx_ = 0;
-
-  std::unordered_set<std::string> symbolNameSet_;
-
-  std::unordered_map<SymbolicDim, SymbolicDim, SymDimHasher> symbolDimUnionSet_;
-
-  std::unordered_map<int64_t, SymbolicDim> constantSymbolicDimMap_;
-
-  // productEqualityMap_[A][B] == true : Product[A] == Product[B]
-  using SymbolicDimProductMap = std::unordered_map<
-      SymbolicDimProduct,
-      std::unordered_map<SymbolicDimProduct, bool, SymProductHasher>,
-      SymProductHasher>;
-  SymbolicDimProductMap productEqualityMap_;
-  bool productEqualityMapUpdated_ = true;
-};
-
+// Helper class to query and manipulate shape constraint IR on buffer level.
 class ShapeAnalysis {
  public:
   virtual ~ShapeAnalysis() = default;
 
+  // Returns true if the two value have the same symbolic shape.
   virtual bool IsShapeEqual(Value lhs, Value rhs) = 0;
 
+  // Suppose:
+  //    lhs_dim_idxs = {ld0, ld1, ...}
+  //    rhs_dim_idxs = {rd0, rd1, ...}
+  // Returns true if:
+  //    lhs.shape[ld0] * lhs.shape[ld1] * ... ==
+  //    rhs.shape[rd0] * rhs.shape[rd1] * ...
   virtual bool IsProductEqual(Value lhs,
-                              std::vector<int> lhsDimIdxs,
+                              std::vector<int> lhs_dim_idxs,
                               Value rhs,
-                              std::vector<int> rhsDimIdxs) = 0;
+                              std::vector<int> rhs_dim_idxs) = 0;
+
+  // Returns true if:
+  //    lhs.shape[lhs_from] * ... lhs.shape[lhs_to-1] ==
+  //    rhs.shape[rhs_from] * ... rhs.shape[rhs_to-1]
   virtual bool IsProductEqual(
-      Value lhs, int lhsFrom, int lhsTo, Value rhs, int rhsFrom, int rhsTo);
+      Value lhs, int lhs_from, int lhs_to, Value rhs, int rhs_from, int rhs_to);
+
+  // Returns true if the two value have the same number elements.
   virtual bool IsSameNumElements(Value lhs, Value rhs);
 };
 
-class SymbolicDimShapeAnalysis : public ShapeAnalysis {
+// A subclass to impement `ShapeAnalysis` on buffer level.
+// The implementation is based on shape constraint ir.
+class ShapeConstraintIRAnalysis : public ShapeAnalysis {
  public:
-  explicit SymbolicDimShapeAnalysis(ModuleOp m);
-  ~SymbolicDimShapeAnalysis();
+  explicit ShapeConstraintIRAnalysis(ModuleOp m);
 
+  // auto-save updated shape constriant ir when destroying.
+  ~ShapeConstraintIRAnalysis();
+
+  // Returns the `SymbolicDimMgr` this object holds.
   SymbolicDimMgr& symbolicDimMgr() { return mgr_; }
   const SymbolicDimMgr& symbolicDimMgr() const { return mgr_; }
+
+  // Returns true if the two value have the same symbolic shape.
   bool IsShapeEqual(Value lhs, Value rhs) override;
 
   bool IsProductEqual(Value lhs,
-                      std::vector<int> lhsDimIdxs,
+                      std::vector<int> lhs_dim_idxs,
                       Value rhs,
-                      std::vector<int> rhsDimIdxs) override;
+                      std::vector<int> rhs_dim_idxs) override;
 
  private:
+  // The operation this analysis runs on.
   ModuleOp m_;
+  // The `SymbolicDimMgr` this analysis holds.
   SymbolicDimMgr mgr_;
-  std::unordered_map<Value, std::vector<SymbolicDim>> value2SymDims_;
-};
-
-class ShapeComputationIRAnalysis {
- public:
-  using func = std::function<bool(Operation* op)>;
-  explicit ShapeComputationIRAnalysis(ModuleOp m,
-                                      SymbolicDimMgr& mgr);  // NOLINT
-  bool Run();
-
- private:
-  bool RunOnRegion(Region* region, func fn);
-  bool RunOnBlock(Block* block, func fn);
-  bool RunOnOperation(Operation* op, func fn);
-
-  bool BuildShapeOnOperation(Operation* op);
-  bool BuildShapeOnValue(Value value);
-
-  bool ApplyOpConstraint(Operation* op);
-  bool ApplyIndexOpConstraint(Operation* op);
-  bool ApplyTieShapeOpConstraint(Operation* op);
-
-  bool initialized_ = false;
-  ModuleOp m_;
-  SymbolicDimMgr& mgr_;
-
-  std::unordered_map<Value, SymbolicDim> value2SymDim_;
-
-  // shape tensor is the 1D ranked tensor with int/index dtype.
-  std::unordered_map<Value, std::vector<SymbolicDim>> shapeTensor2SymDims_;
-
-  std::unordered_map<Value, std::vector<SymbolicDim>> rankedTensor2SymDims_;
+  // Map a ranked memref value to an array of symbolicDims, each represents one
+  // dimension size of the memref value.
+  std::unordered_map<Value, std::vector<dialect::SymbolicDim>>
+      value_to_sym_dims_;
 };
 
-bool IsIntOrIndex(Type type);
-bool IsCandidateShapeTensorType(Type ty);
 }  // namespace pir
diff --git a/paddle/pir/dialect/shape/utils/symbol_table.cc b/paddle/pir/dialect/shape/utils/symbol_table.cc
new file mode 100644
index 00000000000000..c4ed0807b0b43b
--- /dev/null
+++ b/paddle/pir/dialect/shape/utils/symbol_table.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/dialect/shape/utils/symbol_table.h"
+
+namespace pir {
+
+const std::string SymbolTable::insert(Operation* symbol) {
+  std::string name;
+  if (symbol->isa<dialect::SymbolicDim>()) {
+    name = symbol->dyn_cast<SymbolicDim>().GetSymName();
+    symbol_table_map_.insert({name, symbol});
+  }
+
+  // TODO(liujinnan): add more constraint_func name branch.
+  if (symbol->isa<dialect::TieProductEqualOp>()) {
+    name = "tie_product_equal";
+    symbol_func_map_[name].emplace_back(symbol);
+  }
+
+  return name;
+}
+}  // namespace pir
diff --git a/paddle/pir/dialect/shape/utils/symbol_table.h b/paddle/pir/dialect/shape/utils/symbol_table.h
new file mode 100644
index 00000000000000..f85ba2cfb8099f
--- /dev/null
+++ b/paddle/pir/dialect/shape/utils/symbol_table.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/utils.h"
+#include "paddle/pir/dialect/shape/ir/shape_op.h"
+
+namespace pir {
+
+using dialect::SymbolicDim;
+class SymbolTable {
+ public:
+  explicit SymbolTable(Operation* symbol_table_op)
+      : symbol_table_op_(symbol_table_op) {}
+  SymbolTable() = default;
+  template <typename T>
+  typename std::enable_if<std::is_same<T, SymbolicDim>::value,
+                          SymbolicDim>::type
+  Lookup(const std::string& name) const {
+    auto it = symbol_table_map_.find(name);
+    return it != symbol_table_map_.end() ? it->second->dyn_cast<SymbolicDim>()
+                                         : SymbolicDim(nullptr);
+  }
+  template <typename T>
+  typename std::enable_if<!std::is_same<T, SymbolicDim>::value,
+                          std::vector<T>>::type
+  Lookup(const std::string& name) const {
+    std::vector<T> res;
+    auto it = symbol_func_map_.find(name);
+    if (it != symbol_func_map_.end()) {
+      for (auto& p : it->second) {
+        res.push_back(p->dyn_cast<T>());
+      }
+    }
+    return res;
+  }
+
+  const std::string insert(Operation* symbol);
+  Operation* getOp() const { return symbol_table_op_; }
+
+ private:
+  Operation* symbol_table_op_;
+  std::unordered_map<std::string, Operation*> symbol_table_map_;
+  std::unordered_map<std::string, std::vector<Operation*>> symbol_func_map_;
+};
+
+}  // namespace pir
diff --git a/paddle/pir/pass/ir_printing.cc b/paddle/pir/pass/ir_printing.cc
index 6171b71c090fcf..901c8bdd89da78 100644
--- a/paddle/pir/pass/ir_printing.cc
+++ b/paddle/pir/pass/ir_printing.cc
@@ -31,12 +31,8 @@ void PrintIR(Operation *op, bool print_module, std::ostream &os) {
     return;
   }
 
-  // Find the top-level operation.
-  auto *top_op = op;
-  while (auto *parent_op = top_op->GetParentOp()) {
-    top_op = parent_op;
-  }
-  top_op->Print(os);
+  auto *program = op->GetParentProgram();
+  program->Print(os);
 }
 }  // namespace
 
diff --git a/paddle/pir/pass/pass.h b/paddle/pir/pass/pass.h
index f916fcbb1e3542..772b9192c6cfba 100644
--- a/paddle/pir/pass/pass.h
+++ b/paddle/pir/pass/pass.h
@@ -18,10 +18,8 @@
 #include <string>
 #include <vector>
 
-#include "paddle/phi/core/enforce.h"
 #include "paddle/pir/core/enforce.h"
 #include "paddle/pir/pass/analysis_manager.h"
-#include "paddle/pir/pass/pass_registry.h"
 
 namespace pir {
 
diff --git a/paddle/pir/pass/pass_manager.h b/paddle/pir/pass/pass_manager.h
index f606be139c42f2..92faed24f1f5d2 100644
--- a/paddle/pir/pass/pass_manager.h
+++ b/paddle/pir/pass/pass_manager.h
@@ -20,13 +20,13 @@
 #include <vector>
 
 #include "paddle/pir/core/program.h"
+#include "paddle/pir/pass/pass.h"
 
 namespace pir {
 
 class IrContext;
 class Operation;
 class Program;
-class Pass;
 class PassInstrumentation;
 class PassInstrumentor;
 
diff --git a/paddle/pir/pass/pass_registry.h b/paddle/pir/pass/pass_registry.h
index 71140810b0324a..01887f74879f08 100644
--- a/paddle/pir/pass/pass_registry.h
+++ b/paddle/pir/pass/pass_registry.h
@@ -21,9 +21,8 @@
 #include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/macros.h"
 #include "paddle/pir/pass/pass.h"
-namespace pir {
 
-class Pass;
+namespace pir {
 
 using PassCreator = std::function<std::unique_ptr<Pass>()>;
 
diff --git a/paddle/pir/pattern_rewrite/pattern_match.cc b/paddle/pir/pattern_rewrite/pattern_match.cc
index eccaf66cca9ce1..028d0779dbf94f 100644
--- a/paddle/pir/pattern_rewrite/pattern_match.cc
+++ b/paddle/pir/pattern_rewrite/pattern_match.cc
@@ -29,7 +29,7 @@ Pattern::Pattern(const std::string& root_name,
                  PatternBenefit benefit,
                  IrContext* context,
                  const std::vector<std::string>& generated_names)
-    : Pattern(context->GetRegisteredOpInfo(root_name).AsOpaquePointer(),
+    : Pattern(context->GetRegisteredOpInfo(root_name),
               RootKind::OperationInfo,
               generated_names,
               benefit,
@@ -46,7 +46,7 @@ Pattern::Pattern(MatchInterfaceOpTypeTag tag,
                  PatternBenefit benefit,
                  IrContext* context,
                  const std::vector<std::string>& generated_names)
-    : Pattern(interface_id.AsOpaquePointer(),
+    : Pattern(interface_id,
               RootKind::InterfaceId,
               generated_names,
               benefit,
@@ -57,11 +57,7 @@ Pattern::Pattern(MatchTraitOpTypeTag tag,
                  PatternBenefit benefit,
                  IrContext* context,
                  const std::vector<std::string>& generated_names)
-    : Pattern(trait_id.AsOpaquePointer(),
-              RootKind::TraitId,
-              generated_names,
-              benefit,
-              context) {}
+    : Pattern(trait_id, RootKind::TraitId, generated_names, benefit, context) {}
 
 Pattern::Pattern(void* root_val,
                  RootKind root_kind,
diff --git a/paddle/pir/pattern_rewrite/pattern_match.h b/paddle/pir/pattern_rewrite/pattern_match.h
index 0a91c226c519bd..9e7553f4217cae 100644
--- a/paddle/pir/pattern_rewrite/pattern_match.h
+++ b/paddle/pir/pattern_rewrite/pattern_match.h
@@ -76,19 +76,19 @@ class IR_API Pattern {
 
   std::optional<OpInfo> root_kind() const {
     if (root_kind_ == RootKind::OperationInfo)
-      return OpInfo::RecoverFromOpaquePointer(root_val_);
+      return OpInfo::RecoverFromVoidPointer(root_val_);
     return std::nullopt;
   }
 
   std::optional<TypeId> GetRootInterfaceID() const {
     if (root_kind_ == RootKind::InterfaceId)
-      return TypeId::RecoverFromOpaquePointer(root_val_);
+      return TypeId::RecoverFromVoidPointer(root_val_);
     return std::nullopt;
   }
 
   std::optional<TypeId> GetRootTraitID() const {
     if (root_kind_ == RootKind::TraitId)
-      return TypeId::RecoverFromOpaquePointer(root_val_);
+      return TypeId::RecoverFromVoidPointer(root_val_);
     return std::nullopt;
   }
 
diff --git a/paddle/pir/pattern_rewrite/pattern_rewrite_driver.cc b/paddle/pir/pattern_rewrite/pattern_rewrite_driver.cc
index 00d6cb2f4d3064..ff75f86d6da55a 100644
--- a/paddle/pir/pattern_rewrite/pattern_rewrite_driver.cc
+++ b/paddle/pir/pattern_rewrite/pattern_rewrite_driver.cc
@@ -131,6 +131,7 @@ class GreedyPatternRewriteDriver : public pir::PatternRewriter {
     for (uint32_t i = 0; i < op->num_operands(); ++i) {
       AddOperandToWorklist(op->operand_source(i));
     }
+
     if (op->num_regions() == 0) {
       RemoveFromWorklist(op);
     } else {
diff --git a/paddle/pir/transforms/dead_code_elimination_pass.cc b/paddle/pir/transforms/dead_code_elimination_pass.cc
index 6216fca5037e1d..bca3394d1c55d8 100644
--- a/paddle/pir/transforms/dead_code_elimination_pass.cc
+++ b/paddle/pir/transforms/dead_code_elimination_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/pir/transforms/dead_code_elimination_pass.h"
+
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/program.h"
 #include "paddle/pir/pass/pass.h"
@@ -25,11 +26,12 @@ namespace {
 // Now just a naive implementation.
 class DeadCodeEliminationPass : public pir::Pass {
  public:
-  DeadCodeEliminationPass() : pir::Pass("dead_code_elimination", 0) {}
+  DeadCodeEliminationPass() : pir::Pass("dead_code_elimination_pass", 0) {}
 
   void Run(pir::Operation *op) override {
     auto module_op = op->dyn_cast<pir::ModuleOp>();
-    IR_ENFORCE(module_op, "DcePass should run on module op.");
+    IR_ENFORCE(module_op,
+               "dead_code_elimination_pass should run on module op.");
     auto *block = module_op.block();
     std::vector<pir::Operation *> erased_op;
     for (auto &op : *block) {
@@ -76,4 +78,4 @@ std::unique_ptr<Pass> CreateDeadCodeEliminationPass() {
 
 }  // namespace pir
 
-REGISTER_IR_PASS(dead_code_elimination, DeadCodeEliminationPass);
+REGISTER_IR_PASS(dead_code_elimination_pass, DeadCodeEliminationPass);
diff --git a/paddle/pir/transforms/reorder_block_ops_pass.cc b/paddle/pir/transforms/reorder_block_ops_pass.cc
index db2d29fe9b0a73..0e25cc5f180ba9 100644
--- a/paddle/pir/transforms/reorder_block_ops_pass.cc
+++ b/paddle/pir/transforms/reorder_block_ops_pass.cc
@@ -24,11 +24,11 @@ namespace {
 
 class ReorderBlockOpsPass : public pir::Pass {
  public:
-  ReorderBlockOpsPass() : pir::Pass("ReorderBlockOpsPass", 0) {}
+  ReorderBlockOpsPass() : pir::Pass("reorder_block_ops_pass", 0) {}
 
   void Run(pir::Operation *op) override {
     IR_ENFORCE(op->num_regions() > 0,
-               "ReorderBlockOpsPass should run on Operation which regions "
+               "reorder_block_ops_pass should run on Operation which regions "
                "number greater than 0.");
     for (size_t i = 0; i < op->num_regions(); ++i) {
       for (auto *block : op->region(i)) {
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7228d97fa46c74..61adae7128ff96 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -932,6 +932,95 @@ set -ex
     fi
 }
 
+function check_run_sot_ci() {
+    set +x
+    # use "git commit -m 'message, test=sot'" to force ci to run
+    COMMIT_RUN_CI=$(git log -1 --pretty=format:"%s" | grep -w "test=sot" || true)
+    # check pr title
+    TITLE_RUN_CI=$(curl -s https://github.com/PaddlePaddle/Paddle/pull/${GIT_PR_ID} | grep "<title>" | grep -i "sot" || true)
+    if [[ ${COMMIT_RUN_CI} || ${TITLE_RUN_CI} ]]; then
+        set -x
+        return
+    fi
+
+    # git diff
+    SOT_FILE_LIST=(
+        paddle/fluid/operators/run_program_op.h
+        paddle/fluid/operators/run_program_op.cu
+        paddle/fluid/operators/run_program_op.cc
+        paddle/fluid/eager/to_static
+        paddle/fluid/pybind/
+        python/
+        test/sot
+    )
+
+    run_sot_ut="OFF"
+    for change_file in $(git diff --name-only upstream/develop);
+    do
+        for sot_file in ${SOT_FILE_LIST[@]};
+        do
+            if [[ ${change_file} =~ ^"${sot_file}".* ]]; then
+                echo "Detect change about SOT: "
+                echo "Changes related to the sot code were detected: " ${change_file}
+                run_sot_ut="ON"
+                break
+            fi
+        done
+        if [[ "ON" == ${run_sot_ut} ]]; then
+            break
+        fi
+    done
+
+    if [[ "OFF" == ${run_sot_ut} ]]; then
+        echo "No SOT-related changes were found"
+        echo "Skip SOT UT CI"
+        exit 0
+    fi
+    set -x
+}
+
+function run_sot_test() {
+    PY_VERSION=$1
+    PYTHON_WITH_SPECIFY_VERSION=python$PY_VERSION
+    PY_VERSION_NO_DOT=`echo $PY_VERSION | sed 's/\.//g'`
+
+    export STRICT_MODE=1
+    export COST_MODEL=False
+    export MIN_GRAPH_SIZE=0
+    export SOT_LOG_LEVEL=0
+
+    # Install PaddlePaddle
+    $PYTHON_WITH_SPECIFY_VERSION -m pip install ${PADDLE_ROOT}/dist/paddlepaddle-0.0.0-cp${PY_VERSION_NO_DOT}-cp${PY_VERSION_NO_DOT}-linux_x86_64.whl
+    # Install PaddleSOT
+    cd $PADDLE_ROOT/test/sot/
+
+    # Run unittest
+    failed_tests=()
+
+    for file in ./test_*.py; do
+        # check file is python file
+        if [ -f "$file" ]; then
+            echo Running: PYTHONPATH=$PYTHONPATH " STRICT_MODE=1 python " $file
+            # run unittests
+            python_output=$($PYTHON_WITH_SPECIFY_VERSION $file 2>&1)
+
+            if [ $? -ne 0 ]; then
+                echo "run $file failed"
+                failed_tests+=("$file")
+                echo -e "$python_output"
+            fi
+        fi
+    done
+
+    if [ ${#failed_tests[@]} -ne 0 ]; then
+        echo "failed tests file:"
+        for failed_test in "${failed_tests[@]}"; do
+            echo "$failed_test"
+        done
+        exit 1
+    fi
+}
+
 function get_precision_ut_mac() {
     on_precision=0
     UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
@@ -1317,6 +1406,8 @@ function get_quickly_disable_ut() {
         echo ${disable_ut_quickly}
         echo "========================================="
     else
+    
+        exit 102
         disable_ut_quickly='disable_ut'
     fi
 }
@@ -2350,6 +2441,11 @@ set -x
         ut_endTime_s=`date +%s`
         echo "CINN testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         if [[ "$EXIT_CODE" != "0" ]]; then
+            rm -f $tmp_dir/*
+            echo "Summary Failed Tests... "
+            echo "========================================"
+            echo "The following tests FAILED: "
+            echo "${failuretest}" | sort -u
             exit 8;
         fi
     fi
@@ -3285,21 +3381,23 @@ function build_pr_and_develop() {
         mkdir ${PADDLE_ROOT}/build/dev_whl && wget -q -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
         cp ${PADDLE_ROOT}/build/dev_whl/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl ${PADDLE_ROOT}/build/python/dist
     else
+        cp -r ${PADDLE_ROOT}/build /tmp/
         if [[ ${cmake_change} ]];then
             rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt ${PADDLE_ROOT}/build/build.ninja
             rm -rf ${PADDLE_ROOT}/build/third_party
         fi
-        
         git checkout -b develop_base_pr upstream/$BRANCH
         git submodule update --init
         run_setup ${PYTHON_ABI:-""} "rerun-cmake bdist_wheel" ${parallel_number}
+        rm -rf ${PADDLE_ROOT}/build
+        mv /tmp/build ${PADDLE_ROOT}
         if [ ! -d "${PADDLE_ROOT}/build/python/dist/" ]; then
             mkdir ${PADDLE_ROOT}/build/python/dist/
         fi
         mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/
         mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
     fi
-    
+
     generate_api_spec "$1" "DEV"
 
 }
@@ -3555,7 +3653,7 @@ EOF
     export WITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
     export WITH_SHARED_PHI=${WITH_SHARED_PHI:-OFF}
     export WITH_NVCC_LAZY=${WITH_NVCC_LAZY:-ON}
-    
+
     if [ "$SYSTEM" == "Linux" ];then
       if [ `nproc` -gt 16 ];then
           parallel_number=$(expr `nproc` - 8)
@@ -3568,7 +3666,7 @@ EOF
     if [ "$3" != "" ]; then
       parallel_number=$3
     fi
-    
+
     # reset ccache zero stats for collect PR's actual hit rate
     if [ "${MAX_JOBS}" == "" ]; then
         export MAX_JOBS=${parallel_number}
@@ -3845,7 +3943,7 @@ EOF
     fi
     # ci will collect ccache hit rate
     collect_ccache_hits
-    
+
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
@@ -4077,6 +4175,18 @@ function main() {
       test_cicheck_py37)
         run_linux_cpu_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
         ;;
+      cicheck_sot)
+        check_run_sot_ci
+        export WITH_SHARED_PHI=ON
+        PYTHON_VERSIONS=(3.8 3.9 3.10 3.11)
+        for PY_VERSION in ${PYTHON_VERSIONS[@]}; do
+            ln -sf $(which python${PY_VERSION}) /usr/local/bin/python
+            ln -sf $(which pip${PY_VERSION}) /usr/local/bin/pip
+            run_setup ${PYTHON_ABI:-""} bdist_wheel ${parallel_number}
+            run_sot_test $PY_VERSION
+            rm -rf ${PADDLE_ROOT}/build/CMakeCache.txt
+        done
+        ;;
       build_gpubox)
         run_setup ${PYTHON_ABI:-""} install ${parallel_number}
         ;;
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 667045eaebf97c..8e615f7a6cb114 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/framework/init_default_kernel_signature_map.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/flags.h"
diff --git a/paddle/utils/flags_native_test.cc b/paddle/utils/flags_native_test.cc
index 26ef8c12c18753..397072bf2914b7 100644
--- a/paddle/utils/flags_native_test.cc
+++ b/paddle/utils/flags_native_test.cc
@@ -52,8 +52,8 @@ TEST(flags_native_test, ParseCommandLineFlags) {
   std::string commandline =
       "test --paddle_test_int32=3 --paddle_test_uint32=\"4\" "
       "--paddle_test_string \"modified string\"";
-  int argc;
-  char** argv;
+  int argc = 0;
+  char** argv = nullptr;
   SplitCommandlineArg(commandline, &argc, &argv);
 
   // Parse commandline flags and check
diff --git a/paddle/utils/pybind.h b/paddle/utils/pybind.h
index 67927031594e0d..065cd49297ab4c 100644
--- a/paddle/utils/pybind.h
+++ b/paddle/utils/pybind.h
@@ -15,6 +15,9 @@
 #pragma once
 
 #include "paddle/phi/api/include/tensor.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
 #include "paddle/utils/optional.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
@@ -74,8 +77,16 @@ struct type_caster<paddle::Tensor> {
   static handle cast(const paddle::Tensor& src,
                      return_value_policy /* policy */,
                      handle /* parent */) {
+    // TODO(GhostScreaming): pipeline parallel may return a uninitialized
+    // DistTensor, it should not return None.
+#ifdef PADDLE_WITH_DISTRIBUTE
+    bool return_none =
+        phi::distributed::DistTensor::classof(src.impl().get()) ? false : true;
+#else
+    bool return_none = true;
+#endif
     return handle(paddle::pybind::ToPyObject(
-        src, true /* return_py_none_if_not_initialize */));
+        src, return_none /* return_py_none_if_not_initialize */));
   }
 };
 
diff --git a/pyproject.toml b/pyproject.toml
index 4a49ec99f4ec64..3e8da7d18ed6fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,6 @@ extend_skip_glob = [
     # These files do not need to be formatted,
     # see .flake8 for more details
     "python/paddle/utils/gast/**",
-    "python/paddle/base/framework.py",
 ]
 
 [tool.ruff]
@@ -73,6 +72,9 @@ select = [
     "PLR1711",
     "PLR1722",
     "PLW3301",
+
+    # Pygrep-hooks
+    "PGH004",
 ]
 unfixable = [
     "NPY001"
@@ -102,17 +104,6 @@ ignore = [
 # Ignore unnecessary lambda in dy2st unittest test_lambda
 "test/dygraph_to_static/test_lambda.py" = ["PLC3002"]
 
-# Temporarily ignored
-"python/paddle/base/**" = [
-    "C408",
-    "UP030",
-    "C405",
-    "B019", # Confirmation required
-    "C416",
-    "F821",
-    "PLC0414",
-]
-
 # B017
 "test/auto_parallel/spmd_rules/test_reshape_rule.py" = ["B017"]
 "test/dygraph_to_static/test_assert.py" = ["B017"]
diff --git a/python/cinn/__init__.py b/python/cinn/__init__.py
index 9411b774e38360..55ab35e7e56242 100644
--- a/python/cinn/__init__.py
+++ b/python/cinn/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .version import full_version as __version__
+from .runtime.cinn_jit import to_cinn_llir
 import os
 
 cinndir = os.path.dirname(os.path.abspath(__file__))
@@ -189,4 +191,3 @@
     reduce_mul,
     reduce_sum,
 )
-from .version import full_version as __version__
diff --git a/python/cinn/compiler/__init__.py b/python/cinn/compiler/__init__.py
new file mode 100644
index 00000000000000..644bf2d949ca4e
--- /dev/null
+++ b/python/cinn/compiler/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .compiler import compile
+
+__all__ = ["compile"]
diff --git a/python/cinn/compiler/compiler.py b/python/cinn/compiler/compiler.py
new file mode 100644
index 00000000000000..064b97c31f243b
--- /dev/null
+++ b/python/cinn/compiler/compiler.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cinn
+
+from ..runtime import CinnLowerLevelIrJit
+from .compute_code_generator import ComputeCodeGenerator
+from .schedule_code_generator import ScheduleCodeGenerator
+
+
+def ast_to_llir(fn, inputs_signature):
+    function_name = fn.__name__
+    # 1. Parse CINN Compute
+    llir_compute_generator = ComputeCodeGenerator(
+        fn, function_name, inputs_signature
+    )
+    cinn_llir_func = llir_compute_generator.parse()
+
+    # 2. Parse CINN Schedule
+    llir_schedule_generator = ScheduleCodeGenerator(fn, cinn_llir_func)
+    return llir_schedule_generator.parse()
+
+
+def llir_to_runtime_module(llir_func, target, function_name, arg_names):
+    cinn_builder = cinn.lang.Module.Builder(function_name, target)
+    cinn_builder.add_function(llir_func)
+    llir_module = cinn_builder.build()
+    return cinn.runtime.Module(llir_module, target, function_name, arg_names)
+
+
+def compile(fn, just_convert=False, jit_inputs_signature=[], **kwargs):
+    if isinstance(fn, CinnLowerLevelIrJit):
+        llir_func = ast_to_llir(fn, jit_inputs_signature)
+    else:
+        raise Exception("Current Only support compile from CinnLowerLevelIrJit")
+
+    if just_convert:
+        return llir_func
+
+    rt_module = llir_to_runtime_module(
+        llir_func, kwargs["target"], fn.__name__, kwargs["arg_names"]
+    )
+
+    return rt_module
diff --git a/python/cinn/compiler/compute_code_generator.py b/python/cinn/compiler/compute_code_generator.py
new file mode 100644
index 00000000000000..9a54c504306f3f
--- /dev/null
+++ b/python/cinn/compiler/compute_code_generator.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+import contextlib
+
+from cinn import ir
+
+from .expr_executor import ExprExecutor, exec_assign
+from .utils import VariableTable, is_node_parsed_in_schedule
+
+
+class ComputeCodeGenerator(ast.NodeVisitor):
+    """
+    Convert python ast to CINN Lower Level IR,
+    containing only the semantics of the compute part
+    """
+
+    def __init__(self, fn, function_name, inputs_signature):
+        self.fn = fn
+        self.function_name = function_name
+        self.inputs_signature = inputs_signature
+        self.cinn_llir_func = None
+        self.variables_table = VariableTable()
+        self.extra_scope = {"range": ir.sequential}
+
+    def parse(self):
+        ast_node = self.fn.parse()
+        with ir.IRBuilder() as builder, self.variables_table:
+            for k, v in self.fn.scope.items():
+                self.variables_table.add(k, v)
+            for k, v in self.extra_scope.items():
+                self.variables_table.add(k, v)
+            self.visit(ast_node)
+        return builder.get()
+
+    def visit_FunctionDef(self, node) -> None:
+        """
+        Parse CINN Low Level IR FunctionDef.
+
+        Args:
+            node(ast.FunctionDef): The ast FunctionDef Node
+        """
+        with ir.LowerFuncContext(self.function_name) as func_ctx:
+            arg_names = self.visit(node.args)
+
+            assert len(node.args.defaults) == 0, "Not support default args"
+
+            # 1. Construct args of function
+            for i, arg_name in enumerate(arg_names):
+                # Obj of Argument is ir::Buffer
+                if hasattr(self.inputs_signature[i], "dtype"):
+                    tensor_shape = [
+                        ir.Expr(dim) for dim in self.inputs_signature[i].shape
+                    ]
+                    llir_value = ir._Buffer_.make(
+                        arg_name, self.inputs_signature[i].dtype
+                    )
+                    ir.Arg(arg_name, llir_value)
+                    llir_value = ir._Tensor_.make(
+                        arg_name,
+                        self.inputs_signature[i].dtype,
+                        tensor_shape,
+                        tensor_shape,
+                    )
+                    self.variables_table.add(arg_name, llir_value)
+                # Obj of Argument is ir::Var
+                else:
+                    llir_value = ir.Var(arg_name)
+                    ir.Arg(arg_name, llir_value)
+                    llir_value = ir.Expr(llir_value)
+                    self.variables_table.add(arg_name, llir_value)
+
+            # 2. Construct body of function
+            body = self.visit_compound_statement(node.body)
+
+    def visit_compound_statement(self, stmts):
+        for stmt in stmts:
+            self.visit(stmt)
+
+    def visit_arguments(self, node):
+        """
+        Parse CINN Low Level IR Argument.
+        If it is not jit mode, it will get information from arg.annoatation.
+
+        Args:
+            node(ast.arguments): The ast argument Node
+
+        Returns:
+            list[string]: A list of parameter names
+        """
+        arg_names = [arg.arg for arg in node.args]
+
+        if len(self.inputs_signature) != len(arg_names):
+            self.inputs_signature = []
+            for arg in node.args:
+                arg_annotation = arg.annotation
+                if isinstance(arg_annotation, ast.Call):
+                    self.inputs_signature.append(
+                        ExprExecutor(self.variables_table.get()).exec(
+                            arg_annotation
+                        )
+                    )
+                elif isinstance(arg_annotation, int):
+                    if (
+                        -(2**21) <= arg_annotation
+                        and arg_annotation <= 2**31 - 1
+                    ):
+                        self.inputs_signature.append("i32")
+                    elif (
+                        2**63 <= arg_annotation
+                        and arg_annotation <= 2**64 - 1
+                    ):
+                        self.inputs_signature.append("u64")
+                    else:
+                        self.inputs_signature.append("i64")
+                elif isinstance(arg_annotation, float):
+                    return self.inputs_signature.append("fp32")
+                else:
+                    raise TypeError(
+                        f'Unsupported type {type(arg_annotation)} for {arg_annotation}'
+                    )
+
+        return arg_names
+
+    def visit_For(self, node) -> ir.Expr:
+        """
+        parse CINN Low Level IR For.
+
+        Args:
+            node(ast.For): The ast For node
+        """
+        for_ctx = ExprExecutor(self.variables_table.get()).exec(node.iter)
+        with self.variables_table:
+            with for_ctx as loop_var:
+                local_var_table = exec_assign(
+                    target=node.target, source=loop_var
+                )
+                for k, v in local_var_table.items():
+                    loop_var.rename(k)
+                    self.variables_table.add(k, ir.Expr(v))
+                self.visit_compound_statement(node.body)
+
+    def visit_Assign(self, node):
+        """
+        parse CINN Low Level IR Store.
+
+        Args:
+            node(ast.Assign): The ast Assign node
+
+        Returns:
+            ir.Expr, Points to the Expr of ir::ExprNode<Store>
+        """
+
+        if isinstance(node.value, ast.Call) and is_node_parsed_in_schedule(
+            node.value
+        ):
+            return "no compute"
+
+        assert (
+            len(node.targets) == 1
+        ), "Unsupport targets is a \
+               list of nodes, like 'a = b = c'"
+        lhs = node.targets[0]
+
+        # 1 parse RHS
+        rhs_expr = ExprExecutor(self.variables_table.get()).exec(node.value)
+
+        # 2 parse LHS
+        # 2.1 Type of arg is Tensor
+        if isinstance(lhs, ast.Subscript):
+            expr_tensor = ExprExecutor(self.variables_table.get()).exec(
+                lhs.value
+            )
+            if isinstance(lhs.slice, ast.Tuple):
+                expr_indices = []
+                for idx in lhs.slice.elts:
+                    expr_indices.append(
+                        ExprExecutor(self.variables_table.get()).exec(idx)
+                    )
+            else:
+                expr_indices = [
+                    ExprExecutor(self.variables_table.get()).exec(lhs.slice)
+                ]
+            if not isinstance(rhs_expr, ir.Expr):
+                rhs_expr = ir.Expr(rhs_expr)
+            ir.TensorStore(expr_tensor.Expr(), rhs_expr, expr_indices)
+        # 2.2 Type of arg is Var
+        else:
+            local_var_table = exec_assign(target=lhs, source=rhs_expr)
+            if isinstance(lhs, ast.Tuple):
+                for k, v in local_var_table.items():
+                    v.as_var_ref().rename(k)
+                    self.variables_table.add(k, v)
+            else:
+                for k, v in local_var_table.items():
+                    v[0].as_var_ref().rename(k)
+                    self.variables_table.add(k, v[0])
+
+    def visit_If(self, node):
+        with self.variables_table:
+            with ir.IfContext(
+                ExprExecutor(self.variables_table.get()).exec(node.test)
+            ):
+                with ir.ThenContext():
+                    with self.variables_table:
+                        self.visit_compound_statement(node.body)
+                if node.orelse:
+                    with ir.ElseContext():
+                        with self.variables_table:
+                            self.visit_compound_statement(node.body)
+
+    def visit_With(self, node):
+        with self.variables_table:
+            with contextlib.ExitStack() as context_stack:
+                for item in node.items:
+                    cur_ctx = ExprExecutor(self.variables_table.get()).exec(
+                        item.context_expr
+                    )
+                    cur_ctx = context_stack.enter_context(cur_ctx)
+                    if item.optional_vars is not None:
+                        local_var_table = exec_assign(
+                            target=item.optional_vars, source=cur_ctx
+                        )
+                        for k, v in local_var_table.items():
+                            self.variables_table.add(k, v)
+                body = self.visit_compound_statement(node.body)
+
+    def visit_Expr(self, node):
+        if is_node_parsed_in_schedule(node.value):
+            return
+        res = ExprExecutor(self.variables_table.get()).exec(node.value)
+        if isinstance(res, ir.Expr):
+            ir.link_to_parent_context(res)
diff --git a/python/cinn/compiler/expr_executor.py b/python/cinn/compiler/expr_executor.py
new file mode 100644
index 00000000000000..cff9a9d62d7c43
--- /dev/null
+++ b/python/cinn/compiler/expr_executor.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+
+from cinn import ir
+
+# The Python native AST node that cinn ir supports
+AST2CINN = {
+    ast.Add: ir.Add,
+    ast.Sub: ir.Sub,
+    ast.Mult: ir.Mul,
+    ast.Div: ir.Div,
+    ast.Mod: ir.Mod,
+    ast.And: ir.And,
+    ast.Or: ir.Or,
+    ast.USub: ir.Minus,
+    ast.Not: ir.Not,
+    ast.Eq: ir.EQ,
+    ast.NotEq: ir.NE,
+    ast.Lt: ir.LT,
+    ast.LtE: ir.LE,
+    ast.Gt: ir.GT,
+    ast.GtE: ir.GE,
+}
+
+
+class ExprExecutor:
+    def __init__(self, var_table):
+        self.var_table = var_table
+        self.tmp_value_count = 1
+
+    def exec(self, node):
+        ret = self.visit(node)
+        if isinstance(ret, ast.Name):
+            return self.var_table[ret.id]
+        if isinstance(ret, ast.Constant):
+            return ret.value
+        raise Exception(f"Error result type: {type(ret)}")
+
+    def visit(self, node):
+        if isinstance(node, list):
+            return [self.visit(item) for item in node]
+        if isinstance(node, tuple):
+            return (self.visit(item) for item in node)
+        assert isinstance(node, ast.AST)
+        if isinstance(node, ast.Name):
+            return node
+
+        if isinstance(node, ast.Constant):
+            return node
+
+        if not isinstance(node, (ast.expr, ast.slice)):
+            # some nodes don't need to parse, such as ast.Load
+            return node
+        if isinstance(node, (ast.Lambda, ast.Starred)):
+            raise Exception("Current not suporrted: Lambda, Starred")
+
+        cls_fields = {}
+        for field in node.__class__._fields:
+            attr = getattr(node, field)
+            if isinstance(attr, (ast.AST, tuple, list)):
+                cls_fields[field] = self.visit(attr)
+            else:
+                cls_fields[field] = attr
+
+        node_type_name = f'eval_{type(node).__name__}'
+        if hasattr(self, node_type_name):
+            exec_func = getattr(self, node_type_name)
+            value = exec_func(cls_fields)
+        else:
+            new_node = node.__class__(**cls_fields)
+            ast.copy_location(new_node, node)
+            new_node = ast.Expression(new_node)
+            value = self.exec_expr(new_node)
+        return self.save_temp_value(value)
+
+    def exec_expr(self, node):
+        if isinstance(node, ast.expr):
+            node = ast.Expression(body=node)
+        node = ast.fix_missing_locations(node)
+        exec = compile(node, filename="<ast>", mode="eval")
+        return eval(exec, self.var_table)
+
+    def eval_BinOp(self, fields):
+        args = [self.exec_expr(fields["left"]), self.exec_expr(fields["right"])]
+        args = [
+            ir.Expr(item) if not isinstance(item, ir.Expr) else item
+            for item in args
+        ]
+        return AST2CINN[type(fields["op"])].make(*args)
+
+    def eval_UnaryOp(self, fields):
+        args = [self.exec_expr(fields["operand"])]
+        args = [
+            ir.Expr(item) if not isinstance(item, ir.Expr) else item
+            for item in args
+        ]
+        return AST2CINN[type(fields["op"])].make(*args)
+
+    def eval_Compare(self, fields):
+        assert (
+            len(fields["ops"]) == 1
+        ), "Only binary comparison symbols are supported. Expressions such as '1 <= a < 10' are not supported."
+        args = [
+            self.exec_expr(fields["left"]),
+            self.exec_expr(fields["comparators"][0]),
+        ]
+        args = [
+            ir.Expr(item) if not isinstance(item, ir.Expr) else item
+            for item in args
+        ]
+        return AST2CINN[type(fields["ops"][0])].make(*args)
+
+    def save_temp_value(self, value):
+        name = f"__cinn_python_script_tmp_value_{self.tmp_value_count}"
+        self.tmp_value_count += 1
+        self.var_table[name] = value
+        return ast.Name(
+            id=name,
+            ctx=ast.Load(
+                lineno=0, col_offset=0, end_lineno=None, end_col_offset=None
+            ),
+            lineno=0,
+            col_offset=0,
+            end_lineno=None,
+            end_col_offset=None,
+        )
+
+
+def exec_assign(target, source):
+    right_value_var_name = "__CINN_RIGHT_VALUE_VAR_NAME__"
+    local_var_table = {right_value_var_name: source}
+    mod = ast.fix_missing_locations(
+        ast.Module(
+            body=[
+                ast.Assign(
+                    targets=[target],
+                    value=ast.Name(id=right_value_var_name, ctx=ast.Load()),
+                )
+            ],
+            type_ignores=[],
+        )
+    )
+    exe = compile(mod, filename="<ast>", mode="exec")
+    exec(exe, {}, local_var_table)
+    del local_var_table[right_value_var_name]
+    return local_var_table
diff --git a/python/cinn/compiler/schedule_code_generator.py b/python/cinn/compiler/schedule_code_generator.py
new file mode 100644
index 00000000000000..6cc4c2973464b9
--- /dev/null
+++ b/python/cinn/compiler/schedule_code_generator.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+
+from cinn.schedule import IRSchedule
+
+from .expr_executor import ExprExecutor, exec_assign
+from .utils import (
+    VariableTable,
+    is_node_parsed_in_schedule,
+    node_is_schedule_block_context,
+)
+
+
+class ScheduleCodeGenerator(ast.NodeVisitor):
+    """
+    Convert python ast to CINN Lower Level IR,
+    containing only the semantics of the schedule part
+    """
+
+    def __init__(self, fn, cinn_llir_func):
+        self.fn = fn
+        self.cinn_llir_func = cinn_llir_func
+        self.scheduler = IRSchedule.make(self.cinn_llir_func)
+        self.variable_table = VariableTable()
+        self.global_variable_table = VariableTable()
+        # Set the schedule-related variable to global
+        self.extra_scope = {
+            "ScheduleBlockVariable": ScheduleBlockVariable,
+            "scheduler": self.scheduler,
+        }
+        self.loop_var_stack = []
+        self.block_stack = []
+        self.sch_block_tmp_var_name = "__CINN_SCHEDULE_BLOCK_VAR_NAME__"
+        self.tmp_var_count = 1
+
+    def parse(self):
+        with self.variable_table, self.global_variable_table:
+            ast_node = self.fn.parse()
+            for k, v in self.fn.scope.items():
+                self.variable_table.add(k, v)
+            for k, v in self.extra_scope.items():
+                self.variable_table.add(k, v)
+            self.visit(ast_node)
+        return self.cinn_llir_func
+
+    def visit_For(self, node):
+        assert isinstance(
+            node.target, ast.Name
+        ), "Current only support range() to make ForLoop"
+        with self.variable_table:
+            self.loop_var_stack.append(node.target)
+            self.generic_visit(node)
+            self.loop_var_stack.pop()
+
+    def visit_compound_statement(self, stmts):
+        for stmt in stmts:
+            self.visit(stmt)
+
+    def visit_With(self, node):
+        with self.variable_table:
+            for item in node.items:
+                if isinstance(
+                    item.context_expr, ast.Call
+                ) and not node_is_schedule_block_context(item.context_expr):
+                    continue
+                # 1. replace ScheduleBlockContext to ScheduleBlockVariable
+                sch_ctx_node = item.context_expr
+                sch_block_node = ast.copy_location(
+                    ast.Call(
+                        func=ast.Name(
+                            id="ScheduleBlockVariable", ctx=ast.Load()
+                        ),
+                        args=sch_ctx_node.args,
+                        keywords=[],
+                        starargs=None,
+                        kwargs=None,
+                    ),
+                    item.context_expr,
+                )
+                item.context_expr = sch_block_node
+
+                # 2. store ScheduleBlockVariable node
+                sch_block = ExprExecutor(self.variable_table.get()).exec(
+                    item.context_expr
+                )
+                if item.optional_vars is None:
+                    tmp_var_name = self.sch_block_tmp_var_name + str(
+                        self.tmp_var_count
+                    )
+                    sch_block_var_node = ast.Name(
+                        id=tmp_var_name, ctx=ast.Store()
+                    )
+                    item.optional_vars = sch_block_var_node
+                local_var_table = exec_assign(
+                    target=item.optional_vars, source=sch_block
+                )
+                # 3. Set the block's loop to its attritbute
+                sch_block.set_scheduler(self.scheduler)
+                self.block_stack.append(sch_block)
+                for k, v in local_var_table.items():
+                    self.variable_table.add(k, v)
+                    self.global_variable_table.add(k, v)
+                    for loop_var in self.loop_var_stack:
+                        loop_var_value = ast.Attribute(
+                            value=ast.Name(id=k, ctx=ast.Load()),
+                            attr=loop_var.id,
+                            ctx=ast.Load(),
+                        )
+                        loop_var_value = ExprExecutor(
+                            self.variable_table.get()
+                        ).exec(loop_var_value)
+                        for_loop_var_table = exec_assign(
+                            loop_var, loop_var_value
+                        )
+                        for (
+                            loop_var_k,
+                            loop_var_v,
+                        ) in for_loop_var_table.items():
+                            self.variable_table.add(loop_var_k, loop_var_v)
+
+            body = self.visit_compound_statement(node.body)
+
+    def visit_Assign(self, node):
+        if isinstance(node.value, ast.Call) and is_node_parsed_in_schedule(
+            node.value
+        ):
+            sch_ret = self.exec_schedule_primitive(node.value)
+            local_var_table = exec_assign(
+                target=node.targets[0], source=sch_ret
+            )
+            for k, v in local_var_table.items():
+                self.variable_table.add(k, v)
+            return
+        self.generic_visit(node)
+
+    def visit_Call(self, node):
+        if isinstance(node, ast.Call) and is_node_parsed_in_schedule(node):
+            self.exec_schedule_primitive(node)
+            return
+
+    def exec_schedule_primitive(self, node):
+        # Reflect ScheduleBlockContext to ScheduleBlockVariable
+        sch_primitive = node
+        args = [ast.Name(id="scheduler", ctx=ast.Load()), *sch_primitive.args]
+        sch_primitive.args = args
+        all_variable_table = self.variable_table.get()
+        for k, v in self.global_variable_table.get().items():
+            all_variable_table[k] = v
+        sch_ret = ExprExecutor(all_variable_table).exec(node)
+
+        return sch_ret
+
+
+class ScheduleBlockVariable:
+    """
+    The parse Schedule process replaces ScheduleBlockContext with this class on the ast layer to improve schedule usability on the python layer
+    For example, split a loop in c++ requires two steps:
+        1. Gets the loop for the corresponding block: `x, y = sch.get_loops(block)`
+        2. Apply schedule to loop: tx, xi = sch.split(x, [2])
+    This class allows you to directly manipulate the loop name of a block
+        `sch.split(block.x, [2])`
+    """
+
+    def __init__(self, name):
+        self.name = name
+        self.scheduler = None
+
+    def set_scheduler(self, scheduler):
+        self.scheduler = scheduler
+
+    def __getattr__(self, k):
+        if k == "block":
+            return self.scheduler.get_block(self.name)
+        else:
+            name2loops = self.scheduler.get_name2loops_dict(self.name)
+            return name2loops[k]
diff --git a/python/cinn/compiler/utils.py b/python/cinn/compiler/utils.py
new file mode 100644
index 00000000000000..6f78446245fb42
--- /dev/null
+++ b/python/cinn/compiler/utils.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+
+try:
+    from _collections import defaultdict
+except ImportError:
+    pass
+
+
+from cinn.schedule import IRSchedule
+
+
+def is_node_parsed_in_schedule(node: ast.Call):
+    func_name = ""
+    if isinstance(node.func, ast.Name):
+        func_name = node.func.id
+    elif isinstance(node.func, ast.Attribute):
+        func_name = node.func.attr
+    if func_name == "make":
+        return False
+    if func_name == "print":
+        return True
+
+    return getattr(IRSchedule, func_name, None)
+
+
+def node_is_schedule_block_context(node: ast.Call):
+    if isinstance(node.func, ast.Name):
+        return node.Name == "ScheduleBlockContext"
+    if isinstance(node.func, ast.Attribute):
+        return node.func.attr == "ScheduleBlockContext"
+    return False
+
+
+class VariableTable:
+    def __init__(self):
+        # var name added by current context
+        self.var_name_list = []
+        # var name to var. Dtype is {string:list}
+        # list records the value assigned to each layer of context
+        self.name2value = defaultdict(list)
+
+    def __enter__(self):
+        self.var_name_list.append([])
+        return self
+
+    def __exit__(self, ptype, value, trace) -> None:
+        # clear var assign in current context
+        if ptype is None and value is None:
+            var_names = self.var_name_list.pop()
+            for var_name in var_names:
+                self.name2value[var_name].pop()
+                if len(self.name2value[var_name]) == 0:
+                    self.name2value.pop(var_name)
+
+    def add(self, name, value, cover=False):
+        if cover and name in self.var_name_list[-1]:
+            self.name2value[name][-1] = value
+        else:
+            self.var_name_list[-1].append(name)
+            self.name2value[name].append(value)
+
+    def get(self):
+        return {k: v[-1] for k, v in self.name2value.items()}
diff --git a/python/cinn/ir/ir.py b/python/cinn/ir/ir.py
index 5c683de04e705b..7d51a302a3dfb8 100644
--- a/python/cinn/ir/ir.py
+++ b/python/cinn/ir/ir.py
@@ -17,7 +17,7 @@
 from .ir_context import ForContext
 
 
-# Python's rang() function calls the sequential()
+# Python's range() function calls the sequential()
 def sequential(min, extent=None):
     if extent is None:
         extent = min
diff --git a/python/cinn/runtime/__init__.py b/python/cinn/runtime/__init__.py
index a9f32b12d0e226..244567bd855c22 100644
--- a/python/cinn/runtime/__init__.py
+++ b/python/cinn/runtime/__init__.py
@@ -66,3 +66,8 @@
     seed,
     set_cinn_cudnn_deterministic,
 )
+
+from .cinn_jit import CinnLowerLevelIrJit
+from .module import Module
+
+__all__ = ["CinnLowerLevelIrJit", "Module"]
diff --git a/python/cinn/runtime/cinn_jit.py b/python/cinn/runtime/cinn_jit.py
new file mode 100644
index 00000000000000..7b85808593d625
--- /dev/null
+++ b/python/cinn/runtime/cinn_jit.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import ast
+import functools
+import inspect
+import textwrap
+from typing import Callable, Generic, Optional, TypeVar, Union, cast
+
+from .utils import inspect_function_scope
+
+T = TypeVar('T')
+
+
+class CinnLowerLevelIrJit(Generic[T]):
+    def __init__(self, fn):
+        self.fn = fn
+        # function prototype
+        signature = inspect.signature(fn)
+        self.arg_names = [v.name for v in signature.parameters.values()]
+
+        self.src = textwrap.dedent(inspect.getsource(fn))
+        self.src = self.src[self.src.find("def") :]
+        self.scope = inspect_function_scope(fn)
+
+        # docs of warpped function
+        self.__doc__ = fn.__doc__
+        self.__name__ = fn.__name__
+        self.__globals__ = fn.__globals__
+        self.__module__ = fn.__module__
+
+        # Encapsulates the compile and run processes
+        self.run = self._make_launcher()
+
+    def _make_launcher(self):
+        # Gets information about runtime input parameters
+        jit_input_args = ', '.join(arg_name for arg_name in self.arg_names)
+        lazy_compile = f"""
+import cinn
+def {self.fn.__name__}({jit_input_args}, target=cinn.common.DefaultHostTarget()):
+    from cinn.compiler import compile
+    jit_inputs = {', '.join([f'{arg}' for arg in self.arg_names])}
+    jit_inputs_signature = {{ i: self._convert_arg_type(arg) \
+                             for i, arg in enumerate(jit_inputs)}}
+    module = compile(self, jit_inputs_signature=jit_inputs_signature, arg_names={
+                     self.arg_names}, target=target)
+    module({jit_input_args})
+
+    return module
+        """
+        scope = {
+            "self": self,
+        }
+        exec(lazy_compile, scope)
+        return scope[self.fn.__name__]
+
+    def convert_to_llir(self):
+        from cinn.compiler import compile
+
+        return compile(self, just_convert=True)
+
+    def parse(self):
+        tree = ast.parse(self.src)
+        assert isinstance(tree, ast.Module)
+        return tree
+
+    def __getitem__(self, target):
+        return cast(
+            T, functools.partial(cast(Callable, self.run), target=target)
+        )
+
+    def _convert_arg_type(self, arg):
+        # arg is a Tensor
+        if hasattr(arg, "dtype"):
+            return arg
+        # arg is a Var
+        else:
+            if isinstance(arg, int):
+                if -(2**21) <= arg and arg <= 2**31 - 1:
+                    return "i32"
+                elif 2**63 <= arg and arg <= 2**64 - 1:
+                    return "u64"
+                else:
+                    return "i64"
+            elif isinstance(arg, float):
+                return "fp32"
+            else:
+                raise TypeError(f'Unsupported type {type(arg)} for {arg}')
+
+    def __str__(self):
+        return str(self.convert_to_llir())
+
+
+def to_cinn_llir(
+    fn: Optional[T] = None,
+) -> Union[CinnLowerLevelIrJit[T]]:
+    def decorator(fn: T) -> CinnLowerLevelIrJit[T]:
+        return CinnLowerLevelIrJit(fn)
+
+    if fn is not None:
+        return decorator(fn)
+    else:
+        return decorator
diff --git a/python/cinn/runtime/data_array.py b/python/cinn/runtime/data_array.py
new file mode 100644
index 00000000000000..e422005622cac7
--- /dev/null
+++ b/python/cinn/runtime/data_array.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from cinn import common, runtime
+from cinn.common import BFloat16, Bool, Float, Float16, Int, UInt
+
+
+class DataArray:
+    """
+    Provides Python encapsulation of the cinn_buffer_t
+    data interface in the CINN RunTime module.
+    """
+
+    def __init__(
+        self,
+        shape: list,
+        dtype: common.Type = common.Float(32),
+        data: runtime.cinn_buffer_t = None,
+    ) -> None:
+        self.shape = shape
+        self.dtype = dtype
+        self.data = data
+
+    def to_numpy(self):
+        """
+        Convert DataArray to numpy array
+        """
+        np_dtype = "unk"
+        if self.dtype.is_bfloat16():
+            # numpy has no 'bfloat16', we use uint16 to hold bfloat16 data, same to Paddle
+            np_dtype = "uint16"
+        elif self.dtype.is_float16():
+            np_dtype = "float16"
+        elif self.dtype.is_float(32, common.Type.specific_type_t.UNK):
+            np_dtype = "float32"
+        elif self.dtype.is_float(64, common.Type.specific_type_t.UNK):
+            np_dtype = "float64"
+        elif self.dtype.is_int(8):
+            np_dtype = "int8"
+        elif self.dtype.is_int(16):
+            np_dtype = "int16"
+        elif self.dtype.is_int(32):
+            np_dtype = "int32"
+        elif self.dtype.is_int(64):
+            np_dtype = "int64"
+        elif self.dtype.is_uint(8):
+            np_dtype = "uint8"
+        elif self.dtype.is_uint(32):
+            np_dtype = "uint32"
+        elif self.dtype.is_uint(64):
+            np_dtype = "uint64"
+        elif self.dtype.is_bool():
+            np_dtype = "bool"
+        else:
+            raise TypeError(f"no support {self.dtype} in CINN")
+
+        np_arr = np.empty(self.shape, np_dtype)
+        assert np_arr.flags["C_CONTIGUOUS"]
+        self.data.copy_to(np_arr)
+        return np_arr
+
+    @staticmethod
+    def from_numpy(np_array, target=common.DefaultHostTarget()):
+        """
+        Create DataArray form numpy array
+        """
+        assert isinstance(np_array, np.ndarray)
+        data = runtime.cinn_buffer_t(np_array, target)
+        dtype_np_to_common = {
+            # numpy has no 'bfloat16', we use uint16 to hold bfloat16 data, same to Paddle
+            "uint16": BFloat16(),
+            "bfloat16": BFloat16(),
+            "float16": Float16(),
+            "float32": Float(32),
+            "float64": Float(64),
+            "int8": Int(8),
+            "int16": Int(16),
+            "int32": Int(32),
+            "int64": Int(64),
+            "uint8": UInt(8),
+            # numpy has no 'bfloat16', we use uint16 to hold bfloat16 data, same to Paddle
+            # "uint16": UInt(16),
+            "uint32": UInt(32),
+            "uint64": UInt(64),
+            "bool": Bool(),
+        }
+        dtype_np = str(np_array.dtype).split(".")[-1]
+        assert str(dtype_np) in dtype_np_to_common, (
+            str(dtype_np) + " not support in CINN"
+        )
+        assert dtype_np in dtype_np_to_common.keys()
+
+        return DataArray(np_array.shape, dtype_np_to_common[dtype_np], data)
diff --git a/python/cinn/runtime/module.py b/python/cinn/runtime/module.py
new file mode 100644
index 00000000000000..24a31691015944
--- /dev/null
+++ b/python/cinn/runtime/module.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cinn
+from cinn import framework
+from cinn.backends import Compiler
+
+
+class Module:
+    def __init__(self, llir_module, target, fn_name, arg_names):
+        self.arg_names = arg_names
+        self.fn_name = fn_name
+        self.compiler = Compiler.create(target)
+        self.compiler.build(llir_module)
+        self._instruction = framework.Instruction(
+            target, None, [], arg_names, fn_name
+        )
+
+    def __call__(self, *args):
+        name2pod = {}
+        for i, name in enumerate(self.arg_names):
+            if isinstance(args[i], cinn.runtime.data_array.DataArray):
+                name2pod[name] = cinn.runtime.cinn_pod_value_t(args[i].data)
+            else:
+                name2pod[name] = cinn.runtime.cinn_pod_value_t(args[i])
+
+        self._instruction.run(self.compiler, self.fn_name, name2pod)
diff --git a/python/cinn/runtime/utils.py b/python/cinn/runtime/utils.py
new file mode 100644
index 00000000000000..8df8cccc772d1c
--- /dev/null
+++ b/python/cinn/runtime/utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+
+def get_func_global_vars(func):
+    if inspect.ismethod(func):
+        func = func.__func__
+
+    code = func.__code__
+    global_vars = {}
+    if func.__closure__ is not None:
+        for k, v in zip(code.co_freevars, func.__closure__):
+            global_vars[k] = v.cell_contents
+    return global_vars
+
+
+def inspect_function_scope(func):
+    scope = {
+        **func.__globals__,
+        **get_func_global_vars(func),
+    }
+    return scope
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 6fd3bbba099a2a..5de54e1d89d125 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -23,6 +23,9 @@
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
     )
 
+# NOTE(SigureMo): We should place the import of base.core before other modules,
+# because there are some initialization codes in base/core/__init__.py.
+from .base import core  # noqa: F401
 from .batch import batch
 
 # Do the *DUPLICATED* monkey-patch for the tensor object.
@@ -30,9 +33,11 @@
 # the illogical implement in the monkey-patch methods later.
 from .framework import monkey_patch_variable
 from .framework import monkey_patch_math_tensor
+from .pir import monkey_patch_opresult
 
 monkey_patch_variable()
 monkey_patch_math_tensor()
+monkey_patch_opresult()
 
 from .framework import (
     disable_signal_handler,
@@ -402,6 +407,8 @@
     i1e,
     polygamma,
     polygamma_,
+    hypot,
+    hypot_,
     combinations,
 )
 
@@ -533,8 +540,8 @@
 
 from .pir_utils import IrGuard
 
-ir_change = IrGuard()
-ir_change._switch_to_pir()
+ir_guard = IrGuard()
+ir_guard._switch_to_pir()
 
 __all__ = [
     'iinfo',
@@ -900,5 +907,7 @@
     'i1e',
     'polygamma',
     'polygamma_',
+    'hypot',
+    'hypot_',
     'combinations',
 ]
diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py
index 615e6c4f36d551..9984ba450afe76 100644
--- a/python/paddle/amp/__init__.py
+++ b/python/paddle/amp/__init__.py
@@ -12,20 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .auto_cast import auto_cast  # noqa: F401
-from .auto_cast import decorate  # noqa: F401
-from .auto_cast import amp_guard  # noqa: F401
-from .auto_cast import amp_decorate  # noqa: F401
-from .amp_lists import white_list  # noqa: F401
-from .amp_lists import black_list  # noqa: F401
-
-from . import grad_scaler  # noqa: F401
-from .grad_scaler import GradScaler  # noqa: F401
-from .grad_scaler import AmpScaler  # noqa: F401
-from .grad_scaler import OptimizerState  # noqa: F401
-
-from . import debugging  # noqa: F401
-from . import accuracy_compare  # noqa: F401
+from .auto_cast import (  # noqa: F401
+    auto_cast,
+    decorate,
+    amp_guard,
+    amp_decorate,
+)
+from .amp_lists import (  # noqa: F401
+    white_list,
+    black_list,
+)
+
+from . import (  # noqa: F401
+    debugging,
+    grad_scaler,
+    accuracy_compare,
+)
+
+from .grad_scaler import (  # noqa: F401
+    GradScaler,
+    AmpScaler,
+    OptimizerState,
+)
 
 from paddle.base import core
 from paddle.base.framework import (
diff --git a/python/paddle/amp/amp_lists.py b/python/paddle/amp/amp_lists.py
index 7d014b1bf14f92..b4b4fc95cb0499 100644
--- a/python/paddle/amp/amp_lists.py
+++ b/python/paddle/amp/amp_lists.py
@@ -44,7 +44,6 @@
     'cosh',
     'atanh',
     'tanh_shrink',
-    'cos_sim',
     'erfinv',
     'exp',
     'expm1',
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index d39666f6f1c532..d612b93bd1cf31 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -198,9 +198,10 @@ def set_excluded_layers(models, excluded_layers):
             include_self=True
         ):
             layer._cast_to_low_precison = False
+    excluded_layers_types = tuple(excluded_layers_types)
     for idx in range(len(models)):
         for layer in models[idx].sublayers(include_self=True):
-            if type(layer) in excluded_layers_types:
+            if isinstance(layer, excluded_layers_types):
                 layer._cast_to_low_precison = False
 
 
@@ -358,37 +359,38 @@ def amp_guard(
             % tracer._expected_place
         )
         enable = False
-    # For xpu:
-    if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
-        warnings.warn('XPUPlace only support float16 amp.')
-        enable = False
-    # For custom device:
-    if tracer._expected_place.is_custom_place() and (dtype == 'bfloat16'):
-        warnings.warn('CustomPlace only support float16 amp.')
-        enable = False
-    # For gpu float16: Compute Capability should >= 7.
-    # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
-    if tracer._expected_place.is_gpu_place():
-        if (dtype == 'float16') and not _is_gpu_float16_supported():
-            prop = paddle.device.cuda.get_device_capability()
-            warnings.warn(
-                "For float16, amp only support NVIDIA GPU with Compute Capability 7.0 or higher, current GPU is: %s, with Compute Capability: %d.%d."
-                % (paddle.device.cuda.get_device_name(), prop[0], prop[1])
-            )
+    if enable:
+        # For xpu:
+        if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
+            warnings.warn('XPUPlace only support float16 amp.')
             enable = False
-        elif (dtype == 'bfloat16') and not _is_gpu_bfloat16_supported():
-            prop = paddle.device.cuda.get_device_capability()
-            cuda_version = paddle.version.cuda()
-            warnings.warn(
-                "For bfloat16, amp only support NVIDIA GPU with Compute Capability 8.0 or higher and CUDA Version 11.0 or higher, current GPU is: %s, with Compute Capability: %d.%d, current CUDA Version is: %s."
-                % (
-                    paddle.device.cuda.get_device_name(),
-                    prop[0],
-                    prop[1],
-                    cuda_version,
-                )
-            )
+        # For custom device:
+        if tracer._expected_place.is_custom_place() and (dtype == 'bfloat16'):
+            warnings.warn('CustomPlace only support float16 amp.')
             enable = False
+        # For gpu float16: Compute Capability should >= 7.
+        # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
+        if tracer._expected_place.is_gpu_place():
+            if (dtype == 'float16') and not _is_gpu_float16_supported():
+                prop = paddle.device.cuda.get_device_capability()
+                warnings.warn(
+                    "For float16, amp only support NVIDIA GPU with Compute Capability 7.0 or higher, current GPU is: %s, with Compute Capability: %d.%d."
+                    % (paddle.device.cuda.get_device_name(), prop[0], prop[1])
+                )
+                enable = False
+            elif (dtype == 'bfloat16') and not _is_gpu_bfloat16_supported():
+                prop = paddle.device.cuda.get_device_capability()
+                cuda_version = paddle.version.cuda()
+                warnings.warn(
+                    "For bfloat16, amp only support NVIDIA GPU with Compute Capability 8.0 or higher and CUDA Version 11.0 or higher, current GPU is: %s, with Compute Capability: %d.%d, current CUDA Version is: %s."
+                    % (
+                        paddle.device.cuda.get_device_name(),
+                        prop[0],
+                        prop[1],
+                        cuda_version,
+                    )
+                )
+                enable = False
 
     amp_dtype = dtype
     amp_global_state().amp_dtype = amp_dtype
diff --git a/python/paddle/audio/backends/__init__.py b/python/paddle/audio/backends/__init__.py
index ac19a14c69a01a..f089e5bfe9cd30 100644
--- a/python/paddle/audio/backends/__init__.py
+++ b/python/paddle/audio/backends/__init__.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import init_backend
-from .init_backend import get_current_backend  # noqa: F401
-from .init_backend import list_available_backends  # noqa: F401
-from .init_backend import set_backend
+from .init_backend import (
+    get_current_backend,
+    list_available_backends,
+    set_backend,
+)
 
 init_backend._init_set_audio_backend()
 
diff --git a/python/paddle/audio/features/__init__.py b/python/paddle/audio/features/__init__.py
index 3c0bf499f1eff4..ac48d594ded935 100644
--- a/python/paddle/audio/features/__init__.py
+++ b/python/paddle/audio/features/__init__.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .layers import LogMelSpectrogram  # noqa: F401
-from .layers import MelSpectrogram  # noqa: F401
-from .layers import MFCC  # noqa: F401
-from .layers import Spectrogram  # noqa: F401
+from .layers import (
+    LogMelSpectrogram,
+    MelSpectrogram,
+    MFCC,
+    Spectrogram,
+)
 
-__all__ = [  # noqa
+__all__ = [
     'LogMelSpectrogram',
     'MelSpectrogram',
     'MFCC',
diff --git a/python/paddle/audio/functional/__init__.py b/python/paddle/audio/functional/__init__.py
index b7db53d6c22a6f..caf1cf18c1a35d 100644
--- a/python/paddle/audio/functional/__init__.py
+++ b/python/paddle/audio/functional/__init__.py
@@ -11,16 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .functional import compute_fbank_matrix  # noqa: F401
-from .functional import create_dct  # noqa: F401
-from .functional import fft_frequencies  # noqa: F401
-from .functional import hz_to_mel  # noqa: F401
-from .functional import mel_frequencies  # noqa: F401
-from .functional import mel_to_hz  # noqa: F401
-from .functional import power_to_db  # noqa: F401
-from .window import get_window  # noqa: F401
+from .functional import (
+    compute_fbank_matrix,
+    create_dct,
+    fft_frequencies,
+    hz_to_mel,
+    mel_frequencies,
+    mel_to_hz,
+    power_to_db,
+)
 
-__all__ = [  # noqa
+from .window import get_window
+
+__all__ = [
     'compute_fbank_matrix',
     'create_dct',
     'fft_frequencies',
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index acfb6447283c5e..1cfd2734386f37 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -12,20 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..base.dygraph.base import grad  # noqa: F401
-from ..base.dygraph.base import enable_grad  # noqa: F401
-from ..base.dygraph.base import no_grad_ as no_grad  # noqa: F401
-from ..base.dygraph.base import is_grad_enabled  # noqa: F401
-from ..base.dygraph.base import set_grad_enabled  # noqa: F401
-from . import backward_mode  # noqa: F401
-from .autograd import jacobian, hessian  # noqa: F401
-from .backward_mode import backward  # noqa: F401
-from .py_layer import PyLayer  # noqa: F401
-from .py_layer import PyLayerContext  # noqa: F401
+from ..base.dygraph.base import (  # noqa: F401
+    grad,
+    enable_grad,
+    no_grad_ as no_grad,
+    is_grad_enabled,
+    set_grad_enabled,
+)
+from . import (  # noqa: F401
+    backward_mode,
+    ir_backward,
+)
+from .autograd import jacobian, hessian
+from .backward_mode import backward
+from .py_layer import PyLayer, PyLayerContext
 from .saved_tensors_hooks import saved_tensors_hooks
-from . import ir_backward
 
-__all__ = [  # noqa
+__all__ = [
     'jacobian',
     'hessian',
     'backward',
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index 97a315c1010566..ad5a7cc02aef9e 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import collections
+import logging
 from collections.abc import Sequence
 
 import paddle.pir
@@ -363,8 +364,8 @@ def make_output_grad(op):
         for i, value in enumerate(op.results()):
             if (
                 value in state.value_to_valuegrad
-                and len(state.value_to_valuegrad[value])
-            ) > 1:
+                and len(state.value_to_valuegrad[value]) > 1
+            ):
                 # one value is input of more than one fwd_op,
                 # so more than one bwd_op create input_grad,
                 # need add sum op to accumulate gradient
@@ -556,7 +557,7 @@ def create_backward_prune_set(inputs, outputs, no_grad_set, state):
                 if state.value_to_valuegrad[item] != []:
                     outputs_set.add(state.value_to_valuegrad[item][0][0])
         else:
-            raise ValueError("input privided by inputs has no use")
+            logging.warning("input privided by inputs has no use")
 
     inputs_set = set()
     for output in outputs:
diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index 4acf21c4657763..5bab0d5cf84f07 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -15,6 +15,7 @@
 import os
 import sys
 import atexit
+import platform
 
 # The legacy core need to be removed before "import core",
 # in case of users installing paddlepaddle without -U option
@@ -32,6 +33,8 @@
     except Exception as e:
         raise e
 
+from . import core
+
 # import all class inside framework into base module
 from . import framework
 from .framework import (
@@ -138,11 +141,6 @@ def __bootstrap__():
     Returns:
         None
     """
-    import sys
-    import os
-    import platform
-    from . import core
-
     # NOTE(zhiqiu): When (1)numpy < 1.19; (2) python < 3.7,
     # unittest is always imported in numpy (maybe some versions not).
     # so is_test is True and p2p is not inited.
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 6d30823d4bf4a9..876db0abc3aa70 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -20,8 +20,7 @@
 from collections.abc import Sequence
 
 import paddle.base
-from paddle.base import framework as framework
-from paddle.base import program_guard
+from paddle.base import framework, program_guard
 
 from . import core, log_helper, unique_name
 from .data_feeder import check_type
@@ -812,7 +811,7 @@ def insert_output(self, var):
             assert isinstance(var, Var)
             self.outputs.append(var)
 
-    var_versions = dict()
+    var_versions = {}
 
     def _create_node(name):
         if name not in var_versions.keys():
@@ -1671,17 +1670,8 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
                     or var in parent_op_vars
                 ]
                 if not existing_grad_var_ins:
-                    '''
-                    FIXME(paddle-dev, zengjinle): rnn_memory_helper_grad is used
-                    in recurrent op. The input of this op does not even exist in
-                    the program! Therefore, any dependency analysis would not
-                    work to this op! If I do not add the following code, this op
-                    would be pruned, and the calculation result would be wrong.
-                    Maybe we should re-design this op later...
-                    '''
-                    if op_desc.type() not in ['rnn_memory_helper_grad']:
-                        ops_to_remove.append(op_idx)
-                        continue
+                    ops_to_remove.append(op_idx)
+                    continue
 
         # sum may create invalid variable, here to deal with it.
         if op_desc.type() == 'sum':
@@ -1817,7 +1807,7 @@ def _rename_grad_(
 
 
 def _get_stop_gradients_(program):
-    no_grad_dict = dict()
+    no_grad_dict = {}
     assert isinstance(program, framework.Program)
     for block in program.blocks:
         assert isinstance(block, framework.Block)
@@ -2041,7 +2031,7 @@ def append_backward(
     for idx in son_parent_block_idx_dict:
         block_fwd_op_num_dict[idx] = program.block(idx).desc.op_size()
 
-    grad_to_var = dict()
+    grad_to_var = {}
 
     # pass the cuda_graph_attr to the fill_constant which generates the loss_grad
     op_desc = _create_loss_op_desc_(loss)
@@ -2055,16 +2045,16 @@ def append_backward(
             map(_strip_grad_suffix_, no_grad_dict[block_idx])
         )
 
-        op_path_dict = dict()
+        op_path_dict = {}
         op_path = _find_op_path_(
             block, [loss], [], block_no_grad_set, op_path_dict
         )
 
-        no_grad_vars = _find_no_grad_vars(
+        no_grad_set = _find_no_grad_vars(
             block, op_path, [loss], block_no_grad_set
         )
 
-        block_no_grad_set.update(no_grad_vars)
+        block_no_grad_set.update(no_grad_set)
         no_grad_dict[block_idx].update(
             list(map(_append_grad_suffix_, block_no_grad_set))
         )
@@ -2077,7 +2067,7 @@ def append_backward(
         # not support double grad in control flow sub-block now.
         if not is_in_control_flow:
             if program._appending_grad_times > 1:
-                input_grad_names_set = set([_append_grad_suffix_(loss.name)])
+                input_grad_names_set = {_append_grad_suffix_(loss.name)}
 
         # TODO: support _append_backward_ops_with_checkpoints_ in
         #  sub-block (control flow)
@@ -2118,7 +2108,7 @@ def append_backward(
                 grad_op_id_to_fwd_op=grad_op_id_to_fwd_op,
             )
 
-    grad_info_map = dict()
+    grad_info_map = {}
 
     # if in control flow, target_grad_block is a created new block which only contains grad ops,
     # so fwd_op_num is set to 0.
@@ -2319,7 +2309,7 @@ def _find_op_path_(
     input_names = {inp.name for inp in inputs}
     output_names = _get_output_names(block, targets)
     if op_path_dict is None:
-        op_path_dict = dict()
+        op_path_dict = {}
 
     relevant_op_flags = [True] * len(block.ops)
 
@@ -2465,7 +2455,7 @@ def calc_gradient_helper(
             raise ValueError("input must be in the same program as targets")
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
 
-    op_path_dict = dict()
+    op_path_dict = {}
     op_path = _find_op_path_(
         block, targets, inputs, block_no_grad_set, op_path_dict
     )
@@ -2510,14 +2500,14 @@ def calc_gradient_helper(
         block.program._sync_with_cpp()
 
     # find no grad var by op_path
-    no_grad_vars = _find_no_grad_vars(
+    no_grad_set = _find_no_grad_vars(
         block, op_path, tmp_targets, block_no_grad_set
     )
-    block_no_grad_set.update(no_grad_vars)
+    block_no_grad_set.update(no_grad_set)
 
     no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
-    grad_to_var = dict()
-    grad_info_map = dict()
+    grad_to_var = {}
+    grad_info_map = {}
     _append_backward_ops_(
         block,
         op_path,
@@ -2636,6 +2626,56 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
             >>> print(z)
             [var x@GRAD : LOD_TENSOR.shape(-1, 2, 8, 8).dtype(float32).stop_gradient(False)]
     """
+    if framework.in_pir_mode():
+        check_type(
+            targets,
+            'targets',
+            ((paddle.pir.Value, paddle.pir.OpResult), list, tuple),
+            'paddle.autograd.ir_backward.grad',
+        )
+        check_type(
+            inputs,
+            'inputs',
+            ((paddle.pir.Value, paddle.pir.OpResult), list, tuple),
+            'paddle.autograd.ir_backward.grad',
+        )
+        check_type(
+            target_gradients,
+            'target_gradients',
+            ((paddle.pir.Value, paddle.pir.OpResult), list, tuple, type(None)),
+            'paddle.autograd.ir_backward.grad',
+        )
+
+        check_type(
+            no_grad_set,
+            'no_grad_set',
+            (
+                (paddle.pir.Value, paddle.pir.OpResult),
+                list,
+                tuple,
+                set,
+                type(None),
+            ),
+            'paddle.autograd.ir_backward.grad',
+        )
+        targets = _as_list(targets)
+        inputs = _as_list(inputs)
+        target_gradients = _as_list(target_gradients)
+        if no_grad_set is None:
+            no_grad_set = set()
+        elif no_grad_set is not set:
+            no_grad_set = set(no_grad_set)
+        else:
+            no_grad_set = no_grad_set
+        from paddle.autograd.ir_backward import (
+            calc_gradient as pir_calc_gradient,
+        )
+
+        input_grad = pir_calc_gradient(
+            targets, inputs, target_gradients, no_grad_set
+        )
+        return input_grad
+
     check_type(
         targets,
         'targets',
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index 536f524c596e6c..81c6f32a893ca4 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import struct
-import warnings
 
 import numpy as np
 
 from ..pir import OpResult
+from ..pir.core import ParameterMeta
 from . import core
 from .framework import (
     Variable,
@@ -148,7 +148,9 @@ def check_variable_and_dtype(
     input, input_name, expected_dtype, op_name, extra_message=''
 ):
     if in_pir_mode():
-        check_type(input, input_name, OpResult, op_name, extra_message)
+        check_type(
+            input, input_name, (OpResult, ParameterMeta), op_name, extra_message
+        )
     else:
         check_type(input, input_name, Variable, op_name, extra_message)
     check_dtype(input.dtype, input_name, expected_dtype, op_name, extra_message)
@@ -196,22 +198,7 @@ def check_dtype(
     # See NOTE [ Why skip dynamic graph check ]
     if in_dygraph_mode():
         return
-    if convert_dtype(input_dtype) in ['float16']:
-        warnings.warn(
-            "The data type of '{}' in {} only support float16 in GPU now. {}".format(
-                input_name, op_name, extra_message
-            )
-        )
-    if convert_dtype(input_dtype) in ['uint16'] and op_name not in [
-        'reshape',
-        'lookup_table',
-        'scale',
-    ]:
-        warnings.warn(
-            "The data type of '{}' in {} only support bfloat16 in OneDNN now. {}".format(
-                input_name, op_name, extra_message
-            )
-        )
+
     if convert_dtype(input_dtype) not in expected_dtype:
         raise TypeError(
             "The data type of '{}' in {} must be {}, but received {}. {}".format(
diff --git a/python/paddle/base/default_scope_funcs.py b/python/paddle/base/default_scope_funcs.py
index dd820572e5edca..225da00088d9a7 100644
--- a/python/paddle/base/default_scope_funcs.py
+++ b/python/paddle/base/default_scope_funcs.py
@@ -42,7 +42,7 @@ def get_cur_scope():
     """
     cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
     if cur_scope_stack is None:
-        __tl_scope__.cur_scope = list()
+        __tl_scope__.cur_scope = []
     if len(__tl_scope__.cur_scope) == 0:
         __tl_scope__.cur_scope.append(paddle.base.core.Scope())
     return __tl_scope__.cur_scope[-1]
diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py
index 3c89b56d660066..cadb6bcb089377 100644
--- a/python/paddle/base/dygraph/base.py
+++ b/python/paddle/base/dygraph/base.py
@@ -161,9 +161,8 @@ def _convert_into_variable(tensor):
 def enabled():
     """
     This function checks whether the program runs in dynamic graph mode or not.
-    You can enter dynamic graph mode with :ref:`api_base_dygraph_guard` api,
-    or enable and disable dynamic graph mode with :ref:`api_base_dygraph_enable_dygraph`
-    and :ref:`api_base_dygraph_disable_dygraph` api .
+    You can enable dynamic graph mode with :ref:`api_paddle_disable_static` api,
+    or disable dynamic graph mode with :ref:`api_paddle_enable_static` .
 
     **Note**:
         ``base.dygraph.enabled`` is the alias of ``base.in_dygraph_mode``, and
@@ -175,12 +174,14 @@ def enabled():
     Examples:
         .. code-block:: python
 
-            import paddle.base as base
+            >>> import paddle.base as base
 
-            base.enable_dygraph()  # Now we are in dygragh mode
-            print(base.dygraph.enabled())  # True
-            base.disable_dygraph()
-            print(base.dygraph.enabled())  # False
+            >>> base.enable_dygraph()  # Now we are in dygragh mode
+            >>> print(base.dygraph.enabled())
+            True
+            >>> base.disable_dygraph()
+            >>> print(base.dygraph.enabled())
+            False
     """
     # TODO(jiabin): Make this check as in_dygraph_mode when we support default eager mode.
     return framework.in_dygraph_mode()
@@ -205,14 +206,17 @@ def enable_dygraph(place=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            print(paddle.in_dynamic_mode())  # True, dynamic mode is turn ON by default since paddle 2.0.0
+            >>> import paddle
+            >>> print(paddle.in_dynamic_mode())
+            True
 
-            paddle.enable_static()
-            print(paddle.in_dynamic_mode())  # False, Now we are in static graph mode
+            >>> paddle.enable_static()
+            >>> print(paddle.in_dynamic_mode())
+            False
 
-            paddle.disable_static()
-            print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
+            >>> paddle.disable_static()
+            >>> print(paddle.in_dynamic_mode())
+            True
 
     """
     global global_var
@@ -240,14 +244,17 @@ def disable_dygraph():
     Examples:
         .. code-block:: python
 
-            import paddle
-            print(paddle.in_dynamic_mode())  # True, dynamic mode is turn ON by default since paddle 2.0.0
+            >>> import paddle
+            >>> print(paddle.in_dynamic_mode())
+            True
 
-            paddle.enable_static()
-            print(paddle.in_dynamic_mode())  # False, Now we are in static graph mode
+            >>> paddle.enable_static()
+            >>> print(paddle.in_dynamic_mode())
+            False
 
-            paddle.disable_static()
-            print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
+            >>> paddle.disable_static()
+            >>> print(paddle.in_dynamic_mode())
+            True
 
     """
     global global_var
@@ -281,40 +288,40 @@ def no_grad(func=None):
 
     Examples:
 
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.base as base
-
-        # use as generator
-
-        data = np.array([[2, 3], [4, 5]]).astype('float32')
-        with base.dygraph.guard():
-            l0 = base.Linear(2, 2)  # l0.weight.gradient() is None
-            l1 = base.Linear(2, 2)
-            with base.dygraph.no_grad():
-                # l1.weight.stop_gradient is False
-                tmp = l1.weight * 2  # tmp.stop_gradient is True
-            x = base.dygraph.to_variable(data)
-            y = l0(x) + tmp
-            o = l1(y)
-            o.backward()
-            print(tmp.gradient() is None)  # True
-            print(l0.weight.gradient() is None)  # False
-
-        # use as decorator
-
-        @base.dygraph.no_grad
-        def test_layer():
-            with base.dygraph.guard():
-                inp = np.ones([3, 1024], dtype='float32')
-                t = base.dygraph.base.to_variable(inp)
-                linear1 = base.Linear(1024, 4, bias_attr=False)
-                linear2 = base.Linear(4, 4)
-                ret = linear1(t)
-                dy_ret = linear2(ret)
-
-        test_layer()
+        .. code-block:: python
+
+            >>> import numpy as np
+            >>> import paddle.base as base
+
+            >>> # use as generator
+
+            >>> data = np.array([[2, 3], [4, 5]]).astype('float32')
+            >>> with base.dygraph.guard():
+            ...     l0 = paddle.nn.Linear(2, 2)  # l0.weight.gradient() is None
+            ...     l1 = paddle.nn.Linear(2, 2)
+            ...     with base.dygraph.no_grad():
+            ...         # l1.weight.stop_gradient is False
+            ...         tmp = l1.weight * 2  # tmp.stop_gradient is True
+            ...     x = base.dygraph.to_variable(data)
+            ...     y = l0(x) + tmp
+            ...     o = l1(y)
+            ...     o.backward()
+            ...     print(tmp.gradient() is None)
+            ...     print(l0.weight.gradient() is None)
+            True
+            False
+
+            >>> @base.dygraph.no_grad
+            >>> def test_layer():
+            ...     with base.dygraph.guard():
+            ...         inp = np.ones([3, 1024], dtype='float32')
+            ...         t = base.dygraph.base.to_variable(inp)
+            ...         linear1 = paddle.nn.Linear(1024, 4, bias_attr=False)
+            ...         linear2 = paddle.nn.Linear(4, 4)
+            ...         ret = linear1(t)
+            ...         dy_ret = linear2(ret)
+            ...
+            >>> test_layer()
 
     """
     if in_to_static_mode():
@@ -374,16 +381,19 @@ def is_grad_enabled():
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            # Dygraph gradient calculation mode is enabled by default.
-            paddle.is_grad_enabled() # True
+            >>> # Dygraph gradient calculation mode is enabled by default.
+            >>> paddle.is_grad_enabled()
+            True
 
-            with paddle.set_grad_enabled(False):
-                paddle.is_grad_enabled() # False
+            >>> with paddle.set_grad_enabled(False):
+            ...     paddle.is_grad_enabled()
+            False
 
-            paddle.enable_static()
-            paddle.is_grad_enabled() # False
+            >>> paddle.enable_static()
+            >>> paddle.is_grad_enabled()
+            False
     """
     tracer = framework._dygraph_tracer()
     return tracer._has_grad if tracer else False
@@ -408,20 +418,23 @@ class set_grad_enabled(_DecoratorContextManager):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.to_tensor([1.], stop_gradient=False)
-            is_train = False
-            with paddle.set_grad_enabled(is_train):
-                y = x * 2
-            assert(y.stop_gradient == True)
-
-            paddle.set_grad_enabled(True)
-            y = x * 2
-            assert(y.stop_gradient == False)
-
-            paddle.set_grad_enabled(False)
-            y = x * 2
-            assert(y.stop_gradient == True)
+            >>> import paddle
+            >>> x = paddle.to_tensor([1.], stop_gradient=False)
+            >>> is_train = False
+            >>> with paddle.set_grad_enabled(is_train):
+            ...     y = x * 2
+            >>> print(y.stop_gradient)
+            True
+
+            >>> paddle.set_grad_enabled(True)
+            >>> y = x * 2
+            >>> print(y.stop_gradient)
+            False
+
+            >>> paddle.set_grad_enabled(False)
+            >>> y = x * 2
+            >>> print(y.stop_gradient)
+            True
     """
 
     def __init__(self, mode):
@@ -451,38 +464,40 @@ class no_grad_(_DecoratorContextManager):
 
     Examples:
 
-     .. code-block:: python
-
-        import numpy as np
-        import paddle
-
-        # use as generator
-
-        data = np.array([[2, 3], [4, 5]]).astype('float32')
-        l0 = paddle.nn.Linear(2, 2)  # l0.weight.gradient() is None
-        l1 = paddle.nn.Linear(2, 2)
-        with paddle.no_grad():
-            # l1.weight.stop_gradient is False
-            tmp = l1.weight * 2  # tmp.stop_gradient is True
-        x = paddle.to_tensor(data)
-        y = l0(x) + tmp
-        o = l1(y)
-        o.backward()
-        print(tmp.gradient() is None)  # True
-        print(l0.weight.gradient() is None)  # False
-
-        # use as decorator
-
-        @paddle.no_grad()
-        def test_layer():
-            inp = np.ones([3, 1024], dtype='float32')
-            t = paddle.to_tensor(inp)
-            linear1 = paddle.nn.Linear(1024, 4, bias_attr=False)
-            linear2 = paddle.nn.Linear(4, 4)
-            ret = linear1(t)
-            dy_ret = linear2(ret)
-
-        test_layer()
+        .. code-block:: python
+
+            >>> import numpy as np
+            >>> import paddle
+
+            >>> # use as generator
+
+            >>> data = np.array([[2, 3], [4, 5]]).astype('float32')
+            >>> l0 = paddle.nn.Linear(2, 2)  # l0.weight.gradient() is None
+            >>> l1 = paddle.nn.Linear(2, 2)
+            >>> with paddle.no_grad():
+            ...     # l1.weight.stop_gradient is False
+            ...     tmp = l1.weight * 2  # tmp.stop_gradient is True
+            >>> x = paddle.to_tensor(data)
+            >>> y = l0(x) + tmp
+            >>> o = l1(y)
+            >>> o.backward()
+            >>> print(tmp.gradient() is None)
+            True
+            >>> print(l0.weight.gradient() is None)
+            False
+
+            >>> # use as decorator
+
+            >>> @paddle.no_grad()
+            >>> def test_layer():
+            ...     inp = np.ones([3, 1024], dtype='float32')
+            ...     t = paddle.to_tensor(inp)
+            ...     linear1 = paddle.nn.Linear(1024, 4, bias_attr=False)
+            ...     linear2 = paddle.nn.Linear(4, 4)
+            ...     ret = linear1(t)
+            ...     dy_ret = linear2(ret)
+            ...
+            >>> test_layer()
     """
 
     def __enter__(self):
@@ -507,30 +522,30 @@ class enable_grad(_DecoratorContextManager):
 
     Examples:
 
-     .. code-block:: python
-
-        import paddle
-
-        # use as generator
+        .. code-block:: python
 
-        x = paddle.to_tensor([1.], stop_gradient=False)
-        with paddle.no_grad():
-            with paddle.enable_grad():
-                y = x * 2
-        assert(y.stop_gradient == False)
-        y.backward()
-        assert(x.grad is not None)
+            >>> import paddle
 
-        # use as decorator
+            >>> # use as generator
 
-        @paddle.enable_grad()
-        def double(x):
-            return x * 2
+            >>> x = paddle.to_tensor([1.], stop_gradient=False)
+            >>> with paddle.no_grad():
+            ...     with paddle.enable_grad():
+            ...         y = x * 2
+            >>> assert(y.stop_gradient == False)
+            >>> y.backward()
+            >>> assert(x.grad is not None)
 
-        with paddle.no_grad():
-            z = double(x)
+            >>> # use as decorator
 
-        assert(z.stop_gradient == False)
+            >>> @paddle.enable_grad()
+            >>> def double(x):
+            ...     return x * 2
+            ...
+            >>> with paddle.no_grad():
+            ...     z = double(x)
+            ...
+            >>> assert(z.stop_gradient == False)
     """
 
     def __enter__(self):
@@ -559,19 +574,19 @@ def guard(place=None):
 
     Examples:
 
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.base as base
-
-        with base.dygraph.guard():
-            inp = np.ones([3, 1024], dtype='float32')
-            t = base.dygraph.base.to_variable(inp)
-            linear1 = base.Linear(1024, 4, bias_attr=False)
-            linear2 = base.Linear(4, 4)
-            ret = linear1(t)
-            dy_ret = linear2(ret)
+        .. code-block:: python
 
+            >>> import numpy as np
+            >>> import paddle.base as base
+
+            >>> with base.dygraph.guard():
+            ...     inp = np.ones([3, 1024], dtype='float32')
+            ...     t = base.dygraph.base.to_variable(inp)
+            ...     linear1 = paddle.nn.Linear(1024, 4, bias_attr=False)
+            ...     linear2 = paddle.nn.Linear(4, 4)
+            ...     ret = linear1(t)
+            ...     dy_ret = linear2(ret)
+            ...
     """
     train = framework.Program()
     startup = framework.Program()
@@ -580,7 +595,7 @@ def guard(place=None):
     if place is not None:
         expected_place = _get_paddle_place(place)
     else:
-        expected_place = framework._current_expected_place()
+        expected_place = framework._current_expected_place_()
 
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
@@ -652,79 +667,85 @@ def grad(
         .. code-block:: python
             :name: code-example-1
 
-            import paddle
-
-            def test_dygraph_grad(create_graph):
-                x = paddle.ones(shape=[1], dtype='float32')
-                x.stop_gradient = False
-                y = x * x
-
-                # Since y = x * x, dx = 2 * x
-                dx = paddle.grad(
-                        outputs=[y],
-                        inputs=[x],
-                        create_graph=create_graph,
-                        retain_graph=True)[0]
-
-                z = y + dx
-
-                # If create_graph = False, the gradient of dx
-                # would not be backpropagated. Therefore,
-                # z = x * x + dx, and x.gradient() = 2 * x = 2.0
-
-                # If create_graph = True, the gradient of dx
-                # would be backpropagated. Therefore,
-                # z = x * x + dx = x * x + 2 * x, and
-                # x.gradient() = 2 * x + 2 = 4.0
-
-                z.backward()
-                return x.gradient()
-
-            print(test_dygraph_grad(create_graph=False)) # [2.]
-            print(test_dygraph_grad(create_graph=True)) # [4.]
+            >>> import paddle
+
+            >>> def test_dygraph_grad(create_graph):
+            ...     x = paddle.ones(shape=[1], dtype='float32')
+            ...     x.stop_gradient = False
+            ...     y = x * x
+            ...
+            ...     # Since y = x * x, dx = 2 * x
+            ...     dx = paddle.grad(
+            ...             outputs=[y],
+            ...             inputs=[x],
+            ...             create_graph=create_graph,
+            ...             retain_graph=True)[0]
+            ...
+            ...     z = y + dx
+            ...
+            ...     # If create_graph = False, the gradient of dx
+            ...     # would not be backpropagated. Therefore,
+            ...     # z = x * x + dx, and x.gradient() = 2 * x = 2.0
+            ...
+            ...     # If create_graph = True, the gradient of dx
+            ...     # would be backpropagated. Therefore,
+            ...     # z = x * x + dx = x * x + 2 * x, and
+            ...     # x.gradient() = 2 * x + 2 = 4.0
+            ...
+            ...     z.backward()
+            ...     return x.gradient()
+            ...
+            >>> print(test_dygraph_grad(create_graph=False))
+            [2.]
+            >>> print(test_dygraph_grad(create_graph=True))
+            [4.]
 
         .. code-block:: python
             :name: code-example-2
 
-            import paddle
-
-            def test_dygraph_grad(grad_outputs=None):
-                x = paddle.to_tensor(2.0)
-                x.stop_gradient = False
-
-                y1 = x * x
-                y2 = x * 3
-
-                # If grad_outputs=None, dy1 = [1], dy2 = [1].
-                # If grad_outputs=[g1, g2], then:
-                #    - dy1 = [1] if g1 is None else g1
-                #    - dy2 = [1] if g2 is None else g2
-
-                # Since y1 = x * x, dx = 2 * x * dy1.
-                # Since y2 = x * 3, dx = 3 * dy2.
-                # Therefore, the final result would be:
-                # dx = 2 * x * dy1 + 3 * dy2 = 4 * dy1 + 3 * dy2.
-
-                dx = paddle.grad(
-                    outputs=[y1, y2],
-                    inputs=[x],
-                    grad_outputs=grad_outputs)[0]
-
-                return dx.numpy()
-
-            grad_value = paddle.to_tensor(4.0)
-            # dy1 = [1], dy2 = [1]
-            print(test_dygraph_grad(None)) # [7.]
-
-            # dy1 = [1], dy2 = [4]
-            print(test_dygraph_grad([None, grad_value])) # [16.]
-
-            # dy1 = [4], dy2 = [1]
-            print(test_dygraph_grad([grad_value, None])) # [19.]
-
-            # dy1 = [3], dy2 = [4]
-            grad_y1 = paddle.to_tensor(3.0)
-            print(test_dygraph_grad([grad_y1, grad_value])) # [24.]
+            >>> import paddle
+
+            >>> def test_dygraph_grad(grad_outputs=None):
+            ...     x = paddle.to_tensor(2.0)
+            ...     x.stop_gradient = False
+            ...
+            ...     y1 = x * x
+            ...     y2 = x * 3
+            ...
+            ...     # If grad_outputs=None, dy1 = [1], dy2 = [1].
+            ...     # If grad_outputs=[g1, g2], then:
+            ...     #    - dy1 = [1] if g1 is None else g1
+            ...     #    - dy2 = [1] if g2 is None else g2
+            ...
+            ...     # Since y1 = x * x, dx = 2 * x * dy1.
+            ...     # Since y2 = x * 3, dx = 3 * dy2.
+            ...     # Therefore, the final result would be:
+            ...     # dx = 2 * x * dy1 + 3 * dy2 = 4 * dy1 + 3 * dy2.
+            ...
+            ...     dx = paddle.grad(
+            ...         outputs=[y1, y2],
+            ...         inputs=[x],
+            ...         grad_outputs=grad_outputs)[0]
+            ...
+            ...     return dx.numpy()
+            ...
+            >>> grad_value = paddle.to_tensor(4.0)
+            >>> # dy1 = [1], dy2 = [1]
+            >>> print(test_dygraph_grad(None))
+            7.
+
+            >>> # dy1 = [1], dy2 = [4]
+            >>> print(test_dygraph_grad([None, grad_value]))
+            16.
+
+            >>> # dy1 = [4], dy2 = [1]
+            >>> print(test_dygraph_grad([grad_value, None]))
+            19.
+
+            >>> # dy1 = [3], dy2 = [4]
+            >>> grad_y1 = paddle.to_tensor(3.0)
+            >>> print(test_dygraph_grad([grad_y1, grad_value]))
+            24.
     '''
     if in_to_static_mode():
         # In dy2static context, we call static interface `gradients`
@@ -779,8 +800,6 @@ def check_in_out(in_out_list, name):
         no_grad_vars = []
     elif isinstance(no_grad_vars, core.eager.Tensor):
         no_grad_vars = [no_grad_vars]
-    elif isinstance(no_grad_vars, core.eager.Tensor):
-        no_grad_vars = [no_grad_vars]
     elif isinstance(no_grad_vars, (list, tuple, set)):
         no_grad_vars = list(no_grad_vars)
         for var in no_grad_vars:
@@ -850,30 +869,35 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
 
     Examples:
 
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.base as base
-
-        with base.dygraph.guard(base.CPUPlace()):
-            x = np.ones([2, 2], np.float32)
-            y = base.dygraph.to_variable(x, zero_copy=False)
-            x[0][0] = -1
-            y[0][0].numpy()  # array([1.], dtype=float32)
-            y = base.dygraph.to_variable(x)
-            x[0][0] = 0
-            y[0][0].numpy()  # array([0.], dtype=float32)
-            c = np.array([2+1j, 2])
-            z = base.dygraph.to_variable(c)
-            z.numpy() # array([2.+1.j, 2.+0.j])
-            z.dtype # 'complex128'
-
-            y = base.dygraph.to_variable([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
-            y.shape     # [3L, 2L]
-
-            y = base.dygraph.to_variable(((0.1, 1.2), (2.2, 3.1), (4.9, 5.2)), dtype='int32')
-            y.shape     # [3L, 2L]
+        .. code-block:: python
 
+            >>> import numpy as np
+            >>> import paddle.base as base
+
+            >>> with base.dygraph.guard(base.CPUPlace()):
+            ...     x = np.ones([2, 2], np.float32)
+            ...     y = base.dygraph.to_variable(x, zero_copy=False)
+            ...     x[0][0] = -1
+            ...     print(y[0][0].numpy())
+            ...     y = base.dygraph.to_variable(x)
+            ...     x[0][0] = 0
+            ...     print(y[0][0].numpy())
+            ...     c = np.array([2+1j, 2])
+            ...     z = base.dygraph.to_variable(c)
+            ...     print(z.numpy())
+            ...     print(z.dtype)
+            ...
+            ...     y = base.dygraph.to_variable([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
+            ...     print(y.shape)
+            ...
+            ...     y = base.dygraph.to_variable(((0.1, 1.2), (2.2, 3.1), (4.9, 5.2)), dtype='int32')
+            ...     print(y.shape)
+            1
+            -1
+            [2.+1.j, 2.+0.j]
+            paddle.complex128
+            [3, 2]
+            [3, 2]
     """
     support_type = (
         list,
diff --git a/python/paddle/base/dygraph/math_op_patch.py b/python/paddle/base/dygraph/math_op_patch.py
index 5972b545f93e23..172f73bf7f531f 100644
--- a/python/paddle/base/dygraph/math_op_patch.py
+++ b/python/paddle/base/dygraph/math_op_patch.py
@@ -150,7 +150,7 @@ def _index_(var):
         return int(np.array(var))
 
     @property
-    def _ndim_(var):
+    def _ndim(var):
         return len(var.shape)
 
     def ndimension(var):
@@ -183,7 +183,7 @@ def _T_(var):
         ('astype', astype),
         ('dim', dim),
         ('ndimension', ndimension),
-        ('ndim', _ndim_),
+        ('ndim', _ndim),
         ('size', _size_),
         ('T', _T_),
         # for logical compare
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index b01c7a70e44066..1f5b414ebb559e 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -869,7 +869,7 @@ def cuda(self, device_id=None, blocking=True):
         if self.place._equals(res_place):
             return self
         else:
-            res = self._copy_to(res_place, True)
+            res = self._copy_to(res_place, blocking)
             res.stop_gradient = self.stop_gradient
             res.persistable = self.persistable
             return res
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index c2db2f04f663dd..037657ee0ad94c 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -58,11 +58,11 @@ def global_scope():
     Examples:
         .. code-block:: python
 
-          import paddle
-          import numpy
+            >>> import paddle
+            >>> import numpy
 
-          paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
-          numpy.array(paddle.static.global_scope().find_var("data").get_tensor())
+            >>> paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
+            >>> numpy.array(paddle.static.global_scope().find_var("data").get_tensor())
     """
     return g_scope
 
@@ -98,14 +98,16 @@ def scope_guard(scope):
 
         .. code-block:: python
 
-            import paddle
-            import numpy
-            paddle.enable_static()
+            >>> import paddle
+            >>> import numpy
+            >>> paddle.enable_static()
 
-            new_scope = paddle.static.Scope()
-            with paddle.static.scope_guard(new_scope):
-                 paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
-            numpy.array(new_scope.find_var("data").get_tensor())
+            >>> new_scope = paddle.static.Scope()
+            >>> with paddle.static.scope_guard(new_scope):
+            ...         paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
+            >>> numpy.array(new_scope.find_var("data").get_tensor())
+            array([[1., 1.],
+                   [1., 1.]])
     """
 
     ex = _switch_scope(scope)
@@ -123,14 +125,14 @@ def as_numpy(tensor, copy=False):
     Examples:
         .. code-block:: python
 
-          import paddle.base as base
-          import numpy
+            >>> import paddle.base as base
+            >>> import numpy
 
-          new_scope = base.Scope()
-          with base.scope_guard(new_scope):
-              base.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), base.CPUPlace())
-          tensor = new_scope.find_var("data").get_tensor()
-          base.executor.as_numpy(tensor) # or numpy.array(new_scope.find_var("data").get_tensor())
+            >>> new_scope = base.Scope()
+            >>> with base.scope_guard(new_scope):
+            ...     base.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), base.CPUPlace())
+            >>> tensor = new_scope.find_var("data").get_tensor()
+            >>> base.executor.as_numpy(tensor) # or numpy.array(new_scope.find_var("data").get_tensor())
 
     Args:
        tensor(Variable): a instance of Tensor
@@ -670,12 +672,15 @@ def _as_lodtensor(data, place, dtype=None):
     For higher dimensional sequence data, please use LoDTensor directly.
 
     Examples:
-        >>> import paddle.base as base
-        >>> place = base.CPUPlace()
-        >>> exe = base.executor(place)
-        >>> data = np.array(size=(100, 200, 300))
-        >>> np_outs = map(lambda x: base.executor._as_lodtensor(x, place), data)
-        >>>     ...
+
+        .. code-block:: python
+
+            >>> import numpy as np
+            >>> import paddle.base as base
+            >>> place = base.CPUPlace()
+            >>> exe = base.Executor(place)
+            >>> data = np.array((100, 200, 300))
+            >>> np_outs = map(lambda x: base.executor._as_lodtensor(x, place), data)
 
     Args:
         data(numpy.ndarray|list|tuple|scalar): a instance of array, scalar, list or tuple
@@ -739,6 +744,11 @@ def _can_use_interpreter_core(program, place):
     return True
 
 
+@lru_cache()
+def _warning_once(msg):
+    logging.warning(msg)
+
+
 class FetchHandler:
     def __init__(self, var_dict=None, period_secs=60):
         assert var_dict is not None
@@ -966,7 +976,9 @@ def _get_program_and_executor(self, cached_data):
             else False
         )
 
-        if os.getenv("FLAGS_enable_new_ir_in_executor"):
+        if get_flags('FLAGS_enable_new_ir_in_executor')[
+            'FLAGS_enable_new_ir_in_executor'
+        ]:
             # todo(phlrain), skip inplace add addto pass in new IR
             enable_inplace = False
             enable_addto = False
@@ -1044,44 +1056,45 @@ class Executor:
         Executor
 
     Examples:
+
         .. code-block:: python
 
-            import paddle
-            import numpy
-            import os
-
-            # Executor is only used in static graph mode
-            paddle.enable_static()
-
-            # Set place explicitly.
-            # use_cuda = True
-            # place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-            # exe = paddle.static.Executor(place)
-
-            # If you don't set place, PaddlePaddle sets the default device.
-            exe = paddle.static.Executor()
-
-            train_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
-            with paddle.static.program_guard(train_program, startup_program):
-                data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
-                hidden = paddle.static.nn.fc(data, 10)
-                loss = paddle.mean(hidden)
-                paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-            # Run the startup program once and only once.
-            # Not need to optimize/compile the startup program.
-            exe.run(startup_program)
-
-            # Run the main program directly without compile.
-            x = numpy.random.random(size=(10, 1)).astype('float32')
-            loss_data, = exe.run(train_program, feed={"X": x}, fetch_list=[loss.name])
-
-            # Or, compiled the program and run. See `CompiledProgram`
-            # for more details.
-            compiled_prog = paddle.static.CompiledProgram(
-                train_program)
-            loss_data, = exe.run(compiled_prog, feed={"X": x}, fetch_list=[loss.name])
+            >>> import paddle
+            >>> import numpy
+            >>> import os
+
+            >>> # Executor is only used in static graph mode
+            >>> paddle.enable_static()
+
+            >>> # Set place explicitly.
+            >>> # use_cuda = True
+            >>> # place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            >>> # exe = paddle.static.Executor(place)
+
+            >>> # If you don't set place, PaddlePaddle sets the default device.
+            >>> exe = paddle.static.Executor()
+
+            >>> train_program = paddle.static.Program()
+            >>> startup_program = paddle.static.Program()
+            >>> with paddle.static.program_guard(train_program, startup_program):
+            ...     data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
+            ...     hidden = paddle.static.nn.fc(data, 10)
+            ...     loss = paddle.mean(hidden)
+            ...     paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+            ...
+            >>> # Run the startup program once and only once.
+            >>> # Not need to optimize/compile the startup program.
+            >>> exe.run(startup_program)
+
+            >>> # Run the main program directly without compile.
+            >>> x = numpy.random.random(size=(10, 1)).astype('float32')
+            >>> loss_data, = exe.run(train_program, feed={"X": x}, fetch_list=[loss.name])
+
+            >>> # Or, compiled the program and run. See `CompiledProgram`
+            >>> # for more details.
+            >>> compiled_prog = paddle.static.CompiledProgram(
+            ...     train_program)
+            >>> loss_data, = exe.run(compiled_prog, feed={"X": x}, fetch_list=[loss.name])
 
     """
 
@@ -1091,18 +1104,18 @@ def __init__(self, place=None):
             self.place = expected_place
         else:
             self.place = framework._get_paddle_place(place)
-        self.program_caches = dict()
-        self.ctx_caches = dict()
-        self.trainer_caches = dict()
-        self.scope_caches = dict()
-        self.micro_scope_cache = dict()
-        self.var_caches = dict()
-        self.pruned_program_caches = dict()
+        self.program_caches = {}
+        self.ctx_caches = {}
+        self.trainer_caches = {}
+        self.scope_caches = {}
+        self.micro_scope_cache = {}
+        self.var_caches = {}
+        self.pruned_program_caches = {}
         p = core.Place()
         p.set_place(self.place)
         self._default_executor = core.Executor(p)
         self._closed = False
-        self.pruned_program_scope_caches = dict()
+        self.pruned_program_scope_caches = {}
         self._prepare_to_run_called = False
 
         self._auto_checkpoint_name = unique_name.generate(
@@ -1171,10 +1184,8 @@ def _add_micro_scopes_cache(self, program_cache_key, micro_scopes: list):
     def _get_micro_scopes_cache(self, program_cache_key):
         return self.micro_scope_cache.get(program_cache_key, None)
 
-    # just for testing, will be removed later
-    @lru_cache()
     def _log_force_set_program_cache(self, use_program_cache):
-        logging.warning(
+        _warning_once(
             f"use_program_cache is force set to {use_program_cache} by FLAGS_FORCE_USE_PROGRAM_CACHE"
         )
 
@@ -1440,14 +1451,15 @@ def close(self):
             None
 
         Examples:
+
             .. code-block:: python
 
-              import paddle
+                >>> import paddle
 
-              cpu = paddle.CPUPlace()
-              exe = paddle.static.Executor(cpu)
-              # execute training or testing
-              exe.close()
+                >>> cpu = paddle.CPUPlace()
+                >>> exe = paddle.static.Executor(cpu)
+                >>> # execute training or testing
+                >>> exe.close()
         """
         if not self._closed:
             self._closed = True
@@ -1519,78 +1531,82 @@ def run(
             List: The fetched result list.
 
         Examples:
+
             .. code-block:: python
                 :name: code-example-1
 
-                import paddle
-                import numpy
-
-                # First create the Executor.
-                paddle.enable_static()
-                place = paddle.CPUPlace()  # paddle.CUDAPlace(0)
-                exe = paddle.static.Executor(place)
-
-                data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
-                hidden = paddle.static.nn.fc(data, 10)
-                loss = paddle.mean(hidden)
-                adam = paddle.optimizer.Adam()
-                adam.minimize(loss)
-                i = paddle.zeros(shape=[1], dtype='int64')
-                array = paddle.tensor.array_write(x=loss, i=i)
-
-                # Run the startup program once and only once.
-                exe.run(paddle.static.default_startup_program())
-
-                x = numpy.random.random(size=(10, 1)).astype('float32')
-                loss_val, array_val = exe.run(feed={'X': x},
-                                              fetch_list=[loss.name, array.name])
-                print(array_val)
-                # [array([0.02153828], dtype=float32)]
+                >>> import paddle
+                >>> import numpy
+
+                >>> # First create the Executor.
+                >>> paddle.enable_static()
+                >>> place = paddle.CPUPlace()  # paddle.CUDAPlace(0)
+                >>> exe = paddle.static.Executor(place)
+
+                >>> data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
+                >>> hidden = paddle.static.nn.fc(data, 10)
+                >>> loss = paddle.mean(hidden)
+                >>> adam = paddle.optimizer.Adam()
+                >>> adam.minimize(loss)
+                >>> i = paddle.zeros(shape=[1], dtype='int64')
+                >>> array = paddle.tensor.array_write(x=loss, i=i)
+
+                >>> # Run the startup program once and only once.
+                >>> exe.run(paddle.static.default_startup_program())
+
+                >>> x = numpy.random.random(size=(10, 1)).astype('float32')
+                >>> loss_val, array_val = exe.run(feed={'X': x},
+                ...                                 fetch_list=[loss.name, array.name])
+                >>> print(array_val)
+                >>> # doctest: +SKIP("Random output")
+                [array(0.16870381, dtype=float32)]
+                >>> # doctest: -SKIP
 
             .. code-block:: python
                 :name: code-example-2
 
-                # required: gpu
-                import paddle
-                import numpy as np
-
-                # First create the Executor.
-                paddle.enable_static()
-                place = paddle.CUDAPlace(0)
-                exe = paddle.static.Executor(place)
-
-                data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
-                class_dim = 2
-                prediction = paddle.static.nn.fc(data, class_dim)
-                loss = paddle.mean(prediction)
-                adam = paddle.optimizer.Adam()
-                adam.minimize(loss)
-
-                # Run the startup program once and only once.
-                exe.run(paddle.static.default_startup_program())
-                build_strategy = paddle.static.BuildStrategy()
-                binary = paddle.static.CompiledProgram(
-                    paddle.static.default_main_program(), build_strategy=build_strategy)
-                batch_size = 6
-                x = np.random.random(size=(batch_size, 1)).astype('float32')
-
-                prediction, = exe.run(binary,
-                                      feed={'X': x},
-                                    fetch_list=[prediction.name])
-                # If the user uses two GPU cards to run this python code, the printed result will be
-                # (6, class_dim). The first dimension value of the printed result is the batch_size.
-                print("The prediction shape: {}".format(
-                    np.array(prediction).shape))
-                print(prediction)
-
-                # Out:
-                # The prediction shape: (6, 2)
-                # [[-0.37789783 -0.19921964]
-                #  [-0.3577645  -0.18863106]
-                #  [-0.24274671 -0.12814042]
-                #  [-0.24635398 -0.13003758]
-                #  [-0.49232286 -0.25939852]
-                #  [-0.44514108 -0.2345845 ]]
+                >>> # doctest: +REQUIRES(env:GPU)
+                >>> import paddle
+                >>> import numpy as np
+
+                >>> # First create the Executor.
+                >>> paddle.enable_static()
+                >>> place = paddle.CUDAPlace(0)
+                >>> exe = paddle.static.Executor(place)
+
+                >>> data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
+                >>> class_dim = 2
+                >>> prediction = paddle.static.nn.fc(data, class_dim)
+                >>> loss = paddle.mean(prediction)
+                >>> adam = paddle.optimizer.Adam()
+                >>> adam.minimize(loss)
+
+                >>> # Run the startup program once and only once.
+                >>> exe.run(paddle.static.default_startup_program())
+                >>> build_strategy = paddle.static.BuildStrategy()
+                >>> binary = paddle.static.CompiledProgram(
+                ...     paddle.static.default_main_program(), build_strategy=build_strategy)
+                >>> batch_size = 6
+                >>> x = np.random.random(size=(batch_size, 1)).astype('float32')
+
+                >>> prediction, = exe.run(binary,
+                ...                         feed={'X': x},
+                ...                     fetch_list=[prediction.name])
+                >>> # If the user uses two GPU cards to run this python code, the printed result will be
+                >>> # (6, class_dim). The first dimension value of the printed result is the batch_size.
+                >>> print("The prediction shape: {}".format(
+                ...     np.array(prediction).shape))
+                The prediction shape: (6, 2)
+
+                >>> print(prediction)
+                >>> # doctest: +SKIP("Random output")
+                [[-0.37789783 -0.19921964]
+                 [-0.3577645  -0.18863106]
+                 [-0.24274671 -0.12814042]
+                 [-0.24635398 -0.13003758]
+                 [-0.49232286 -0.25939852]
+                 [-0.44514108 -0.2345845 ]]
+                >>> # doctest: -SKIP
 
         """
         # Temporary FLAGS, just for testing the performance of program cache
@@ -1894,13 +1910,7 @@ def _run_pir_impl(
                     "Please ensure you create model correctly or you can pass "
                     "the Program or the CompiledProgram manually."
                 )
-            else:
-                error_info = (
-                    "There are no operators in the program to be executed. "
-                    "If you pass Program manually, please use base.program_guard "
-                    "to ensure the current Program is being used."
-                )
-            warnings.warn(error_info)
+                warnings.warn(error_info)
 
         if scope is None:
             scope = global_scope()
@@ -2717,7 +2727,7 @@ def _run_using_fleet_executor(
                         if return_numpy:
                             tensor = as_numpy(tensor)
                         else:
-                            tensor = [t for t in tensor]
+                            tensor = list(tensor)
 
                     if tensor:
                         scope_result_list.append(tensor)
@@ -2915,23 +2925,22 @@ def infer_from_dataset(
 
             .. code-block:: python
 
-                import paddle
-
-                paddle.enable_static()
-                place = paddle.CPUPlace()  # you can set place = paddle.CUDAPlace(0) to use gpu
-                exe = paddle.static.Executor(place)
-                x = paddle.static.data(name="x", shape=[None, 10, 10], dtype="int64")
-                y = paddle.static.data(name="y", shape=[None, 1], dtype="int64", lod_level=1)
-                dataset = paddle.base.DatasetFactory().create_dataset()
-                dataset.set_use_var([x, y])
-                dataset.set_thread(1)
-                # you should set your own filelist, e.g. filelist = ["dataA.txt"]
-                filelist = []
-                dataset.set_filelist(filelist)
-                exe.run(paddle.static.default_startup_program())
-                exe.infer_from_dataset(program=paddle.static.default_main_program(),
-                                       dataset=dataset)
-
+                >>> import paddle
+
+                >>> paddle.enable_static()
+                >>> place = paddle.CPUPlace()  # you can set place = paddle.CUDAPlace(0) to use gpu
+                >>> exe = paddle.static.Executor(place)
+                >>> x = paddle.static.data(name="x", shape=[None, 10, 10], dtype="int64")
+                >>> y = paddle.static.data(name="y", shape=[None, 1], dtype="int64", lod_level=1)
+                >>> dataset = paddle.base.DatasetFactory().create_dataset()
+                >>> dataset.set_use_var([x, y])
+                >>> dataset.set_thread(1)
+                >>> # you should set your own filelist, e.g. filelist = ["dataA.txt"]
+                >>> filelist = []
+                >>> dataset.set_filelist(filelist)
+                >>> exe.run(paddle.static.default_startup_program())
+                >>> exe.infer_from_dataset(program=paddle.static.default_main_program(),
+                ...                         dataset=dataset)
         """
         return self._run_from_dataset(
             program,
@@ -3038,23 +3047,22 @@ def train_from_dataset(
 
             .. code-block:: python
 
-              import paddle
-
-              paddle.enable_static()
-              place = paddle.CPUPlace() # you can set place = paddle.CUDAPlace(0) to use gpu
-              exe = paddle.static.Executor(place)
-              x = paddle.static.data(name="x", shape=[None, 10, 10], dtype="int64")
-              y = paddle.static.data(name="y", shape=[None, 1], dtype="int64", lod_level=1)
-              dataset = paddle.base.DatasetFactory().create_dataset()
-              dataset.set_use_var([x, y])
-              dataset.set_thread(1)
-              # you should set your own filelist, e.g. filelist = ["dataA.txt"]
-              filelist = []
-              dataset.set_filelist(filelist)
-              exe.run(paddle.static.default_startup_program())
-              exe.train_from_dataset(program=paddle.static.default_main_program(),
-                                     dataset=dataset)
-
+                >>> import paddle
+
+                >>> paddle.enable_static()
+                >>> place = paddle.CPUPlace() # you can set place = paddle.CUDAPlace(0) to use gpu
+                >>> exe = paddle.static.Executor(place)
+                >>> x = paddle.static.data(name="x", shape=[None, 10, 10], dtype="int64")
+                >>> y = paddle.static.data(name="y", shape=[None, 1], dtype="int64", lod_level=1)
+                >>> dataset = paddle.base.DatasetFactory().create_dataset()
+                >>> dataset.set_use_var([x, y])
+                >>> dataset.set_thread(1)
+                >>> # you should set your own filelist, e.g. filelist = ["dataA.txt"]
+                >>> filelist = []
+                >>> dataset.set_filelist(filelist)
+                >>> exe.run(paddle.static.default_startup_program())
+                >>> exe.train_from_dataset(program=paddle.static.default_main_program(),
+                ...                         dataset=dataset)
         """
         return self._run_from_dataset(
             program,
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 0e1c62f4fb850d..ca9bcf5fd8db5b 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -12,33 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import textwrap
 import collections
-from collections.abc import Iterable
-from .wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+import copy
+import functools
+import multiprocessing
 import os
 import re
+import subprocess
+import sys
+import textwrap
+import threading
 import traceback
-import copy
-from types import MethodType, FunctionType
+import warnings
+from collections.abc import Iterable
+from types import FunctionType, MethodType
 
 import numpy as np
-import subprocess
-import multiprocessing
-import sys
 
-from .proto import framework_pb2
-from .proto import data_feed_pb2  # noqa: F401
+import paddle.version as paddle_version
 
-from . import core
-from . import unique_name
 from .. import pir
-from paddle.base.libpaddle import DataType
-import paddle.version as fluid_version
-import warnings
-import functools
-from .variable_index import _getitem_static, _setitem_static, _setitem_impl_
-import threading
+from . import core, unique_name
+from .libpaddle import DataType
+from .proto import data_feed_pb2  # noqa: F401
+from .proto import framework_pb2
+from .variable_index import _getitem_static, _setitem_impl_, _setitem_static
+from .wrapped_decorator import signature_safe_contextmanager, wrap_decorator
 
 __all__ = []
 
@@ -467,13 +466,13 @@ def require_version(min_version, max_version=None):
     Examples:
         .. code-block:: python
 
-            >>> import paddle.base as base
+            >>> import paddle
 
             >>> # any version >= 0.1.0 is acceptable.
-            >>> base.require_version('0.1.0')
+            >>> paddle.utils.require_version('0.1.0')
 
             >>> # if 0.1.0 <= version <= 10.0.0, it is acceptable.
-            >>> base.require_version(min_version='0.1.0', max_version='10.0.0')
+            >>> paddle.utils.require_version(min_version='0.1.0', max_version='10.0.0')
     """
     if not isinstance(min_version, str):
         raise TypeError(
@@ -503,10 +502,10 @@ def require_version(min_version, max_version=None):
             )
 
     version_installed = [
-        fluid_version.major,
-        fluid_version.minor,
-        fluid_version.patch,
-        fluid_version.rc,
+        paddle_version.major,
+        paddle_version.minor,
+        paddle_version.patch,
+        paddle_version.rc,
     ]
     zero_version = ['0', '0', '0', '0']
 
@@ -524,7 +523,7 @@ def version_cmp(ver_a, ver_b):
                 "PaddlePaddle version in [{}, {}] required, but {} installed. "
                 "Maybe you are using a develop version, "
                 "please make sure the version is good with your code.".format(
-                    min_version, max_version, fluid_version.full_version
+                    min_version, max_version, paddle_version.full_version
                 )
             )
         else:
@@ -532,7 +531,7 @@ def version_cmp(ver_a, ver_b):
                 "PaddlePaddle version {} or higher is required, but {} installed, "
                 "Maybe you are using a develop version, "
                 "please make sure the version is good with your code.".format(
-                    min_version, fluid_version.full_version
+                    min_version, paddle_version.full_version
                 )
             )
         return
@@ -554,7 +553,7 @@ def version_cmp(ver_a, ver_b):
         ):
             raise Exception(
                 "VersionError: PaddlePaddle version in [{}, {}] required, but {} installed.".format(
-                    min_version, max_version, fluid_version.full_version
+                    min_version, max_version, paddle_version.full_version
                 )
             )
     else:
@@ -562,7 +561,7 @@ def version_cmp(ver_a, ver_b):
             raise Exception(
                 "VersionError: PaddlePaddle version {} or higher is required, but {} installed, "
                 "please upgrade your PaddlePaddle to {} or other higher version.".format(
-                    min_version, fluid_version.full_version, min_version
+                    min_version, paddle_version.full_version, min_version
                 )
             )
 
@@ -1023,7 +1022,7 @@ def cuda_pinned_places(device_count=None):
 
 class NameScope:
     def __init__(self, name="", parent=None):
-        self._children = dict()
+        self._children = {}
         self._name = name
         self._parent = parent
 
@@ -1219,7 +1218,7 @@ def _debug_string_(proto, throw_on_error=True):
     Returns(str): The debug string of the protobuf message
 
     """
-    error_fields = list()
+    error_fields = []
     if not proto.IsInitialized(error_fields) and throw_on_error:
         raise ValueError(
             f"{error_fields} are not initialized.\nThe message is {proto}:\n"
@@ -1385,7 +1384,7 @@ class Variable(metaclass=VariableMetaClass):
 
         In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.
 
-        In Dygraph Mode: Please use ** :ref:`api_base_dygraph_to_variable` ** to create a dygraph variable with real data.
+        In Dygraph Mode: Please use ** :ref:`api_paddle_to_tensor` ** to create a dygraph variable with real data.
 
     In Fluid, every input and output of an OP is a variable. In most
     cases, variables are used for holding different kinds of data or training
@@ -2879,7 +2878,6 @@ class Operator:
         'fetch',
         'recurrent',
         'go',
-        'rnn_memory_helper_grad',
         'conditional_block',
         'pylayer',
         'while',
@@ -2933,7 +2931,7 @@ def __init__(
             # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
             op_attrs = attrs
             if op_attrs is None:
-                op_attrs = dict()
+                op_attrs = {}
             del attrs
 
             # attr for static graph mode cuda graph
@@ -3957,7 +3955,7 @@ class Block:
     def __init__(self, program, idx):
         self.desc = program.desc.block(idx)
         self.vars = collections.OrderedDict()  # var_name --> var
-        self.ops = list()  # operator list
+        self.ops = []  # operator list
         self.program = program
 
     def __str__(self):
@@ -4115,7 +4113,7 @@ def _find_var_recursive(self, name):
         Returns:
             Variable: the Variable with the giving name. Or None if not found.
         """
-        frontier = list()
+        frontier = []
         visited = set()
 
         frontier.append(self)
@@ -5428,7 +5426,7 @@ def safe_remove_nodes(self, remove_nodes):
 
     def resolve_hazard(self):
         ordered_nodes = core.topology_sort(self.graph)
-        var_nodes = dict()
+        var_nodes = {}
         for node in ordered_nodes:
             if node.is_op() and node.op() is not None:
                 for each_var_name in node.op().input_arg_names():
@@ -5485,7 +5483,7 @@ def build_adjacency_list(self):
             dict{IrNode: set(IrNode)}: the adjacency list.
         """
         adj_list = core.build_adjacency_list(self.graph)
-        wrapped_adj_list = dict()
+        wrapped_adj_list = {}
         for k, v in adj_list.items():
             wrapped_adj_list[IrNode(k)] = {IrNode(n) for n in v}
         return wrapped_adj_list
@@ -7123,7 +7121,7 @@ def condition(var):
 
         var_list = filter(condition, self.list_vars())
 
-        state_dict = dict()
+        state_dict = {}
         for var in var_list:
             var_temp = scope.find_var(var.name)
             if var_temp is None:
@@ -7704,8 +7702,8 @@ def _get_var(name, program=None):
 
 @signature_safe_contextmanager
 def dygraph_guard_if_declarative():
-    from .dygraph.base import in_to_static_mode
     from .dygraph import Tracer
+    from .dygraph.base import in_to_static_mode
 
     if in_to_static_mode():
         # Under @paddle.jit.to_static decorator, we switch back dygraph mode temporarily.
diff --git a/python/paddle/base/layers/__init__.py b/python/paddle/base/layers/__init__.py
index 002fd068930d91..3d3e629c31f467 100644
--- a/python/paddle/base/layers/__init__.py
+++ b/python/paddle/base/layers/__init__.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import io  # noqa: F401
-from . import math_op_patch
-
+from . import io, math_op_patch  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py
index b0f35af4fefed7..2cec3b7e58fa17 100644
--- a/python/paddle/base/layers/layer_function_generator.py
+++ b/python/paddle/base/layers/layer_function_generator.py
@@ -193,7 +193,7 @@ def infer_and_check_dtype(op_proto, *args, **kwargs):
                     dtype = each.dtype
                 elif dtype != each.dtype:
                     raise ValueError(
-                        "operator {0} must input same dtype. {1} vs {2}".format(
+                        "operator {} must input same dtype. {} vs {}".format(
                             op_type, dtype, each.dtype
                         )
                     )
@@ -214,7 +214,7 @@ def func(*args, **kwargs):
 
         dtype = infer_and_check_dtype(op_proto, *args, **kwargs)
 
-        inputs = dict()
+        inputs = {}
         for ipt in op_proto.inputs:
             name = _convert_(ipt.name)
             val = kwargs.pop(name, [])
@@ -225,7 +225,7 @@ def func(*args, **kwargs):
                 args = args[1:]
             inputs[ipt.name] = val
 
-        outputs = dict()
+        outputs = {}
         out = kwargs.pop(_convert_(o_name), [])
         if out:
             out_var = out[0] if (isinstance(out, (list, tuple))) else out
@@ -337,8 +337,8 @@ def func(x, name=None):
 
     func.__name__ = inplace_op_type
     func.__doc__ = """
-Inplace version of ``{0}`` API, the output Tensor will be inplaced with input ``x``.
-Please refer to :ref:`api_base_layers_{1}`.
+Inplace version of ``{}`` API, the output Tensor will be inplaced with input ``x``.
+Please refer to :ref:`api_base_layers_{}`.
 """.format(
         origin_op_type, origin_op_type
     )
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index f2b1ac7c6d04d1..1f070882758b92 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -355,7 +355,7 @@ def pop(self, *args):
 
         if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
-                "Only Variable with VarType.LOD_TENSOR_ARRAY support `append` method, but received type: {}".format(
+                "Only Variable with VarType.LOD_TENSOR_ARRAY support `pop` method, but received type: {}".format(
                     self.type
                 )
             )
@@ -376,7 +376,7 @@ def _neg_(var):
         return _scalar_op_(var, -1.0, 0.0)
 
     @property
-    def _ndim_(self):
+    def _ndim(self):
         """
         Returns the dimension of current Variable
 
@@ -393,7 +393,7 @@ def _ndim_(self):
                 >>> # create a static Variable
                 >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
                 >>> # print the dimension of the Variable
-                >>> print(x.ndim())
+                >>> print(x.ndim)
                 3
         """
         return len(self.shape)
@@ -627,7 +627,7 @@ def to_dense(var):
         ('pop', pop),
         ('dim', dim),
         ('ndimension', ndimension),
-        ('ndim', _ndim_),
+        ('ndim', _ndim),
         (
             '__add__',
             _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_),
diff --git a/python/paddle/base/multiprocess_utils.py b/python/paddle/base/multiprocess_utils.py
index 8d18db0bb3ea85..9b70cacd1c2cd8 100644
--- a/python/paddle/base/multiprocess_utils.py
+++ b/python/paddle/base/multiprocess_utils.py
@@ -73,7 +73,6 @@ def _func_register(function):
             if not callable(function):
                 raise TypeError("%s is not callable object." % (function))
             # check function object whether hash-able
-            set([function])
             if function not in cls._registered_func_set:
                 atexit.register(_func_exectuor)
                 cls._registered_func_set.add(function)
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index 8c2ddd16961dac..7fcccf8910fc46 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -42,12 +42,12 @@
     _copy_reader_var_,
     monkey_patch_reader_methods,
 )
-from .multiprocess_utils import _cleanup  # noqa: F401
-from .multiprocess_utils import multiprocess_queue_set  # noqa: F401
-from .multiprocess_utils import (
+from .multiprocess_utils import (  # noqa: F401
     CleanupFuncRegistrar,
+    _cleanup,
     _cleanup_mmap,
     _set_SIGCHLD_handler,
+    multiprocess_queue_set,
 )
 from .unique_name import UniqueNameGenerator
 
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index 0ff628ed48f4f9..7d5b10fefeda47 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -14,6 +14,7 @@
 
 import itertools
 import warnings
+from functools import reduce
 
 import numpy as np
 
@@ -224,7 +225,8 @@ def replace_ellipsis(var, item):
     item_remove_var = [
         ele
         for ele in item
-        if not isinstance(ele, (Variable, np.ndarray)) and ele is not None
+        if not isinstance(ele, (Variable, paddle.pir.OpResult, np.ndarray))
+        and ele is not None
     ]
     ell_count = item_remove_var.count(Ellipsis)
     if ell_count == 0:
@@ -284,6 +286,9 @@ def is_integer_or_scalar_tensor(ele):
                 return True
         if len(ele.shape) == 0 and ele.dtype != paddle.bool:
             return True
+    elif isinstance(ele, paddle.pir.OpResult):
+        if len(ele.shape) == 0 and ele.dtype != paddle.base.libpaddle.BOOL:
+            return True
     return False
 
 
@@ -292,6 +297,11 @@ def is_bool_tensor(ele):
 
     if isinstance(ele, Variable) and ele.dtype == paddle.bool:
         return True
+    elif (
+        isinstance(ele, paddle.pir.OpResult)
+        and ele.dtype == paddle.base.libpaddle.BOOL
+    ):
+        return True
     return False
 
 
@@ -303,7 +313,7 @@ def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
             attr, dtype="int64"
         )
         for i, dim in enumerate(attr):
-            if isinstance(dim, Variable):
+            if isinstance(dim, (Variable, paddle.pir.OpResult)):
                 attrs[attr_name].append(-1)
                 infer_flags[i] = -1
             else:
@@ -335,14 +345,10 @@ def get_value_for_bool_tensor(var, item):
     empty_shape = [0] + list(var.shape[i:])
 
     def idx_not_empty(var, item):
-        from ..tensor import gather_nd
-
-        bool_2_idx = paddle.nonzero(item == True)
-        return gather_nd(var, bool_2_idx)
+        bool_2_idx = paddle.nonzero(item)
+        return paddle.gather_nd(var, bool_2_idx)
 
-    from paddle.static.nn import cond
-
-    return cond(
+    return paddle.static.nn.cond(
         item.any(),
         lambda: idx_not_empty(var, item),
         lambda: paddle.empty(empty_shape, var.dtype),
@@ -758,9 +764,14 @@ def parse_index(x, indices):
             has_advanced_index = True
             estimated_dim += 1
 
-        elif isinstance(slice_item, paddle.base.Variable):
+        elif isinstance(
+            slice_item, (paddle.base.Variable, paddle.pir.OpResult)
+        ):
             # In this case, the Variable is not 0-dim Tensor and will be treated as advanced-indexing.
-            if slice_item.dtype == paddle.bool:
+            if (
+                slice_item.dtype == paddle.bool
+                or slice_item.dtype == paddle.base.libpaddle.BOOL
+            ):
                 if slice_item.ndim == 0:
                     # 0-D bool Tensor, same as single PY-bool.
                     none_axes.append(dim)
@@ -788,7 +799,12 @@ def parse_index(x, indices):
             axes.append(dim)
             use_strided_slice = (
                 True
-                if (isinstance(step, paddle.base.Variable) or step != 1)
+                if (
+                    isinstance(
+                        step, (paddle.base.Variable, paddle.pir.OpResult)
+                    )
+                    or step != 1
+                )
                 else use_strided_slice
             )
     return (
@@ -1032,7 +1048,9 @@ def get_tensor_with_basic_indexing(
         )
         attrs['infer_flags'] = infer_flags
 
-        if paddle.in_dynamic_mode():
+        from . import in_dynamic_or_pir_mode, in_pir_mode
+
+        if in_dynamic_or_pir_mode():
             if "StartsTensorList" in inputs.keys():
                 st = inputs['StartsTensorList']
             else:
@@ -1050,6 +1068,13 @@ def get_tensor_with_basic_indexing(
                 if len(decrease_axes) > 0:
                     out = paddle._C_ops.squeeze(out, decrease_axes)
             else:
+                if in_pir_mode():
+                    if isinstance(st, (list, tuple)):
+                        if paddle.utils._contain_var(st):
+                            st = paddle.utils.get_int_tensor_list(st)
+                    if isinstance(end, (list, tuple)):
+                        if paddle.utils._contain_var(end):
+                            end = paddle.utils.get_int_tensor_list(end)
                 out = paddle._C_ops.slice(
                     x,
                     axes,
diff --git a/python/paddle/callbacks.py b/python/paddle/callbacks.py
index 960399c6b97967..6e94a9d7b67dfe 100644
--- a/python/paddle/callbacks.py
+++ b/python/paddle/callbacks.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .hapi.callbacks import Callback  # noqa: F401
-from .hapi.callbacks import EarlyStopping  # noqa: F401
-from .hapi.callbacks import LRScheduler  # noqa: F401
-from .hapi.callbacks import ModelCheckpoint  # noqa: F401
-from .hapi.callbacks import ProgBarLogger  # noqa: F401
-from .hapi.callbacks import ReduceLROnPlateau  # noqa: F401
-from .hapi.callbacks import VisualDL  # noqa: F401
-from .hapi.callbacks import WandbCallback  # noqa: F401
+from .hapi.callbacks import (
+    Callback,
+    EarlyStopping,
+    LRScheduler,
+    ModelCheckpoint,
+    ProgBarLogger,
+    ReduceLROnPlateau,
+    VisualDL,
+    WandbCallback,
+)
 
-__all__ = [  # noqa
+__all__ = [
     'Callback',
     'ProgBarLogger',
     'ModelCheckpoint',
diff --git a/python/paddle/cost_model/__init__.py b/python/paddle/cost_model/__init__.py
index e6907128642c66..6fd0ef63f3c5de 100644
--- a/python/paddle/cost_model/__init__.py
+++ b/python/paddle/cost_model/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .cost_model import CostModel  # noqa: F401
+from .cost_model import CostModel
 
 __all__ = ['CostModel']
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 4b71ff6ac66f1e..eaa8ab0ddfed1a 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -15,18 +15,20 @@
 Dataset package.
 """
 
-import paddle.dataset.mnist  # noqa: F401
-import paddle.dataset.imikolov  # noqa: F401
-import paddle.dataset.imdb  # noqa: F401
-import paddle.dataset.cifar  # noqa: F401
-import paddle.dataset.movielens  # noqa: F401
-import paddle.dataset.conll05  # noqa: F401
-import paddle.dataset.uci_housing  # noqa: F401
-import paddle.dataset.wmt14  # noqa: F401
-import paddle.dataset.wmt16  # noqa: F401
-import paddle.dataset.flowers  # noqa: F401
-import paddle.dataset.voc2012  # noqa: F401
-import paddle.dataset.image  # noqa: F401
+from . import (  # noqa: F401
+    mnist,
+    imikolov,
+    imdb,
+    cifar,
+    movielens,
+    conll05,
+    uci_housing,
+    wmt14,
+    wmt16,
+    flowers,
+    voc2012,
+    image,
+)
 
 # set __all__ as empty for not showing APIs under paddle.dataset
 __all__ = []
diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py
index bfb9b6e9ba2c66..e89b5abc392211 100644
--- a/python/paddle/decomposition/decomp.py
+++ b/python/paddle/decomposition/decomp.py
@@ -16,7 +16,9 @@
 import typing
 
 from paddle import pir
-from paddle.base.libpaddle.pir import Block, Program
+from paddle.autograd import ir_backward
+from paddle.base.core import call_decomp, has_decomp
+from paddle.base.libpaddle.pir import Block, Operation, Program
 from paddle.framework import core
 
 from . import register
@@ -30,6 +32,18 @@ def _build_tensor_tuple(xs):
     return TypeError(f"Type {type(xs)} is not supported.")
 
 
+def _analyse_decomp_results(orig_outs, decomp_outs):
+    assert len(orig_outs) == len(decomp_outs)
+    res = []
+    for org_item, new_item in zip(orig_outs, decomp_outs):
+        if isinstance(org_item, pir.OpResult):
+            assert len(new_item) == 1 and isinstance(new_item[0], pir.OpResult)
+            res.append(new_item[0])
+        else:
+            res.append(new_item)
+    return res
+
+
 def _prepare_python_api_arguments(op):
     """
     For standard api of operator, its inputs should keep consistent with organization of its inputs and attrs.
@@ -37,19 +51,35 @@ def _prepare_python_api_arguments(op):
     Args:
     op (Operator): The target operator.
     """
-    op_inputs = [x.source() for x in op.operands()]
+    combine_op_name = "builtin.combine"
+    inputs = []
+    for x in op.operands():
+        input = x.source()
+        if input and input.initialized():
+            prev_op = input.get_defining_op()
+            if (
+                isinstance(prev_op, Operation)
+                and prev_op.name() == combine_op_name
+            ):
+                input = [item.source() for item in prev_op.operands()]
+            inputs.append(input)
+        else:
+            # for optional input, such as scale for layer_norm op,
+            # if it is not set, there will be an empty OpResult which is not initialized in ops.operands
+            # therefore append None for it.
+            inputs.append(None)
+
     # The inputs of PIR op builtin.combine will be restored as list of tensor.
-    if op.name() in ["builtin.combine"]:
-        return (op_inputs,)
+    if op.name() == combine_op_name:
+        return (inputs,)
 
-    op_attrs_dict = op.attrs()
-    op_attrs_name = op.get_attr_names()
-    op_attrs = [op_attrs_dict[x] for x in op_attrs_name]
-    api_arguments = op_inputs + op_attrs
+    api_arguments = inputs + [op.attrs()[x] for x in op.get_attr_names()]
     return tuple(api_arguments)
 
 
-def _check_op_results(op_name, orig_outs, new_outs, orig_vars, dst_vars):
+def _check_op_results(
+    op_name, orig_outs, new_outs, orig_vars=None, dst_vars=None
+):
     """
     Check whether the replaced outputs are consistent with origin outputs.
 
@@ -79,8 +109,9 @@ def _check_op_results(op_name, orig_outs, new_outs, orig_vars, dst_vars):
             # to keep same as phi op definition, orig_out may receive None
             continue
         elif new_out is not None:
-            if orig_out in orig_vars.keys():
-                dst_vars[orig_vars[orig_out]] = new_out
+            if orig_vars is not None and dst_vars is not None:
+                if orig_out in orig_vars.keys():
+                    dst_vars[orig_vars[orig_out]] = new_out
             orig_dtype = orig_out.dtype
             new_dtype = new_out.dtype
             orig_shape = orig_out.shape
@@ -203,15 +234,14 @@ def _decompose_subgraph(block, orig_vars, dst_vars, op_filter):
     if isinstance(block, Block):
         ops_list = block.ops
         temp_op = None
-        temp_inputs = None
         for idx, op in enumerate(ops_list):
             op_name = op.name()
             decom_rule = register.get_decomp_rule(op_name)
-            lower = decom_rule and op_filter(op)
+            has_sink_decomp_rule = has_decomp(op)
+            lower = (decom_rule or has_sink_decomp_rule) and op_filter(op)
 
             if op.name() == "builtin.combine":
                 temp_op = op
-                temp_inputs = _prepare_python_api_arguments(op)
 
             if lower:
                 core.prim_config["composite_ops_record"].add(op_name)
@@ -219,20 +249,25 @@ def _decompose_subgraph(block, orig_vars, dst_vars, op_filter):
                     temp_op is not None
                     and ops_list[idx - 1].name() == "builtin.combine"
                 ):
-                    input_args = temp_inputs
                     pir.set_insertion_point(temp_op)
                 else:
-                    input_args = _prepare_python_api_arguments(op)
                     pir.set_insertion_point(op)
+                input_args = _prepare_python_api_arguments(op)
                 orig_outs = op.results()
-                new_outs = _build_tensor_tuple(decom_rule(*input_args))
+                if has_sink_decomp_rule:
+                    decomp_outs = call_decomp(op)
+                    new_outs = _analyse_decomp_results(orig_outs, decomp_outs)
+                else:
+                    new_outs = _build_tensor_tuple(decom_rule(*input_args))
 
                 # Todo: To cover such case: some outputs are no longer needed after decomposition.
                 _check_op_results(
                     op_name, orig_outs, new_outs, orig_vars, dst_vars
                 )
-
-                op.replace_all_uses_with(new_outs)
+                if op.name() in ("pd_op.unsqueeze", "pd_op.squeeze"):
+                    orig_outs[0].replace_all_uses_with(new_outs[0])
+                else:
+                    op.replace_all_uses_with(new_outs)
                 block.remove_op(op)
 
                 if temp_op is not None:
@@ -253,3 +288,221 @@ def _decompose_subgraph(block, orig_vars, dst_vars, op_filter):
     raise TypeError(
         f"Expect type Block or Sequence of Block, but got type {type(block)}"
     )
+
+
+def get_leaf_ops(block, global_outputs):
+    '''
+    This API checks which op contributes to the outputs of the entire computation graph,
+    as well as determining the corresponding output index.
+
+    Args:
+        block (Block): the block of program to be processed.
+        global_outputs (tuple(Value)): the outputs of the entire computation graph.
+
+    Returns:
+        related_ops (tuple(pir.Operation)): a tuple of op that contributes to the outputs of the entire graph.
+        related_ops_output_indexes (tuple(tuple())) : a tuple records the mapping of tuple(the output index of the op,  the output index of the entire graph)
+    '''
+    if not isinstance(block, Block):
+        raise TypeError(f"block should be Block, but got type {type(block)}")
+    if not isinstance(global_outputs, list):
+        raise TypeError("The type of global_outputs should be list")
+
+    related_ops = []
+    related_ops_output_indexes = []
+
+    op_to_op_valid_result = {}
+    for op in block.ops:
+        op_valid_result = []
+        for x in op.results():
+            if x.initialized():
+                op_valid_result.append(x)
+        op_to_op_valid_result[op] = op_valid_result
+
+    for global_output in global_outputs:
+        for op in op_to_op_valid_result.keys():
+            if global_output in op_to_op_valid_result[op]:
+                if op not in related_ops:
+                    related_ops.append(op)
+                    related_ops_output_indexes.append(
+                        [
+                            [
+                                op.results().index(global_output),
+                                global_outputs.index(global_output),
+                            ]
+                        ]
+                    )
+                else:
+                    related_ops_output_indexes[related_ops.index(op)].append(
+                        [
+                            op.results().index(global_output),
+                            global_outputs.index(global_output),
+                        ]
+                    )
+
+    return tuple(related_ops), tuple(related_ops_output_indexes)
+
+
+def replace_graph_outputs(
+    global_outputs,
+    op_outputs,
+    op_index,
+    related_ops_output_indexes,
+):
+    '''
+    This API replace the outputs of the entire computation graph with the new outputs of the op,
+    when the op contributes to the outputs of the entire computation graph.
+    '''
+    for index in related_ops_output_indexes[op_index]:
+        global_outputs[index[1]] = op_outputs[index[0]]
+
+
+def decompose_fwd_op(
+    block: Block, fwd_op: pir.Operation, grad_var_to_var_map: dict
+) -> tuple:
+    '''
+    Decompose the fwd_op into a list of primitive ops.
+
+    Args:
+        block (Block): the block to which the fwd_op belongs.
+        fwd_op (pir.Operation): the forward op to be decomposed.
+        grad_var_to_var_map (dict): a dict obtained from distributed processing,
+            which maps the backward grad variable to its corresponding forward variable.
+    Returns:
+        new_outputs (tuple(Value)): the new outputs after decomposing.
+    '''
+
+    if not core._is_fwd_prim_enabled():
+        raise RuntimeError(
+            "To decompose forward op, please set `core._set_prim_forward_enabled(True)` firstly"
+        )
+
+    with pir.core.program_guard(block.program):
+        op_name = fwd_op.name()
+        orig_outs = fwd_op.results()
+        decom_rule = register.get_decomp_rule(op_name)
+        has_sink_decomp_rule = has_decomp(fwd_op)
+        lower = decom_rule or has_sink_decomp_rule
+
+        if lower:
+            input_args = _prepare_python_api_arguments(fwd_op)
+            pir.set_insertion_point(fwd_op)
+            if has_sink_decomp_rule:
+                decomp_outs = call_decomp(fwd_op)
+                new_outs = _analyse_decomp_results(orig_outs, decomp_outs)
+            else:
+                new_outs = _build_tensor_tuple(decom_rule(*input_args))
+
+            _check_op_results(op_name, orig_outs, new_outs)
+
+            # update_grad_var_to_var_map
+            for grad_var, var in grad_var_to_var_map.items():
+                if var in orig_outs:
+                    grad_var_to_var_map[grad_var] = new_outs[
+                        orig_outs.index(var)
+                    ]
+
+            fwd_op.replace_all_uses_with(new_outs)
+            block.remove_op(fwd_op)
+            return new_outs
+        else:
+            return tuple(orig_outs)
+
+
+def decompose_bwd_op(
+    block: Block,
+    bwd_op: pir.Operation,
+    grad_var_to_var_map: dict,
+    fwd_outputs: tuple,
+    fwd_inputs: tuple,
+) -> tuple:
+    '''
+    Lowering a first-order derivative PHI operator into primitive operators, steps are as follows:
+    step1: get grad_outputs from the bwd_op's operands, which is a subset of bwd_op's operands after excluding the inputs and outputs of fwd_op;
+    step2: get the new_fwd_outputs via grad_var_to_var_map, which correspond one-to-one with grad_outputs;
+    step3: get the new_fwd_inputs, iterate over the initialized result in bwd_op's results, then find the corresponding fwd_input based on grad_var_to_var_map;
+    step4: call grad() API, decompose the bwd_op, and get new gradients;
+    step5: replace bwd_op with a set of primitive ops;
+    step6: update grad_var_to_var_map.
+
+    Args:
+        block (Block): the block to which the backward op belongs.
+        bwd_op (pir.Operation): the backward op to be decomposed.
+        grad_var_to_var_map (dict): a dict obtained from distributed processing,
+            which maps the backward grad variable to its corresponding forward variable.
+        fwd_outputs (tuple(Value)): the output value tuple of the forward op.
+        fwd_inputs (tuple(Value)): the input value tuple of the forward op.
+
+    Returns:
+        new_input_grads (tuple(Value)): the input grad value tuple, the i-th returned value is the sum of gradients of `fwd_outputs` with respect to the i-th `fwd_inputs`.
+    '''
+
+    if not core._is_bwd_prim_enabled():
+        raise RuntimeError(
+            "To get composite backward op, please set `core._set_prim_backward_enabled(True)` firstly"
+        )
+
+    # intercept grad_outputs from the original bwd_op
+    # grad_outputs = bwd_op.operands() - fwd_inputs - fwd_outputs
+    bwd_inputs = tuple(x.source() for x in bwd_op.operands())
+    grad_outputs = tuple(
+        bwd_input
+        for bwd_input in bwd_inputs
+        if not (bwd_input in fwd_inputs or bwd_input in fwd_outputs)
+    )
+
+    # new_fwd_outputs is a subset of fwd_outputs, because some fwd_output does not hold the gradients,
+    # e.g., layer_norm op's output is [out, mean, variance], but only out holds gradient,
+    # therefore, parse the new_fwd_outputs according to grad_outputs and grad_var_to_var_map
+    new_fwd_outputs = tuple(
+        grad_var_to_var_map[grad_output] for grad_output in grad_outputs
+    )
+
+    # new_fwd_inputs is a subset of fwd_inputs, because some fwd_input does not need to compute the gradients,
+    # e.g., dropout op's input is [x, seed_tensor], but the seed_tensor is generated by the forward op, and does not need to compute the gradients,
+    # therefore, parse the new_fwd_inputs according to bwd_op.results() and grad_var_to_var_map
+    new_fwd_inputs = tuple(
+        grad_var_to_var_map[grad_input]
+        for grad_input in bwd_op.results()
+        if grad_input.initialized()
+    )
+
+    # when replace bwd_op with a list of primitive ops, a insertion point is needed
+    bwd_op_idx = block.ops.index(bwd_op)
+    # decompose bwd_op into a list of primitive ops
+    before_num_ops = len(block.ops)
+    input_grads = ir_backward.grad(
+        new_fwd_outputs, new_fwd_inputs, grad_outputs
+    )
+    after_num_ops = len(block.ops)
+
+    # update the bwd_op's results
+    # when the original result of the bwd_op is None, then fake an OpResult for replacement
+    # when the original result of the bwd_op is not None, then replace it with the new result of primitive ops
+    new_input_grads = []
+    input_grads_idx = 0
+    for idx, input_grad in enumerate(bwd_op.results()):
+        if input_grad.initialized():
+            new_input_grads.append(input_grads[input_grads_idx])
+            input_grads_idx += 1
+        else:
+            new_input_grads.append(pir.fake_op_result())
+
+    # move the primitive ops to the insertion point
+    insert_idx = bwd_op_idx
+    for i in range(before_num_ops, after_num_ops):
+        block.move_op(block.ops[i], insert_idx)
+        insert_idx += 1
+
+    # update_grad_var_to_var_map
+    for idx, grad_var in enumerate(bwd_op.results()):
+        if grad_var in grad_var_to_var_map.keys():
+            grad_var_to_var_map[new_input_grads[idx]] = grad_var_to_var_map.pop(
+                grad_var
+            )
+
+    # replace the following use of original bwd_op's results with new primitive ops' results, and then remove original bwd_op
+    bwd_op.replace_all_uses_with(new_input_grads)
+    block.remove_op(bwd_op)
+
+    return tuple(new_input_grads)
diff --git a/python/paddle/decomposition/primitives.py b/python/paddle/decomposition/primitives.py
index c2c6fcb08dadc6..0ac15489ae557d 100644
--- a/python/paddle/decomposition/primitives.py
+++ b/python/paddle/decomposition/primitives.py
@@ -12,58 +12,58 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.tensor import abs  # noqa: F401
-from paddle.tensor import acos  # noqa: F401
-from paddle.tensor import acosh  # noqa: F401
-from paddle.tensor import add  # noqa: F401
-from paddle.tensor import asin  # noqa: F401
-from paddle.tensor import asinh  # noqa: F401
-from paddle.tensor import atan  # noqa: F401
-from paddle.tensor import atanh  # noqa: F401
-from paddle.tensor import broadcast_shape  # noqa: F401
-from paddle.tensor import broadcast_to  # noqa: F401
-from paddle.tensor import concat  # noqa: F401
-from paddle.tensor import cos  # noqa: F401
-from paddle.tensor import cosh  # noqa: F401
-from paddle.tensor import cumprod  # noqa: F401
-from paddle.tensor import cumsum  # noqa: F401
-from paddle.tensor import digamma  # noqa: F401
-from paddle.tensor import divide  # noqa: F401
-from paddle.tensor import erf  # noqa: F401
-from paddle.tensor import erfinv  # noqa: F401
-from paddle.tensor import exp  # noqa: F401
-from paddle.tensor import expm1  # noqa: F401
-from paddle.tensor import fill_constant  # noqa: F401
-from paddle.tensor import full  # noqa: F401
-from paddle.tensor import gather  # noqa: F401
-from paddle.tensor import greater_equal  # noqa: F401
-from paddle.tensor import lgamma  # noqa: F401
-from paddle.tensor import log  # noqa: F401
-from paddle.tensor import log1p  # noqa: F401
-from paddle.tensor import logcumsumexp  # noqa: F401
-from paddle.tensor import logit  # noqa: F401
-from paddle.tensor import logsumexp  # noqa: F401
-from paddle.tensor import max  # noqa: F401
-from paddle.tensor import min  # noqa: F401
-from paddle.tensor import multiply  # noqa: F401
-from paddle.tensor import ones  # noqa: F401
-from paddle.tensor import pow  # noqa: F401
-from paddle.tensor import prod  # noqa: F401
-from paddle.tensor import reshape  # noqa: F401
-from paddle.tensor import rsqrt  # noqa: F401
-from paddle.tensor import sign  # noqa: F401
-from paddle.tensor import sin  # noqa: F401
-from paddle.tensor import sinh  # noqa: F401
-from paddle.tensor import sqrt  # noqa: F401
-from paddle.tensor import subtract  # noqa: F401
-from paddle.tensor import sum  # noqa: F401
-from paddle.tensor import tan  # noqa: F401
-from paddle.tensor import tanh  # noqa: F401
-from paddle.tensor import tile  # noqa: F401
-from paddle.tensor import uniform  # noqa: F401
-from paddle.tensor import zeros  # noqa: F401
-from paddle.tensor.creation import assign  # noqa: F401
-from paddle.tensor.creation import zeros_like  # noqa: F401
+from paddle.tensor import (  # noqa: F401
+    abs,
+    acos,
+    acosh,
+    add,
+    asin,
+    asinh,
+    atan,
+    atanh,
+    broadcast_shape,
+    broadcast_to,
+    concat,
+    cos,
+    cosh,
+    cumprod,
+    cumsum,
+    digamma,
+    divide,
+    erf,
+    erfinv,
+    exp,
+    expm1,
+    fill_constant,
+    full,
+    gather,
+    greater_equal,
+    lgamma,
+    log,
+    log1p,
+    logcumsumexp,
+    logit,
+    logsumexp,
+    max,
+    min,
+    multiply,
+    ones,
+    pow,
+    prod,
+    reshape,
+    rsqrt,
+    sign,
+    sin,
+    sinh,
+    sqrt,
+    subtract,
+    sum,
+    tan,
+    tanh,
+    tile,
+    uniform,
+    zeros,
+)
+from paddle.tensor.creation import assign, zeros_like  # noqa: F401
 from paddle.tensor.manipulation import cast  # noqa: F401
-from paddle.tensor.math import maximum  # noqa: F401
-from paddle.tensor.math import minimum  # noqa: F401
+from paddle.tensor.math import maximum, minimum  # noqa: F401
diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py
index 4dd291475308e2..d64cba8d657ba1 100644
--- a/python/paddle/decomposition/rules.py
+++ b/python/paddle/decomposition/rules.py
@@ -18,7 +18,6 @@
 from .register import register_decomp
 
 
-@register_decomp('pd_op.mean')
 def mean(x, axis, keepdim):
     """define composite rule of op mean"""
     x_shape = x.shape
@@ -56,15 +55,32 @@ def gelu(x, approximate):
         tanh_out = tanh(kAlpha * (x + GELU_CONSTANT * x * x * x))
         out = x * half * (one + tanh_out)
         return out
-
     else:
         # gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
-
         cdf = half * (one + _pir_ops.erf(x * full(x.shape, M_SQRT1_2, x.dtype)))
         out = x * cdf
         return out
 
 
+@register_decomp('pd_op.sqrt')
+def sqrt(x):
+    """
+    define composite rule of op sqrt
+    res = pow(x, 0.5)
+    """
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+
+    y = full(x.shape if len(x.shape) == 0 else [1], 0.5, x.dtype)
+    res = pow_composite(x, y)
+    return res if not is_amp else cast(res, dtype)
+
+
 @register_decomp('pd_op.rsqrt')
 def rsqrt(x):
     """define composite rule of op rsqrt."""
@@ -155,7 +171,7 @@ def dropout(x, seed_tensor, p, is_test, mode, seed, fix_seed):
         train: out = input * mask
         inference: out = input * (1.0 - p)
     """
-    from paddle import scale as pd_scale
+    from paddle import assign
     from paddle.base import core
     from paddle.base.data_feeder import convert_dtype
 
@@ -179,9 +195,7 @@ def dropout(x, seed_tensor, p, is_test, mode, seed, fix_seed):
                     shape=x.shape, value=(1.0 - p), dtype=x.dtype
                 ), cast(mask, uint8_type)
         else:
-            return pd_scale(x, 1.0), cast(
-                mask, uint8_type
-            )  # assign(x), cast(mask, mask, core.VarDesc.VarType.UINT8)
+            return assign(x), cast(mask, uint8_type)
     else:
         if not is_test:
             return x * mask, cast(mask, uint8_type)
@@ -213,3 +227,120 @@ def add_n(x):
     for xi in x[1:]:
         ans = xi + ans
     return ans
+
+
+@register_decomp('pd_op.silu')
+def silu(x):
+    """
+    define composite rule of op silu
+    res = x / (1 + exp(-x))
+    """
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+
+    sum_temp = exp(-x) + 1
+    res = x / sum_temp
+    return res if not is_amp else cast(res, dtype)
+
+
+@register_decomp('pd_op.softmax')
+def softmax(x, axis):
+    """define composite rule of op softmax"""
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    # Softmax need fp32 compute since it has sum op in
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+    if not x.shape:
+        # do not return 1, to ensure gradients
+        res = exp(x - x)
+        if is_amp:
+            res = cast(res, "float16")
+        return res
+    max_temp = max(x, axis, keepdim=True)
+    max_temp.stop_gradient = True
+    molecular = exp(x - max_temp)
+    denominator = sum(molecular, axis=axis, keepdim=True)
+    res = divide(molecular, denominator)
+    if is_amp:
+        res = cast(res, dtype)
+    return res
+
+
+@register_decomp('pd_op.full_like')
+def full_like(x, fill_value, dtype, place=None):
+    """define composite rule of op full_like."""
+    """op name: full_like  op type name: fill_any_like."""
+    """arg place is not used, add it here to keep same as python api."""
+    fill_value = fill_value.get_defining_op().attrs()["value"]
+    val = full(x.shape, fill_value, dtype)
+    return val
+
+
+@register_decomp('pd_op.stack')
+def stack(x, axis):
+    """
+    define composite rule of op stack
+    unsqueeze each dimension of the input (use reshape), and then concat
+    """
+    x_shape = x[0].shape
+    if axis < 0:
+        axis += len(x_shape) + 1
+    out_shape = x_shape[:axis] + [1] + x_shape[axis:]
+    out = concat([reshape(item, out_shape) for item in x], axis)
+    return out
+
+
+@register_decomp('pd_op.squeeze')
+def squeeze(x, axis):
+    """define composite rule of squeeze"""
+    """
+    canonicalize dim within range 0 to rank and
+    determine new shape after squeeze op
+    if axis not specified, remove all dims equal to 1
+    otherwise, remove dims equal to 1 in axis
+    axis can only be list, not int
+    """
+    axis = axis.get_defining_op().attrs()["value"]
+    rank = len(x.shape)
+    if rank == 0:
+        return [assign(x), None]
+    if len(axis) == 0:
+        dims = set(range(rank))
+    else:
+        dims = {ax % rank for ax in axis}
+    new_shape = []
+    for d, s in enumerate(x.shape):
+        if not (s == 1 and (d in dims)):
+            new_shape.append(s)
+    out = reshape(x, new_shape)
+    return [out, None]
+
+
+@register_decomp('pd_op.unsqueeze')
+def unsqueeze(x, axis):
+    """define composite rule of op unsqueeze"""
+    """using reshape to implement unsqueeze op"""
+    axis = axis.get_defining_op().attrs()["value"]
+    x_shape = list(x.shape)
+    axis_list = list(axis)
+    for i in axis_list:
+        if i < 0:
+            i += len(x_shape) + 1
+        x_shape = (
+            x_shape[:i]
+            + [
+                1,
+            ]
+            + x_shape[i:]
+        )
+    out = reshape(x, x_shape)
+    return [out, None]
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index f8bf7b3b919687..7ee16ffcf5464e 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -19,13 +19,17 @@
 import paddle
 from paddle.base import core
 from paddle.base import framework
-from paddle.base.framework import is_compiled_with_cinn  # noqa: F401
-from paddle.base.framework import is_compiled_with_cuda  # noqa: F401
-from paddle.base.framework import is_compiled_with_rocm  # noqa: F401
-from . import cuda
-from . import xpu
-
-__all__ = [  # noqa
+from paddle.base.framework import (
+    is_compiled_with_cinn,
+    is_compiled_with_cuda,
+    is_compiled_with_rocm,
+)
+from . import (  # noqa: F401
+    cuda,
+    xpu,
+)
+
+__all__ = [
     'get_cudnn_version',
     'set_device',
     'get_device',
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index cb57e674e20176..0a094319f893f6 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -17,8 +17,7 @@
 from paddle.base.wrapped_decorator import signature_safe_contextmanager
 from paddle.utils import deprecated
 
-from .streams import Stream  # noqa: F401
-from .streams import Event  # noqa: F401
+from .streams import Stream, Event
 
 __all__ = [
     'Stream',
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 7f641a5e6fa54d..ce777fa73fd870 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -12,27 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import atexit
+import atexit  # noqa: F401
 from . import io
-from .spawn import spawn  # noqa: F401
-from .launch.main import launch  # noqa: F401
-from .parallel import init_parallel_env  # noqa: F401
-from .parallel import get_rank  # noqa: F401
-from .parallel import get_world_size  # noqa: F401
-from .parallel import ParallelEnv  # noqa: F401
-from .parallel import DataParallel
-from .parallel_with_gloo import gloo_init_parallel_env
-from .parallel_with_gloo import gloo_barrier
-from .parallel_with_gloo import gloo_release
+from .spawn import spawn
+from .launch.main import launch
+from .parallel import (  # noqa: F401
+    init_parallel_env,
+    get_rank,
+    get_world_size,
+    ParallelEnv,
+    DataParallel,
+)
+from .parallel_with_gloo import (
+    gloo_init_parallel_env,
+    gloo_barrier,
+    gloo_release,
+)
 
-from paddle.distributed.fleet.dataset import InMemoryDataset  # noqa: F401
-from paddle.distributed.fleet.dataset import QueueDataset  # noqa: F401
-from paddle.distributed.fleet.base.topology import ParallelMode  # noqa: F401
+from paddle.distributed.fleet.dataset import InMemoryDataset, QueueDataset
+from paddle.distributed.fleet.base.topology import ParallelMode
 
-from .collective import split  # noqa: F401
-from .collective import new_group  # noqa: F401
-from .collective import is_available
-from .communication import (
+from .collective import (
+    split,
+    new_group,
+    is_available,
+)
+from .communication import (  # noqa: F401
     stream,
     ReduceOp,
     all_gather,
@@ -59,31 +64,38 @@
     wait,
     barrier,
     get_backend,
-)  # noqa: F401
+)
 
-from .auto_parallel.process_mesh import ProcessMesh  # noqa: F401
-from .auto_parallel.api import DistAttr  # noqa: F401
+from .auto_parallel.process_mesh import ProcessMesh
 
 from .auto_parallel import shard_op  # noqa: F401
-from .auto_parallel.api import shard_tensor  # noqa: F401
-from .auto_parallel.api import dtensor_from_fn  # noqa: F401
-from .auto_parallel.api import reshard  # noqa: F401
-from .auto_parallel.api import shard_layer  # noqa: F401
+
+from .auto_parallel.api import (
+    DistAttr,
+    shard_tensor,
+    dtensor_from_fn,
+    reshard,
+    shard_layer,
+)
 
 from .fleet import BoxPSDataset  # noqa: F401
 
-from .entry_attr import ProbabilityEntry  # noqa: F401
-from .entry_attr import CountFilterEntry  # noqa: F401
-from .entry_attr import ShowClickEntry  # noqa: F401
+from .entry_attr import (  # noqa: F401
+    ProbabilityEntry,
+    CountFilterEntry,
+    ShowClickEntry,
+)
 
 from . import cloud_utils  # noqa: F401
 
-from .sharding import group_sharded_parallel  # noqa: F401
-from .sharding import save_group_sharded_model  # noqa: F401
+from .sharding import (  # noqa: F401
+    group_sharded_parallel,
+    save_group_sharded_model,
+)
 
-from . import rpc
+from . import rpc  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     "io",
     "spawn",
     "launch",
diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py
index 372144982327c0..d1024a226c64ee 100644
--- a/python/paddle/distributed/auto_parallel/static/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -130,7 +130,7 @@ def _can_apply_infer_spmd_rule(dist_op):
         enable = True if enable == 'true' else False
     enable = bool(enable)
 
-    # TODO remove me. ops to be adapted: lookup_table_v2, reshape2, split, transpose2,
+    # TODO remove me. ops to be adapted: squeeze2
     __adapted_ops__ = [
         "matmul_v2",
         "elementwise_div",
@@ -143,6 +143,11 @@ def _can_apply_infer_spmd_rule(dist_op):
         "dropout",
         "reduce_sum",
         "layer_norm",
+        "lookup_table_v2",
+        "reshape2",
+        "transpose2",
+        "split",
+        "unsqueeze2",
     ]
     op_type = dist_op.serial_op.type
     return enable and contains_spmd_rule(op_type) and op_type in __adapted_ops__
diff --git a/python/paddle/distributed/auto_parallel/static/dist_attribute.py b/python/paddle/distributed/auto_parallel/static/dist_attribute.py
index a8ee0e313669a0..46e36e51138f55 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_attribute.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from paddle.base.core import DistTensorSpec  # noqa: F401
-from paddle.base.core import OperatorDistAttr  # noqa: F401
-from paddle.base.core import TensorDistAttr  # noqa: F401
+from paddle.base.core import (  # noqa: F401
+    DistTensorSpec,
+    OperatorDistAttr,
+    TensorDistAttr,
+)
diff --git a/python/paddle/distributed/auto_parallel/static/dist_op.py b/python/paddle/distributed/auto_parallel/static/dist_op.py
index d60457054245e5..63040f24dfe679 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_op.py
@@ -150,8 +150,17 @@ def __str__(self):
                     is_parameter_str = "non-parameter"
             else:
                 is_parameter_str = "non-parameter"
-            str += ", {}'s dims_mapping (input, {}, {}): {}".format(
-                arg_name, annotated_str, is_parameter_str, dims_mapping
+
+            # partial
+            input_dist_attr = self.dist_attr.get_input_dist_attr(arg_name)
+            partial_dims = sorted(input_dist_attr._partial_dims())
+
+            str += "; {}'s dims_mapping (input, {}, {}): {}, partial on dims: {}".format(
+                arg_name,
+                annotated_str,
+                is_parameter_str,
+                dims_mapping,
+                partial_dims,
             )
 
         for arg_name in self.serial_op.desc.output_arg_names():
@@ -174,8 +183,17 @@ def __str__(self):
                     is_parameter_str = "non-parameter"
             else:
                 is_parameter_str = "non-parameter"
-            str += ", {}'s dims_mapping (output, {}, {}): {}".format(
-                arg_name, annotated_str, is_parameter_str, dims_mapping
+
+            # partial
+            output_dist_attr = self.dist_attr.get_output_dist_attr(arg_name)
+            partial_dims = sorted(output_dist_attr._partial_dims())
+
+            str += "; {}'s dims_mapping (output, {}, {}): {}, partial on dims: {}".format(
+                arg_name,
+                annotated_str,
+                is_parameter_str,
+                dims_mapping,
+                partial_dims,
             )
 
         str += ", dist_impl idx: {} , dist_impl type {} }}".format(
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index ac452565624409..999ad0cf94f926 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -729,7 +729,7 @@ def _optimization_tuning(self, mode, dataset, batch_size):
     def _plan(self, mode):
         if self._planned_mode is None:
             self._planned_mode = mode
-        else:
+        elif self._strategy.auto_mode != "semi":
             self._init_dist_context(mode)
 
         self._planners[mode] = Planner(mode, self._dist_contexts[mode])
@@ -1132,43 +1132,69 @@ def evaluate(
         else:
             self._switch_mode(self._mode)
 
-        micro_batch_size = self._validate_batch_size(batch_size)
-        valid_dataloader = self._prepare_dataloader_from_generator(
-            dataset=valid_data,
-            capacity=70,
-            iterable=False,
-            batch_size=micro_batch_size,
-            steps_per_epoch=steps,
-            collate_fn=collate_fn,
-        )
+        if auto_utils.use_new_executor():
+            local_batch_size = self._validate_batch_size(batch_size)
+            valid_dataloader = self._prepare_dataloader(
+                valid_data,
+                return_list=False,
+                batch_size=local_batch_size,
+                collate_fn=collate_fn,
+            )
+            steps_per_epoch = len(valid_dataloader) if steps is None else steps
+        else:
+            micro_batch_size = self._validate_batch_size(batch_size)
+            valid_dataloader = self._prepare_dataloader_from_generator(
+                dataset=valid_data,
+                capacity=70,
+                iterable=False,
+                batch_size=micro_batch_size,
+                steps_per_epoch=steps,
+                collate_fn=collate_fn,
+            )
+            steps_per_epoch = valid_dataloader._steps
+            local_batch_size = micro_batch_size
+            if self._strategy.pipeline.enable:
+                local_batch_size = micro_batch_size * self._acc_steps
 
         fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
 
         cbks = config_callbacks(
             callbacks,
             engine=self,
-            batch_size=micro_batch_size,
+            batch_size=local_batch_size,
             log_freq=log_freq,
             verbose=verbose,
             metrics=self._metrics_name(),
         )
 
-        eval_steps = valid_dataloader._steps
+        eval_steps = steps_per_epoch
         cbks.on_begin(
             'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
         )
         logs = {}
-        for step, _ in enumerate(valid_dataloader):
-            cbks.on_batch_begin('eval', step, logs)
+        for step, batch in enumerate(valid_dataloader):
+            if auto_utils.use_new_executor():
+                batches = self._validate_batch(batch)
+            else:
+                batches = [{}]
+
             try:
-                outs = self._executor.run(
-                    self.main_program,
-                    fetch_list=fetch_names,
-                    use_program_cache=self._strategy.use_cache,
-                    return_numpy=self._strategy.return_numpy,
-                )
+                for micro_batch in batches:
+                    cbks.on_batch_begin('eval', step, logs)
+                    outs = self._executor.run(
+                        self.main_program,
+                        feed=micro_batch,
+                        fetch_list=fetch_names,
+                        use_program_cache=self._strategy.use_cache,
+                        return_numpy=self._strategy.return_numpy,
+                    )
             except core.EOFException:
                 break
+
+            if steps_per_epoch and step >= steps_per_epoch:
+                if not auto_utils.use_new_executor():
+                    valid_dataloader._reset()
+                break
             logs = self._prepare_logger(
                 outs, None, step, None, fetch_names, fetch_indices, self._mode
             )
@@ -1240,34 +1266,57 @@ def predict(
         else:
             self._switch_mode(self._mode)
 
-        micro_batch_size = self._validate_batch_size(batch_size)
-        test_dataloader = self._prepare_dataloader_from_generator(
-            dataset=test_data,
-            capacity=70,
-            iterable=False,
-            batch_size=micro_batch_size,
-            steps_per_epoch=steps,
-            collate_fn=collate_fn,
-        )
+        if auto_utils.use_new_executor():
+            local_batch_size = self._validate_batch_size(batch_size)
+            test_dataloader = self._prepare_dataloader(
+                test_data,
+                return_list=False,
+                batch_size=local_batch_size,
+                collate_fn=collate_fn,
+            )
+            steps_per_epoch = len(test_dataloader) if steps is None else steps
+        else:
+            micro_batch_size = self._validate_batch_size(batch_size)
+            test_dataloader = self._prepare_dataloader_from_generator(
+                dataset=test_data,
+                capacity=70,
+                iterable=False,
+                batch_size=micro_batch_size,
+                steps_per_epoch=steps,
+                collate_fn=collate_fn,
+            )
+            steps_per_epoch = test_dataloader._steps
 
         fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
 
         outputs = []
         cbks = config_callbacks(callbacks, engine=self, verbose=verbose)
-        test_steps = test_dataloader._steps
+        test_steps = steps_per_epoch
         cbks.on_begin('predict', {'steps': test_steps})
         logs = {}
-        for step, _ in enumerate(test_dataloader):
-            cbks.on_batch_begin('predict', step, logs)
+        for step, batch in enumerate(test_dataloader):
+            if auto_utils.use_new_executor():
+                batches = self._validate_batch(batch)
+            else:
+                batches = [{}]
+
             try:
-                outs = self._executor.run(
-                    self.main_program,
-                    fetch_list=fetch_names,
-                    use_program_cache=self._strategy.use_cache,
-                    return_numpy=self._strategy.return_numpy,
-                )
+                for micro_batch in batches:
+                    cbks.on_batch_begin('predict', step, logs)
+                    outs = self._executor.run(
+                        self.main_program,
+                        feed=micro_batch,
+                        fetch_list=fetch_names,
+                        use_program_cache=self._strategy.use_cache,
+                        return_numpy=self._strategy.return_numpy,
+                    )
             except core.EOFException:
                 break
+
+            if steps_per_epoch and step >= steps_per_epoch:
+                if not auto_utils.use_new_executor():
+                    test_dataloader._reset()
+                break
             logs = self._prepare_logger(
                 outs, None, step, None, fetch_names, fetch_indices, self._mode
             )
@@ -1281,7 +1330,7 @@ def dataloader(
         dataset,
         batch_size=1,
         shuffle=False,
-        drop_last=False,
+        drop_last=True,
         collate_fn=None,
         num_workers=0,
         use_buffer_reader=True,
@@ -1451,7 +1500,7 @@ def _prepare_dataloader(
         return_list=True,
         batch_size=1,
         shuffle=False,
-        drop_last=False,
+        drop_last=True,
         collate_fn=None,
         num_workers=0,
         use_buffer_reader=True,
diff --git a/python/paddle/distributed/auto_parallel/static/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py
index f705ee49688484..6fe6700b996ada 100644
--- a/python/paddle/distributed/auto_parallel/static/helper.py
+++ b/python/paddle/distributed/auto_parallel/static/helper.py
@@ -235,7 +235,9 @@ def build_program(self, mode):
 
         self._logger.info("start to build program for mode = %s." % mode)
         input_spec = [self.inputs_spec, self.labels_spec]
-        static_func = to_static(self.static_func(), input_spec=input_spec)
+        static_func = to_static(
+            self.static_func(), input_spec=input_spec, full_graph=True
+        )
 
         func_name = '_' + mode
         setattr(self.proxy_layer, func_name, static_func)
diff --git a/python/paddle/distributed/auto_parallel/static/operators/common.py b/python/paddle/distributed/auto_parallel/static/operators/common.py
index b134599179260b..7366d65c0ea895 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/common.py
@@ -25,6 +25,7 @@
     _get_corresponding_rank,
     compute_compatible_dims_mapping,
     is_optimize_op,
+    set_dist_op_desc_original_id,
 )
 
 _logger = get_logger(
@@ -622,12 +623,13 @@ def merge_forward_backward_dims_mapping(fw_results, bw_results):
 
 
 def update_op_dims_mapping(
-    dist_op,
-    input_arg_names,
-    infered_input_dims_mappings,
-    output_arg_names,
-    infered_output_dims_mappings,
+    dist_op, input_arg_names, output_arg_names, fw_results, bw_results
 ):
+    (
+        infered_input_dims_mappings,
+        infered_output_dims_mappings,
+    ) = merge_forward_backward_dims_mapping(fw_results, bw_results)
+
     op_dist_attr = dist_op.dist_attr
     changed = False
     assert len(input_arg_names) == len(
@@ -661,6 +663,7 @@ def update_op_dims_mapping(
             op_dist_attr.set_input_dims_mapping(
                 input_arg_names[i], infered_dims_mapping
             )
+        # TODO support partial for inputs
 
     for i in range(len(output_arg_names)):
         original_dims_mapping = op_dist_attr.get_output_dims_mapping(
@@ -683,6 +686,29 @@ def update_op_dims_mapping(
                 output_arg_names[i], infered_dims_mapping
             )
 
+        # NOTE in partial stage-I, we infer partial for output in infer_forward only
+        output_dist_attr = op_dist_attr.get_output_dist_attr(
+            output_arg_names[i]
+        )
+        output_idx = output_arg_names.index(output_arg_names[i])
+        if (
+            fw_results[1][output_idx]._partial_dims()
+            != output_dist_attr._partial_dims()
+        ):
+            _logger.info(
+                "Changed: Op [{}], tensor name [{}], Original partial on [{}], Infered partial on [{}]".format(
+                    dist_op.serial_op.type,
+                    output_arg_names[i],
+                    output_dist_attr._partial_dims(),
+                    fw_results[1][output_idx]._partial_dims(),
+                )
+            )
+            output_dist_attr._clean_partial_status()
+            output_dist_attr._set_partial_dims(
+                list(fw_results[1][0]._partial_dims())
+            )
+            changed = True
+
     return changed
 
 
@@ -693,3 +719,15 @@ def get_default_distributed_operator_impl():
     num_impls = len(dist_op_default_impl_container.impls)
     assert num_impls == 1, f"Default dist op has [{num_impls}] impls"
     return dist_op_default_impl_container.get_impl(0)
+
+
+def copy_op_without_infer_shape(src_op, block, ctx, varname_kwargs):
+    new_op = block.append_op(type='nop')
+    new_op_desc = new_op.desc
+    new_op_desc.copy_from(src_op.desc)
+    set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
+    for input_name in src_op.desc.input_names():
+        new_op_desc.set_input(input_name, varname_kwargs[input_name])
+    for output_name in src_op.desc.output_names():
+        new_op_desc.set_output(output_name, varname_kwargs[output_name])
+    return new_op
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
index 04bca9c95ddbef..bec7869a615f36 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 
+from ..completion import get_phi_spmd_rule
 from ..cost import (
     _g_op_cost_factory,
     build_comp_costs_from_descs,
@@ -26,16 +28,19 @@
     _get_comm_group,
     _get_corresponding_rank,
     compute_compatible_dim_mapping,
+    get_dist_tensor_spec,
     is_prim_op,
     set_dist_op_desc_original_id,
 )
 from .common import (
     DistributedOperatorImpl,
     DistributedOperatorImplContainer,
+    get_default_distributed_operator_impl,
     gradient_synchronization,
     is_parameter_related,
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
+    update_op_dims_mapping,
 )
 
 __op_not_need_param_init__ = ["while", "cond"]
@@ -97,6 +102,61 @@ class DistributedDefault(DistributedOperatorImplContainer):
     def __init__(self, op_type):
         super().__init__(op_type)
 
+    @staticmethod
+    def update_dims_mapping(dist_op):
+        # step1: prepare inputs need for rule (order args as PHI definition and filter out unnecessary args)
+
+        op_desc = dist_op.serial_op.desc
+        input_arg_names = op_desc.input_arg_names()
+        output_arg_names = op_desc.output_arg_names()
+
+        num_inputs = len(input_arg_names)
+        input_specs = []
+        for i in range(num_inputs):
+            assert not is_parameter_related(
+                input_arg_names[i]
+            ), "input {} of op {} is parameter, op should not use default rule.".format(
+                input_arg_names[i], str(dist_op.serial_op)
+            )
+            input_specs.append(
+                get_dist_tensor_spec(dist_op, input_arg_names[i])
+            )
+        num_outputs = len(output_arg_names)
+        output_specs = []
+        for i in range(num_outputs):
+            assert not is_parameter_related(
+                output_arg_names[i]
+            ), "output {} of op {} is parameter, op should not use default rule.".format(
+                output_arg_names[i], str(dist_op.serial_op)
+            )
+            output_specs.append(
+                get_dist_tensor_spec(dist_op, output_arg_names[i], False)
+            )
+
+        # step2: infer spmd
+        rule = get_phi_spmd_rule("default_")
+        # tensor order following order in PHI defition
+        fw_results = rule.infer_forward(input_specs, output_specs)
+        bw_results = rule.infer_backward(input_specs, output_specs)
+
+        # step3: update dist_attr
+        # tensor order following order in PHI defition
+        changed = update_op_dims_mapping(
+            dist_op, input_arg_names, output_arg_names, fw_results, bw_results
+        )
+
+        return changed
+
+    @staticmethod
+    def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
+        # all op use default dist operator impl.
+        op_dist_attr = dist_op.dist_attr
+        default_impl = get_default_distributed_operator_impl()
+        op_dist_attr.impl_type = default_impl.type
+        op_dist_attr.impl_idx = default_impl.idx
+
+        return False
+
 
 register_distributed_operator_impl_container(DistributedDefault("default"))
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py b/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
index ca8a2a0bcd80de..71f72defcd462e 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
@@ -62,24 +62,18 @@ def update_dims_mapping(dist_op):
         fw_results = rule.infer_forward(x_spec)
         bw_results = rule.infer_backward(x_spec, output_spec)
 
-        # step3: merge fw & bw results
-        (
-            infered_input_dims_mappings,
-            infered_output_dims_mappings,
-        ) = merge_forward_backward_dims_mapping(fw_results, bw_results)
-
-        # step4: update dist_attr
+        # step3: update dist_attr
         # tensor order following order in PHI defition
         changed = update_op_dims_mapping(
-            dist_op,
-            [x_name],
-            infered_input_dims_mappings,
-            [out_name],
-            infered_output_dims_mappings,
+            dist_op, [x_name], [out_name], fw_results, bw_results
         )
 
         # step5: update mask and seed dropout special
         if changed:
+            (
+                _,
+                infered_output_dims_mappings,
+            ) = merge_forward_backward_dims_mapping(fw_results, bw_results)
             dist_op.dist_attr.set_output_dims_mapping(
                 mask_name, infered_output_dims_mappings[0]
             )
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py
index 857eda7c79aad9..18fb13b92e2b35 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py
@@ -32,7 +32,6 @@
     get_default_distributed_operator_impl,
     is_elementwise_op,
     is_parameter_related,
-    merge_forward_backward_dims_mapping,
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
     update_op_dims_mapping,
@@ -77,20 +76,10 @@ def update_dims_mapping(dist_op):
         fw_results = rule.infer_forward(*input_specs)
         bw_results = rule.infer_backward(*input_specs, output_spec)
 
-        # step3: merge fw & bw results
-        (
-            infered_input_dims_mappings,
-            infered_output_dims_mappings,
-        ) = merge_forward_backward_dims_mapping(fw_results, bw_results)
-
-        # step4: update dist_attr
+        # step3: update dist_attr
         # tensor order following order in PHI defition
         changed = update_op_dims_mapping(
-            dist_op,
-            input_arg_names,
-            infered_input_dims_mappings,
-            [output_arg_name],
-            infered_output_dims_mappings,
+            dist_op, input_arg_names, [output_arg_name], fw_results, bw_results
         )
 
         return changed
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
index 42ddfc4b0d4b31..12bea4573e1a4f 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
@@ -21,6 +21,7 @@
 from paddle.framework import core
 from paddle.utils import unique_name
 
+from ..completion import get_phi_spmd_rule
 from ..cost import (
     EmbeddingGradOpCost,
     EmbeddingOpCost,
@@ -37,6 +38,7 @@
     _get_corresponding_rank,
     _get_idx_in_axis,
     compute_compatible_and_update_dim_mapping,
+    get_dist_tensor_spec,
     is_dim_replicate,
     is_dim_shard,
     set_var_dist_attr,
@@ -44,11 +46,13 @@
 from .common import (
     DistributedOperatorImpl,
     DistributedOperatorImplContainer,
+    get_default_distributed_operator_impl,
     gradient_synchronization,
     infer_shape,
     naive_copy_op_dist_attr_for_program,
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
+    update_op_dims_mapping,
 )
 
 
@@ -56,6 +60,60 @@ class DistributedEmbedding(DistributedOperatorImplContainer):
     def __init__(self, op_type):
         super().__init__(op_type)
 
+    @staticmethod
+    def update_dims_mapping(dist_op):
+        # step1: prepare inputs need for rule (order args as PHI definition and filter out unnecessary args)
+        op_desc = dist_op.serial_op.desc
+        assert (
+            dist_op.serial_op.type == "lookup_table_v2"
+        ), f"{dist_op.serial_op.type} is not supported by dist embedding yet."
+
+        x_name = op_desc.input('Ids')[0]
+        w_name = op_desc.input('W')[0]
+        out_name = op_desc.output('Out')[0]
+        padding_idx = op_desc.attr('padding_idx')
+        is_sparse = op_desc.attr('is_sparse')
+
+        x_spec = get_dist_tensor_spec(dist_op, x_name)
+        w_spec = get_dist_tensor_spec(dist_op, w_name)
+        output_spec = get_dist_tensor_spec(dist_op, out_name, False)
+
+        # step2: infer spmd
+        rule = get_phi_spmd_rule("embedding")
+        # tensor order following order in PHI defition
+        fw_results = rule.infer_forward(x_spec, w_spec, padding_idx, is_sparse)
+        bw_results = rule.infer_backward(
+            x_spec, w_spec, output_spec, padding_idx, is_sparse
+        )
+
+        # step3: update dist_attr
+        # tensor order following order in PHI defition
+        changed = update_op_dims_mapping(
+            dist_op, [x_name, w_name], [out_name], fw_results, bw_results
+        )
+
+        return changed
+
+    @staticmethod
+    def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
+        reverted = False
+        op_dist_attr = dist_op.dist_attr
+        op_desc = dist_op.serial_op.desc
+        out_name = op_desc.output('Out')[0]
+        out_dist_attr = op_dist_attr.get_output_dist_attr(out_name)
+
+        # vocab parallel embedding
+        if out_dist_attr._is_partial():
+            op_dist_attr.impl_type = op_desc.type()
+            op_dist_attr.impl_idx = 0
+        # data parallel or col parallel of weight
+        else:
+            default_impl = get_default_distributed_operator_impl()
+            op_dist_attr.impl_type = default_impl.type
+            op_dist_attr.impl_idx = default_impl.idx
+
+        return reverted
+
 
 register_distributed_operator_impl_container(
     DistributedEmbedding("lookup_table_v2")
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py b/python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py
index d83beb82cd12a1..841dc0a5870444 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py
@@ -18,7 +18,7 @@
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
 )
-from .dist_eltwise import DistributedDefaultImpl0, DistributedElementwiseImpl0
+from .dist_eltwise import DistributedElementwiseImpl0
 
 
 class DistributedFlashAttn(DistributedOperatorImplContainer):
@@ -30,6 +30,7 @@ def __init__(self, op_type):
 
 
 # Dist FlashAttn with Random Control
+# NOTE(zhiqiu): trick implementation, copy dist_attr of q,k,v to out
 class DistributedFlashAttnImpl0(DistributedElementwiseImpl0):
     def __init__(self, name):
         super().__init__(name)
@@ -83,12 +84,12 @@ def forward(ctx, *args, **kwargs):
 
                 src_op._set_attr('rng_name', rng_name)
 
-        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+        DistributedElementwiseImpl0.forward(ctx, *args, **kwargs)
 
     @staticmethod
     def backward(ctx, *args, **kwargs):
         # dropout backward is deterministic by mask, and not need for random state control
-        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+        DistributedElementwiseImpl0.backward(ctx, *args, **kwargs)
 
 
 register_distributed_operator_impl(
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_layer_norm.py b/python/paddle/distributed/auto_parallel/static/operators/dist_layer_norm.py
index dcd1518dcd13d1..4f1ad6600c50c7 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_layer_norm.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_layer_norm.py
@@ -22,7 +22,6 @@
 from .common import (
     DistributedOperatorImplContainer,
     get_default_distributed_operator_impl,
-    merge_forward_backward_dims_mapping,
     register_distributed_operator_impl_container,
     update_op_dims_mapping,
 )
@@ -73,20 +72,14 @@ def update_dims_mapping(dist_op):
             begin_norm_axis,
         )
 
-        # step3: merge fw & bw results
-        (
-            infered_input_dims_mappings,
-            infered_output_dims_mappings,
-        ) = merge_forward_backward_dims_mapping(fw_results, bw_results)
-
-        # step4: update dist_attr
+        # step3: update dist_attr
         # tensor order following order in PHI defition
         changed = update_op_dims_mapping(
             dist_op,
             [x_name, scale_name, bias_name],
-            infered_input_dims_mappings,
             [y_name, var_name, mean_name],
-            infered_output_dims_mappings,
+            fw_results,
+            bw_results,
         )
 
         return changed
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
index 3568c928c16e72..40b0f109c78037 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
@@ -14,7 +14,7 @@
 
 import copy
 
-from paddle.common_ops_import import check_dtype, check_variable_and_dtype
+from paddle.common_ops_import import check_variable_and_dtype
 from paddle.distributed.auto_parallel.static.cost.comm_op_cost import (
     AllreduceSumOpCost,
     IdentityOpCost,
@@ -53,10 +53,9 @@
 from .common import (
     DistributedOperatorImpl,
     DistributedOperatorImplContainer,
+    copy_op_without_infer_shape,
     gradient_synchronization,
-    infer_shape,
     is_parameter_related,
-    merge_forward_backward_dims_mapping,
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
     set_comm_op_dist_attr_for_program,
@@ -79,8 +78,6 @@ def trans_x_y_dims_mapping(trans_x, trans_y, x_dims_mapping, y_dims_mapping):
 
 
 def copy_op_with_new_input_output(ctx, block, src_op, **kwargs):
-    pass
-
     src_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
     dist_attr = copy.deepcopy(src_dist_attr)
     dist_op = block.append_op(type='nop')
@@ -93,12 +90,15 @@ def copy_op_with_new_input_output(ctx, block, src_op, **kwargs):
         dist_attr.rename_input(
             src_op.desc.input(input_name)[0], kwargs[input_name][0]
         )
+
     for output_name in src_op.desc.output_names():
-        assert output_name in kwargs
-        dist_op_desc.set_output(output_name, kwargs[output_name])
-        dist_attr.rename_output(
-            src_op.desc.output(output_name)[0], kwargs[output_name][0]
-        )
+        # NOTE if stop_gradient is set, some of the output of grad_op should be empty.
+        if len(src_op.desc.output(output_name)) > 0:
+            assert output_name in kwargs
+            dist_op_desc.set_output(output_name, kwargs[output_name])
+            dist_attr.rename_output(
+                src_op.desc.output(output_name)[0], kwargs[output_name][0]
+            )
     # TODO: this call leads to a deepcopy when we init the dist op
     ctx.set_op_dist_attr_for_program(dist_op, dist_attr)
 
@@ -385,7 +385,6 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
         if dim >= 0 and process_mesh_shape[dim] > 0:
             Y_var_partitioned = True
             break
-
     if is_parameter_related(Y_var.name, main_block) and Y_var_partitioned:
         if Y_var_dim_mapping[0] >= 0:
             # row parallel: c_identity + matmul
@@ -550,22 +549,12 @@ def update_dims_mapping_matmul(dist_op):
     fw_results = rule.infer_forward(x_spec, y_spec, trans_x, trans_y)
     bw_results = rule.infer_backward(x_spec, y_spec, out_spec, trans_x, trans_y)
 
-    # step3: merge fw & bw results
-    (
-        infered_input_dims_mappings,
-        infered_output_dims_mappings,
-    ) = merge_forward_backward_dims_mapping(fw_results, bw_results)
-
-    # step4: update dist_attr
+    # step3: update dist_attr
     # tensor order following order in PHI defition
     input_arg_names = [x_name, y_name]
     output_arg_names = [out_name]
     changed = update_op_dims_mapping(
-        dist_op,
-        input_arg_names,
-        infered_input_dims_mappings,
-        output_arg_names,
-        infered_output_dims_mappings,
+        dist_op, input_arg_names, output_arg_names, fw_results, bw_results
     )
 
     return changed
@@ -869,37 +858,16 @@ def forward(ctx, *args, **kwargs):
         assert x_tensor_dist_attr is not None
         identity_var_dist_attr = op_dist_attr.get_input_dist_attr(X_var.name)
         assert identity_var_dist_attr is not None
-        ref_shape_x = infer_shape(
-            main_block, X_var, x_tensor_dist_attr, identity_var_dist_attr
-        )
+
         # infer out var shape with op dist attr
         out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
         assert out_tensor_dist_attr is not None
         out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
         assert out_var_dist_attr is not None
-        ref_shape_out = infer_shape(
-            main_block, Out_var, out_tensor_dist_attr, out_var_dist_attr
-        )
-
-        check_variable_and_dtype(
-            X_var,
-            'tensor',
-            ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
-            '_c_identity',
-        )
 
-        attrs = {
-            'transpose_X': trans_x,
-            'transpose_Y': trans_y,
-            'alpha': 1,
-            OP_ROLE_KEY: src_op.attr('op_role'),
-        }
-        inputs = {'X': [X_var], 'Y': [Weight_var]}
-        matmul_op = main_block.append_op(
-            type='matmul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs
-        )
-        if Out_var.shape != ref_shape_out:
-            Out_var.desc.set_shape(ref_shape_out)
+        # copy op
+        matmul_op = copy_op_without_infer_shape(src_op, main_block, ctx, kwargs)
+        matmul_op._set_attr('alpha', 1)
 
         # matmul
         matmul_op_dist_attr = OperatorDistAttr()
@@ -1177,61 +1145,19 @@ def forward(ctx, *args, **kwargs):
         )
         group = new_process_group(group_ranks)
 
-        check_variable_and_dtype(
-            X_var, 'x', ['float16', 'float32', 'float64', 'uint16'], 'linear'
-        )
-        check_dtype(
-            X_var.dtype,
-            'dtype',
-            ['float16', 'float32', 'float64', 'uint16'],
-            'linear',
-        )
-        attrs = {
-            'transpose_X': trans_x,
-            'transpose_Y': trans_y,
-            'alpha': 1,
-            OP_ROLE_KEY: src_op.attr('op_role'),
-        }
-        inputs = {'X': X_var, 'Y': Weight_var}
-
         # infer out var shape with op dist attr
         out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
         assert out_tensor_dist_attr is not None
         out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
         assert out_var_dist_attr is not None
-        ref_shape = infer_shape(
-            main_block, Out_var, out_tensor_dist_attr, out_var_dist_attr
-        )
-
-        intermediate_var_0 = main_block.create_var(
-            name=unique_name.generate_with_ignorable_key(
-                ".".join(["c_allreduce_sum", 'tmp'])
-            ),
-            shape=Out_var.shape,
-            dtype=Out_var.dtype,
-            type=Out_var.type,
-            lod_level=Out_var.lod_level,
-            persistable=False,
-            is_data=False,
-            need_check_feed=Out_var.desc.need_check_feed(),
-        )
-        # set intermediate_var_0's dist_attr with Out_var's dist_attr
-        ctx.set_tensor_dist_attr_for_program(
-            intermediate_var_0, out_var_dist_attr
-        )
-
-        matmul_op = main_block.append_op(
-            type='matmul',
-            inputs=inputs,
-            outputs={'Out': intermediate_var_0},
-            attrs=attrs,
-        )
-        if intermediate_var_0.shape != ref_shape:
-            intermediate_var_0.desc.set_shape(ref_shape)
 
+        # copy op
+        matmul_op = copy_op_without_infer_shape(src_op, main_block, ctx, kwargs)
+
+        # add allreduce (inplace)
         c_allreduce_sum_op = main_block.append_op(
             type='c_allreduce_sum',
-            inputs={'X': intermediate_var_0},
+            inputs={'X': Out_var},
             outputs={'Out': Out_var},
             attrs={
                 'ring_id': group.id,
@@ -1240,8 +1166,6 @@ def forward(ctx, *args, **kwargs):
                 OP_ROLE_KEY: src_op.attr('op_role'),
             },
         )
-        if Out_var.shape != ref_shape:
-            Out_var.desc.set_shape(ref_shape)
 
         # set dist op's dist_attr with serial op's dist_attr
         # matmul
@@ -1684,55 +1608,25 @@ def forward(ctx, *args, **kwargs):
         ), "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
             matmul_col_dim_mapping
         )
-        process_mesh_shape = op_dist_attr.process_mesh.shape
-        process_mesh_group = op_dist_attr.process_mesh.process_ids
-
-        parallel_axis = matmul_col_dim_mapping
-        group_ranks = _get_comm_group(
-            process_mesh_group, process_mesh_shape, parallel_axis, rank_id
-        )
-        group = new_process_group(group_ranks)
 
         # infer new var shape with op dist attr
         x_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(X_var)
         assert x_tensor_dist_attr is not None
         identity_var_dist_attr = op_dist_attr.get_input_dist_attr(X_var.name)
         assert identity_var_dist_attr is not None
-        ref_shape_x = infer_shape(
-            main_block, X_var, x_tensor_dist_attr, identity_var_dist_attr
-        )
+
         # infer out var shape with op dist attr
         out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
         assert out_tensor_dist_attr is not None
         out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
         assert out_var_dist_attr is not None
-        ref_shape_out = infer_shape(
-            main_block, Out_var, out_tensor_dist_attr, out_var_dist_attr
-        )
-
-        check_variable_and_dtype(
-            X_var,
-            'tensor',
-            ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
-            '_c_identity',
-        )
-
-        attrs = {
-            'trans_x': trans_x,
-            'trans_y': trans_y,
-            OP_ROLE_KEY: src_op.attr('op_role'),
-        }
-        inputs = {'X': [X_var], 'Y': [Weight_var]}
-        matmul_v2_op = main_block.append_op(
-            type='matmul_v2',
-            inputs=inputs,
-            outputs={'Out': Out_var},
-            attrs=attrs,
+
+        # copy op
+        matmul_v2_op = copy_op_without_infer_shape(
+            src_op, main_block, ctx, kwargs
         )
-        if Out_var.shape != ref_shape_out:
-            Out_var.desc.set_shape(ref_shape_out)
 
-        # matmulv2
+        # set distattr
         matmulv2_op_dist_attr = OperatorDistAttr()
         matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
         matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
@@ -2006,60 +1900,20 @@ def forward(ctx, *args, **kwargs):
         )
         group = new_process_group(group_ranks)
 
-        check_variable_and_dtype(
-            X_var, 'x', ['float16', 'float32', 'float64', 'uint16'], 'linear'
-        )
-        check_dtype(
-            X_var.dtype,
-            'dtype',
-            ['float16', 'float32', 'float64', 'uint16'],
-            'linear',
-        )
-        attrs = {
-            'trans_x': trans_x,
-            'trans_y': trans_y,
-            OP_ROLE_KEY: src_op.attr('op_role'),
-        }
-        inputs = {'X': X_var, 'Y': Weight_var}
-
         # infer out var shape with op dist attr
         out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
         assert out_tensor_dist_attr is not None
         out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
         assert out_var_dist_attr is not None
-        ref_shape = infer_shape(
-            main_block, Out_var, out_tensor_dist_attr, out_var_dist_attr
-        )
-
-        intermediate_var_0 = main_block.create_var(
-            name=unique_name.generate_with_ignorable_key(
-                ".".join(["c_allreduce_sum", 'tmp'])
-            ),
-            shape=Out_var.shape,
-            dtype=Out_var.dtype,
-            type=Out_var.type,
-            lod_level=Out_var.lod_level,
-            persistable=False,
-            is_data=False,
-            need_check_feed=Out_var.desc.need_check_feed(),
-        )
-        # set intermediate_var_0's dist_attr with Out_var's dist_attr
-        ctx.set_tensor_dist_attr_for_program(
-            intermediate_var_0, out_var_dist_attr
-        )
-
-        matmul_v2_op = main_block.append_op(
-            type='matmul_v2',
-            inputs=inputs,
-            outputs={'Out': intermediate_var_0},
-            attrs=attrs,
+
+        # copy op
+        matmul_v2_op = copy_op_without_infer_shape(
+            src_op, main_block, ctx, kwargs
         )
-        if intermediate_var_0.shape != ref_shape:
-            intermediate_var_0.desc.set_shape(ref_shape)
 
         c_allreduce_sum_op = main_block.append_op(
             type='c_allreduce_sum',
-            inputs={'X': intermediate_var_0},
+            inputs={'X': Out_var},
             outputs={'Out': Out_var},
             attrs={
                 'ring_id': group.id,
@@ -2068,11 +1922,8 @@ def forward(ctx, *args, **kwargs):
                 OP_ROLE_KEY: src_op.attr('op_role'),
             },
         )
-        if Out_var.shape != ref_shape:
-            Out_var.desc.set_shape(ref_shape)
 
         # set dist op's dist_attr with serial op's dist_attr
-        # matmulv2
         matmulv2_op_dist_attr = OperatorDistAttr()
         matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
         matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
@@ -2512,60 +2363,17 @@ def forward(ctx, *args, **kwargs):
         assert x_tensor_dist_attr is not None
         identity_var_dist_attr = op_dist_attr.get_input_dist_attr(X_var.name)
         assert identity_var_dist_attr is not None
-        ref_shape_x = infer_shape(
-            main_block, X_var, x_tensor_dist_attr, identity_var_dist_attr
-        )
+
         # infer out var shape with op dist attr
         out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
         assert out_tensor_dist_attr is not None
         out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
         assert out_var_dist_attr is not None
-        ref_shape_out = infer_shape(
-            main_block, Out_var, out_tensor_dist_attr, out_var_dist_attr
-        )
 
-        check_variable_and_dtype(
-            X_var,
-            'tensor',
-            ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
-            '_c_identity',
-        )
+        # copy op
+        mul_op = copy_op_without_infer_shape(src_op, main_block, ctx, kwargs)
 
-        attrs = {
-            "x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
-            "y_num_col_dims": src_op.desc.attr("y_num_col_dims"),
-            OP_ROLE_KEY: src_op.attr('op_role'),
-        }
-        inputs = {'X': X_var, 'Y': Weight_var}
-
-        inputs_ref_shape = {}
-        inputs_original_shape = {}
-        for var_name in inputs:
-            if var_name == "X":
-                var = X_var
-            else:
-                var = inputs[var_name]
-            inputs_original_shape[var_name] = var.shape
-            input_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(var)
-            input_var_dist_attr = op_dist_attr.get_input_dist_attr(var.name)
-            input_ref_shape = infer_shape(
-                main_block, var, input_tensor_dist_attr, input_var_dist_attr
-            )
-            inputs_ref_shape[var_name] = input_ref_shape
-            var.desc.set_shape(input_ref_shape)
-
-        mul_op = main_block.append_op(
-            type='mul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs
-        )
-        if Out_var.shape != ref_shape_out:
-            Out_var.desc.set_shape(ref_shape_out)
-
-        for var_name in inputs:
-            var = inputs[var_name]
-            original_shape = inputs_original_shape[var_name]
-            var.desc.set_shape(original_shape)
-
-        # matmulv2
+        # set distattr
         matmulv2_op_dist_attr = OperatorDistAttr()
         matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
         matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
@@ -2827,80 +2635,18 @@ def forward(ctx, *args, **kwargs):
         )
         group = new_process_group(group_ranks)
 
-        check_variable_and_dtype(
-            X_var, 'x', ['float16', 'float32', 'float64', 'uint16'], 'linear'
-        )
-        check_dtype(
-            X_var.dtype,
-            'dtype',
-            ['float16', 'float32', 'float64', 'uint16'],
-            'linear',
-        )
-        # attrs = {'trans_x': False, 'trans_y': False}
-        attrs = {
-            "x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
-            "y_num_col_dims": src_op.desc.attr("y_num_col_dims"),
-            OP_ROLE_KEY: src_op.attr('op_role'),
-        }
-        inputs = {'X': X_var, 'Y': Weight_var}
-
         # infer out var shape with op dist attr
         out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
         assert out_tensor_dist_attr is not None
         out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
         assert out_var_dist_attr is not None
-        ref_shape = infer_shape(
-            main_block, Out_var, out_tensor_dist_attr, out_var_dist_attr
-        )
-
-        intermediate_var_0 = main_block.create_var(
-            name=unique_name.generate_with_ignorable_key(
-                ".".join(["c_allreduce_sum", 'tmp'])
-            ),
-            shape=Out_var.shape,
-            dtype=Out_var.dtype,
-            type=Out_var.type,
-            lod_level=Out_var.lod_level,
-            persistable=False,
-            is_data=False,
-            need_check_feed=Out_var.desc.need_check_feed(),
-        )
-        # set intermediate_var_0's dist_attr with Out_var's dist_attr
-        ctx.set_tensor_dist_attr_for_program(
-            intermediate_var_0, out_var_dist_attr
-        )
-
-        inputs_ref_shape = {}
-        inputs_original_shape = {}
-        for var_name in inputs:
-            var = inputs[var_name]
-            inputs_original_shape[var_name] = var.shape
-            input_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(var)
-            input_var_dist_attr = op_dist_attr.get_input_dist_attr(var.name)
-            input_ref_shape = infer_shape(
-                main_block, var, input_tensor_dist_attr, input_var_dist_attr
-            )
-            inputs_ref_shape[var_name] = input_ref_shape
-            var.desc.set_shape(input_ref_shape)
 
-        mul_op = main_block.append_op(
-            type='mul',
-            inputs=inputs,
-            outputs={'Out': intermediate_var_0},
-            attrs=attrs,
-        )
-
-        if intermediate_var_0.shape != ref_shape:
-            intermediate_var_0.desc.set_shape(ref_shape)
-
-        for var_name in inputs:
-            var = inputs[var_name]
-            original_shape = inputs_original_shape[var_name]
-            var.desc.set_shape(original_shape)
+        # copy op
+        mul_op = copy_op_without_infer_shape(src_op, main_block, ctx, kwargs)
 
         c_allreduce_sum_op = main_block.append_op(
             type='c_allreduce_sum',
-            inputs={'X': intermediate_var_0},
+            inputs={'X': Out_var},
             outputs={'Out': Out_var},
             attrs={
                 'ring_id': group.id,
@@ -2910,9 +2656,6 @@ def forward(ctx, *args, **kwargs):
             },
         )
 
-        if Out_var.shape != ref_shape:
-            Out_var.desc.set_shape(ref_shape)
-
         # set dist op's dist_attr with serial op's dist_attr
         # matmulv2
         matmulv2_op_dist_attr = OperatorDistAttr()
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py b/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
index 85abed9558f4e6..e1481d30643eb0 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
@@ -28,7 +28,6 @@
     DistributedOperatorImpl,
     DistributedOperatorImplContainer,
     get_default_distributed_operator_impl,
-    merge_forward_backward_dims_mapping,
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
     update_op_dims_mapping,
@@ -72,20 +71,11 @@ def update_dims_mapping(dist_op):
         bw_results = rule.infer_backward(
             input_spec, output_spec, dims, keep_dim
         )
-        # step3: merge fw & bw results
-        (
-            infered_input_dims_mappings,
-            infered_output_dims_mappings,
-        ) = merge_forward_backward_dims_mapping(fw_results, bw_results)
 
-        # step4: update dist_attr
+        # step3: update dist_attr
         # tensor order following order in PHI defition
         changed = update_op_dims_mapping(
-            dist_op,
-            [input_arg_name],
-            infered_input_dims_mappings,
-            [output_arg_name],
-            infered_output_dims_mappings,
+            dist_op, [input_arg_name], [output_arg_name], fw_results, bw_results
         )
 
         return changed
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py
index e89caba2dd68d3..cd1e6bf10f8144 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py
@@ -14,6 +14,7 @@
 
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 
+from ..completion import get_phi_spmd_rule
 from ..cost import (
     Reshape2GradOpCost,
     Reshape2OpCost,
@@ -23,6 +24,7 @@
 )
 from ..utils import (
     compute_compatible_and_update_dim_mapping,
+    get_dist_tensor_spec,
     is_dim_shard,
     set_dist_op_desc_original_id,
 )
@@ -30,8 +32,10 @@
     DistributedOperatorImpl,
     DistributedOperatorImplContainer,
     is_parameter_related,
+    merge_forward_backward_dims_mapping,
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
+    update_op_dims_mapping,
 )
 from .dist_default import DistributedDefaultImpl0
 
@@ -40,6 +44,55 @@ class DistributedReshape2(DistributedOperatorImplContainer):
     def __init__(self, op_type):
         super().__init__(op_type)
 
+    @staticmethod
+    def update_dims_mapping(dist_op):
+        # step1: prepare inputs need for rule (order args as PHI definition and filter out unnecessary args)
+        op_desc = dist_op.serial_op.desc
+        assert (
+            dist_op.serial_op.type == "reshape2"
+        ), f"{dist_op.serial_op.type} is not supported by dist reshape yet."
+
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        xshape_name = op_desc.output('XShape')[0]
+        shape = op_desc.attr('shape')
+
+        x_spec = get_dist_tensor_spec(dist_op, x_name)
+        output_spec = get_dist_tensor_spec(dist_op, out_name, False)
+
+        # step2: infer spmd
+        rule = get_phi_spmd_rule("reshape")
+        # tensor order following order in PHI defition
+        fw_results = rule.infer_forward(x_spec, shape)
+        bw_results = rule.infer_backward(x_spec, output_spec, shape)
+
+        # step3: update dist_attr
+        # tensor order following order in PHI defition
+        changed = update_op_dims_mapping(
+            dist_op, [x_name], [out_name], fw_results, bw_results
+        )
+
+        # step4: update xshape
+        infered_input_dims_mappings, _ = merge_forward_backward_dims_mapping(
+            fw_results, bw_results
+        )
+        dist_op.dist_attr.set_output_dims_mapping(
+            xshape_name, [-1] + infered_input_dims_mappings[0]
+        )
+
+        return changed
+
+    @staticmethod
+    def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
+        reverted = False
+        op_dist_attr = dist_op.dist_attr
+
+        # all reshape mapping to impl0
+        op_dist_attr.impl_type = "reshape2"
+        op_dist_attr.impl_idx = 0
+
+        return reverted
+
 
 register_distributed_operator_impl_container(DistributedReshape2("reshape2"))
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_split.py b/python/paddle/distributed/auto_parallel/static/operators/dist_split.py
index e2df5428882722..9045a53e241249 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_split.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_split.py
@@ -12,12 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..utils import compute_compatible_and_update_dim_mapping, is_dim_shard
+from ..completion import get_phi_spmd_rule
+from ..utils import (
+    compute_compatible_and_update_dim_mapping,
+    get_dist_tensor_spec,
+    is_dim_shard,
+)
 from .common import (
     DistributedOperatorImpl,
     DistributedOperatorImplContainer,
+    get_default_distributed_operator_impl,
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
+    update_op_dims_mapping,
 )
 from .dist_default import DistributedDefaultImpl0
 
@@ -26,8 +33,71 @@ class DistributedSplit(DistributedOperatorImplContainer):
     def __init__(self, op_type):
         super().__init__(op_type)
 
+    @staticmethod
+    def update_dims_mapping(dist_op):
+        # step1: prepare inputs need for rule (order args as PHI definition and filter out unnecessary args)
+        op_desc = dist_op.serial_op.desc
+
+        x_name = op_desc.input('X')[0]
+        assert (
+            len(op_desc.input('AxisTensor')) == 0
+        ), "Attribute AxisTensor is not supported by dist split."
+        assert (
+            len(op_desc.input('SectionsTensorList')) == 0
+        ), "Attribute SectionsTensorList is not supported by dist split."
+        output_arg_names = op_desc.output('Out')
+
+        num = op_desc.attr('num')
+        sections = op_desc.attr('sections')
+        if num is not None:
+            assert (sections is None) or (
+                len(sections) == 0
+            ), f"Both Attributes of num: {num} and sections: {sections} are specified."
+            first_attr = num
+            rule_type = "split_with_num"
+        else:
+            assert (
+                num is None
+            ), f"Both Attributes of num: {num} and sections: {sections} are specified."
+            first_attr = sections
+            rule_type = "split"
+        axis = op_desc.attr('axis')
+
+        x_spec = get_dist_tensor_spec(dist_op, x_name)
+        num_outputs = len(output_arg_names)
+        output_specs = []
+        for i in range(num_outputs):
+            output_specs.append(
+                get_dist_tensor_spec(dist_op, output_arg_names[i], False)
+            )
+
+        # step2: infer spmd
+        rule = get_phi_spmd_rule(rule_type)
+        # tensor order following order in PHI defition
+        fw_results = rule.infer_forward(x_spec, first_attr, axis)
+        bw_results = rule.infer_backward(x_spec, output_specs, first_attr, axis)
+
+        # step3: update dist_attr
+        # tensor order following order in PHI defition
+        changed = update_op_dims_mapping(
+            dist_op, [x_name], output_arg_names, fw_results, bw_results
+        )
+
+        return changed
+
+    @staticmethod
+    def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
+        # all split op use default dist operator impl.
+        op_dist_attr = dist_op.dist_attr
+        default_impl = get_default_distributed_operator_impl()
+        op_dist_attr.impl_type = default_impl.type
+        op_dist_attr.impl_idx = default_impl.idx
+
+        return False
+
 
 register_distributed_operator_impl_container(DistributedSplit("split"))
+register_distributed_operator_impl_container(DistributedSplit("split_with_num"))
 
 
 class DistributedSplitImpl(DistributedOperatorImpl):
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/static/operators/dist_transpose.py
index 762c5b9209cd50..a0b062480b5fb4 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_transpose.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_transpose.py
@@ -14,6 +14,7 @@
 
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 
+from ..completion import get_phi_spmd_rule
 from ..cost import (
     Transpose2GradOpCost,
     Transpose2OpCost,
@@ -21,13 +22,19 @@
     build_comp_desc_from_dist_op,
     build_dp_costs,
 )
-from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import (
+    compute_compatible_and_update_dim_mapping,
+    get_dist_tensor_spec,
+)
 from .common import (
     DistributedOperatorImpl,
     DistributedOperatorImplContainer,
+    get_default_distributed_operator_impl,
     is_parameter_related,
+    merge_forward_backward_dims_mapping,
     register_distributed_operator_impl,
     register_distributed_operator_impl_container,
+    update_op_dims_mapping,
 )
 from .dist_default import DistributedDefaultImpl0
 
@@ -36,6 +43,54 @@ class DistributedTranspose2(DistributedOperatorImplContainer):
     def __init__(self, op_type):
         super().__init__(op_type)
 
+    @staticmethod
+    def update_dims_mapping(dist_op):
+        # step1: prepare inputs need for rule (order args as PHI definition and filter out unnecessary args)
+        op_desc = dist_op.serial_op.desc
+        assert (
+            dist_op.serial_op.type == "transpose2"
+        ), f"{dist_op.serial_op.type} is not supported by dist transpose yet."
+
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        xshape_name = op_desc.output('XShape')[0]
+        axes = op_desc.attr('axis')
+
+        x_spec = get_dist_tensor_spec(dist_op, x_name)
+        output_spec = get_dist_tensor_spec(dist_op, out_name, False)
+
+        # step2: infer spmd
+        rule = get_phi_spmd_rule("transpose")
+        # tensor order following order in PHI defition
+        fw_results = rule.infer_forward(x_spec, axes)
+        bw_results = rule.infer_backward(x_spec, output_spec, axes)
+
+        # step3: update dist_attr
+        # tensor order following order in PHI defition
+        changed = update_op_dims_mapping(
+            dist_op, [x_name], [out_name], fw_results, bw_results
+        )
+
+        # step4: update xshape
+        infered_input_dims_mappings, _ = merge_forward_backward_dims_mapping(
+            fw_results, bw_results
+        )
+        dist_op.dist_attr.set_output_dims_mapping(
+            xshape_name, [-1] + infered_input_dims_mappings[0]
+        )
+
+        return changed
+
+    @staticmethod
+    def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
+        # all elementwise op use default dist operator impl.
+        op_dist_attr = dist_op.dist_attr
+        default_impl = get_default_distributed_operator_impl()
+        op_dist_attr.impl_type = default_impl.type
+        op_dist_attr.impl_idx = default_impl.idx
+
+        return False
+
 
 register_distributed_operator_impl_container(
     DistributedTranspose2("transpose2")
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer.py b/python/paddle/distributed/auto_parallel/static/parallelizer.py
index 6e4eecf89fc5e8..06d2f4a995b750 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer.py
@@ -45,7 +45,7 @@
     get_world_process_group,
 )
 from .reshard import Resharder
-from .utils import SerialProgramInfo, make_data_unshard, set_grad_var_shape
+from .utils import SerialProgramInfo, make_data_unshard
 
 _logger = get_logger(logging.INFO)
 
@@ -260,8 +260,6 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
             dist_main_prog, dist_startup_prog, dist_params_grads
         )
 
-        set_grad_var_shape(dist_main_prog, self._dist_context)
-
         make_data_unshard(dist_main_prog, dist_startup_prog, self._dist_context)
 
         resharder = Resharder(
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index 38b9ae8dcda59a..6f0a1db1a3bff9 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -26,12 +26,7 @@
 from .partitioner import Partitioner
 from .process_group import get_world_process_group
 from .reshard import Resharder
-from .utils import (
-    get_pp_stage,
-    is_sequential_run,
-    set_grad_var_shape,
-    use_new_executor,
-)
+from .utils import get_pp_stage, is_sequential_run, use_new_executor
 
 
 class Parallelizer:
@@ -122,7 +117,7 @@ def parallel(self, rank, parameter_list=None):
                     time.time() - time0, self._mode
                 )
             )
-            set_grad_var_shape(dist_main_prog, self._dist_context)
+
             resharder = Resharder(
                 dist_main_prog,
                 dist_startup_prog,
diff --git a/python/paddle/distributed/auto_parallel/static/process_group.py b/python/paddle/distributed/auto_parallel/static/process_group.py
index df881be1a31e3a..6bf7b18cabcb0f 100644
--- a/python/paddle/distributed/auto_parallel/static/process_group.py
+++ b/python/paddle/distributed/auto_parallel/static/process_group.py
@@ -13,7 +13,6 @@
 # limitations under the License
 
 import hashlib
-import os
 from collections import OrderedDict
 
 import paddle
@@ -158,10 +157,10 @@ def instantiate(self):
             strategy.nrings = 1
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(genv.device_id)
-                use_new_comm = os.getenv(
-                    "FLAGS_dynamic_static_unified_comm", "0"
-                )
-                if use_new_comm in ["1", "True", "true"]:
+                use_new_comm = paddle.get_flags(
+                    "FLAGS_dynamic_static_unified_comm"
+                )["FLAGS_dynamic_static_unified_comm"]
+                if use_new_comm:
                     store = core.create_or_get_global_tcp_store()
                     endpoints_str = ""
                     for endpoint in strategy.trainer_endpoints:
diff --git a/python/paddle/distributed/auto_parallel/static/reshard.py b/python/paddle/distributed/auto_parallel/static/reshard.py
index facfe183c5d9ab..9cc1a61610d808 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard.py
@@ -33,7 +33,7 @@
 from .dist_attribute import TensorDistAttr
 from .dist_context import DistributedContext
 from .process_group import new_process_group
-from .utils import is_gradient_clip_op
+from .utils import is_gradient_clip_op, is_optimize_op
 
 # NOTE: If op in _g_special_ops or _g_gradient_clip_ops, it will not be resharded.
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
@@ -1786,8 +1786,19 @@ def parse_op_desc(
         source_tensor = get_var_with_recursion(
             var_name, block, self.auto_parallel_main_prog
         )
+
+        def is_grad(name):
+            return name.endswith('GRAD')
+
+        # all op that generate grad is marked as OpRole.Backward
+        op_role = (
+            OpRole.Backward
+            if is_optimize_op(reshard_op) and is_grad(var_name)
+            else reshard_op.attr('op_role')
+        )
+
         for op_desc in op_desc_list:
-            if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
+            if isinstance(op_desc, AllGatherOpDesc):
                 if var_name not in self.has_allgather.keys():
                     self.has_allgather[var_name] = []
                 if not self.has_allgather[var_name] or op_desc.group not in [
@@ -1799,7 +1810,7 @@ def parse_op_desc(
                             block,
                             idx,
                             source_tensor,
-                            reshard_op.attr('op_role'),
+                            op_role,
                             paddle.int64,
                         )
                         tensor_list, idx_offset = Inserter.insert_allgather_op(
@@ -1807,7 +1818,7 @@ def parse_op_desc(
                             idx + 1,
                             out_cast,
                             op_desc.group,
-                            reshard_op.attr('op_role'),
+                            op_role,
                         )
                         idx += idx_offset
                         tensor_name_list = []
@@ -1816,7 +1827,7 @@ def parse_op_desc(
                                 block,
                                 idx,
                                 var,
-                                reshard_op.attr('op_role'),
+                                op_role,
                                 paddle.bool,
                             )
                             tensor_name_list.append(out_cast.name)
@@ -1830,7 +1841,7 @@ def parse_op_desc(
                             idx,
                             source_tensor,
                             op_desc.group,
-                            reshard_op.attr('op_role'),
+                            op_role,
                         )
                         idx += idx_offset
                         tensor_name_list = [var.name for var in tensor_list]
@@ -1862,7 +1873,7 @@ def parse_op_desc(
                             block,
                             idx,
                             source_tensor,
-                            reshard_op.attr('op_role'),
+                            op_role,
                             paddle.int64,
                         )
                         Inserter.insert_send_op(
@@ -1871,7 +1882,7 @@ def parse_op_desc(
                             out_cast,
                             op_desc.src,
                             op_desc.dst,
-                            reshard_op.attr('op_role'),
+                            op_role,
                         )
                         idx += 2
                     else:
@@ -1881,7 +1892,7 @@ def parse_op_desc(
                             source_tensor,
                             op_desc.src,
                             op_desc.dst,
-                            reshard_op.attr('op_role'),
+                            op_role,
                         )
                         idx += 1
                     self.has_sent[var_name].append(op_desc.dst)
@@ -1909,13 +1920,13 @@ def parse_op_desc(
                             recv_tensor,
                             op_desc.src,
                             op_desc.dst,
-                            reshard_op.attr('op_role'),
+                            op_role,
                         )
                         out_cast = Inserter.insert_cast_op(
                             block,
                             idx + 1,
                             recv_tensor,
-                            reshard_op.attr('op_role'),
+                            op_role,
                             paddle.bool,
                         )
                         tensor_list.append(out_cast)
@@ -1935,7 +1946,7 @@ def parse_op_desc(
                             recv_tensor,
                             op_desc.src,
                             op_desc.dst,
-                            reshard_op.attr('op_role'),
+                            op_role,
                         )
 
                         # for lod tensor, need reset lod after received
@@ -1958,7 +1969,7 @@ def parse_op_desc(
                                                 idx + 1,
                                                 recv_tensor,
                                                 tmp_var,
-                                                reshard_op.attr('op_role'),
+                                                op_role,
                                             )
                                         )
                                         tensor_list.append(reset_lod_out)
@@ -1988,7 +1999,7 @@ def parse_op_desc(
                         partition_index_list[index],
                         block,
                         idx_list,
-                        reshard_op.attr('op_role'),
+                        op_role,
                     )
                 idx = idx_list[0]
 
@@ -2013,7 +2024,7 @@ def parse_op_desc(
                         ends=op_desc.ends,
                         axes=op_desc.axes,
                         new_var_name=new_name,
-                        op_role=reshard_op.attr('op_role'),
+                        op_role=op_role,
                     )
                 else:
                     target_tensor = Inserter.insert_c_concat_op(
@@ -2021,7 +2032,7 @@ def parse_op_desc(
                         idx,
                         source_tensor,
                         op_desc.group,
-                        reshard_op.attr('op_role'),
+                        op_role,
                     )
 
                 assert target_tensor is not None
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
index 6a3365eff018b4..6f1c26e5f235c4 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
@@ -38,10 +38,7 @@
     new_process_group,
 )
 from paddle.distributed.auto_parallel.static.reshard import Resharder
-from paddle.distributed.auto_parallel.static.utils import (
-    debug_program,
-    set_grad_var_shape,
-)
+from paddle.distributed.auto_parallel.static.utils import debug_program
 from paddle.distributed.passes import PassContext, new_pass
 from paddle.static import append_backward, program_guard
 from paddle.utils import unique_name
@@ -353,8 +350,6 @@ def _apply_optimization(self, trial):
                 )
         completer.complete_update_annotation(dist_main_prog)
 
-        # Do reshard process
-        set_grad_var_shape(dist_main_prog, dist_context)
         resharder = Resharder(
             dist_main_prog,
             dist_startup_prog,
diff --git a/python/paddle/distributed/auto_parallel/static/utils.py b/python/paddle/distributed/auto_parallel/static/utils.py
index fac4df3d451446..da57e126058a56 100644
--- a/python/paddle/distributed/auto_parallel/static/utils.py
+++ b/python/paddle/distributed/auto_parallel/static/utils.py
@@ -1205,149 +1205,6 @@ def _get_split_indices(
     return split_indices_list
 
 
-def set_grad_var_shape(program, dist_context):
-    from paddle.distributed.fleet.meta_optimizers.common import OpRole
-
-    from .operators.common import infer_shape
-
-    block = program.global_block()
-    vars = block.vars
-    appended_grad_times = 0
-    grad_var_to_var = dist_context.dist_op_context.grad_var_to_var
-
-    for idx, op in enumerate(block.ops):
-        if int(op.attr('op_role')) != int(OpRole.Backward):
-            continue
-
-        if (
-            int(block.ops[idx - 1].attr('op_role')) == int(OpRole.Forward)
-            or int(block.ops[idx - 1].attr('op_role')) == 257
-        ):
-            appended_grad_times += 1
-
-        if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
-            break
-
-        if op.type in ["sum", "concat", "shape"]:
-            continue
-
-        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-        assert op_dist_attr is not None
-
-        for var_name in op.output_arg_names:
-            if "@GRAD" not in var_name:
-                continue
-            if var_name in grad_var_to_var[appended_grad_times]:
-                forward_var_name = grad_var_to_var[appended_grad_times][
-                    var_name
-                ]
-            else:
-                forward_var_name = var_name[: var_name.find("@GRAD")]
-
-            if op.type in [
-                "c_allreduce_sum",
-                "c_identity",
-                "scale",
-                "cast",
-                "fill_any_like",
-            ]:
-                forward_var_name = op.input_arg_names[0]
-            elif (
-                op.type == "matmul_v2_grad"
-                or op.type == "matmul_grad"
-                or op.type == "mul_grad"
-            ):
-                forward_var_name = None
-                for output_name in op.output_names:
-                    if var_name in op.output(output_name):
-                        assert "@GRAD" in output_name
-                        input_name = output_name[: output_name.find("@GRAD")]
-                        assert len(op.input(input_name)) == 1
-                        forward_var_name = op.input(input_name)[0]
-                assert forward_var_name is not None
-
-            need_set_shape_list = [
-                "reshape2_grad",
-                "softmax_with_cross_entropy_grad",
-                "transpose2_grad",
-                "softmax_grad",
-                "cross_entropy_grad2",
-                "dropout_grad",
-                "tanh_grad",
-                "slice",
-                "assign",
-                "matmul_v2_triple_grad",
-                "elementwise_add_triple_grad",
-                "fill_constant",
-                "sqrt_grad",
-                "fused_softmax_mask_upper_triangle_grad",
-                "flatten_contiguous_range_grad",
-                "relu_grad",
-                "exp_grad",
-                "sigmoid_grad",
-                "unsqueeze2_grad",
-                "fused_dropout_add_grad",
-            ]
-            forward_list = [
-                "reshape2",
-                "softmax_with_cross_entropy",
-                "transpose2",
-                "softmax",
-                "cross_entropy2",
-                "dropout",
-                "tanh",
-                ["slice_grad", "c_allgather"],
-                "assign",
-                "matmul_v2_grad_grad",
-                "elementwise_add_grad_grad",
-                "shape",
-                "sqrt",
-                "fused_softmax_mask_upper_triangle",
-                "flatten_contiguous_range",
-                "relu",
-                "exp",
-                "sigmoid",
-                "unsqueeze2",
-                "fused_dropout_add",
-            ]
-            if op.type in need_set_shape_list:
-                for forward_op in block.ops:
-                    idx = need_set_shape_list.index(op.type)
-                    forward_op_name = forward_list[idx]
-                    if (
-                        forward_op.type in forward_op_name
-                        and forward_var_name in forward_op.input_arg_names
-                    ):
-                        op_dist_attr = (
-                            dist_context.get_op_dist_attr_for_program(
-                                forward_op
-                            )
-                        )
-                        break
-
-            forward_input_dist_attr = op_dist_attr.get_input_dist_attr(
-                forward_var_name
-            )
-            assert (
-                forward_input_dist_attr is not None
-            ), f"{forward_var_name, str(op)}"
-            forward_var = vars[forward_var_name]
-            forward_var_dist_attr = (
-                dist_context.get_tensor_dist_attr_for_program(forward_var)
-            )
-            assert forward_var_dist_attr is not None
-            grad_var = vars[var_name]
-            ref_shape = infer_shape(
-                block,
-                forward_var,
-                forward_var_dist_attr,
-                forward_input_dist_attr,
-            )
-
-            if list(grad_var.shape) != ref_shape:
-                grad_var.desc.set_shape(ref_shape)
-
-
 def is_forward_op(op):
     op_role = int(op.attr('op_role'))
     return OP_ROLE_KEY in op.attr_names and (
diff --git a/python/paddle/distributed/auto_tuner/cost_model.py b/python/paddle/distributed/auto_tuner/cost_model.py
new file mode 100644
index 00000000000000..53a4fdae793fab
--- /dev/null
+++ b/python/paddle/distributed/auto_tuner/cost_model.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def all_params(mp, pp, sharding, h, l, V):
+    # TODO: TBD - add some fixed structure models.
+    return 1
+
+
+def full_recompute_acts(mp, pp, s, b, h, l):
+    # TODO: TBD - add some fixed structure models.
+    return 1
+
+
+def all_acts(mp, pp, s, b, h, l, a):
+    # TODO: TBD - add some fixed structure models.
+    return 1
+
+
+def to_gb(p):
+    return p / (2**30)
+
+
+def get_mem(total_cards, parallel_cfg, l, h, a, V, s, gbs):
+    """Estimate the memory of model unser parallel strategy."""
+    sharding = parallel_cfg["sharding_degree"]
+    mp = parallel_cfg["mp_degree"]
+    b = parallel_cfg["micro_batch_size"]
+    pp = parallel_cfg["pp_degree"]
+    vpp = parallel_cfg["vpp_degree"]
+    use_recompute = parallel_cfg["use_recompute"]
+
+    sep = 1
+
+    lbs = int(gbs / sharding / s)
+    lbs = int(lbs / pp) * pp
+    assert s % sep == 0
+    s_sep = s // sep
+    assert a % (sep * mp) == 0, f'{a} vs {sep * mp}'
+
+    vpp_ratio = 1
+    if vpp > 1:
+        assert l % (pp * vpp) == 0
+        vpp_ratio = 1 + (pp - 1) / (pp * vpp)
+
+    params = to_gb(all_params(mp, pp, sharding, h, l, V))
+
+    acts = 0
+    assert l % pp == 0
+
+    if use_recompute:
+        acts = to_gb(full_recompute_acts(mp, pp, s_sep, b, h, l)) * vpp_ratio
+    else:
+        acts = to_gb(all_acts(mp, pp, s, b, h, l, a)) * vpp_ratio
+    assert acts > 0
+
+    peak_mem = params + acts
+    return peak_mem
+
+
+def divisor(num, reverse=False):
+    """Get the divisor of a given number."""
+    results = set()
+    i = 1
+    mid = num // 2 + 1
+    while i < mid:
+        if num % i == 0:
+            results.add(i)
+            results.add(num // i)
+        i += 1
+    results = list(results)
+    return sorted(results, reverse=reverse)
+
+
+def get_not_oom_cfgs(cfgs, tuner_cfg):
+    """Get not OOM parallel strategies."""
+    total_cards, l, h, a, V, s, gbs, per_card_memory = (
+        tuner_cfg["estimated_num_gpus"],
+        tuner_cfg["model_cfg"]["num_layers"],
+        tuner_cfg["model_cfg"]["hidden_size"],
+        tuner_cfg["model_cfg"]["num_attention_heads"],
+        tuner_cfg["model_cfg"]["vocab_size"],
+        tuner_cfg["model_cfg"]["seq_length"],
+        tuner_cfg["model_cfg"]["global_batch_size"],
+        tuner_cfg.get("per_card_memory", 80),
+    )
+    pruned_cfgs = []
+    for cfg in cfgs:
+        mp = cfg["mp_degree"]
+        sharding = cfg["sharding_degree"]
+        mbs = cfg["micro_batch_size"]
+        pp = cfg["pp_degree"]
+        vpp = cfg["vpp_degree"]
+        dp = cfg["dp_degree"]
+        use_recompute = cfg["use_recompute"]
+
+        if mp * sharding * pp * dp != total_cards:
+            continue
+        if gbs % sharding != 0:
+            continue
+        if gbs // sharding % dp != 0:
+            continue
+        if gbs // sharding // dp % mbs != 0:
+            continue
+        if l % pp != 0:
+            continue
+        if l // pp % vpp != 0:
+            continue
+        if vpp != 1 and pp <= 2:
+            continue
+        if a % mp != 0 or V % mp != 0 or h % mp != 0:
+            continue
+
+        pruned_cfgs.append(cfg)
+    valid_cfgs = []
+    for cfg in pruned_cfgs:
+        mem = get_mem(total_cards, cfg, l, h, a, V, s, gbs)
+        # TODO: Uncomment when it is actually implemented.
+        # if (
+        #     mem < per_card_memory
+        #     and mem
+        #     > tuner_cfg.get(
+        #         "search_algo", {"name": "dp_estimation", "threshold": 0.7}
+        #     ).get("threshold", 0.7)
+        #     * per_card_memory
+        # ):
+        # cfg["memory_cost"] = mem
+        # valid_cfgs.append(cfg)
+        cfg["memory_cost"] = mem
+        valid_cfgs.append(cfg)
+    assert valid_cfgs
+    return valid_cfgs
diff --git a/python/paddle/distributed/auto_tuner/search.py b/python/paddle/distributed/auto_tuner/search.py
index 0e0114a5249f08..b788e538581f12 100644
--- a/python/paddle/distributed/auto_tuner/search.py
+++ b/python/paddle/distributed/auto_tuner/search.py
@@ -16,7 +16,7 @@
 from abc import ABC, abstractmethod
 
 from .prune import _PRUNE_FUNC
-from .utils import gbs_search_all, search_all
+from .utils import gbs_search_all, search_all, search_by_dp_estimation
 
 
 class SearchAlgo(ABC):
@@ -54,6 +54,34 @@ def search_once(self, history_cfgs):
         return new_cfg
 
 
+class DpEstimationSearch(SearchAlgo):
+    def __init__(self, tuner_cfg):
+        super().__init__(tuner_cfg)
+        self.idx = 0
+        self.all_tasks = search_by_dp_estimation(tuner_cfg)
+        assert len(self.all_tasks) > 0, "Unable to perform this search."
+        # change global_batch_size and dp_degree
+        tuner_cfg["model_cfg"]["global_batch_size"] = (
+            tuner_cfg["model_cfg"]["global_batch_size"]
+            // self.all_tasks[0]["dp_degree"]
+        )
+        for task in self.all_tasks:
+            task["estimated_dp_degree"] = task["dp_degree"]
+            task["dp_degree"] = 1
+
+    def search_once(self, history_cfgs):
+        new_cfg = None
+        stop = False
+        while not stop:
+            if self.idx < len(self.all_tasks):
+                new_cfg = self.all_tasks[self.idx]
+                self.idx += 1
+                stop = not self.prune(self.tuner_cfg, new_cfg, history_cfgs)
+            else:
+                return None
+        return new_cfg
+
+
 class GBSSearch(SearchAlgo):
     def __init__(self, tuner_cfg):
         super().__init__(tuner_cfg)
diff --git a/python/paddle/distributed/auto_tuner/tuner.py b/python/paddle/distributed/auto_tuner/tuner.py
index bdc6bed5c6a085..9e693fcc3874f9 100644
--- a/python/paddle/distributed/auto_tuner/tuner.py
+++ b/python/paddle/distributed/auto_tuner/tuner.py
@@ -29,13 +29,18 @@ def __init__(self, tuner_cfg):
         self.cur_task_id = 1
         self.task_limit = tuner_cfg.get("task_limit", 100)
 
-        search_algo = tuner_cfg.get("search_algo", "grid")
+        search_algo = tuner_cfg.get("search_algo", {"name": "grid"})["name"]
 
         if search_algo == "grid":
             from .search import GridSearch
 
             tuner_cfg["candidates"] = default_candidates(tuner_cfg)
             self.algo = GridSearch(tuner_cfg)
+        elif search_algo == "dp_estimation":
+            from .search import DpEstimationSearch
+
+            tuner_cfg["candidates"] = default_candidates(tuner_cfg)
+            self.algo = DpEstimationSearch(tuner_cfg)
         elif search_algo == "gbs":
             from .search import GBSSearch
 
diff --git a/python/paddle/distributed/auto_tuner/utils.py b/python/paddle/distributed/auto_tuner/utils.py
index 3f2dcf45fcd854..2928908750d923 100644
--- a/python/paddle/distributed/auto_tuner/utils.py
+++ b/python/paddle/distributed/auto_tuner/utils.py
@@ -112,8 +112,16 @@ def dist_degree(mode, num_gpus, num_nodes, tuner_cfg=None):
 def default_candidates(tuner_cfg):
     """Return the default candidates of every hyper param which user defined auto"""
     candidates = {}
-    num_gpus = tuner_cfg["num_gpus"]
-    num_nodes = tuner_cfg["nodes"]
+    num_gpus = (
+        tuner_cfg["num_gpus"]
+        if "estimated_num_gpus" not in tuner_cfg
+        else tuner_cfg["estimated_num_gpus"]
+    )
+    num_nodes = (
+        tuner_cfg["nodes"]
+        if "estimated_num_gpus" not in tuner_cfg
+        else tuner_cfg["estimated_num_gpus"] // 8
+    )
     assert num_gpus > 0
 
     if tuner_cfg.get("dp_degree", None) == "auto":
@@ -210,7 +218,11 @@ def search_all(tuner_cfg):
     use_recompute_candidates = candidates["use_recompute"]
     recompute_granularity_candidates = candidates["recompute_granularity"]
 
-    num_gpus = tuner_cfg["num_gpus"]
+    num_gpus = (
+        tuner_cfg["num_gpus"]
+        if "estimated_num_gpus" not in tuner_cfg
+        else tuner_cfg["estimated_num_gpus"]
+    )
     valid_degrees = []
 
     for mp_degree in mp_degree_candidates:
@@ -294,6 +306,22 @@ def search_all(tuner_cfg):
     return new_all_cfgs
 
 
+def search_by_dp_estimation(tuner_cfg):
+    from .cost_model import get_not_oom_cfgs
+
+    all_cfgs = search_all(tuner_cfg)
+    not_oom_cfgs = get_not_oom_cfgs(all_cfgs, tuner_cfg)
+    num_gpus_per_dp_degree = tuner_cfg["num_gpus"]
+    estimated_dp_degree = (
+        tuner_cfg["estimated_num_gpus"] // num_gpus_per_dp_degree
+    )
+    result_cfgs = []
+    for cfg in not_oom_cfgs:
+        if cfg["dp_degree"] == estimated_dp_degree:
+            result_cfgs.append(cfg)
+    return result_cfgs
+
+
 def gen_new_args(raw_args, cfg, tuner_cfg, run_best=False):
     """Generate new script args."""
 
@@ -309,6 +337,9 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 import json
 
                 file_path = cmd[arg][0]
+                prefix = ""
+                if len(cmd[arg]) >= 3:
+                    prefix = cmd[arg][2]
                 try:
                     with open(file_path, "r") as f:
                         cmd_cfg = json.load(f)
@@ -317,14 +348,28 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = cfg[arg]
+                    if not value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = (
+                        prefix + str(cfg[arg]) if prefix else cfg[arg]
+                    )
+                else:
+                    cmd_cfg[keys[-1]] = (
+                        prefix + str(cfg[arg]) if prefix else cfg[arg]
+                    )
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
             elif ".yaml" in cmd[arg][0]:
                 import yaml
 
                 file_path = cmd[arg][0]
+                prefix = ""
+                if len(cmd[arg]) >= 3:
+                    prefix = cmd[arg][2]
                 try:
                     with open(file_path, "r") as f:
                         cmd_cfg = yaml.safe_load(f)
@@ -333,9 +378,20 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = cfg[arg]
+                    if not value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = (
+                        prefix + str(cfg[arg]) if prefix else cfg[arg]
+                    )
+                else:
+                    cmd_cfg[keys[-1]] = (
+                        prefix + str(cfg[arg]) if prefix else cfg[arg]
+                    )
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
         elif arg == "local_batch_size" and arg in cmd:
             local_batch_size = (
@@ -357,6 +413,9 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 import json
 
                 file_path = cmd[arg][0]
+                prefix = ""
+                if len(cmd[arg]) >= 3:
+                    prefix = cmd[arg][2]
                 try:
                     with open(file_path, "r") as f:
                         cmd_cfg = json.load(f)
@@ -365,14 +424,32 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = local_batch_size
+                    if not value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = (
+                        prefix + str(local_batch_size)
+                        if prefix
+                        else local_batch_size
+                    )
+                else:
+                    cmd_cfg[keys[-1]] = (
+                        prefix + str(local_batch_size)
+                        if prefix
+                        else local_batch_size
+                    )
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
             elif ".yaml" in cmd[arg][0]:
                 import yaml
 
                 file_path = cmd[arg][0]
+                prefix = ""
+                if len(cmd[arg]) >= 3:
+                    prefix = cmd[arg][2]
                 try:
                     with open(file_path, "r") as f:
                         cmd_cfg = yaml.safe_load(f)
@@ -381,9 +458,24 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = local_batch_size
+                    if not value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = (
+                        prefix + str(local_batch_size)
+                        if prefix
+                        else local_batch_size
+                    )
+                else:
+                    cmd_cfg[keys[-1]] = (
+                        prefix + str(local_batch_size)
+                        if prefix
+                        else local_batch_size
+                    )
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
         elif arg == "gradient_accumulation_steps" and arg in cmd:
@@ -413,6 +505,9 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 import json
 
                 file_path = cmd[arg][0]
+                prefix = ""
+                if len(cmd[arg]) >= 3:
+                    prefix = cmd[arg][2]
                 try:
                     with open(file_path, "r") as f:
                         cmd_cfg = json.load(f)
@@ -421,14 +516,32 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = gradient_accumulation_steps
+                    if not value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = (
+                        prefix + str(gradient_accumulation_steps)
+                        if prefix
+                        else gradient_accumulation_steps
+                    )
+                else:
+                    cmd_cfg[keys[-1]] = (
+                        prefix + str(gradient_accumulation_steps)
+                        if prefix
+                        else gradient_accumulation_steps
+                    )
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
             elif ".yaml" in cmd[arg][0]:
                 import yaml
 
                 file_path = cmd[arg][0]
+                prefix = ""
+                if len(cmd[arg]) >= 3:
+                    prefix = cmd[arg][2]
                 try:
                     with open(file_path, "r") as f:
                         cmd_cfg = yaml.safe_load(f)
@@ -437,9 +550,24 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = gradient_accumulation_steps
+                    if not value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = (
+                        prefix + str(gradient_accumulation_steps)
+                        if prefix
+                        else gradient_accumulation_steps
+                    )
+                else:
+                    cmd_cfg[keys[-1]] = (
+                        prefix + str(gradient_accumulation_steps)
+                        if prefix
+                        else gradient_accumulation_steps
+                    )
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
     assert "run_cmd" in tuner_cfg
@@ -477,9 +605,16 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = cmd[arg][2]
+                    if value:
+                        value = value[key]
+                    else:
+                        value = cmd_cfg[key]
+                if value:
+                    value[keys[-1]] = cmd[arg][2]
+                else:
+                    cmd_cfg[keys[-1]] = cmd[arg][2]
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
             elif ".yaml" in cmd[arg][0]:
                 import yaml
@@ -493,9 +628,16 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = cmd[arg][2]
+                    if value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = cmd[arg][2]
+                else:
+                    cmd_cfg[keys[-1]] = cmd[arg][2]
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
     if tuner_cfg["run_cmd"].get("run_best_stage", None) and run_best:
@@ -517,9 +659,16 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = cmd[arg][2]
+                    if value:
+                        value = value[key]
+                    else:
+                        value = cmd_cfg[key]
+                if value:
+                    value[keys[-1]] = cmd[arg][2]
+                else:
+                    cmd_cfg[keys[-1]] = cmd[arg][2]
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
             elif ".yaml" in cmd[arg][0]:
                 import yaml
@@ -533,9 +682,16 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         "Please check your auto tuner json whether valid."
                     )
                 keys = cmd[arg][1].split(".")
+                value = None
                 for key in keys[: len(keys) - 1]:
-                    cmd_cfg = cmd_cfg[key]
-                cmd_cfg[keys[-1]] = cmd[arg][2]
+                    if value:
+                        value = value[key]
+                    else:
+                        value = cmd_cfg[key]
+                if value:
+                    value[keys[-1]] = cmd[arg][2]
+                else:
+                    cmd_cfg[keys[-1]] = cmd[arg][2]
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
     return res_args
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 04f92558bdcdae..a2bac699bb5421 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -22,18 +22,20 @@
 from paddle.framework import in_dynamic_mode
 
 from .communication.group import Group, _add_new_group, is_initialized
-from .fleet.layers.mpu.mp_ops import _c_concat  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _c_identity  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _c_lookup_table  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _c_softmax_with_cross_entropy  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _c_split  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _Linear  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _linear  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _mp_allreduce  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _parallel_embedding  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _parallel_linear  # noqa: F401
-from .fleet.layers.mpu.mp_ops import _set_var_distributed  # noqa: F401
-from .fleet.layers.mpu.mp_ops import split  # noqa: F401
+from .fleet.layers.mpu.mp_ops import (  # noqa: F401
+    _c_concat,
+    _c_identity,
+    _c_lookup_table,
+    _c_softmax_with_cross_entropy,
+    _c_split,
+    _Linear,
+    _linear,
+    _mp_allreduce,
+    _parallel_embedding,
+    _parallel_linear,
+    _set_var_distributed,
+    split,
+)
 
 __all__ = []
 
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 0cda5198ab3c9f..a77e61bc2b4019 100755
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -13,30 +13,34 @@
 # limitations under the License.
 
 # TODO: define distributed api under this directory,
-from .base.role_maker import Role  # noqa: F401
-from .base.role_maker import UserDefinedRoleMaker  # noqa: F401
-from .base.role_maker import PaddleCloudRoleMaker  # noqa: F401
-from .base.distributed_strategy import DistributedStrategy  # noqa: F401
-from .base.util_factory import UtilBase  # noqa: F401
-from .dataset import DatasetBase  # noqa: F401
-from .dataset import InMemoryDataset  # noqa: F401
-from .dataset import QueueDataset  # noqa: F401
-from .dataset import FileInstantDataset  # noqa: F401
-from .dataset import BoxPSDataset  # noqa: F401
-from .data_generator.data_generator import MultiSlotDataGenerator  # noqa: F401
+from .base.role_maker import (
+    Role,
+    UserDefinedRoleMaker,
+    PaddleCloudRoleMaker,
+)
+from .base.distributed_strategy import DistributedStrategy
+from .base.util_factory import UtilBase
+from .dataset import (  # noqa: F401
+    DatasetBase,
+    InMemoryDataset,
+    QueueDataset,
+    FileInstantDataset,
+    BoxPSDataset,
+)
+from .data_generator.data_generator import MultiSlotDataGenerator
 from .data_generator.data_generator import (
     MultiSlotStringDataGenerator,
-)  # noqa: F401
+)
 from . import metrics  # noqa: F401
 from .base.topology import CommunicateTopology
-from .base.topology import HybridCommunicateGroup  # noqa: F401
+from .base.topology import HybridCommunicateGroup
 from .fleet import Fleet
 from .model import distributed_model
 from .optimizer import distributed_optimizer
 from .scaler import distributed_scaler
 from .utils import log_util
 
-__all__ = [  # noqa
+__all__ = [
     "CommunicateTopology",
     "UtilBase",
     "HybridCommunicateGroup",
@@ -103,4 +107,4 @@
 get_log_level_code = log_util.get_log_level_code
 get_log_level_name = log_util.get_log_level_name
 save_cache_table = fleet.save_cache_table
-from .. import auto_parallel as auto
+from .. import auto_parallel as auto  # noqa: F401
diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
index c5199eb46a7475..0da733c0f24c65 100644
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -16,6 +16,8 @@
 import time
 from contextlib import closing
 
+import paddle
+
 __all__ = []
 
 
@@ -33,6 +35,15 @@ def wait_server_ready(endpoints):
 
              >>> wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
     """
+    try:
+        use_new_comm = paddle.get_flags("FLAGS_dynamic_static_unified_comm")[
+            "FLAGS_dynamic_static_unified_comm"
+        ]
+    except:
+        use_new_comm = False
+
+    if use_new_comm:
+        return
     assert not isinstance(endpoints, str)
     while True:
         all_ok = True
diff --git a/python/paddle/distributed/fleet/dataset/__init__.py b/python/paddle/distributed/fleet/dataset/__init__.py
index 55b944abccd51c..589d315c5b2345 100644
--- a/python/paddle/distributed/fleet/dataset/__init__.py
+++ b/python/paddle/distributed/fleet/dataset/__init__.py
@@ -11,11 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-from .dataset import DatasetBase  # noqa: F401
-from .dataset import InMemoryDataset  # noqa: F401
-from .dataset import QueueDataset  # noqa: F401
-from .dataset import FileInstantDataset  # noqa: F401
-from .dataset import BoxPSDataset  # noqa: F401
+from .dataset import (  # noqa: F401
+    BoxPSDataset,
+    DatasetBase,
+    FileInstantDataset,
+    InMemoryDataset,
+    QueueDataset,
+)
 from .index_dataset import TreeIndex  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index 6c3810f7aae746..153d2447abe1d5 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -494,9 +494,9 @@ def _update_elastic_scale_out(self):
             if curr_host_port not in host_endpoints:
                 host_endpoints.append(curr_host_port)
 
-        os.environ[
-            'PADDLE_TRAINER_ID'
-        ] = f'{host_endpoints.index(self.curr_host)}'
+        os.environ['PADDLE_TRAINER_ID'] = str(
+            host_endpoints.index(self.curr_host)
+        )
         hosts = ','.join(
             [host_port.split(":")[0] for host_port in host_endpoints]
         )
@@ -547,9 +547,9 @@ def _update_elastic_scale_in(self):
         )
 
         self.args.ips = hosts
-        os.environ[
-            'PADDLE_TRAINER_ID'
-        ] = f'{sorted_endpoints.index(self.curr_host)}'
+        os.environ['PADDLE_TRAINER_ID'] = str(
+            sorted_endpoints.index(self.curr_host)
+        )
         os.environ['PADDLE_TRAINERS'] = hosts
         self.np = len(sorted_endpoints)
         os.environ['PADDLE_TRAINER_ENDPOINTS'] = ','.join(sorted_endpoints)
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index eee2ae02c9c88b..f18f7aeb068761 100755
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -105,54 +105,55 @@ class Fleet:
     Returns:
         Fleet: A Fleet instance
 
-
+    Examples:
         .. code-block:: python
             :name: code-example1
 
-            # Example1: for collective training
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
+            >>> # Example1: for collective training
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> import paddle.distributed.fleet as fleet
 
-            fleet.init(is_collective=True)
+            >>> fleet.init(is_collective=True)
 
-            strategy = fleet.DistributedStrategy()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+            >>> strategy = fleet.DistributedStrategy()
+            >>> linear = paddle.nn.Linear(10, 10)
+            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters())
+            >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
 
-            # do distributed training
+            >>> # do distributed training
 
 
 
         .. code-block:: python
             :name: code-example2
 
-            # Example2: for parameter server training
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            fleet.init(strategy=strategy)
+            >>> # Example2: for parameter server training
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> import paddle.distributed.fleet as fleet
+            >>> strategy = fleet.DistributedStrategy()
+            >>> fleet.init(strategy=strategy)
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-            optimizer = fleet.distributed_optimizer(optimizer)
+            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+            >>> optimizer = fleet.distributed_optimizer(optimizer)
 
-            if fleet.is_first_worker():
-                print("this is first worker")
+            >>> if fleet.is_first_worker():
+            ...     print("this is first worker")
 
-            print("current node index: {}".format(fleet.worker_index()))
-            print("total number of worker num: {}".format(fleet.worker_num()))
+            >>> print("current node index: {}".format(fleet.worker_index()))
+            >>> print("total number of worker num: {}".format(fleet.worker_num()))
 
-            if fleet.is_worker():
-                print("this is worker")
-            print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True)))
+            >>> if fleet.is_worker():
+            ...     print("this is worker")
+            >>> print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True)))
 
-            print("server num: {}".format(fleet.server_num()))
-            print("server endpoints: {}".format(fleet.server_endpoints(to_string=True)))
+            >>> print("server num: {}".format(fleet.server_num()))
+            >>> print("server endpoints: {}".format(fleet.server_endpoints(to_string=True)))
 
-            if fleet.is_server():
-                print("this is server")
-            fleet.stop_worker()
+            >>> if fleet.is_server():
+            ...     print("this is server")
+            >>> fleet.stop_worker()
 
 
     """
@@ -202,37 +203,37 @@ def init(
             .. code-block:: python
                 :name: code-example1
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
 
 
             .. code-block:: python
                 :name: code-example2
 
-                import paddle.distributed.fleet as fleet
-                fleet.init(is_collective=True)
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init(is_collective=True)
 
 
             .. code-block:: python
                 :name: code-example3
-                import paddle.distributed.fleet as fleet
-                role = fleet.PaddleCloudRoleMaker()
-                fleet.init(role)
+                >>> import paddle.distributed.fleet as fleet
+                >>> role = fleet.PaddleCloudRoleMaker()
+                >>> fleet.init(role)
 
 
             .. code-block:: python
                 :name: code-example4
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                fleet.init(strategy=strategy)
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> fleet.init(strategy=strategy)
 
 
             .. code-block:: python
                 :name: code-example5
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                fleet.init(log_level = "DEBUG")
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> fleet.init(log_level = "DEBUG")
 
         """
         from paddle.distributed import parallel_helper
@@ -454,9 +455,9 @@ def is_first_worker(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.is_first_worker()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.is_first_worker()
 
         """
         return self._role_maker._is_first_worker()
@@ -472,9 +473,9 @@ def worker_index(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.worker_index()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.worker_index()
 
         """
         return self._role_maker._worker_index()
@@ -490,9 +491,9 @@ def worker_num(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.worker_num()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.worker_num()
 
         """
         return self._role_maker._worker_num()
@@ -521,9 +522,9 @@ def is_worker(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.is_worker()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.is_worker()
 
         """
         return self._role_maker._is_worker()
@@ -542,9 +543,9 @@ def worker_endpoints(self, to_string=False):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.worker_endpoints()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.worker_endpoints()
 
         """
         if to_string:
@@ -563,9 +564,9 @@ def server_num(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.server_num()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.server_num()
         """
         return len(self._role_maker._get_pserver_endpoints())
 
@@ -580,9 +581,9 @@ def server_index(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.server_index()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.server_index()
 
         """
         return self._role_maker._server_index()
@@ -598,9 +599,9 @@ def server_endpoints(self, to_string=False):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.server_endpoints()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.server_endpoints()
 
         """
 
@@ -621,9 +622,9 @@ def is_server(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.is_server()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.is_server()
 
         """
         return self._role_maker._is_server()
@@ -639,9 +640,9 @@ def barrier_worker(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                fleet.barrier_worker()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> fleet.barrier_worker()
         """
         self._role_maker._barrier("worker")
 
@@ -659,13 +660,13 @@ def init_worker(self, scopes=None):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.init_worker()
+                >>> fleet.init_worker()
 
         """
         self._runtime_handle._init_worker(scopes)
@@ -704,13 +705,13 @@ def init_server(self, *args, **kwargs):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.init_server()
+                >>> fleet.init_server()
 
         """
         self._runtime_handle._init_server(*args, **kwargs)
@@ -729,13 +730,13 @@ def load_model(self, path, mode):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.load_model("path", mode=0)
+                >>> fleet.load_model("path", mode=0)
 
         """
         self._runtime_handle._load_persistables(path, mode)
@@ -754,13 +755,13 @@ def load_one_table(self, table_id, path, mode):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.load_one_table(0, "path", mode=0)
+                >>> fleet.load_one_table(0, "path", mode=0)
 
         """
         self._runtime_handle._load_one_table(table_id, path, mode)
@@ -779,13 +780,13 @@ def load_inference_model(self, path, mode):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.load_inference_model("path", mode=1)
+                >>> fleet.load_inference_model("path", mode=1)
 
         """
         self._runtime_handle._load_inference_model(path, mode)
@@ -803,14 +804,14 @@ def run_server(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                if fleet.is_server():
-                    fleet.init_server()
+                >>> if fleet.is_server():
+                ...     fleet.init_server()
 
         """
         self._runtime_handle._run_server()
@@ -828,13 +829,13 @@ def stop_worker(self):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.init_server()
+                >>> fleet.init_server()
 
         """
         self._runtime_handle._stop_worker()
@@ -908,13 +909,13 @@ def save_inference_model(
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.init_server()
+                >>> fleet.init_server()
 
         """
 
@@ -958,17 +959,17 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
 
             .. code-block:: text
 
-                import paddle
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
 
-                fleet.init()
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                exe = paddle.static.Executor(paddle.CPUPlace())
-                fleet.save_persistables(exe, "dirname", paddle.static.default_main_program())
+                >>> exe = paddle.static.Executor(paddle.CPUPlace())
+                >>> fleet.save_persistables(exe, "dirname", paddle.static.default_main_program())
 
         """
         self._runtime_handle._save_persistables(
@@ -1008,13 +1009,13 @@ def save_one_table(self, table_id, path, mode):
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.save_one_table(0, "path", mode=0)
+                >>> fleet.save_one_table(0, "path", mode=0)
 
         """
         self._runtime_handle._save_one_table(table_id, path, mode)
@@ -1035,16 +1036,16 @@ def save_dense_params(
 
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                import paddle
-                place = paddle.CPUPlace()
-                exe =  paddle.static.Executor(place)
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init()
+                >>> import paddle
+                >>> place = paddle.CPUPlace()
+                >>> exe =  paddle.static.Executor(place)
 
-                # build net
-                # fleet.distributed_optimizer(...)
+                >>> # build net
+                >>> # fleet.distributed_optimizer(...)
 
-                fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program())
+                >>> fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program())
 
         """
         self._runtime_handle._save_dense_params(
@@ -1078,12 +1079,13 @@ def distributed_optimizer(self, optimizer, strategy=None):
 
             .. code-block:: python
 
-                import paddle
-                import paddle.distributed.fleet as fleet
-                fleet.init(is_collective=True)
-                strategy = fleet.DistributedStrategy()
-                optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-                optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+                >>> import paddle
+                >>> import paddle.distributed.fleet as fleet
+                >>> fleet.init(is_collective=True)
+                >>> linear = paddle.nn.Linear(10, 10)
+                >>> strategy = fleet.DistributedStrategy()
+                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters())
+                >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
 
         """
         self.user_defined_optimizer = optimizer
@@ -1141,46 +1143,46 @@ def amp_init(
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.nn.functional as F
-                paddle.enable_static()
-
-                def run_example_code():
-                    place = paddle.CUDAPlace(0)
-                    exe = paddle.static.Executor(place)
-                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
-                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
-                    # 1) Use fp16_guard to control the range of fp16 kernels used.
-                    with paddle.static.amp.fp16_guard():
-                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
-                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
-                        hidden = paddle.static.nn.fc(pool, size=10)
-                        loss = paddle.mean(hidden)
-                    # 2) Create the optimizer and set `multi_precision` to True.
-                    # Setting `multi_precision` to True can avoid the poor accuracy
-                    # or the slow convergence in a way.
-                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
-                    # 3) These ops in `custom_black_list` will keep in the float32 computation type.
-                    amp_list = paddle.static.amp.CustomOpLists(
-                        custom_black_list=['pool2d'])
-                    # 4) The entry of Paddle AMP.
-                    # Enable pure fp16 training by setting `use_pure_fp16` to True.
-                    optimizer = paddle.static.amp.decorate(
-                        optimizer,
-                        amp_list,
-                        init_loss_scaling=128.0,
-                        use_dynamic_loss_scaling=True,
-                        use_pure_fp16=True)
-                    # If you don't use the default_startup_program(), you sholud pass
-                    # your defined `startup_program` into `minimize`.
-                    optimizer.minimize(loss)
-                    exe.run(paddle.static.default_startup_program())
-                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
-                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
-                    optimizer.amp_init(place, scope=paddle.static.global_scope())
-
-                if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
-                    run_example_code()
+                >>> import paddle
+                >>> import paddle.nn.functional as F
+                >>> paddle.enable_static()
+
+                >>> def run_example_code():
+                ...     place = paddle.CUDAPlace(0)
+                ...     exe = paddle.static.Executor(place)
+                ...     data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
+                ...     conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
+                ...     # 1) Use fp16_guard to control the range of fp16 kernels used.
+                ...     with paddle.static.amp.fp16_guard():
+                ...         bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
+                ...         pool = F.max_pool2d(bn, kernel_size=2, stride=2)
+                ...         hidden = paddle.static.nn.fc(pool, size=10)
+                ...         loss = paddle.mean(hidden)
+                ...     # 2) Create the optimizer and set `multi_precision` to True.
+                ...     # Setting `multi_precision` to True can avoid the poor accuracy
+                ...     # or the slow convergence in a way.
+                ...     optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
+                ...     # 3) These ops in `custom_black_list` will keep in the float32 computation type.
+                ...     amp_list = paddle.static.amp.CustomOpLists(
+                ...         custom_black_list=['pool2d'])
+                ...     # 4) The entry of Paddle AMP.
+                ...     # Enable pure fp16 training by setting `use_pure_fp16` to True.
+                ...     optimizer = paddle.static.amp.decorate(
+                ...         optimizer,
+                ...         amp_list,
+                ...         init_loss_scaling=128.0,
+                ...         use_dynamic_loss_scaling=True,
+                ...         use_pure_fp16=True)
+                ...     # If you don't use the default_startup_program(), you sholud pass
+                ...     # your defined `startup_program` into `minimize`.
+                ...     optimizer.minimize(loss)
+                ...     exe.run(paddle.static.default_startup_program())
+                ...     # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
+                ...     # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
+                ...     optimizer.amp_init(place, scope=paddle.static.global_scope())
+
+                >>> if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
+                ...     run_example_code()
         """
         amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)
@@ -1254,7 +1256,7 @@ def minimize(
             loss (Tensor): A ``Tensor`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_paddle_static_Program` for
                 initializing parameters in ``parameter_list``. The default value
-                is None, at this time :ref:`api_base_default_startup_program` will be used.
+                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
             parameter_list (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
@@ -1273,28 +1275,29 @@ def minimize(
 
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
-                import paddle.nn.functional as F
-
-                hid_dim = 10
-                label_dim = 2
-                input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32')
-                input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64')
-                fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh')
-                fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh')
-                prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax')
-                cost = F.cross_entropy(input=prediction, label=input_y)
-                avg_cost = paddle.mean(x=cost)
-
-                fleet.init(is_collective=True)
-                strategy = fleet.DistributedStrategy()
-                optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-                optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-                optimizer.minimize(avg_cost)
-
-                # for more examples, please reference https://github.com/PaddlePaddle/PaddleFleetX
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
+                >>> import paddle.nn.functional as F
+
+                >>> hid_dim = 10
+                >>> label_dim = 2
+                >>> input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32')
+                >>> input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64')
+                >>> fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh')
+                >>> fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh')
+                >>> prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax')
+                >>> cost = F.cross_entropy(input=prediction, label=input_y)
+                >>> avg_cost = paddle.mean(x=cost)
+
+                >>> fleet.init(is_collective=True)
+                >>> strategy = fleet.DistributedStrategy()
+                >>> linear = paddle.nn.Linear(10, 10)
+                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters())
+                >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+                >>> optimizer.minimize(avg_cost)
+
+                >>> # for more examples, please reference https://github.com/PaddlePaddle/PaddleFleetX
 
         """
         if not isinstance(loss, list):
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
index 7b59a6d5946403..d7febc350a5b52 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import paddle
 from paddle.autograd import PyLayer
 from paddle.base import core
@@ -20,6 +22,7 @@
 
 from ....communication.reduce import ReduceOp, _get_reduce_op
 from ...base import topology as tp
+from ...utils.log_util import logger
 from . import mp_ops
 from .random import get_rng_state_tracker
 
@@ -177,6 +180,9 @@ def forward(self, x):
         return output
 
 
+_raise_cuda_env_unset_warning = True
+
+
 class InnerOverlapLinear(paddle.autograd.PyLayer):
     @staticmethod
     def forward(
@@ -216,8 +222,17 @@ def backward(ctx, dy):
         task = ctx.model_parallel_group.process_group.all_reduce(
             dx, op_type, sync_op=False
         )
-        # TODO(GhostScreaming): remove it in future.
-        tmp = paddle.ones([512])
+        # Using small operation to preempt GPU SMs for all_reduce to achieve overlap.
+        if int(os.getenv("CUDA_DEVICE_MAX_CONNECTIONS", "0")) != 1:
+            global _raise_cuda_env_unset_warning
+            if _raise_cuda_env_unset_warning:
+                logger.warning(
+                    "You set mp_async_allreduce=True, but you forget to set environment "
+                    "variable CUDA_DEVICE_MAX_CONNECTIONS=1, which may leads to performance "
+                    "loss. Try to export CUDA_DEVICE_MAX_CONNECTIONS=1 for better performance."
+                )
+            _raise_cuda_env_unset_warning = False
+            tmp = paddle.ones([512])
 
         if ctx.mp_fused_linear_param_grad_add:
             if not is_fused_linear_param_grad_add_supported():
@@ -263,7 +278,7 @@ def backward(ctx, dy):
                     weight.main_grad,
                     bias.main_grad,
                 ) = paddle._C_ops.fused_linear_param_grad_add(
-                    input,
+                    x,
                     dy,
                     weight.main_grad,
                     bias.main_grad,
@@ -293,9 +308,10 @@ def backward(ctx, dy):
                     task.wait()
                     return dx, dw, dbias
         else:
+            dy = dy.reshape([-1, dy.shape[-1]])
             dw = paddle.matmul(
                 x.reshape([-1, x.shape[-1]]),
-                dy.reshape([-1, dy.shape[-1]]),
+                dy,
                 transpose_x=True,
             )
             if bias is None:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 9625e2481d4002..75be5f621d4124 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 
 import paddle
 from paddle.framework import core
@@ -99,8 +98,10 @@ def _init_communicator(
             other_endpoints.remove(current_endpoint)
 
         if rank == 0 and wait_port:
-            use_new_comm = os.getenv("FLAGS_dynamic_static_unified_comm", "0")
-            if use_new_comm not in [1, "1", "True", "true"]:
+            use_new_comm = paddle.get_flags(
+                "FLAGS_dynamic_static_unified_comm"
+            )["FLAGS_dynamic_static_unified_comm"]
+            if not use_new_comm:
                 wait_server_ready(other_endpoints)
 
         def _add_sync_by_allreduce(block):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 1ee99b10854b9f..ab8ec3a67b145f 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -14,6 +14,7 @@
 
 import os
 
+import paddle
 from paddle.base import core
 from paddle.incubate.optimizer import PipelineOptimizer
 from paddle.static import (
@@ -714,8 +715,10 @@ def minimize_impl(
         self._recreate_not_persist_param_as_var()
 
         self._dump_program_for_debug()
-        use_new_comm = os.getenv("FLAGS_dynamic_static_unified_comm", "0")
-        if use_new_comm not in ["1", "True", "true"]:
+        use_new_comm = paddle.get_flags("FLAGS_dynamic_static_unified_comm")[
+            "FLAGS_dynamic_static_unified_comm"
+        ]
+        if not use_new_comm:
             self._wait()
         return optimize_ops, params_grads
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
index d50eb940b72bb1..7b1f668f421da9 100644
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -12,20 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .parallel_layers import VocabParallelEmbedding  # noqa: F401
-from .parallel_layers import ColumnParallelLinear  # noqa: F401
-from .parallel_layers import RowParallelLinear  # noqa: F401
-from .parallel_layers import ParallelCrossEntropy  # noqa: F401
-from .parallel_layers import LayerDesc  # noqa: F401
-from .parallel_layers import SharedLayerDesc  # noqa: F401
-from .parallel_layers import PipelineLayer  # noqa: F401
-from .parallel_layers import RNGStatesTracker  # noqa: F401
-from .parallel_layers import model_parallel_random_seed  # noqa: F401
-from .parallel_layers import get_rng_state_tracker  # noqa: F401
-from .tensor_parallel import TensorParallel  # noqa: F401
-from .pipeline_parallel import PipelineParallel  # noqa: F401
-from .pipeline_parallel import PipelineParallelWithInterleave  # noqa: F401
-from .sharding_parallel import ShardingParallel  # noqa: F401
+from .parallel_layers import (  # noqa: F401
+    ColumnParallelLinear,
+    LayerDesc,
+    ParallelCrossEntropy,
+    PipelineLayer,
+    RNGStatesTracker,
+    RowParallelLinear,
+    SharedLayerDesc,
+    VocabParallelEmbedding,
+    get_rng_state_tracker,
+    model_parallel_random_seed,
+)
+from .pipeline_parallel import (  # noqa: F401
+    PipelineParallel,
+    PipelineParallelWithInterleave,
+)
 from .segment_parallel import SegmentParallel  # noqa: F401
+from .sharding_parallel import ShardingParallel  # noqa: F401
+from .tensor_parallel import TensorParallel  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
index fd977857490737..cf6f26c989e571 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
@@ -12,15 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .mp_layers import VocabParallelEmbedding  # noqa: F401
-from .mp_layers import ColumnParallelLinear  # noqa: F401
-from .mp_layers import RowParallelLinear  # noqa: F401
-from .mp_layers import ParallelCrossEntropy  # noqa: F401
-from .pp_layers import LayerDesc  # noqa: F401
-from .pp_layers import SharedLayerDesc  # noqa: F401
-from .pp_layers import PipelineLayer  # noqa: F401
-from .random import RNGStatesTracker  # noqa: F401
-from .random import model_parallel_random_seed  # noqa: F401
-from .random import get_rng_state_tracker  # noqa: F401
+from .mp_layers import (  # noqa: F401
+    ColumnParallelLinear,
+    ParallelCrossEntropy,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+)
+from .pp_layers import LayerDesc, PipelineLayer, SharedLayerDesc  # noqa: F401
+from .random import (  # noqa: F401
+    RNGStatesTracker,
+    get_rng_state_tracker,
+    model_parallel_random_seed,
+)
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index f0d7ca602feb77..7a6d5adc83163f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...layers.mpu.mp_layers import ColumnParallelLinear  # noqa: F401
-from ...layers.mpu.mp_layers import ParallelCrossEntropy  # noqa: F401
-from ...layers.mpu.mp_layers import RowParallelLinear  # noqa: F401
-from ...layers.mpu.mp_layers import VocabParallelEmbedding  # noqa: F401
+from ...layers.mpu.mp_layers import (  # noqa: F401
+    ColumnParallelLinear,
+    ParallelCrossEntropy,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+)
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
index 7139d40adc7583..1e043d18a94eb9 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...layers.mpu.random import RNGStatesTracker  # noqa: F401
-from ...layers.mpu.random import determinate_seed  # noqa: F401
-from ...layers.mpu.random import dropout  # noqa: F401
-from ...layers.mpu.random import get_rng_state_tracker  # noqa: F401
-from ...layers.mpu.random import model_parallel_random_seed  # noqa: F401
+from ...layers.mpu.random import (  # noqa: F401
+    RNGStatesTracker,
+    determinate_seed,
+    dropout,
+    get_rng_state_tracker,
+    model_parallel_random_seed,
+)
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
index c4e0d54b99a12d..44b799b00c91d7 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -382,7 +382,12 @@ def scale(grad):
             if hasattr(param, "main_grad"):
                 param.main_grad.scale_(self._world_size_scaling)
             else:
-                grad.scale_(self._world_size_scaling)
+                if grad is not None and grad._is_initialized():
+                    grad.scale_(self._world_size_scaling)
+                else:
+                    assert param.grad is not None
+                    assert param.grad._is_initialized()
+                    param.grad.scale_(self._world_size_scaling)
 
         return scale
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index 12c5ac37c8b10b..8a61ab904cb304 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -176,6 +176,9 @@ def __init__(
                     if "grad_clip" in item.keys():
                         item["grad_clip"] = self._optim._grad_clip
 
+        # check main_grad
+        self._check_main_grad()
+
         # Synchronous all ranks models
         if pertrain_sync_models:
             self._sync_params_and_buffers()
@@ -203,6 +206,16 @@ def __init__(
         self._redefine_opt_step()
         self._redefine_opt_clear()
 
+    def _check_main_grad(self):
+        self.use_main_grad = None
+        for param in self._layer.parameters():
+            if self.use_main_grad is None and hasattr(param, "main_grad"):
+                self.use_main_grad = True
+            if self.use_main_grad:
+                assert hasattr(
+                    param, "main_grad"
+                ), "Params have different main grad attributes."
+
     @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
@@ -235,8 +248,11 @@ def _clear_gradients(self):
             assert hasattr(
                 param, "fw_storage"
             ), f"Find {param.name} don't have fw_storage attribute."
-
-            param.fw_storage.clear_gradient(False)
+            if self.use_main_grad:
+                param.fw_storage.main_grad._clear()
+                param.fw_storage.main_grad = None
+            else:
+                param.fw_storage.clear_gradient(False)
             param.bw_storage._clear()
             param.bw_storage = None
         # 2.Handle unslice param
@@ -245,7 +261,12 @@ def _clear_gradients(self):
                 grad_storage.buffer.zero_()
         else:
             for param in list(self._unslice_params):
-                param.clear_gradient(False)
+                if self.use_main_grad:
+                    param.main_grad._clear()
+                    param.main_grad = None
+                else:
+                    param.clear_gradient(False)
+
                 if (
                     self._default_device
                     in paddle.device.get_all_custom_device_type()
@@ -350,7 +371,9 @@ def _handle_unslice_params(self):
             if param.dtype not in self._grad_storages.keys():
                 self._grad_storages[param.dtype] = GradStorage(
                     buffer_size[param.dtype],
-                    dtype=param.dtype,
+                    dtype=param.dtype
+                    if not self.use_main_grad
+                    else paddle.float32,
                     device=self._default_device,
                     destination=self._rank,
                     parm2align=self._unslice_params2align,
@@ -596,8 +619,11 @@ def _update_params(self):
             ), f"Find {param.name} don't have fw_storage attribute"
 
             param.fw_storage = _TensorWrapper(param)
-            assert param.fw_storage.grad is None
-            param.fw_storage._copy_gradient_from(param.bw_storage)
+            if self.use_main_grad:
+                param.fw_storage.main_grad = param.bw_storage
+            else:
+                assert param.fw_storage.grad is None
+                param.fw_storage._copy_gradient_from(param.bw_storage)
             update_list.append(param)
 
         # 2.Handle unslice param
@@ -617,9 +643,13 @@ def _update_params(self):
 
             for grad_storage in self._grad_storages.values():
                 for p in grad_storage._params:
-                    tmp_g = _device2cpu(p.grad, convert_dtype=True)
-                    p.clear_gradient(False)
-                    p._copy_gradient_from(tmp_g)
+                    if self.use_main_grad:
+                        tmp_g = _device2cpu(p.main_grad, convert_dtype=True)
+                        p.main_grad = tmp_g
+                    else:
+                        tmp_g = _device2cpu(p.grad, convert_dtype=True)
+                        p.clear_gradient(False)
+                        p._copy_gradient_from(tmp_g)
                     del tmp_g
                 grad_storage.buffer._clear()
 
@@ -650,6 +680,7 @@ def get_all_parameters(self, convert2cpu=False):
         if convert2cpu:
             for param in trainable_params:
                 t_flow.full_param[param.name][0]._share_buffer_to(param)
+                del t_flow.full_param[param.name]
 
         #  a _allgather_buffer call should be matched with a _release_param call later,
         #  but the _allgather_buffer call here has no match.
@@ -708,7 +739,11 @@ def allreduce_(*_):
                             param.bw_storage,
                             full_grad._slice(start, end).detach().clone(),
                         )
-                param.clear_gradient(False)
+
+                if self.use_main_grad:
+                    param.main_grad = None
+                else:
+                    param.clear_gradient(False)
                 del self._task_flow.full_grad[param.name]
 
             if param.name in self._task_flow.full_param.keys():
@@ -726,6 +761,7 @@ def allreduce_(*_):
                     del self._task_flow.full_param[param.name]
 
                     if self._offload:
+                        # revert back to cpu for offload update
                         param.fw_storage._clear_data()
                         param.master_weight._share_buffer_to(param.fw_storage)
 
@@ -929,11 +965,14 @@ class TaskFlow:
 
     def __init__(
         self,
+        full_param={},
+        full_grad={},
+        use_calc={},
         callback=None,
     ):
-        self.full_param = {}
-        self.full_grad = {}
-        self.use_calc = {}
+        self.full_param = full_param
+        self.full_grad = full_grad
+        self.use_calc = use_calc
         self.callback = callback
 
 
@@ -1014,6 +1053,7 @@ def _allgather_buffer(
             continue
 
         if offload:
+            # convert to device for collective comm
             param.fw_storage = _cpu2device(param)
 
         buffer_size = param2buffer_size[param.name]
@@ -1046,17 +1086,22 @@ def _allgather_buffer(
 @paddle.autograd.no_grad()
 def _create_params_grad(trainable_params, param2buffer_size, task_flow):
     for param in trainable_params:
+        use_main_grad = hasattr(param, "main_grad")
         if not param.trainable:
             continue
         if param.name in task_flow.full_grad.keys():
             continue
         assert isinstance(param2buffer_size[param.name], int)
         temp_grad = paddle.zeros(
-            [param2buffer_size[param.name]], dtype=param.dtype
+            [param2buffer_size[param.name]],
+            dtype=param.dtype if not use_main_grad else paddle.float32,
         )
         temp_tensor = temp_grad._slice(0, param._numel())
         temp_tensor.get_tensor()._set_dims(param.shape)
-        param._copy_gradient_from(temp_tensor)
+        if use_main_grad:
+            param.main_grad = temp_tensor
+        else:
+            param._copy_gradient_from(temp_tensor)
         del temp_tensor
         task_flow.full_grad[param.name] = temp_grad
     return task_flow
diff --git a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
index 6da9dae096d46c..ff9ff2ee2a9c3e 100755
--- a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
@@ -41,11 +41,17 @@ def _prepare_for_model(self):
             logger.info("start broadcast sharding parameters")
             broadcast_sharding_parameters(self._layers, self._hcg)
 
-        logger.info("start broadcast dp parameters")
-        broadcast_dp_parameters(self._layers, self._hcg)
+        if self._hcg.get_data_parallel_world_size() > 1:
+            logger.info("start broadcast dp parameters")
+            broadcast_dp_parameters(self._layers, self._hcg)
 
         logger.info("mp's parameters is ready")
 
     def _pre_forward(self, *inputs, **kwargs):
-        logger.debug("mp start broadcast input data")
-        return broadcast_input_data(self._hcg, *inputs, **kwargs)
+        need_broadcast_data = True
+        if self._strategy is not None:
+            mp_configs = self._strategy.hybrid_configs["mp_configs"]
+            need_broadcast_data = mp_configs.need_broadcast_data
+        if need_broadcast_data:
+            logger.debug("mp start broadcast input data")
+            return broadcast_input_data(self._hcg, *inputs, **kwargs)
diff --git a/python/paddle/distributed/fleet/metrics/__init__.py b/python/paddle/distributed/fleet/metrics/__init__.py
index abcb90afb23c43..160039e26dd199 100644
--- a/python/paddle/distributed/fleet/metrics/__init__.py
+++ b/python/paddle/distributed/fleet/metrics/__init__.py
@@ -12,13 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .metric import acc  # noqa: F401
-from .metric import auc  # noqa: F401
-from .metric import mae  # noqa: F401
-from .metric import max  # noqa: F401
-from .metric import min  # noqa: F401
-from .metric import mse  # noqa: F401
-from .metric import rmse  # noqa: F401
-from .metric import sum  # noqa: F401
+from .metric import acc, auc, mae, max, min, mse, rmse, sum  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py
old mode 100755
new mode 100644
index 43e4dde69440a1..6a8202965d5be2
--- a/python/paddle/distributed/fleet/recompute/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 import contextlib
+import copy
 import weakref
 
 import paddle
 from paddle import framework
 from paddle.autograd import PyLayer
+from paddle.base.framework import EagerParamBase
 from paddle.distributed.fleet.meta_parallel.parallel_layers.random import (
     get_rng_state_tracker,
 )
@@ -28,6 +30,15 @@
 __all__ = []
 
 
+def _varbase_help(param):
+    state = copy.deepcopy(param.__dict__)
+    new_param = EagerParamBase(
+        shape=param.shape, dtype=param.dtype, name=param.name, **state
+    )
+    param._share_buffer_to(new_param)
+    return new_param
+
+
 def detach_variable(inputs):
     out = []
     for inp in inputs:
@@ -38,14 +49,23 @@ def detach_variable(inputs):
             out.append(inp)
             continue
 
+        if isinstance(inp, EagerParamBase):
+            out.append(_varbase_help(inp))
+            continue
+
         if type(inp) is tuple:
             detach_inp = []
             for i in inp:
                 # detach all tensors in the tuple
                 assert isinstance(i, core.eager.Tensor)
-                tmp_i = i.detach()
-                tmp_i.stop_gradient = i.stop_gradient
-                detach_inp.append(tmp_i)
+
+                if isinstance(i, EagerParamBase):
+                    detach_inp.append(_varbase_help(i))
+                else:
+                    tmp_i = i.detach()
+                    tmp_i.stop_gradient = i.stop_gradient
+                    detach_inp.append(tmp_i)
+
             out.append(tuple(detach_inp))
             continue
 
@@ -403,87 +423,94 @@ def recompute(function, *args, **kwargs):
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle.distributed.fleet.utils import recompute
-            import random
-            # required: gpu
-            def get_fc_block(block_idx, input_size, is_last=False):
-                block_name = "block_" + str(block_idx)
-                block = paddle.nn.Sequential(
-                    (block_name + "_fc_0", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
-                    (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
-                    (block_name + "_relu_1", paddle.nn.ReLU()),
-                    (block_name + "_fc_1", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
-                    (block_name + "_relu_2", paddle.nn.ReLU()),
-                )
-                if is_last:
-                    block.add_sublayer(
-                        block_name + "_fc_2",
-                        paddle.nn.Linear(
-                            input_size, 1, bias_attr=False
-                        )
-                    )
-                else:
-                    block.add_sublayer(
-                        block_name + "_fc_2",
-                        paddle.nn.Linear(input_size, input_size, bias_attr=False)
-                    )
-                return block
-            class Naive_fc_net(paddle.nn.Layer):
-                def __init__(self, input_size=10,
-                            recompute_blocks=[1, 3],
-                            recompute_kwargs={}):
-                    super().__init__()
-                    self.recompute_blocks = recompute_blocks
-                    self.recompute_kwargs = recompute_kwargs
-                    self.runfunc0 = get_fc_block(0, input_size, is_last=False)
-                    self.runfunc1 = get_fc_block(1, input_size, is_last=False)
-                    self.runfunc2 = get_fc_block(2, input_size, is_last=False)
-                    self.runfunc3 = get_fc_block(3, input_size, is_last=False)
-                    self.runfunc4 = get_fc_block(4, input_size, is_last=True)
-                    self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
-                def forward(self, inputs):
-                    nums = len(self.total_func)
-                    for i in range(nums):
-                        if i in self.recompute_blocks:
-                            inputs = recompute(self.total_func[i], inputs, **{"preserve_rng_state": True})
-                        else:
-                            inputs = self.total_func[i](inputs)
-                    return inputs
-            def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
-                gen = paddle.seed(10)
-                gen.manual_seed(10)
-                random.seed(10)
-                if cuda_state:
-                    paddle.set_cuda_rng_state(cuda_state)
-                batch_size, input_size = 1, 10
-                model = Naive_fc_net(
-                    input_size,
-                    recompute_blocks=recompute_block,
-                    recompute_kwargs=recompute_kwargs)
-                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
-                loss_ = []
-                param_ = []
-                grad_ = []
-                for _ in range(5):
-                    x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
-                    y_pred = model(x)
-                    loss = y_pred.mean()
-                    loss_.append(loss.item())
-                    loss.backward()
-                    optimizer.step()
-                    param_.append(model.parameters()[9])
-                    grad_.append(model.parameters()[3]._grad_ivar())
-                    optimizer.clear_grad()
-                return loss_, param_, grad_
-            cuda_state = paddle.get_cuda_rng_state()
-            # without recompute
-            loss_ref, param_ref, grad_ref = run_model(
-                cuda_state, recompute_block=[]
-            )
-            loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
-            print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
-            # The result of the recompute_loss should be the same as the normal_loss.
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED, env:GPU)
+            >>> import paddle
+            >>> from paddle.distributed.fleet.utils import recompute
+            >>> import random
+            >>> paddle.seed(2023)
+            >>> def get_fc_block(block_idx, input_size, is_last=False):
+            ...     block_name = "block_" + str(block_idx)
+            ...     block = paddle.nn.Sequential(
+            ...         (block_name + "_fc_0", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+            ...         (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
+            ...         (block_name + "_relu_1", paddle.nn.ReLU()),
+            ...         (block_name + "_fc_1", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+            ...         (block_name + "_relu_2", paddle.nn.ReLU()),
+            ...     )
+            ...     if is_last:
+            ...         block.add_sublayer(
+            ...             block_name + "_fc_2",
+            ...             paddle.nn.Linear(
+            ...                 input_size, 1, bias_attr=False
+            ...             )
+            ...         )
+            ...     else:
+            ...         block.add_sublayer(
+            ...             block_name + "_fc_2",
+            ...             paddle.nn.Linear(input_size, input_size, bias_attr=False)
+            ...         )
+            ...     return block
+
+            >>> class Naive_fc_net(paddle.nn.Layer):
+            ...     def __init__(self, input_size=10,
+            ...                 recompute_blocks=[1, 3],
+            ...                 recompute_kwargs={}):
+            ...         super().__init__()
+            ...         self.recompute_blocks = recompute_blocks
+            ...         self.recompute_kwargs = recompute_kwargs
+            ...         self.runfunc0 = get_fc_block(0, input_size, is_last=False)
+            ...         self.runfunc1 = get_fc_block(1, input_size, is_last=False)
+            ...         self.runfunc2 = get_fc_block(2, input_size, is_last=False)
+            ...         self.runfunc3 = get_fc_block(3, input_size, is_last=False)
+            ...         self.runfunc4 = get_fc_block(4, input_size, is_last=True)
+            ...         self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
+            ...     def forward(self, inputs):
+            ...         nums = len(self.total_func)
+            ...         for i in range(nums):
+            ...             if i in self.recompute_blocks:
+            ...                 inputs = recompute(self.total_func[i], inputs, **{"preserve_rng_state": True})
+            ...             else:
+            ...                 inputs = self.total_func[i](inputs)
+            ...         return inputs
+
+            >>> def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+            ...     gen = paddle.seed(10)
+            ...     gen.manual_seed(10)
+            ...     random.seed(10)
+            ...     if cuda_state:
+            ...         paddle.set_cuda_rng_state(cuda_state)
+            ...     batch_size, input_size = 1, 10
+            ...     model = Naive_fc_net(
+            ...         input_size,
+            ...         recompute_blocks=recompute_block,
+            ...         recompute_kwargs=recompute_kwargs)
+            ...     optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+            ...     loss_ = []
+            ...     param_ = []
+            ...     grad_ = []
+            ...     for _ in range(5):
+            ...         x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
+            ...         y_pred = model(x)
+            ...         loss = y_pred.mean()
+            ...         loss_.append(loss.item())
+            ...         loss.backward()
+            ...         optimizer.step()
+            ...         param_.append(model.parameters()[9])
+            ...         grad_.append(model.parameters()[3]._grad_ivar())
+            ...         optimizer.clear_grad()
+            ...     return loss_, param_, grad_
+
+            >>> cuda_state = paddle.get_cuda_rng_state()
+            >>> # without recompute
+            >>> loss_ref, param_ref, grad_ref = run_model(
+            ...     cuda_state, recompute_block=[]
+            ... )
+
+            >>> loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
+            >>> print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
+            >>> # The result of the recompute_loss should be the same as the normal_loss.
+            normal_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0], recompute_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0]
+
     """
     # Hack to mix *args with **kwargs in a python 2.7-compliant way
     preserve = kwargs.pop('preserve_rng_state', True)
@@ -524,11 +551,14 @@ def recompute_sequential(ctx, functions, *args, **kwargs):
 
     Examples:
         .. code-block:: python
-            import paddle
-            from paddle.incubate.distributed.fleet import recompute_sequential
-            input = paddle.ones(shape=[8, 10])
-            model = paddle.nn.Sequential(paddle.nn.Linear(10, 10), paddle.nn.Linear(10, 2))
-            output = recompute_sequential({'segments' : 1}, model, input)
+
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.incubate.distributed.fleet import recompute_sequential
+            >>> input = paddle.ones(shape=[8, 10])
+            >>> model = paddle.nn.Sequential(paddle.nn.Linear(10, 10), paddle.nn.Linear(10, 2))
+            >>> output = recompute_sequential({'segments' : 1}, model, input)
+
     """
     segments = ctx.get('segments', 1)
     preserve_rng_state = ctx.get('preserve_rng_state', True)
diff --git a/python/paddle/distributed/fleet/scaler.py b/python/paddle/distributed/fleet/scaler.py
index bf0d7363b05251..463674c9587413 100755
--- a/python/paddle/distributed/fleet/scaler.py
+++ b/python/paddle/distributed/fleet/scaler.py
@@ -29,23 +29,32 @@ def distributed_scaler(scaler):
     def unscale_method(self, optimizer):
         if not self._enable:
             return
+
+        param_grads = []
+        param_grads_fp16 = []
+        param_grads_fp32 = []
         if getattr(optimizer, '_param_groups', None) and isinstance(
             optimizer._param_groups[0], dict
         ):
-            param_grads = []
-            param_grads_fp16 = []
-            param_grads_fp32 = []
             for group in optimizer._param_groups:
                 for param in group['params']:
-                    if param._grad_ivar() is not None:
-                        param_grads.append(param._grad_ivar())
-                        if (
-                            param._grad_ivar().dtype
-                            == core.VarDesc.VarType.FP16
-                        ):
-                            param_grads_fp16.append(param._grad_ivar())
+                    tgt_grad = None
+                    if (
+                        hasattr(param, "main_grad")
+                        and param.main_grad is not None
+                    ):
+                        tgt_grad = param.main_grad
+                    elif param.grad is not None:
+                        tgt_grad = param.grad
+                    if tgt_grad is not None:
+                        param_grads.append(tgt_grad)
+                        if tgt_grad.dtype in [
+                            core.VarDesc.VarType.FP16,
+                            paddle.float16,
+                        ]:
+                            param_grads_fp16.append(tgt_grad)
                         else:
-                            param_grads_fp32.append(param._grad_ivar())
+                            param_grads_fp32.append(tgt_grad)
         else:
             strategy = fleet.fleet._user_defined_strategy
             sharding_stage_1_overlap = strategy.hybrid_configs[
@@ -67,18 +76,23 @@ def unscale_method(self, optimizer):
                 parameters = optimizer._local_parameter_list
             else:
                 parameters = optimizer._parameter_list
-            param_grads_fp16 = [
-                param._grad_ivar()
-                for param in parameters
-                if (param._grad_ivar() is not None)
-                and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16)
-            ]
-            param_grads_fp32 = [
-                param._grad_ivar()
-                for param in parameters
-                if (param._grad_ivar() is not None)
-                and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
-            ]
+
+            for param in parameters:
+                tgt_grad = None
+                if hasattr(param, "main_grad") and param.main_grad is not None:
+                    tgt_grad = param.main_grad
+                elif param.grad is not None:
+                    tgt_grad = param.grad
+                if tgt_grad is not None:
+                    param_grads.append(tgt_grad)
+                    if tgt_grad.dtype in [
+                        core.VarDesc.VarType.FP16,
+                        paddle.float16,
+                    ]:
+                        param_grads_fp16.append(tgt_grad)
+                    else:
+                        param_grads_fp32.append(tgt_grad)
+
         temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
         temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
         self._found_inf = self._temp_found_inf_value_false
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 0ad0d6256ab88d..2d7c44e77b6623 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -12,21 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .fs import LocalFS  # noqa: F401
-from .fs import HDFSClient  # noqa: F401
-from .ps_util import DistributedInfer  # noqa: F401
-from paddle.utils import deprecated
 from paddle.distributed import fleet
 
-import paddle
-from . import log_util  # noqa: F401
-from . import hybrid_parallel_util  # noqa: F401
-from . import tensor_parallel_utils  # noqa: F401
-from . import mix_precision_utils  # noqa: F401
-from . import sequence_parallel_utils
+from . import (  # noqa: F401
+    hybrid_parallel_util,
+    log_util,
+    mix_precision_utils,
+    sequence_parallel_utils,
+    tensor_parallel_utils,
+)
+from .fs import HDFSClient, LocalFS
+from .ps_util import DistributedInfer
 
-
-__all__ = ["LocalFS", "recompute", "DistributedInfer", "HDFSClient"]  # noqa
+__all__ = ["LocalFS", "recompute", "DistributedInfer", "HDFSClient"]
 
 
 def recompute(function, *args, **kwargs):
@@ -51,87 +49,94 @@ def recompute(function, *args, **kwargs):
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle.distributed.fleet.utils import recompute
-            import random
-            # required: gpu
-            def get_fc_block(block_idx, input_size, is_last=False):
-                block_name = "block_" + str(block_idx)
-                block = paddle.nn.Sequential(
-                    (block_name + "_fc_0", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
-                    (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
-                    (block_name + "_relu_1", paddle.nn.ReLU()),
-                    (block_name + "_fc_1", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
-                    (block_name + "_relu_2", paddle.nn.ReLU()),
-                )
-                if is_last:
-                    block.add_sublayer(
-                        block_name + "_fc_2",
-                        paddle.nn.Linear(
-                            input_size, 1, bias_attr=False
-                        )
-                    )
-                else:
-                    block.add_sublayer(
-                        block_name + "_fc_2",
-                        paddle.nn.Linear(input_size, input_size, bias_attr=False)
-                    )
-                return block
-            class Naive_fc_net(paddle.nn.Layer):
-                def __init__(self, input_size=10,
-                            recompute_blocks=[1, 3],
-                            recompute_kwargs={}):
-                    super().__init__()
-                    self.recompute_blocks = recompute_blocks
-                    self.recompute_kwargs = recompute_kwargs
-                    self.runfunc0 = get_fc_block(0, input_size, is_last=False)
-                    self.runfunc1 = get_fc_block(1, input_size, is_last=False)
-                    self.runfunc2 = get_fc_block(2, input_size, is_last=False)
-                    self.runfunc3 = get_fc_block(3, input_size, is_last=False)
-                    self.runfunc4 = get_fc_block(4, input_size, is_last=True)
-                    self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
-                def forward(self, inputs):
-                    nums = len(self.total_func)
-                    for i in range(nums):
-                        if i in self.recompute_blocks:
-                            inputs = recompute(self.total_func[i], inputs, **{"preserve_rng_state": True})
-                        else:
-                            inputs = self.total_func[i](inputs)
-                    return inputs
-            def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
-                gen = paddle.seed(10)
-                gen.manual_seed(10)
-                random.seed(10)
-                if cuda_state:
-                    paddle.set_cuda_rng_state(cuda_state)
-                batch_size, input_size = 1, 10
-                model = Naive_fc_net(
-                    input_size,
-                    recompute_blocks=recompute_block,
-                    recompute_kwargs=recompute_kwargs)
-                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
-                loss_ = []
-                param_ = []
-                grad_ = []
-                for _ in range(5):
-                    x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
-                    y_pred = model(x)
-                    loss = y_pred.mean()
-                    loss_.append(loss.item())
-                    loss.backward()
-                    optimizer.step()
-                    param_.append(model.parameters()[9])
-                    grad_.append(model.parameters()[3]._grad_ivar())
-                    optimizer.clear_grad()
-                return loss_, param_, grad_
-            cuda_state = paddle.get_cuda_rng_state()
-            # without recompute
-            loss_ref, param_ref, grad_ref = run_model(
-                cuda_state, recompute_block=[]
-            )
-            loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
-            print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
-            # The result of the recompute_loss should be the same as the normal_loss.
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED, env:GPU)
+            >>> import paddle
+            >>> from paddle.distributed.fleet.utils import recompute
+            >>> import random
+            >>> paddle.seed(2023)
+            >>> def get_fc_block(block_idx, input_size, is_last=False):
+            ...     block_name = "block_" + str(block_idx)
+            ...     block = paddle.nn.Sequential(
+            ...         (block_name + "_fc_0", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+            ...         (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
+            ...         (block_name + "_relu_1", paddle.nn.ReLU()),
+            ...         (block_name + "_fc_1", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+            ...         (block_name + "_relu_2", paddle.nn.ReLU()),
+            ...     )
+            ...     if is_last:
+            ...         block.add_sublayer(
+            ...             block_name + "_fc_2",
+            ...             paddle.nn.Linear(
+            ...                 input_size, 1, bias_attr=False
+            ...             )
+            ...         )
+            ...     else:
+            ...         block.add_sublayer(
+            ...             block_name + "_fc_2",
+            ...             paddle.nn.Linear(input_size, input_size, bias_attr=False)
+            ...         )
+            ...     return block
+
+            >>> class Naive_fc_net(paddle.nn.Layer):
+            ...     def __init__(self, input_size=10,
+            ...                 recompute_blocks=[1, 3],
+            ...                 recompute_kwargs={}):
+            ...         super().__init__()
+            ...         self.recompute_blocks = recompute_blocks
+            ...         self.recompute_kwargs = recompute_kwargs
+            ...         self.runfunc0 = get_fc_block(0, input_size, is_last=False)
+            ...         self.runfunc1 = get_fc_block(1, input_size, is_last=False)
+            ...         self.runfunc2 = get_fc_block(2, input_size, is_last=False)
+            ...         self.runfunc3 = get_fc_block(3, input_size, is_last=False)
+            ...         self.runfunc4 = get_fc_block(4, input_size, is_last=True)
+            ...         self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
+            ...     def forward(self, inputs):
+            ...         nums = len(self.total_func)
+            ...         for i in range(nums):
+            ...             if i in self.recompute_blocks:
+            ...                 inputs = recompute(self.total_func[i], inputs, **{"preserve_rng_state": True})
+            ...             else:
+            ...                 inputs = self.total_func[i](inputs)
+            ...         return inputs
+
+            >>> def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+            ...     gen = paddle.seed(10)
+            ...     gen.manual_seed(10)
+            ...     random.seed(10)
+            ...     if cuda_state:
+            ...         paddle.set_cuda_rng_state(cuda_state)
+            ...     batch_size, input_size = 1, 10
+            ...     model = Naive_fc_net(
+            ...         input_size,
+            ...         recompute_blocks=recompute_block,
+            ...         recompute_kwargs=recompute_kwargs)
+            ...     optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+            ...     loss_ = []
+            ...     param_ = []
+            ...     grad_ = []
+            ...     for _ in range(5):
+            ...         x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
+            ...         y_pred = model(x)
+            ...         loss = y_pred.mean()
+            ...         loss_.append(loss.item())
+            ...         loss.backward()
+            ...         optimizer.step()
+            ...         param_.append(model.parameters()[9])
+            ...         grad_.append(model.parameters()[3]._grad_ivar())
+            ...         optimizer.clear_grad()
+            ...     return loss_, param_, grad_
+
+            >>> cuda_state = paddle.get_cuda_rng_state()
+            >>> # without recompute
+            >>> loss_ref, param_ref, grad_ref = run_model(
+            ...     cuda_state, recompute_block=[]
+            ... )
+
+            >>> loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
+            >>> print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
+            >>> # The result of the recompute_loss should be the same as the normal_loss.
+            normal_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0], recompute_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0]
+
     """
 
     return fleet.recompute.recompute(function, *args, **kwargs)
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 11617981d9d4b0..743ceac3e296cc 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -117,10 +117,12 @@ class LocalFS(FS):
     Examples:
         .. code-block:: python
 
-            from paddle.distributed.fleet.utils import LocalFS
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> from paddle.distributed.fleet.utils import LocalFS
+
+            >>> client = LocalFS()
+            >>> subdirs, files = client.ls_dir("./")
 
-            client = LocalFS()
-            subdirs, files = client.ls_dir("./")
     """
 
     def ls_dir(self, fs_path):
@@ -137,10 +139,12 @@ def ls_dir(self, fs_path):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> subdirs, files = client.ls_dir("./")
 
-                client = LocalFS()
-                subdirs, files = client.ls_dir("./")
         """
         if not self.is_exist(fs_path):
             return [], []
@@ -165,11 +169,13 @@ def mkdirs(self, fs_path):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> client.mkdirs("test_mkdirs")
+                >>> client.delete("test_mkdirs")
 
-                client = LocalFS()
-                client.mkdirs("test_mkdirs")
-                client.delete("test_mkdirs")
         """
         assert not os.path.isfile(fs_path), f"{fs_path} is already a file"
         os.makedirs(fs_path, exist_ok=True)
@@ -185,15 +191,20 @@ def rename(self, fs_src_path, fs_dst_path):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> client.touch("test_rename_src")
+                >>> print(client.is_exist("test_rename_src"))
+                True
+                >>> client.rename("test_rename_src", "test_rename_dst")
+                >>> print(client.is_exist("test_rename_src"))
+                False
+                >>> print(client.is_exist("test_rename_dst"))
+                True
+                >>> client.delete("test_rename_dst")
 
-                client = LocalFS()
-                client.touch("test_rename_src")
-                print(client.is_exists("test_rename_src")) # True
-                client.rename("test_rename_src", "test_rename_dst")
-                print(client.is_exists("test_rename_src")) # False
-                print(client.is_exists("test_rename_dst")) # True
-                client.delete("test_rename_dst")
         """
         os.rename(fs_src_path, fs_dst_path)
 
@@ -213,11 +224,13 @@ def delete(self, fs_path):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> client.mkdirs("test_localFS_mkdirs")
+                >>> client.delete("test_localFS_mkdirs")
 
-                client = LocalFS()
-                client.mkdirs("test_localFS_mkdirs")
-                client.delete("test_localFS_mkdirs")
         """
         if not self.is_exist(fs_path):
             return
@@ -243,12 +256,15 @@ def is_file(self, fs_path):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> client.touch("test_is_file")
+                >>> print(client.is_file("test_is_file"))
+                True
+                >>> client.delete("test_is_file")
 
-                client = LocalFS()
-                client.touch("test_is_file")
-                print(client.is_file("test_is_file")) # True
-                client.delete("test_is_file")
         """
         return os.path.isfile(fs_path)
 
@@ -265,12 +281,15 @@ def is_dir(self, fs_path):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> client.mkdirs("test_is_dir")
+                >>> print(client.is_dir("test_is_dir"))
+                True
+                >>> client.delete("test_is_dir")
 
-                client = LocalFS()
-                client.mkdirs("test_is_dir")
-                print(client.is_dir("test_is_file")) # True
-                client.delete("test_is_dir")
         """
         return os.path.isdir(fs_path)
 
@@ -288,10 +307,12 @@ def is_exist(self, fs_path):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> local_fs = LocalFS()
+                >>> ret = local_fs.is_exist("test_is_exist")
 
-                client = LocalFS()
-                ret = local_fs.is_exist("test_is_exist")
         """
         return os.path.exists(fs_path)
 
@@ -307,11 +328,13 @@ def touch(self, fs_path, exist_ok=True):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> client.touch("test_touch")
+                >>> client.delete("test_touch")
 
-                client = LocalFS()
-                client.touch("test_touch")
-                client.delete("test_touch")
         """
         if self.is_exist(fs_path):
             if exist_ok:
@@ -332,12 +355,14 @@ def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> client.touch("test_mv_src")
+                >>> client.mv("test_mv_src", "test_mv_dst")
+                >>> client.delete("test_mv_dst")
 
-                client = LocalFS()
-                client.touch("test_mv_src")
-                client.mv("test_mv_src", "test_mv_dst")
-                client.delete("test_mv_dst")
         """
         if not self.is_exist(src_path):
             raise FSFileNotExistsError
@@ -363,10 +388,12 @@ def list_dirs(self, fs_path):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils import LocalFS
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import LocalFS
+
+                >>> client = LocalFS()
+                >>> subdirs = client.list_dirs("./")
 
-                client = LocalFS()
-                subdirs = client.list_dirs("./")
         """
         if not self.is_exist(fs_path):
             return []
@@ -428,18 +455,21 @@ class HDFSClient(FS):
 
     Examples:
 
-        .. code-block:: text
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> from paddle.distributed.fleet.utils import HDFSClient
+            >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
 
-            from paddle.distributed.fleet.utils import HDFSClient
-            hadoop_home = "/home/client/hadoop-client/hadoop/"
+            >>> configs = {
+            ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+            ...     "hadoop.job.ugi": "hello,hello123"
+            ... }
 
-            configs = {
-                "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                "hadoop.job.ugi": "hello,hello123"
-            }
+            >>> client = HDFSClient(hadoop_home, configs)
+            >>> client.ls_dir("hdfs:/test_hdfs_client")
+            ([], [])
 
-            client = HDFSClient(hadoop_home, configs)
-            client.ls_dir("hdfs:/test_hdfs_client")
     """
 
     def __init__(
@@ -496,18 +526,20 @@ def list_dirs(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> subdirs = client.list_dirs("hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                subdirs = client.list_dirs("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return []
@@ -529,18 +561,20 @@ def ls_dir(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return [], []
@@ -590,18 +624,20 @@ def is_dir(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> ret = client.is_file("hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                ret = client.is_file("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return False
@@ -634,18 +670,20 @@ def is_file(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> ret = client.is_file("hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                ret = client.is_file("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return False
@@ -666,18 +704,20 @@ def is_exist(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DITSTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> ret = client.is_exist("hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                ret = client.is_exist("hdfs:/test_hdfs_client")
         """
         cmd = f"test -e {fs_path} "
         ret, out = self._run_cmd(cmd, redirect_stderr=True, retry_times=1)
@@ -718,18 +758,20 @@ def upload(self, local_path, fs_path, multi_processes=5, overwrite=False):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on external file')
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
         """
 
         def __subprocess_upload(hdfs_path_single, datas):
@@ -808,18 +850,20 @@ def download(self, fs_path, local_path, multi_processes=5, overwrite=False):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on external file')
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> client.download("hdfs:/test_hdfs_client", "./")
 
-                client = HDFSClient(hadoop_home, configs)
-                client.download("hdfs:/test_hdfs_client", "./")
         """
 
         def __subprocess_download(local_path, datas):
@@ -877,18 +921,20 @@ def mkdirs(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on external file')
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> client.mkdirs("hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                client.mkdirs("hdfs:/test_hdfs_client")
         """
         if self.is_exist(fs_path):
             return
@@ -923,18 +969,20 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on external file')
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
 
-                client = HDFSClient(hadoop_home, configs)
-                client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
         """
         if overwrite and self.is_exist(fs_dst_path):
             self.delete(fs_dst_path)
@@ -983,18 +1031,20 @@ def delete(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> client.delete("hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                client.delete("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return
@@ -1016,18 +1066,20 @@ def touch(self, fs_path, exist_ok=True):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on external file')
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> client.touch("hdfs:/test_hdfs_client")
 
-                client = HDFSClient(hadoop_home, configs)
-                client.touch("hdfs:/test_hdfs_client")
         """
         if self.is_exist(fs_path):
             if exist_ok:
@@ -1058,18 +1110,21 @@ def cat(self, fs_path=None):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> from paddle.distributed.fleet.utils import HDFSClient
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> hadoop_home = "/home/client/hadoop-client/hadoop/"
+                >>> configs = {
+                ...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                ...     "hadoop.job.ugi": "hello,hello123"
+                ... }
 
-                hadoop_home = "/home/client/hadoop-client/hadoop/"
-                configs = {
-                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-                    "hadoop.job.ugi": "hello,hello123"
-                }
+                >>> client = HDFSClient(hadoop_home, configs)
+                >>> client.cat("hdfs:/test_hdfs_client")
+                ''
 
-                client = HDFSClient(hadoop_home, configs)
-                client.cat("hdfs:/test_hdfs_client")
         """
         if self.is_file(fs_path):
             output = self._try_cat(fs_path)
@@ -1151,12 +1206,15 @@ class AFSClient(FS):
 
     Examples:
 
-        .. code-block:: text
+        .. code-block:: python
+
+            >>> # doctest: +SKIP('depend on WITH_PSLIB')
+            >>> from paddle.distributed.fleet.utils.fs import AFSClient
+
+            >>> client = AFSClient()
+            >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+            >>> client.ls_dir("hdfs:/test_hdfs_client")
 
-            from paddle.distributed.fleet.utils import AFSClient
-            client = AFSClient()
-            client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-            client.ls_dir("hdfs:/test_hdfs_client")
     """
 
     def __init__(self, time_out=5 * 60 * 1000, sleep_inter=1000):  # ms  # ms
@@ -1178,13 +1236,15 @@ def list_dirs(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> subdirs = client.list_dirs("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                subdirs = client.list_dirs("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return []
@@ -1205,13 +1265,15 @@ def ls_dir(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return [], []
@@ -1235,13 +1297,15 @@ def is_dir(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> ret = client.is_dir("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                ret = client.is_file("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return False
@@ -1267,13 +1331,15 @@ def is_file(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> ret = client.is_file("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                ret = client.is_file("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return False
@@ -1293,13 +1359,15 @@ def is_exist(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> ret = client.is_exist("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                ret = client.is_exist("hdfs:/test_hdfs_client")
         """
         return self._fs.exist(fs_path)
 
@@ -1335,13 +1403,15 @@ def upload(self, local_path, fs_path, multi_processes=1, overwrite=False):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
         """
 
         local = LocalFS()
@@ -1362,13 +1432,15 @@ def download(self, fs_path, local_path, multi_processes=1, overwrite=False):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> client.download("hdfs:/test_hdfs_client", "./")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                client.download("hdfs:/test_hdfs_client", "./")
         """
 
         def __subprocess_download(local_path, datas):
@@ -1411,13 +1483,15 @@ def mkdirs(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> client.mkdirs("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                client.mkdirs("hdfs:/test_hdfs_client")
         """
         if self.is_exist(fs_path):
             return
@@ -1435,13 +1509,15 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
         """
         if overwrite and self.is_exist(fs_dst_path):
             self.delete(fs_dst_path)
@@ -1464,15 +1540,16 @@ def delete(self, fs_path):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
 
-                from paddle.distributed.fleet.utils import HDFSClient
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> client.delete("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                client.delete("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return
@@ -1489,13 +1566,15 @@ def touch(self, fs_path, exist_ok=True):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> client.touch("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                client.touch("hdfs:/test_hdfs_client")
         """
         if self.is_exist(fs_path):
             if exist_ok:
@@ -1519,13 +1598,15 @@ def cat(self, fs_path=None):
 
         Examples:
 
-            .. code-block:: text
+            .. code-block:: python
+
+                >>> # doctest: +SKIP('depend on WITH_PSLIB')
+                >>> from paddle.distributed.fleet.utils.fs import AFSClient
 
-                from paddle.distributed.fleet.utils import AFSClient
+                >>> client = AFSClient()
+                >>> client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
+                >>> client.cat("hdfs:/test_hdfs_client")
 
-                client = AFSClient()
-                client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf")
-                client.cat("hdfs:/test_hdfs_client")
         """
         if self.is_file(fs_path):
             return self._fs.cat(fs_path)
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index 9c44fc49fff672..9170754bb78ff8 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -42,145 +42,123 @@ class HybridParallelInferenceHelper:
 
     Write Paradigm:
 
-    .. code-block:: bash
-        :name: bash-example1
-
-        # while op pattern
-        with paddle.base.device_guard(f'{device}:all'):
-            # init global cond
-            max_len = paddle.full(shape=[1], dtype="int64", fill_value=10)
-            step_idx = paddle.full(shape=[1], dtype="int64", fill_value=0)
-            cond_int = paddle.full(shape=[1], dtype="int64", fill_value=0, name="cond_int")
-            cond = layers.cast(step_idx < max_len, dtype="bool")
-            while_op = layers.While(cond, is_test=True)
-
-            # init global lod_tensor_array for generation task
-            arr = paddle.tensor.array_write(data, step_idx)
-
-        with while_op.block():
-            with paddle.base.device_guard(f'{device}:all'):
-                # read data from global lod_tensor_array
-                element_in_arr = paddle.tensor.array_read(array=arr, i=step_idx)
-                # write placehold data to global lod_tensor_array,
-                # it need for send_v2 of lod_tensor_array
-                paddle.increment(x=step_idx, value=1.0)
-                paddle.tensor.array_write(element_in_arr, i=step_idx, array=arr)
-
-            with paddle.base.device_guard(f'{device}:0'):
-                ... some code
-
-            with paddle.base.device_guard(f'{device}:1'):
-                ... some code
-
-            with paddle.base.device_guard(f'{device}:{num_pp-1}'):
-                # generate some data in while block and write to global lod_tensor_array
-                # that they are read in next while step.
-                # we will using send_v2 to send global lod_tensor_array to other pipeline and sync
-                paddle.tensor.array_write(other_var, i=step_idx, array=arr)
-
-                # update cond and assign to cond_int, we will sync cond_int
-                layers.assign(layers.cast(cond, dtype="int32"), cond_int)
-
-            with paddle.base.device_guard(f'{model._device}:all'):
-                # the code below must at end of while block and exists in device:all
-                layers.assign(layers.cast(cond_int, dtype='bool'), cond)
-
-        with paddle.base.device_guard(f'{model._device}:all'):
-            # use a empty lod_tensor_array to clear lod_tensor_array
-            layers.assign(layers.create_array(data.dtype), arr)
-
+        .. code-block:: text
+            :name: text-example1
+
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED, env:GPU)
+            >>> import paddle
+            >>> # while op pattern
+            >>> with paddle.base.device_guard(f'{device}:all'):
+            ...     # init global cond
+            ...     max_len = paddle.full(shape=[1], dtype="int64", fill_value=10)
+            ...     step_idx = paddle.full(shape=[1], dtype="int64", fill_value=0)
+            ...     cond_int = paddle.full(shape=[1], dtype="int64", fill_value=0, name="cond_int")
+            ...     cond = layers.cast(step_idx < max_len, dtype="bool")
+            ...     while_op = layers.While(cond, is_test=True)
+
+            ...     # init global lod_tensor_array for generation task
+            ...     arr = paddle.tensor.array_write(data, step_idx)
+
+            >>> with while_op.block():
+            ...     with paddle.base.device_guard(f'{device}:all'):
+            ...         # read data from global lod_tensor_array
+            ...         element_in_arr = paddle.tensor.array_read(array=arr, i=step_idx)
+            ...         # write placehold data to global lod_tensor_array,
+            ...         # it need for send_v2 of lod_tensor_array
+            ...         paddle.increment(x=step_idx, value=1.0)
+            ...         paddle.tensor.array_write(element_in_arr, i=step_idx, array=arr)
+            ...     with paddle.base.device_guard(f'{device}:0'):
+            ...         pass # some code
+            ...     with paddle.base.device_guard(f'{device}:1'):
+            ...         pass # some code
+            ...     with paddle.base.device_guard(f'{device}:{num_pp-1}'):
+            ...         # generate some data in while block and write to global lod_tensor_array
+            ...         # that they are read in next while step.
+            ...         # we will using send_v2 to send global lod_tensor_array to other pipeline and sync
+            ...         paddle.tensor.array_write(other_var, i=step_idx, array=arr)
+            ...         # update cond and assign to cond_int, we will sync cond_int
+            ...         layers.assign(layers.cast(cond, dtype="int32"), cond_int)
+            ...     with paddle.base.device_guard(f'{model._device}:all'):
+            ...         # the code below must at end of while block and exists in device:all
+            ...         layers.assign(layers.cast(cond_int, dtype='bool'), cond)
+
+            >>> with paddle.base.device_guard(f'{model._device}:all'):
+            ...     # use a empty lod_tensor_array to clear lod_tensor_array
+            ...     layers.assign(layers.create_array(data.dtype), arr)
 
     Examples:
 
-    .. code-block:: python
-        :name: code-example1
-
-        # required: distributed
-        import os
-        import numpy as np
-        import paddle
-        import paddle.base.layers as layers
-        import paddle.distributed.fleet as fleet
-        paddle.enable_static()
-
-        nranks = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
-        rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
-        dev_id = int(os.getenv("FLAGS_selected_gpus", 0))
-
-        main_program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-
-        if nranks > 1:
-            dist_strategy = fleet.DistributedStrategy()
-            dist_strategy.without_graph_optimization = True
-            fleet.init(is_collective=True, strategy=dist_strategy)
-
-        device = "gpu"
-
-        with paddle.static.program_guard(main_program, startup_program):
-            with paddle.base.device_guard(f'{device}:0'):
-                X = paddle.static.data(name='X', shape=[None, 2], dtype='float32')
-
-            with paddle.base.device_guard(f'{device}:all'):
-                max_len = paddle.full(
-                    shape=[1], dtype="int64", fill_value=5, name="n")
-                step_idx = paddle.full(
-                    shape=[1], dtype="int64", fill_value=0, name="i")
-
-                data = paddle.tensor.array_write(X, step_idx)
-
-                cond_int = paddle.full(shape=[1], dtype="int64", fill_value=0, name="cond_int")
-                cond = paddle.less_than(x=step_idx, y=max_len)
-                while_op = layers.While(cond, is_test=True)
-
-            with while_op.block():
-                with paddle.base.device_guard(f'{device}:all'):
-                    input = paddle.tensor.array_read(array=data, i=step_idx)
-                    paddle.increment(x=step_idx, value=1.0)
-                    paddle.tensor.array_write(input, i=step_idx, array=data)
-
-                with paddle.base.device_guard(f'{device}:0'):
-                    param_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0))
-                    weight1 = paddle.static.create_parameter(
-                        shape=[2, 5], dtype='float32', attr=param_attr, is_bias=False)
-                    hidden1 = paddle.matmul(input, weight1)
-
-                with paddle.base.device_guard(f'{device}:1'):
-                    param_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(2.0))
-                    weight2 = paddle.static.create_parameter(
-                        shape=[5, 2], dtype='float32', attr=param_attr, is_bias=False)
-                    hidden2 = paddle.matmul(hidden1, weight2)
-
-                    paddle.tensor.array_write(hidden2, i=step_idx, array=data)
-
-                    # update cond and assign to cond_int, we will sync cond_int
-                    paddle.assign(paddle.less_than(x=step_idx, y=max_len), cond)
-                    layers.assign(layers.cast(cond, dtype="int32"), cond_int)
-
-                with paddle.base.device_guard(f'{device}:all'):
-                    # the code below must at end of while block and exists in device:all
-                    layers.assign(layers.cast(cond_int, dtype='bool'), cond)
-
-            with paddle.base.device_guard(f'{device}:all'):
-                out = layers.create_array(data.dtype)
-                layers.assign(data, out)
-
-            with paddle.base.device_guard(f'{device}:all'):
-                # use a empty lod_tensor_array to clear lod_tensor_array
-                layers.assign(layers.create_array(data.dtype), data)
-
-        helper = fleet.HybridParallelInferenceHelper(startup_program, main_program, micro_batch_size=2, num_pp=2, init_comm=nranks>1)
-        helper.gen_infer_program(['array_write_0.out'], ['cond_int.tmp_0'])
-
-        exe = paddle.static.Executor(paddle.CUDAPlace(dev_id))
-        exe.run(startup_program)
-
-        np.random.seed(2333)
-        for step in range(5):
-            init_data = np.random.uniform(low=0.0, high=1.0, size=[2, 2]).astype('float32')
-            [res] = exe.run(main_program, feed={"X": init_data}, fetch_list=[out])
-            print('-------- step', step, ' --------')
-            print(res)
+        .. code-block:: python
+            :name: code-example1
+
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED, env:GPU)
+            >>> import os
+            >>> import numpy as np
+            >>> import paddle
+            >>> import paddle.distributed.fleet as fleet
+            >>> from paddle.distributed.fleet.utils import hybrid_parallel_inference
+            >>> paddle.enable_static()
+            >>> nranks = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
+            >>> rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+            >>> dev_id = int(os.getenv("FLAGS_selected_gpus", 0))
+            >>> main_program = paddle.static.Program()
+            >>> startup_program = paddle.static.Program()
+            >>> if nranks > 1:
+            ...     dist_strategy = fleet.DistributedStrategy()
+            ...     dist_strategy.without_graph_optimization = True
+            ...     fleet.init(is_collective=True, strategy=dist_strategy)
+            >>> device = "gpu"
+            >>> with paddle.static.program_guard(main_program, startup_program):
+            ...     with paddle.base.device_guard(f'{device}:0'):
+            ...         X = paddle.static.data(name='X', shape=[None, 2], dtype='float32')
+            ...     with paddle.base.device_guard(f'{device}:all'):
+            ...         max_len = paddle.full(
+            ...             shape=[1], dtype="int64", fill_value=5, name="n")
+            ...         step_idx = paddle.full(
+            ...             shape=[1], dtype="int64", fill_value=0, name="i")
+            ...         data = paddle.tensor.array_write(X, step_idx)
+            ...         cond_int = paddle.full(shape=[1], dtype="int64", fill_value=0, name="cond_int")
+            ...         cond = paddle.less_than(x=step_idx, y=max_len)
+            ...         while_op = paddle.static.nn.control_flow.While(cond, is_test=True)
+            ...     with while_op.block():
+            ...         with paddle.base.device_guard(f'{device}:all'):
+            ...             input = paddle.tensor.array_read(array=data, i=step_idx)
+            ...             paddle.increment(x=step_idx, value=1.0)
+            ...             paddle.tensor.array_write(input, i=step_idx, array=data)
+            ...         with paddle.base.device_guard(f'{device}:0'):
+            ...             param_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0))
+            ...             weight1 = paddle.static.create_parameter(
+            ...                 shape=[2, 5], dtype='float32', attr=param_attr, is_bias=False)
+            ...             hidden1 = paddle.matmul(input, weight1)
+            ...         with paddle.base.device_guard(f'{device}:1'):
+            ...             param_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(2.0))
+            ...             weight2 = paddle.static.create_parameter(
+            ...                 shape=[5, 2], dtype='float32', attr=param_attr, is_bias=False)
+            ...             hidden2 = paddle.matmul(hidden1, weight2)
+            ...             paddle.tensor.array_write(hidden2, i=step_idx, array=data)
+            ...             # update cond and assign to cond_int, we will sync cond_int
+            ...             paddle.assign(paddle.less_than(x=step_idx, y=max_len), cond)
+            ...             paddle.assign(paddle.cast(cond, dtype="int32"), cond_int)
+            ...         with paddle.base.device_guard(f'{device}:all'):
+            ...             # the code below must at end of while block and exists in device:all
+            ...             paddle.assign(paddle.cast(cond_int, dtype='bool'), cond)
+            ...     with paddle.base.device_guard(f'{device}:all'):
+            ...         out = paddle.tensor.create_array(data.dtype)
+            ...         paddle.assign(data, out)
+            ...     with paddle.base.device_guard(f'{device}:all'):
+            ...         # use a empty lod_tensor_array to clear lod_tensor_array
+            ...         paddle.assign(paddle.tensor.create_array(data.dtype), data)
+            >>> helper = hybrid_parallel_inference.HybridParallelInferenceHelper(startup_program, main_program, micro_batch_size=2, num_pp=2, init_comm=nranks>1)
+            >>> helper.gen_infer_program(['array_write_0.out'], ['cond_int.tmp_0'])
+            >>> exe = paddle.static.Executor(paddle.CUDAPlace(dev_id))
+            >>> exe.run(startup_program)
+            >>> np.random.seed(2333)
+            >>> for step in range(5):
+            ...     init_data = np.random.uniform(low=0.0, high=1.0, size=[2, 2]).astype('float32')
+            ...     [res] = exe.run(main_program, feed={"X": init_data}, fetch_list=[out])
+            ...     print('-------- step', step, ' --------')
+            ...     print(res)
+
     """
 
     def __init__(
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index c68dfeefd2c600..86194c66016b29 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -252,7 +252,7 @@ def fused_allreduce_gradients(parameter_list, hcg):
         scale = 1.0
         if dp_enabled:
             group = hcg.get_data_parallel_group()
-            scale = group.nranks
+            scale = scale / group.nranks
         if sep_enabled:
             sep_group = hcg.get_sep_parallel_group()
             dp_sep_group = hcg.get_dp_sep_parallel_group()
diff --git a/python/paddle/distributed/fleet/utils/mix_precision_utils.py b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
index f6b04bbfda011e..e779d41b8f3faa 100644
--- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py
+++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -28,6 +28,7 @@
     obtain_optimizer_parameters_list,
 )
 from paddle.framework import core
+from paddle.utils import deprecated
 
 
 class MixPrecisionLayer(nn.Layer):
@@ -232,6 +233,11 @@ def unscale_method(self, optimizer):
         self._found_inf = int(is_found_inf)
 
 
+@deprecated(
+    since="2.5.0",
+    update_to="paddle.distributed_scaler",
+    level=1,
+)
 class MixPrecisionScaler:
     def __init__(self, scaler):
         self._inner_scaler = scaler
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 0a7dff06dc227c..1fc2e6713e1b63 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -527,6 +527,9 @@ def launch():
 
         # build AutoTuner to get new config
         auto_tuner = AutoTuner(tuner_cfg)
+        logger.info(
+            f"Launch {len(auto_tuner.algo.all_tasks)} tasks by auto tuner: "
+        )
         cur_cfg = auto_tuner.search_once()
         auto_tuner.add_cfg(cur_cfg)
         assert cur_cfg is not None, "No config can run."
@@ -557,7 +560,9 @@ def launch():
                 cur_cfg["acc_steps"],
             )
 
-            ctx.args.log_dir = log_dir
+            ctx.args.log_dir = os.path.join(
+                os.path.dirname(ctx.args.auto_tuner_json), log_dir
+            )
 
             # every task has own job id
             job_id += 1
@@ -693,6 +698,7 @@ def launch():
             # record history
             cur_cfg['job_id'] = job_id
             recorder.add_cfg(**cur_cfg)
+            recorder.store_history(history_file_path)
             cur_best_cfgs, err = recorder.get_best(
                 metric=tuner_cfg['metric_cfg']['name'],
                 direction=tuner_cfg['metric_cfg']['OptimizationDirection'],
@@ -700,7 +706,6 @@ def launch():
             if not err:
                 ctx.logger.info(f"Current best config: {cur_best_cfgs}")
                 logger.info(f"Current best config: {cur_best_cfgs}")
-                recorder.store_history(history_file_path)
             else:
                 ctx.logger.info(
                     "Get best config failed. Currently there are no appropriate configs."
@@ -795,7 +800,9 @@ def launch():
         ctx.args.job_id = "best_cfg"
         ctx.logger.info(f"Launch best cfg from auto tuner: {best_cfg}")
         logger.info(f"Launch best cfg from auto tuner: {best_cfg}")
-        ctx.args.log_dir = "best_cfg"
+        ctx.args.log_dir = ctx.args.log_dir = os.path.join(
+            os.path.dirname(ctx.args.auto_tuner_json), "best_cfg"
+        )
         # run best cfg
         c = controllers.init(ctx)
         c.run()
diff --git a/python/paddle/distributed/launch/utils/nvsmi.py b/python/paddle/distributed/launch/utils/nvsmi.py
index da446006154587..0c51456bf1204f 100644
--- a/python/paddle/distributed/launch/utils/nvsmi.py
+++ b/python/paddle/distributed/launch/utils/nvsmi.py
@@ -16,6 +16,9 @@
 import os
 import shutil
 import subprocess
+import time
+
+import paddle
 
 
 class Info:
@@ -73,6 +76,39 @@ def query_smi(query=None, query_type="gpu", index=None, dtype=None):
     return ret
 
 
+def query_rocm_smi(query=None, index=None, dtype=None, mem=32150):
+    if not has_rocm_smi():
+        return []
+
+    cmd = ["rocm-smi"]
+
+    if not isinstance(dtype, list) or len(dtype) != len(query):
+        dtype = [str] * len(query)
+
+    output = subprocess.check_output(cmd, timeout=3)
+    lines = output.decode("utf-8").split(os.linesep)
+    ret = []
+    for line in lines:
+        if not line:
+            continue
+        if len(line.split()) != 8 or "DCU" in line.split():
+            continue
+        info = Info()
+        line = line.split()
+        line = [
+            line[0],
+            line[7][: len(line[7]) - 1],
+            mem,
+            mem * float(line[6][: len(line[6]) - 1]) / 100,
+            mem - mem * float(line[6][: len(line[6]) - 1]) / 100,
+            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+        ]
+        for k, v, d in zip(query, line, dtype):
+            setattr(info, k.replace(".", "_"), d(v))
+        ret.append(info)
+    return ret
+
+
 def get_gpu_info(index=None):
     q = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode".split(
         ","
@@ -97,7 +133,8 @@ def get_gpu_util(index=None):
         if index is None or isinstance(index, list)
         else str(index).split(",")
     )
-
+    if paddle.device.is_compiled_with_cuda():
+        return query_rocm_smi(q, index=index, dtype=d)
     return query_smi(q, index=index, dtype=d)
 
 
@@ -117,6 +154,10 @@ def has_nvidia_smi():
     return shutil.which("nvidia-smi")
 
 
+def has_rocm_smi():
+    return shutil.which("rocm-smi")
+
+
 if __name__ == '__main__':
     print(get_gpu_info(0))
     print(get_gpu_util(0))
diff --git a/python/paddle/distributed/metric/__init__.py b/python/paddle/distributed/metric/__init__.py
index f87fe885824b80..e2d49af681961d 100644
--- a/python/paddle/distributed/metric/__init__.py
+++ b/python/paddle/distributed/metric/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .metrics import init_metric  # noqa: F401
-from .metrics import print_auc  # noqa: F401
+from .metrics import init_metric, print_auc  # noqa: F401
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 3815d0f475fbed..8890ab0bd179ae 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -19,8 +19,7 @@
 import warnings
 from collections import OrderedDict, namedtuple
 from contextlib import contextmanager
-from multiprocessing import Manager  # noqa: F401
-from multiprocessing import Process  # noqa: F401
+from multiprocessing import Manager, Process
 
 import numpy as np
 
@@ -43,7 +42,7 @@
     _get_global_group,
     is_initialized,
 )
-from paddle.distributed.fleet.base.private_helper_function import (  # noqa: F401
+from paddle.distributed.fleet.base.private_helper_function import (
     wait_server_ready,
 )
 from paddle.distributed.fleet.launch_utils import check_backend
@@ -248,50 +247,43 @@ class DataParallel(layers.Layer):
         .. code-block:: python
             :name: dp-example
 
-            # required: distributed
-            import paddle
-            import paddle.nn as nn
-            import paddle.optimizer as opt
-            import paddle.distributed as dist
-
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
-
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
-
-            def train():
-                # 1. initialize parallel environment
-                dist.init_parallel_env()
-
-                # 2. create data parallel layer & optimizer
-                layer = LinearNet()
-                dp_layer = paddle.DataParallel(layer)
-
-                loss_fn = nn.MSELoss()
-                adam = opt.Adam(
-                    learning_rate=0.001, parameters=dp_layer.parameters())
-
-                # 3. run layer
-                inputs = paddle.randn([10, 10], 'float32')
-                outputs = dp_layer(inputs)
-                labels = paddle.randn([10, 1], 'float32')
-                loss = loss_fn(outputs, labels)
-
-                loss.backward()
-
-                adam.step()
-                adam.clear_grad()
-
-            if __name__ == '__main__':
-                # 1. start by ``paddle.distributed.spawn`` (default)
-                dist.spawn(train, nprocs=2)
-                # 2. start by ``paddle.distributed.launch``
-                # train()
-
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> import paddle.nn as nn
+            >>> import paddle.optimizer as opt
+            >>> import paddle.distributed as dist
+
+            >>> class LinearNet(nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self._linear1 = nn.Linear(10, 10)
+            ...         self._linear2 = nn.Linear(10, 1)
+            ...     def forward(self, x):
+            ...         return self._linear2(self._linear1(x))
+
+            >>> def train():
+            ...     # 1. initialize parallel environment
+            ...     dist.init_parallel_env()
+            ...     # 2. create data parallel layer & optimizer
+            ...     layer = LinearNet()
+            ...     dp_layer = paddle.DataParallel(layer)
+            ...     loss_fn = nn.MSELoss()
+            ...     adam = opt.Adam(
+            ...         learning_rate=0.001, parameters=dp_layer.parameters())
+            ...     # 3. run layer
+            ...     inputs = paddle.randn([10, 10], 'float32')
+            ...     outputs = dp_layer(inputs)
+            ...     labels = paddle.randn([10, 1], 'float32')
+            ...     loss = loss_fn(outputs, labels)
+            ...     loss.backward()
+            ...     adam.step()
+            ...     adam.clear_grad()
+
+            >>> if __name__ == '__main__':
+            ...     # 1. start by ``paddle.distributed.spawn`` (default)
+            ...     dist.spawn(train, nprocs=2)
+            ...     # 2. start by ``paddle.distributed.launch``
+            ...     # train()
 
     .. note::
         ``PyLayer`` is not supported in DataParallel. To solve problems of this kind,
@@ -304,58 +296,51 @@ def train():
         .. code-block:: python
             :name: dp-pylayer-example
 
-            # required: distributed
-            import numpy
-            import paddle
-            import paddle.distributed as dist
-            from paddle.autograd import PyLayer
-            from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
-
-            class cus_tanh(PyLayer):
-                @staticmethod
-                def forward(ctx, x):
-                    y = paddle.tanh(x)
-                    ctx.save_for_backward(y)
-                    return y
-
-                @staticmethod
-                def backward(ctx, dy):
-                    y, = ctx.saved_tensor()
-                    grad = dy * (1 - paddle.square(y))
-                    return grad
-
-            class SimpleNet(paddle.nn.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self.linear = paddle.nn.Linear(2, 2)
-
-                def forward(self, inputs):
-                    inputs = cus_tanh.apply(inputs)
-                    return self.linear(inputs)
-
-            if __name__ == '__main__':
-                dist.init_parallel_env()
-
-                model = SimpleNet()
-                model = paddle.DataParallel(model)
-                opt = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
-
-                for step in range(10):
-                    x_data = numpy.random.randn(2, 2).astype(numpy.float32)
-                    x = paddle.to_tensor(x_data)
-                    x.stop_gradient = False
-
-                    # step 1 : skip gradient synchronization by 'no_sync'
-                    with model.no_sync():
-                        y_pred = model(x)
-                        loss = y_pred.mean()
-                        loss.backward()
-
-                    # step 2 : fuse + allreduce manually before optimization
-                    fused_allreduce_gradients(list(model.parameters()), None)
-
-                    opt.step()
-                    opt.clear_grad()
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import numpy
+            >>> import paddle
+            >>> import paddle.distributed as dist
+            >>> from paddle.autograd import PyLayer
+            >>> from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
+
+            >>> class cus_tanh(PyLayer):
+            ...     @staticmethod
+            ...     def forward(ctx, x):
+            ...         y = paddle.tanh(x)
+            ...         ctx.save_for_backward(y)
+            ...         return y
+            ...     @staticmethod
+            ...     def backward(ctx, dy):
+            ...         y, = ctx.saved_tensor()
+            ...         grad = dy * (1 - paddle.square(y))
+            ...         return grad
+
+            >>> class SimpleNet(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.linear = paddle.nn.Linear(2, 2)
+            ...     def forward(self, inputs):
+            ...         inputs = cus_tanh.apply(inputs)
+            ...         return self.linear(inputs)
+
+            >>> if __name__ == '__main__':
+            ...     dist.init_parallel_env()
+            ...     model = SimpleNet()
+            ...     model = paddle.DataParallel(model)
+            ...     opt = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+            ...     for step in range(10):
+            ...         x_data = numpy.random.randn(2, 2).astype(numpy.float32)
+            ...         x = paddle.to_tensor(x_data)
+            ...         x.stop_gradient = False
+            ...         # step 1 : skip gradient synchronization by 'no_sync'
+            ...         with model.no_sync():
+            ...             y_pred = model(x)
+            ...             loss = y_pred.mean()
+            ...             loss.backward()
+            ...         # step 2 : fuse + allreduce manually before optimization
+            ...         fused_allreduce_gradients(list(model.parameters()), None)
+            ...         opt.step()
+            ...         opt.clear_grad()
 
     """
 
@@ -503,32 +488,31 @@ def no_sync(self):
         Examples:
             .. code-block:: python
 
-                # required: distributed
-                import paddle
-                import paddle.nn as nn
-                import paddle.distributed as dist
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> import paddle
+                >>> import paddle.nn as nn
+                >>> import paddle.distributed as dist
 
-                class SimpleNet(nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self._linear = nn.Linear(10, 1)
+                >>> class SimpleNet(nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self._linear = nn.Linear(10, 1)
+                ...     def forward(self, x):
+                ...         return self._linear(x)
 
-                    def forward(self, x):
-                        return self._linear(x)
+                >>> dist.init_parallel_env()
+                >>> model = SimpleNet()
+                >>> dp_model = paddle.DataParallel(model)
 
-                dist.init_parallel_env()
-                model = SimpleNet()
-                dp_model = paddle.DataParallel(model)
+                >>> inputs_1 = paddle.randn([10, 10], 'float32')
+                >>> inputs_2 = paddle.ones([10, 10], 'float32')
 
-                inputs_1 = paddle.randn([10, 10], 'float32')
-                inputs_2 = paddle.ones([10, 10], 'float32')
+                >>> with dp_model.no_sync():
+                ...     # gradients will not be synchronized
+                ...     dp_model(inputs_1).backward()
 
-                with dp_model.no_sync():
-                    # gradients will not be synchronized
-                    dp_model(inputs_1).backward()
-
-                # synchronization happens here
-                dp_model(inputs_2).backward()
+                >>> # synchronization happens here
+                >>> dp_model(inputs_2).backward()
 
         """
         tmp_grad_need_sync = self.grad_need_sync
@@ -587,16 +571,17 @@ def state_dict(
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.distributed as dist
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> import paddle
+                >>> import paddle.distributed as dist
 
-                dist.init_parallel_env()
+                >>> dist.init_parallel_env()
 
-                emb = paddle.nn.Embedding(10, 10)
-                emb = paddle.DataParallel(emb)
+                >>> emb = paddle.nn.Embedding(10, 10)
+                >>> emb = paddle.DataParallel(emb)
 
-                state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy.pdparams")
+                >>> state_dict = emb.state_dict()
+                >>> paddle.save(state_dict, "paddle_dy.pdparams")
 
         '''
 
@@ -621,19 +606,20 @@ def set_state_dict(self, state_dict, use_structured_name=True):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.distributed as dist
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> import paddle
+                >>> import paddle.distributed as dist
 
-                dist.init_parallel_env()
+                >>> dist.init_parallel_env()
 
-                emb = paddle.nn.Embedding(10, 10)
-                emb = paddle.DataParallel(emb)
+                >>> emb = paddle.nn.Embedding(10, 10)
+                >>> emb = paddle.DataParallel(emb)
 
-                state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy.pdparams")
+                >>> state_dict = emb.state_dict()
+                >>> paddle.save(state_dict, "paddle_dy.pdparams")
 
-                para_state_dict = paddle.load("paddle_dy.pdparams")
-                emb.set_state_dict(para_state_dict)
+                >>> para_state_dict = paddle.load("paddle_dy.pdparams")
+                >>> emb.set_state_dict(para_state_dict)
 
         '''
 
@@ -665,32 +651,34 @@ class ParallelEnv:
     or ``paddle.distributed.spawn`` .
 
     Examples:
-      .. code-block:: python
-
-        import paddle
-        import paddle.distributed as dist
-
-        def train():
-            # 1. initialize parallel environment
-            dist.init_parallel_env()
-
-            # 2. get current ParallelEnv
-            parallel_env = dist.ParallelEnv()
-            print("rank: ", parallel_env.rank)
-            print("world_size: ", parallel_env.world_size)
-
-            # print result in process 1:
-            # rank: 1
-            # world_size: 2
-            # print result in process 2:
-            # rank: 2
-            # world_size: 2
-
-        if __name__ == '__main__':
-            # 1. start by ``paddle.distributed.spawn`` (default)
-            dist.spawn(train, nprocs=2)
-            # 2. start by ``paddle.distributed.launch``
-            # train()
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> def train():
+            ...     # 1. initialize parallel environment
+            ...     dist.init_parallel_env()
+            ...     # 2. get current ParallelEnv
+            ...     parallel_env = dist.ParallelEnv()
+            ...     print("rank: ", parallel_env.rank)
+            ...     print("world_size: ", parallel_env.world_size)
+
+            >>> if __name__ == '__main__':
+            ...     # 1. start by ``paddle.distributed.spawn`` (default)
+            ...     dist.spawn(train, nprocs=2)
+            ...     # 2. start by ``paddle.distributed.launch``
+            ...     train()
+
+            # Print result in process 1:
+            rank: 1
+            world_size: 2
+
+            # Print result in process 2:
+            rank: 2
+            world_size: 2
+
     """
 
     def __init__(self):
@@ -735,14 +723,16 @@ def rank(self):
         Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0.
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINER_ID=0
-            import paddle.distributed as dist
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> # execute this command in terminal: export PADDLE_TRAINER_ID=0
+                >>> import paddle.distributed as dist
+
+                >>> env = dist.ParallelEnv()
+                >>> print("The rank is %d" % env.rank)
+                The rank is 0
 
-            env = dist.ParallelEnv()
-            print("The rank is %d" % env.rank)
-            # The rank is 0
         """
         return self._rank
 
@@ -754,14 +744,16 @@ def world_size(self):
         Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1.
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+                >>> import paddle.distributed as dist
 
-            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
-            import paddle.distributed as dist
+                >>> env = dist.ParallelEnv()
+                >>> print("The world_size is %d" % env.world_size)
+                The world_size is 4
 
-            env = dist.ParallelEnv()
-            print("The world_size is %d" % env.world_size)
-            # The world_size is 4
         """
         return self._world_size
 
@@ -773,14 +765,15 @@ def device_id(self):
         Its value is equal to the value of the environment variable ``FLAGS_selected_gpus`` . The default value is 0.
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            # execute this command in terminal: export FLAGS_selected_gpus=1
-            import paddle.distributed as dist
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> # execute this command in terminal: export FLAGS_selected_gpus=1
+                >>> import paddle.distributed as dist
 
-            env = dist.ParallelEnv()
-            print("The device id are %d" % env.device_id)
-            # The device id are 1
+                >>> env = dist.ParallelEnv()
+                >>> print("The device id are %d" % env.device_id)
+                The device id are 1
         """
         return self._device_id
 
@@ -802,14 +795,15 @@ def current_endpoint(self):
         Its value is equal to the value of the environment variable ``PADDLE_CURRENT_ENDPOINT`` . The default value is "".
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
-            import paddle.distributed as dist
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
+                >>> import paddle.distributed as dist
 
-            env = dist.ParallelEnv()
-            print("The current endpoint are %s" % env.current_endpoint)
-            # The current endpoint are 127.0.0.1:6170
+                >>> env = dist.ParallelEnv()
+                >>> print("The current endpoint are %s" % env.current_endpoint)
+                The current endpoint are 127.0.0.1:6170
         """
         return self._current_endpoint
 
@@ -822,14 +816,16 @@ def trainer_endpoints(self):
         Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "".
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
+
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
+                >>> import paddle.distributed as dist
 
-            # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
-            import paddle.distributed as dist
+                >>> env = dist.ParallelEnv()
+                >>> print("The trainer endpoints are %s" % env.trainer_endpoints)
+                The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
 
-            env = dist.ParallelEnv()
-            print("The trainer endpoints are %s" % env.trainer_endpoints)
-            # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
         """
         return self._trainer_endpoints
 
@@ -841,14 +837,15 @@ def nrings(self):
         Its value is equal to the value of the environment variable ``FLAGS_nccl_nrings`` . The default value is 1.
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            # execute this command in terminal: export FLAGS_nccl_nrings=1
-            import paddle.distributed as dist
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> # execute this command in terminal: export FLAGS_nccl_nrings=1
+                >>> import paddle.distributed as dist
 
-            env = dist.ParallelEnv()
-            print("The nrings is %d" % env.nrings)
-            # the number of ring is 1
+                >>> env = dist.ParallelEnv()
+                >>> print("The nrings is %d" % env.nrings)
+                The nrings is 1
         """
         return self._nrings
 
@@ -941,46 +938,40 @@ def init_parallel_env():
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            import paddle.nn as nn
-            import paddle.optimizer as opt
-            import paddle.distributed as dist
-
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
-
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
-
-            def train():
-                # 1. initialize parallel environment
-                dist.init_parallel_env()
-
-                # 2. create data parallel layer & optimizer
-                layer = LinearNet()
-                dp_layer = paddle.DataParallel(layer)
-
-                loss_fn = nn.MSELoss()
-                adam = opt.Adam(
-                    learning_rate=0.001, parameters=dp_layer.parameters())
-
-                # 3. run layer
-                inputs = paddle.randn([10, 10], 'float32')
-                outputs = dp_layer(inputs)
-                labels = paddle.randn([10, 1], 'float32')
-                loss = loss_fn(outputs, labels)
-
-                loss.backward()
-
-                adam.step()
-                adam.clear_grad()
-
-            if __name__ == '__main__':
-                dist.spawn(train)
+            >>> # doctest: +REQUIRES(env:GPU, env:DISTRIBUTED)
+            >>> import paddle
+            >>> import paddle.nn as nn
+            >>> import paddle.optimizer as opt
+            >>> import paddle.distributed as dist
+
+            >>> class LinearNet(nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self._linear1 = nn.Linear(10, 10)
+            ...         self._linear2 = nn.Linear(10, 1)
+            ...     def forward(self, x):
+            ...         return self._linear2(self._linear1(x))
+
+            >>> def train():
+            ...     # 1. initialize parallel environment
+            ...     dist.init_parallel_env()
+            ...     # 2. create data parallel layer & optimizer
+            ...     layer = LinearNet()
+            ...     dp_layer = paddle.DataParallel(layer)
+            ...     loss_fn = nn.MSELoss()
+            ...     adam = opt.Adam(
+            ...         learning_rate=0.001, parameters=dp_layer.parameters())
+            ...     # 3. run layer
+            ...     inputs = paddle.randn([10, 10], 'float32')
+            ...     outputs = dp_layer(inputs)
+            ...     labels = paddle.randn([10, 1], 'float32')
+            ...     loss = loss_fn(outputs, labels)
+            ...     loss.backward()
+            ...     adam.step()
+            ...     adam.clear_grad()
+
+            >>> if __name__ == '__main__':
+            ...     dist.spawn(train)
 
     """
 
@@ -1214,13 +1205,15 @@ def get_rank(group=None):
     Examples:
         .. code-block:: python
 
-            # Execute this script using distributed launch with one card configs.
-            import paddle
-            import paddle.distributed as dist
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # Execute this script using distributed launch with one card configs.
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> dist.init_parallel_env()
+            >>> print("The rank is %d" % dist.get_rank())
+            The rank is 0
 
-            dist.init_parallel_env()
-            print("The rank is %d" % dist.get_rank())
-            # The rank is 0
     """
     if in_dynamic_mode() and group:
         return group.rank
@@ -1246,13 +1239,15 @@ def get_world_size(group=None):
     Examples:
         .. code-block:: python
 
-            # Execute this script using distributed launch with one card configs.
-            import paddle
-            import paddle.distributed as dist
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # Execute this script using distributed launch with one card configs.
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> dist.init_parallel_env()
+            >>> print("The world_size is %d" % dist.get_world_size())
+            The world_size is 1
 
-            dist.init_parallel_env()
-            print("The world_size is %d" % dist.get_world_size())
-            # The world_size is 1
     """
     if in_dynamic_mode() and (group is None):
         if is_initialized():
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index 53bdca47c48a53..b8cd7a6b8d5d73 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -215,7 +215,7 @@ def build_state(self):
                         fwd_op_id = self.grad_op_to_op_map[
                             op.desc.original_id()
                         ]
-                        assert fwd_op_id in self._op_fp16_dict, f"{str(op)}"
+                        assert fwd_op_id in self._op_fp16_dict, str(op)
                         self._op_fp16_dict[
                             op.desc.original_id()
                         ] = self._is_fp16_op(fwd_op_id)
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index f804b59a2db2c6..8f9b83f04c5fcb 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -48,8 +48,8 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
             removed_op_idx.append(idx)
 
             # del op from dist_context
-            if dist_context:
-                dist_context.del_dist_op_for_program(op)
+            # if dist_context:
+            #     dist_context.del_dist_op_for_program(op)
 
     for idx in removed_op_idx[::-1]:
         main_block._remove_op(idx, sync=False)
@@ -166,58 +166,97 @@ def _append_gradient_merge_backward_op(
     grad_to_gradient_merge = {}
     # {param: gradient_merge_var} to insert scale op and fill_constant op
     new_params_to_grads = []
+
     # step2: create gradient_merge var and init with 0
+    main_program_clone = main_program.clone()
+    main_block_clone = main_program_clone.global_block()
+    grad_to_param_names = {}
     for param, grad in params_grads:
-        param_name = param.name
-        param_var = main_block.var(param_name)
-        assert param_var is not None
-        ref_dist_attr = dist_context.get_tensor_dist_attr_for_program(param_var)
-        assert ref_dist_attr is not None
-        gradient_merge_var = main_block.create_var(
-            name=param_name + "@GRAD@GradientMerge",
-            shape=param_var.shape,
-            dtype=param_var.dtype,
-            persistable=True,
-        )
-        ref_process_mesh = ref_dist_attr.process_mesh
-        ref_dims_mapping = ref_dist_attr.dims_mapping
+        grad_to_param_names[grad.name] = param.name
+
+    for index, op in reversed(list(enumerate(main_block_clone.ops))):
+        output_var_names = op.desc.output_arg_names()
+        if len(grad_to_param_names) == 0:
+            break
+        for output_var_name in output_var_names:
+            if len(grad_to_param_names) == 0:
+                break
+            if output_var_name in grad_to_param_names:
+                param_var = main_block.var(grad_to_param_names[output_var_name])
+                assert param_var is not None
+                ref_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                    param_var
+                )
+                assert ref_dist_attr is not None
+                # Add persistable gradient variables in main_program
+                gradient_merge_var = main_block.create_var(
+                    name=param_var.name + "@GRAD@GradientMerge",
+                    shape=param_var.shape,
+                    dtype=param_var.dtype,
+                    persistable=True,
+                )
+                ref_process_mesh = ref_dist_attr.process_mesh
+                ref_dims_mapping = ref_dist_attr.dims_mapping
+
+                set_var_dist_attr(
+                    dist_context,
+                    gradient_merge_var,
+                    ref_dims_mapping,
+                    ref_process_mesh,
+                )
 
-        set_var_dist_attr(
-            dist_context, gradient_merge_var, ref_dims_mapping, ref_process_mesh
-        )
+                # Add persistable gradient variables in startup_program
+                startup_gradient_merge_var = startup_block.create_var(
+                    name=param_var.name + "@GRAD@GradientMerge",
+                    shape=param_var.shape,
+                    dtype=param_var.dtype,
+                    persistable=True,
+                )
+                # Initial persistable gradient variables in startup_program
+                startup_block.append_op(
+                    type="fill_constant",
+                    outputs={"Out": startup_gradient_merge_var},
+                    attrs={
+                        "shape": param_var.shape,
+                        "dtype": param_var.dtype,
+                        "value": float(0),
+                    },
+                )
 
-        startup_gradient_merge_var = startup_block.create_var(
-            name=param_name + "@GRAD@GradientMerge",
-            shape=param_var.shape,
-            dtype=param_var.dtype,
-            persistable=True,
-        )
-        startup_block.append_op(
-            type="fill_constant",
-            outputs={"Out": startup_gradient_merge_var},
-            attrs={
-                "shape": param_var.shape,
-                "dtype": param_var.dtype,
-                "value": float(0),
-            },
-        )
+                # Accumulate persistable gradient variables in main_program
+                grad_var = main_block.var(output_var_name)
+                assert grad_var is not None
+                new_grad_op = main_block._insert_op_without_sync(
+                    index + 1,
+                    type="elementwise_add",
+                    inputs={'X': grad_var, 'Y': gradient_merge_var},
+                    outputs={'Out': gradient_merge_var},
+                    attrs={
+                        'axis': -1,
+                        'use_mkldnn': False,
+                        OP_ROLE_KEY: OpRole.Backward,
+                    },
+                )
+
+                # Construct new_params_to_grads and grad_to_gradient_merge
+                new_params_to_grads.append([param_var, gradient_merge_var])
+                grad_to_gradient_merge[grad_var.name] = gradient_merge_var.name
+                naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                    new_grad_op,
+                    ref_process_mesh,
+                    ref_dims_mapping,
+                    dist_context,
+                )
+
+                del grad_to_param_names[output_var_name]
+
+    assert (
+        len(grad_to_param_names) == 0
+    ), "grad_to_param_names must be empty right now, but it has {} items".format(
+        len(grad_to_param_names)
+    )
+    main_block._sync_with_cpp()
 
-        # grad_merge += grad
-        new_grad_op = main_block.append_op(
-            type="elementwise_add",
-            inputs={'X': grad, 'Y': gradient_merge_var},
-            outputs={'Out': gradient_merge_var},
-            attrs={
-                'axis': -1,
-                'use_mkldnn': False,
-                OP_ROLE_KEY: OpRole.Backward,
-            },
-        )
-        new_params_to_grads.append([param, gradient_merge_var])
-        grad_to_gradient_merge[grad.name] = gradient_merge_var.name
-        naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
-            new_grad_op, ref_process_mesh, ref_dims_mapping, dist_context
-        )
     return new_params_to_grads, grad_to_gradient_merge
 
 
@@ -229,6 +268,7 @@ def _create_cond_block_and_update_optimizer(
     optimize_ops_block,
     k_steps,
     avg,
+    dist_context,
 ):
     def true_apply_gradient():
         cur_block_idx = main_program.current_block_idx
@@ -285,6 +325,14 @@ def true_apply_gradient():
         main_program.global_block()._sync_with_cpp()
         cur_block._sync_with_cpp()
 
+        # update serial op
+        for idx, op in enumerate(cur_block.ops):
+            if is_optimize_op(op):
+                dist_op = dist_context.get_dist_op_for_program(op)
+                if dist_op:
+                    # dist_op.set_input_dist_attr
+                    dist_op._serial_op = op
+
         # clear gradient_merge_vars
         for param, new_grad in new_params_to_grads:
             paddle.tensor.fill_constant(
@@ -331,6 +379,7 @@ def parse_program(
         optimize_ops_block,
         k_steps,
         avg,
+        dist_context,
     )
 
 
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index f7b211fdc4ba41..6c3ee4d8d8e951 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -1691,11 +1691,10 @@ def re_order_program(block, param_grads, dist_context):
     if is_optimize_op(last_op) and last_op.type in _supported_optimizer_type:
         # record optimizer
         for idx, op in reversed(list(enumerate(block.ops))):
-            if op.type not in _supported_optimizer_type:
-                break
-            assert len(op.input("Param")) == 1
-            pname_to_op[op.input("Param")[0]] = op
-            remove_op_indices.append(idx)
+            if op.type in _supported_optimizer_type:
+                assert len(op.input("Param")) == 1
+                pname_to_op[op.input("Param")[0]] = op
+                remove_op_indices.append(idx)
         assert len(use_order) == len(pname_to_op)
 
         # append new opts
diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py
index ebe6bc54623d6f..0d88c8fef1ce51 100644
--- a/python/paddle/distributed/rpc/rpc.py
+++ b/python/paddle/distributed/rpc/rpc.py
@@ -87,11 +87,13 @@ def init_rpc(name, rank=None, world_size=None, master_endpoint=None):
     Examples:
         .. code-block:: python
 
-            import paddle.distributed.rpc as rpc
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle.distributed.rpc as rpc
 
-            rpc.init_rpc("worker0", rank=0, world_size=1,
-                        master_endpoint="127.0.0.1:8001")
-            rpc.shutdown()
+            >>> rpc.init_rpc("worker0", rank=0, world_size=1,
+            ...             master_endpoint="127.0.0.1:8001")
+
+            >>> rpc.shutdown()
 
     """
     rank = int(os.environ["PADDLE_TRAINER_ID"]) if rank is None else rank
@@ -161,15 +163,17 @@ def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
     Examples:
         .. code-block:: python
 
-            import paddle.distributed.rpc as rpc
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle.distributed.rpc as rpc
+
+            >>> def add(a, b):
+            ...     return a + b
 
-            def add(a, b):
-                return a + b
+            >>> rpc.init_rpc("worker0", rank=0, world_size=1,
+            ...         master_endpoint="127.0.0.1:8002")
 
-            rpc.init_rpc("worker0", rank=0, world_size=1,
-                    master_endpoint="127.0.0.1:8002")
-            ret = rpc.rpc_sync("worker0", add, args=(2, 3))
-            rpc.shutdown()
+            >>> ret = rpc.rpc_sync("worker0", add, args=(2, 3))
+            >>> rpc.shutdown()
 
     """
     fut = _invoke_rpc(to, fn, args, kwargs, timeout)
@@ -201,16 +205,20 @@ def rpc_async(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
     Examples:
         .. code-block:: python
 
-            import paddle.distributed.rpc as rpc
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle.distributed.rpc as rpc
+
+            >>> def add(a, b):
+            ...     return a + b
 
-            def add(a, b):
-                return a + b
+            >>> rpc.init_rpc("worker0", rank=0, world_size=1,
+            ...         master_endpoint="127.0.0.1:8003")
 
-            rpc.init_rpc("worker0", rank=0, world_size=1,
-                    master_endpoint="127.0.0.1:8003")
-            fut = rpc.rpc_async("worker0", add, args=(2, 3))
-            print(fut.wait())
-            rpc.shutdown()
+            >>> fut = rpc.rpc_async("worker0", add, args=(2, 3))
+            >>> print(fut.wait())
+            5
+
+            >>> rpc.shutdown()
 
     """
     return _invoke_rpc(to, fn, args, kwargs, timeout)
@@ -279,11 +287,13 @@ def shutdown():
     Examples:
         .. code-block:: python
 
-            import paddle.distributed.rpc as rpc
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle.distributed.rpc as rpc
+
+            >>> rpc.init_rpc("worker0", rank=0, world_size=1,
+            ...             master_endpoint="127.0.0.1:8004")
 
-            rpc.init_rpc("worker0", rank=0, world_size=1,
-                        master_endpoint="127.0.0.1:8004")
-            rpc.shutdown()
+            >>> rpc.shutdown()
 
     """
     info = get_current_worker_info()
@@ -309,17 +319,18 @@ class `WorkerInfo` with attribute `name`, `rank`, `ip` and `port`.
     Examples:
         .. code-block:: python
 
-            import paddle.distributed.rpc as rpc
-            import os
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle.distributed.rpc as rpc
+            >>> import os
 
-            os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9002"
-            rpc.init_rpc("worker0", rank=0, world_size=1,
-                        master_endpoint="127.0.0.1:8005")
+            >>> os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9002"
+            >>> rpc.init_rpc("worker0", rank=0, world_size=1,
+            ...             master_endpoint="127.0.0.1:8005")
 
-            print(rpc.get_worker_info("worker0"))
-            # {name: worker0, rank: 0, ip: 127.0.0.1, port: 9002}
+            >>> print(rpc.get_worker_info("worker0"))
+            {name: worker0, rank: 0, ip: 127.0.0.1, port: 9002}
 
-            rpc.shutdown()
+            >>> rpc.shutdown()
 
     """
     return core.rpc_get_worker_info(name)
@@ -335,17 +346,18 @@ def get_all_worker_infos():
     Examples:
         .. code-block:: python
 
-            import paddle.distributed.rpc as rpc
-            import os
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle.distributed.rpc as rpc
+            >>> import os
 
-            os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9003"
-            rpc.init_rpc("worker0", rank=0, world_size=1,
-                    master_endpoint="127.0.0.1:8006")
+            >>> os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9003"
+            >>> rpc.init_rpc("worker0", rank=0, world_size=1,
+            ...         master_endpoint="127.0.0.1:8006")
 
-            print(rpc.get_all_worker_infos())
-            # [{name: worker0, rank: 0, ip: 127.0.0.1, port: 9003}]
+            >>> print(rpc.get_all_worker_infos())
+            [{name: worker0, rank: 0, ip: 127.0.0.1, port: 9003}]
 
-            rpc.shutdown()
+            >>> rpc.shutdown()
 
     """
     return core.rpc_get_all_worker_infos()
@@ -361,17 +373,18 @@ class `WorkerInfo` with attribute `name`, `rank`, `ip` and `port`.
     Examples:
         .. code-block:: python
 
-            import paddle.distributed.rpc as rpc
-            import os
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle.distributed.rpc as rpc
+            >>> import os
 
-            os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9004"
-            rpc.init_rpc("worker0", rank=0, world_size=1,
-                        master_endpoint="127.0.0.1:8007")
+            >>> os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9004"
+            >>> rpc.init_rpc("worker0", rank=0, world_size=1,
+            ...             master_endpoint="127.0.0.1:8007")
 
-            print(rpc.get_current_worker_info())
-            # {name: worker0, rank: 0, ip: 127.0.0.1, port: 9004}
+            >>> print(rpc.get_current_worker_info())
+            {name: worker0, rank: 0, ip: 127.0.0.1, port: 9004}
 
-            rpc.shutdown()
+            >>> rpc.shutdown()
 
     """
     return core.rpc_get_current_worker_info()
diff --git a/python/paddle/distributed/sharding/__init__.py b/python/paddle/distributed/sharding/__init__.py
index 3a710ca8059424..008100d506f934 100644
--- a/python/paddle/distributed/sharding/__init__.py
+++ b/python/paddle/distributed/sharding/__init__.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .group_sharded import (
-    group_sharded_parallel,
-    save_group_sharded_model,
-)  # noqa: F401
+from .group_sharded import group_sharded_parallel, save_group_sharded_model
 
 __all__ = ['group_sharded_parallel', 'save_group_sharded_model']
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 350f6eff4d001f..b0f5ab0b629cab 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -77,32 +77,33 @@ def group_sharded_parallel(
     Examples:
         .. code-block:: python
 
-            # required: distributed
-            import paddle
-            from paddle.nn import Linear
-            from paddle.distributed import fleet
-            from paddle.distributed.sharding import group_sharded_parallel
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.nn import Linear
+            >>> from paddle.distributed import fleet
+            >>> from paddle.distributed.sharding import group_sharded_parallel
 
-            fleet.init(is_collective=True)
-            group = paddle.distributed.new_group([0, 1])
-            model = Linear(1000, 1000)
+            >>> fleet.init(is_collective=True)
+            >>> group = paddle.distributed.new_group([0, 1])
+            >>> model = Linear(1000, 1000)
 
-            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            >>> optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
 
-            # wrap sharding model, optimizer and scaler
-            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+            >>> # wrap sharding model, optimizer and scaler
+            >>> model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
 
-            img, label = data
-            label.stop_gradient = True
-            img.stop_gradient = True
+            >>> img, label = data
+            >>> label.stop_gradient = True
+            >>> img.stop_gradient = True
 
-            out = model(img)
-            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+            >>> out = model(img)
+            >>> loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            >>> loss.backward()
+            >>> optimizer.step()
+            >>> optimizer.clear_grad()
 
-            loss.backward()
-            optimizer.step()
-            optimizer.clear_grad()
     """
 
     device = paddle.get_device().split(":")[0]
@@ -195,35 +196,36 @@ def save_group_sharded_model(model, output, optimizer=None):
     Examples:
         .. code-block:: python
 
-            # required: distributed
-            import paddle
-            from paddle.nn import Linear
-            from paddle.distributed import fleet
-            from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.nn import Linear
+            >>> from paddle.distributed import fleet
+            >>> from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+            >>> fleet.init(is_collective=True)
+            >>> group = paddle.distributed.new_group([0, 1])
+            >>> model = Linear(1000, 1000)
 
-            fleet.init(is_collective=True)
-            group = paddle.distributed.new_group([0, 1])
-            model = Linear(1000, 1000)
+            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            >>> optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
 
-            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+            >>> # wrap sharding model, optimizer and scaler
+            >>> model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
 
-            # wrap sharding model, optimizer and scaler
-            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+            >>> img, label = data
+            >>> label.stop_gradient = True
+            >>> img.stop_gradient = True
 
-            img, label = data
-            label.stop_gradient = True
-            img.stop_gradient = True
+            >>> out = model(img)
+            >>> loss = paddle.nn.functional.cross_entropy(input=out, label=label)
 
-            out = model(img)
-            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+            >>> loss.backward()
+            >>> optimizer.step()
+            >>> optimizer.clear_grad()
 
-            loss.backward()
-            optimizer.step()
-            optimizer.clear_grad()
+            >>> # save model and optimizer state_dict
+            >>> save_group_sharded_model(model, optimizer, output=output_dir)
 
-            # save model and optimizer state_dict
-            save_group_sharded_model(model, optimizer, output=output_dir)
     """
     logger_.info(
         "==========Begin to save group sharded model and optimizer=========="
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 91039b3b3bac3b..970afae464030a 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -494,79 +494,74 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-            import paddle.optimizer as opt
-            import paddle.distributed as dist
-
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
-
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
-
-            def train(print_result=False):
-                # 1. initialize parallel environment
-                group = dist.init_parallel_env()
-                process_group = group.process_group if group else None
-
-                # 2. create data parallel layer & optimizer
-                layer = LinearNet()
-                dp_layer = paddle.DataParallel(layer, group = process_group)
-
-                loss_fn = nn.MSELoss()
-                adam = opt.Adam(
-                    learning_rate=0.001, parameters=dp_layer.parameters())
-
-                # 3. run layer
-                inputs = paddle.randn([10, 10], 'float32')
-                outputs = dp_layer(inputs)
-                labels = paddle.randn([10, 1], 'float32')
-                loss = loss_fn(outputs, labels)
-
-                if print_result is True:
-                    print("loss:", loss.numpy())
-
-                loss.backward()
-
-                adam.step()
-                adam.clear_grad()
-
-            # Usage 1: only pass function.
-            # If your training method no need any argument, and
-            # use all visible devices for parallel training.
-            if __name__ == '__main__':
-                dist.spawn(train)
-
-            # Usage 2: pass function and arguments.
-            # If your training method need some arguments, and
-            # use all visible devices for parallel training.
-            if __name__ == '__main__':
-                dist.spawn(train, args=(True,))
-
-            # Usage 3: pass function, arguments and nprocs.
-            # If your training method need some arguments, and
-            # only use part of visible devices for parallel training.
-            # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
-            # this case will use cards {0,1}; If you set
-            # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
-            # cards {4,5}
-            if __name__ == '__main__':
-                dist.spawn(train, args=(True,), nprocs=2)
-
-            # Usage 4: pass function, arguments, nprocs and gpus.
-            # If your training method need some arguments, and
-            # only use part of visible devices for parallel training,
-            # but you can't set your machine's environment variable
-            # CUDA_VISIBLE_DEVICES, such as it is None or all cards
-            # {0,1,2,3,4,5,6,7}, you can pass `gpus` to
-            # select the GPU cards you want to use. For example,
-            # this case will use cards {4,5} if your machine hold 8 cards.
-            if __name__ == '__main__':
-                dist.spawn(train, args=(True,), nprocs=2, gpus='4,5')
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> import paddle.nn as nn
+            >>> import paddle.optimizer as opt
+            >>> import paddle.distributed as dist
+
+            >>> class LinearNet(nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self._linear1 = nn.Linear(10, 10)
+            ...         self._linear2 = nn.Linear(10, 1)
+            ...     def forward(self, x):
+            ...         return self._linear2(self._linear1(x))
+
+            >>> def train(print_result=False):
+            ...     # 1. initialize parallel environment
+            ...     group = dist.init_parallel_env()
+            ...     process_group = group.process_group if group else None
+            ...     # 2. create data parallel layer & optimizer
+            ...     layer = LinearNet()
+            ...     dp_layer = paddle.DataParallel(layer, group = process_group)
+            ...     loss_fn = nn.MSELoss()
+            ...     adam = opt.Adam(
+            ...         learning_rate=0.001, parameters=dp_layer.parameters())
+            ...     # 3. run layer
+            ...     inputs = paddle.randn([10, 10], 'float32')
+            ...     outputs = dp_layer(inputs)
+            ...     labels = paddle.randn([10, 1], 'float32')
+            ...     loss = loss_fn(outputs, labels)
+            ...     if print_result is True:
+            ...         print("loss:", loss.numpy())
+            ...     loss.backward()
+            ...     adam.step()
+            ...     adam.clear_grad()
+
+            >>> # Usage 1: only pass function.
+            >>> # If your training method no need any argument, and
+            >>> # use all visible devices for parallel training.
+            >>> if __name__ == '__main__':
+            ...     dist.spawn(train)
+
+            >>> # Usage 2: pass function and arguments.
+            >>> # If your training method need some arguments, and
+            >>> # use all visible devices for parallel training.
+            >>> if __name__ == '__main__':
+            ...     dist.spawn(train, args=(True,))
+
+            >>> # Usage 3: pass function, arguments and nprocs.
+            >>> # If your training method need some arguments, and
+            >>> # only use part of visible devices for parallel training.
+            >>> # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
+            >>> # this case will use cards {0,1}; If you set
+            >>> # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
+            >>> # cards {4,5}
+            >>> if __name__ == '__main__':
+            ...     dist.spawn(train, args=(True,), nprocs=2)
+
+            >>> # Usage 4: pass function, arguments, nprocs and gpus.
+            >>> # If your training method need some arguments, and
+            >>> # only use part of visible devices for parallel training,
+            >>> # but you can't set your machine's environment variable
+            >>> # CUDA_VISIBLE_DEVICES, such as it is None or all cards
+            >>> # {0,1,2,3,4,5,6,7}, you can pass `gpus` to
+            >>> # select the GPU cards you want to use. For example,
+            >>> # this case will use cards {4,5} if your machine hold 8 cards.
+            >>> if __name__ == '__main__':
+            ...     dist.spawn(train, args=(True,), nprocs=2, gpus='4,5')
+
     """
     # Give an error hint when the users enter a configuration option
     # that does not exist
diff --git a/python/paddle/distributed/transpiler/distribute_transpiler.py b/python/paddle/distributed/transpiler/distribute_transpiler.py
index 47929406ecde9a..3d86d6dd9afcef 100644
--- a/python/paddle/distributed/transpiler/distribute_transpiler.py
+++ b/python/paddle/distributed/transpiler/distribute_transpiler.py
@@ -175,13 +175,14 @@ class DistributeTranspilerConfig:
     Examples:
         .. code-block:: python
 
-            from paddle.distributed.transpiler.ps_dispatcher import RoundRobin
-            import paddle.distributed.transpiler as transpiler
+            >>> from paddle.distributed.transpiler.distribute_transpiler import RoundRobin
+            >>> import paddle.distributed.transpiler as transpiler
+
+            >>> config = transpiler.DistributeTranspilerConfig()
+            >>> config.slice_var_up = True
+            >>> config.split_method = RoundRobin
+            >>> config.min_block_size = 81920
 
-            config = transpiler.DistributeTranspilerConfig()
-            config.slice_var_up = True
-            config.split_method = RoundRobin
-            config.min_block_size = 81920
     """
 
     slice_var_up = True
@@ -282,53 +283,57 @@ class DistributeTranspiler:
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
-            import paddle.distributed.transpiler as transpiler
-
-            paddle.enable_static()
-
-            x = paddle.static.data(name='x', shape=[1,13], dtype='float32')
-            y = paddle.static.data(name='y', shape=[1], dtype='float32')
-            y_predict = paddle.static.nn.fc(x, size=1, activation=None)
-
-            cost =paddle.nn.functional.square_error_cost(input=y_predict, label=y)
-            avg_loss = paddle.mean(cost)
-
-            sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-            sgd_optimizer.minimize(avg_loss)
-
-            # for pserver mode
-            pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-            trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-            current_endpoint = "192.168.0.1:6174"
-            trainer_id = 0
-            trainers = 4
-            role = "PSERVER"
-            t = transpiler.DistributeTranspiler()
-            t.transpile(
-                 trainer_id, pservers=pserver_endpoints, trainers=trainers)
-            if role == "PSERVER":
-                 pserver_program = t.get_pserver_program(current_endpoint)
-                 pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                                pserver_program)
-            elif role == "TRAINER":
-                 trainer_program = t.get_trainer_program()
-
-            # for nccl2 mode
-            trainer_num = 2
-            trainer_id = 0
-            config = transpiler.DistributeTranspilerConfig()
-            config.mode = "nccl2"
-            trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-            t = transpiler.DistributeTranspiler(config=config)
-            t.transpile(trainer_id=trainer_id, trainers=trainer_endpoints, current_endpoint="192.168.0.1:6174")
-            exe = paddle.static.ParallelExecutor(
-                use_cuda=True,
-                loss_name=avg_loss.name,
-                num_trainers=trainer_num,
-                trainer_id=trainer_id
-            )
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> import paddle.distributed.transpiler as transpiler
+
+            >>> paddle.enable_static()
+
+            >>> x = paddle.static.data(name='x', shape=[1,13], dtype='float32')
+            >>> y = paddle.static.data(name='y', shape=[1], dtype='float32')
+            >>> y_predict = paddle.static.nn.fc(x, size=1, activation=None)
+
+            >>> cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
+            >>> avg_loss = paddle.mean(cost)
+
+            >>> sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+            >>> sgd_optimizer.minimize(avg_loss)
+
+            >>> # for pserver mode
+            >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+            >>> trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+            >>> current_endpoint = "192.168.0.1:6174"
+            >>> trainer_id = 0
+            >>> trainers = 4
+            >>> role = "PSERVER"
+
+            >>> t = transpiler.DistributeTranspiler()
+            >>> t.transpile(
+            ...         trainer_id, pservers=pserver_endpoints, trainers=trainers)
+
+            >>> if role == "PSERVER":
+            ...         pserver_program = t.get_pserver_program(current_endpoint)
+            ...         pserver_startup_program = t.get_startup_program(current_endpoint,
+            ...                                                     pserver_program)
+            ... elif role == "TRAINER":
+            ...         trainer_program = t.get_trainer_program()
+
+            >>> # for nccl2 mode
+            >>> trainer_num = 2
+            >>> trainer_id = 0
+            >>> config = transpiler.DistributeTranspilerConfig()
+            >>> config.mode = "nccl2"
+            >>> trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+            >>> t = transpiler.DistributeTranspiler(config=config)
+            >>> t.transpile(trainer_id=trainer_id, trainers=trainer_endpoints, current_endpoint="192.168.0.1:6174")
+            >>> exe = paddle.static.ParallelExecutor(
+            ...     use_cuda=True,
+            ...     loss_name=avg_loss.name,
+            ...     num_trainers=trainer_num,
+            ...     trainer_id=trainer_id
+            ... )
+
     """
 
     def __init__(self, config=None):
@@ -609,13 +614,15 @@ def transpile(
         Examples:
             .. code-block:: python
 
-                transpiler = paddle.distributed.transpiler.DistributeTranspiler()
-                t.transpile(
-                    trainer_id=0,
-                    pservers="127.0.0.1:7000,127.0.0.1:7001",
-                    trainers=2,
-                    sync_mode=False,
-                    current_endpoint="127.0.0.1:7000")
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> t = paddle.distributed.transpiler.DistributeTranspiler()
+                >>> t.transpile(
+                ...     trainer_id=0,
+                ...     pservers="127.0.0.1:7000,127.0.0.1:7001",
+                ...     trainers=2,
+                ...     sync_mode=False,
+                ...     current_endpoint="127.0.0.1:7000")
+
         """
         from paddle.distributed.distribute_lookup_table import (
             find_distributed_lookup_table,
@@ -1127,14 +1134,17 @@ def get_trainer_program(self, wait_port=True):
         Examples:
             .. code-block:: python
 
-              import paddle.distributed.transpiler as transpiler
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              trainer_id = 0
-              trainers = 4
-              t = transpiler.DistributeTranspiler()
-              t.transpile(trainer_id, trainers=trainers, pservers=pserver_endpoints)
-              trainer_program = t.get_trainer_program()
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> import paddle.distributed.transpiler as transpiler
+                >>> # this is an example, find available endpoints in your case
+                >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+                >>> trainer_id = 0
+                >>> trainers = 4
+
+                >>> t = transpiler.DistributeTranspiler()
+                >>> t.transpile(trainer_id, trainers=trainers, pservers=pserver_endpoints)
+                >>> trainer_program = t.get_trainer_program()
+
         """
         # remove optimize ops and add a send op to main_program
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
@@ -1273,16 +1283,20 @@ def get_pserver_program(self, endpoint):
         Examples:
             .. code-block:: python
 
-              import paddle.distributed.transpiler as transpiler
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              current_endpoint = "192.168.0.1:6174"
-              trainer_id = 0
-              trainers = 4
-              t = transpiler.DistributeTranspiler()
-              t.transpile(
-                   trainer_id, pservers=pserver_endpoints, trainers=trainers)
-              pserver_program = t.get_pserver_program(current_endpoint)
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> import paddle.distributed.transpiler as transpiler
+                >>> # this is an example, find available endpoints in your case
+                >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+                >>> current_endpoint = "192.168.0.1:6174"
+                >>> trainer_id = 0
+                >>> trainers = 4
+
+                >>> t = transpiler.DistributeTranspiler()
+                >>> t.transpile(
+                ...     trainer_id, pservers=pserver_endpoints, trainers=trainers)
+
+                >>> pserver_program = t.get_pserver_program(current_endpoint)
+
         """
         # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
         # NOTE: assume blocks of the same variable is not distributed
@@ -1582,16 +1596,19 @@ def get_pserver_programs(self, endpoint):
         Examples:
             .. code-block:: python
 
-              import paddle.distributed.transpiler as transpiler
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              current_endpoint = "192.168.0.1:6174"
-              trainer_id = 0
-              trainers = 4
-              t = transpiler.DistributeTranspiler()
-              t.transpile(
-                   trainer_id, pservers=pserver_endpoints, trainers=trainers)
-              pserver_program, pserver_startup_program = t.get_pserver_programs(current_endpoint)
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> import paddle.distributed.transpiler as transpiler
+                >>> # this is an example, find available endpoints in your case
+                >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+                >>> current_endpoint = "192.168.0.1:6174"
+                >>> trainer_id = 0
+                >>> trainers = 4
+
+                >>> t = transpiler.DistributeTranspiler()
+                >>> t.transpile(
+                ...     trainer_id, pservers=pserver_endpoints, trainers=trainers)
+                >>> pserver_program, pserver_startup_program = t.get_pserver_programs(current_endpoint)
+
         """
         pserver_prog = self.get_pserver_program(endpoint)
         pserver_startup = self.get_startup_program(
@@ -1621,17 +1638,19 @@ def get_startup_program(
         Examples:
             .. code-block:: python
 
-                pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-                trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-                current_endpoint = "192.168.0.1:6174"
-                trainer_id = 0
-                trainers = 4
-
-                t = paddle.distributed.transpiler.DistributeTranspiler()
-                t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-                pserver_program = t.get_pserver_program(current_endpoint)
-                pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                                pserver_program)
+                >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+                >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+                >>> trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+                >>> current_endpoint = "192.168.0.1:6174"
+                >>> trainer_id = 0
+                >>> trainers = 4
+
+                >>> t = paddle.distributed.transpiler.DistributeTranspiler()
+                >>> t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+                >>> pserver_program = t.get_pserver_program(current_endpoint)
+                >>> pserver_startup_program = t.get_startup_program(current_endpoint,
+                ...                                                 pserver_program)
+
         """
         s_prog = Program()
         orig_s_prog = self.startup_program
diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py
index 82a2e6ce87a055..68f4820da994d1 100644
--- a/python/paddle/distribution/__init__.py
+++ b/python/paddle/distribution/__init__.py
@@ -32,7 +32,7 @@
 from paddle.distribution.laplace import Laplace
 from paddle.distribution.geometric import Geometric
 
-__all__ = [  # noqa
+__all__ = [
     'Bernoulli',
     'Beta',
     'Categorical',
diff --git a/python/paddle/distribution/bernoulli.py b/python/paddle/distribution/bernoulli.py
index 7d4849fab48e7c..152306aea31f7c 100644
--- a/python/paddle/distribution/bernoulli.py
+++ b/python/paddle/distribution/bernoulli.py
@@ -212,6 +212,7 @@ def rsample(self, shape, temperature=1.0):
             .. code-block:: python
 
                 >>> import paddle
+                >>> paddle.seed(1)
                 >>> from paddle.distribution import Bernoulli
 
                 >>> rv = Bernoulli(paddle.full((1), 0.3))
@@ -231,28 +232,26 @@ def rsample(self, shape, temperature=1.0):
                 [100, 2, 2]
 
                 >>> # `rsample` has to be followed by a `sigmoid`
-                >>> # doctest: +SKIP
                 >>> rv = Bernoulli(0.3)
                 >>> rsample = rv.rsample([3, ])
                 >>> rsample_sigmoid = paddle.nn.functional.sigmoid(rsample)
-                >>> print(rsample, rsample_sigmoid)
-                Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                [[-0.88315082],
-                [-0.62347704],
-                [-0.31513220]])
-                Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                [[0.29252526],
-                [0.34899110],
-                [0.42186251]])
+                >>> print(rsample)
+                Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [-1.46112013, -0.01239836, -1.32765460])
+                >>> print(rsample_sigmoid)
+                Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.18829606, 0.49690047, 0.20954758])
 
                 >>> # The smaller the `temperature`, the distribution of `rsample` closer to `sample`, with `probs` of 0.3.
                 >>> print(paddle.nn.functional.sigmoid(rv.rsample([1000, ], temperature=1.0)).sum())
+                >>> # doctest: +SKIP('output will be different')
                 Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                361.06829834)
+                365.63122559)
+                >>> # doctest: -SKIP
 
                 >>> print(paddle.nn.functional.sigmoid(rv.rsample([1000, ], temperature=0.1)).sum())
                 Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                288.66418457)
+                320.15057373)
         """
         name = self.name + '_rsample'
         if not in_dynamic_mode():
diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py
index b6484e3f21d563..9d5664dc28f4d3 100644
--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -64,14 +64,12 @@ class Categorical(distribution.Distribution):
             >>> cat = Categorical(x)
             >>> cat2 = Categorical(y)
 
-            >>> # doctest: +SKIP
             >>> paddle.seed(1000) # on CPU device
             >>> print(cat.sample([2,3]))
             Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
             [[0, 1, 5],
             [3, 4, 5]])
 
-            >>> # doctest: -SKIP
             >>> print(cat.entropy())
             Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
             1.77528250)
diff --git a/python/paddle/distribution/geometric.py b/python/paddle/distribution/geometric.py
index bfcd734bb17850..6df855b168143f 100644
--- a/python/paddle/distribution/geometric.py
+++ b/python/paddle/distribution/geometric.py
@@ -18,7 +18,7 @@
 
 import paddle
 from paddle.base import framework
-from paddle.distribution import distribution, uniform
+from paddle.distribution import distribution
 
 
 class Geometric(distribution.Distribution):
@@ -27,8 +27,8 @@ class Geometric(distribution.Distribution):
 
     In probability theory and statistics, the geometric distribution is one of
     discrete probability distributions, parameterized by one positive shape parameter, denoted by probs.
-    In n Bernoulli trials, it takes k trials to get the probability of success for the first time.
-    In detail, it is: the probability that the first k-1 times failed and the kth time succeeded.
+    In n Bernoulli trials, it takes k+1 trials to get the probability of success for the first time.
+    In detail, it is: the probability that the first k times failed and the kth time succeeded.
     The geometric distribution is a special case of the Pascal distribution when r=1.
 
     The probability mass function (pmf) is
@@ -36,7 +36,7 @@ class Geometric(distribution.Distribution):
     .. math::
             Pr(Y=k)=(1-p)^kp
 
-    where k is number of trials performed and p is probability of success for each trial and k=0,1,2,3,4..., p belong to (0,1].
+    where k is number of trials failed before seeing a success, and p is probability of success for each trial and k=0,1,2,3,4..., p belong to (0,1].
 
     Args:
         probs (Real|Tensor): Probability parameter.
@@ -56,7 +56,7 @@ class Geometric(distribution.Distribution):
 
             >>> print(geom.mean)
             Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            2.)
+            1.)
 
             >>> print(geom.variance)
             Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
@@ -108,7 +108,7 @@ def __init__(self, probs):
     @property
     def mean(self):
         """Mean of geometric distribution."""
-        return 1.0 / self.probs
+        return 1.0 / self.probs - 1.0
 
     @property
     def variance(self):
@@ -128,7 +128,7 @@ def pmf(self, k):
 
         .. math::
 
-            P(X=k) = (1-p)^{k-1} p, \quad k=1,2,3,\ldots
+            P(X=k) = (1-p)^{k} p, \quad k=0,1,2,3,\ldots
 
         Args:
             k (int): Value to be evaluated.
@@ -146,10 +146,10 @@ def pmf(self, k):
                 >>> geom = Geometric(0.5)
                 >>> print(geom.pmf(2))
                 Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                0.25000000)
+                0.12500000)
         """
         if isinstance(k, (numbers.Integral, framework.Variable)):
-            return paddle.pow((1.0 - self.probs), k - 1.0) * self.probs
+            return paddle.pow((1.0 - self.probs), k) * self.probs
         else:
             raise TypeError(
                 f"Expected type of k is number.Real|framework.Variable, but got {type(k)}"
@@ -177,7 +177,7 @@ def log_pmf(self, k):
                 >>> geom = Geometric(0.5)
                 >>> print(geom.log_pmf(2))
                 Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                -1.38629436)
+                -2.07944131)
         """
         if isinstance(k, (numbers.Integral, framework.Variable)):
             return paddle.log(self.pmf(k))
@@ -206,8 +206,8 @@ def sample(self, shape=()):
                 >>> geom = Geometric(0.5)
                 >>> print(geom.sample((2,2)))
                 Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                [[0.20783406, 0.94300812],
-                 [1.94558561, 0.14360668]])
+                [[0., 0.],
+                 [1., 0.]])
         """
         with paddle.no_grad():
             return self.rsample(shape)
@@ -232,19 +232,22 @@ def rsample(self, shape=()):
                 >>> geom = Geometric(0.5)
                 >>> print(geom.rsample((2,2)))
                 Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                [[0.20783406, 0.94300812],
-                 [1.94558561, 0.14360668]])
+                [[0., 0.],
+                 [1., 0.]])
 
         """
         shape = distribution.Distribution._extend_shape(
             self, sample_shape=shape
         )
-        tiny = np.finfo(dtype='float32').tiny
 
-        sample_uniform = uniform.Uniform(low=float(tiny), high=float(1))
+        uniform = paddle.uniform(
+            shape=shape,
+            min=float(np.finfo(dtype='float32').tiny),
+            max=1.0,
+            dtype=self.probs.dtype,
+        )
 
-        new_t = sample_uniform.sample(list(shape))
-        return paddle.log(new_t) / paddle.log1p(-(self.probs))
+        return paddle.floor(paddle.log(uniform) / paddle.log1p(-(self.probs)))
 
     def entropy(self):
         r"""Entropy of dirichlet distribution.
@@ -266,7 +269,7 @@ def entropy(self):
                 >>> geom = Geometric(0.5)
                 >>> print(geom.entropy())
                 Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                1.38629436)
+                1.38629425)
         """
         x = (1.0 - self.probs) * paddle.log(1.0 - self.probs)
         y = self.probs * paddle.log(self.probs)
@@ -278,7 +281,7 @@ def cdf(self, k):
 
         .. math::
 
-            F(X \leq k) = 1 - (1-p)^k, \quad k=0,1,2,\ldots
+            F(X \leq k) = 1 - (1-p)^(k+1), \quad k=0,1,2,\ldots
 
         Args:
             k: The number of trials performed.
@@ -296,10 +299,10 @@ def cdf(self, k):
                 >>> geom = Geometric(0.5)
                 >>> print(geom.cdf(4))
                 Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                0.93750000)
+                0.96875000)
         """
         if isinstance(k, (numbers.Integral, framework.Variable)):
-            return 1.0 - paddle.pow((1.0 - self.probs), k)
+            return 1.0 - paddle.pow((1.0 - self.probs), k + 1)
         else:
             raise TypeError(
                 f"Expected type of k is number.Real|framework.Variable, but got {type(k)}"
diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py
index 92313c9bec58a9..39e98a910499bd 100644
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -25,7 +25,7 @@
     variable,
 )
 
-__all__ = [  # noqa
+__all__ = [
     'Transform',
     'AbsTransform',
     'AffineTransform',
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 3708048e56d4a5..ecddb82c9a3752 100755
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -14,75 +14,81 @@
 
 # TODO: import framework api under this directory
 
-from . import random  # noqa: F401
-from .random import seed  # noqa: F401
-from .framework import get_default_dtype  # noqa: F401
-from .framework import set_default_dtype  # noqa: F401
-
-from ..base.param_attr import ParamAttr  # noqa: F401
-from ..base.core import CPUPlace  # noqa: F401
-from ..base.core import IPUPlace  # noqa: F401
-from ..base.core import CUDAPlace  # noqa: F401
-from ..base.core import CUDAPinnedPlace  # noqa: F401
-from ..base.core import CustomPlace  # noqa: F401
-from ..base.core import XPUPlace  # noqa: F401
-
 from ..base import core  # noqa: F401
-from ..base.dygraph import base, to_variable
-from ..base.dygraph.base import no_grad_ as no_grad  # noqa: F401
+from ..base.core import (  # noqa: F401
+    CPUPlace,
+    CUDAPinnedPlace,
+    CUDAPlace,
+    CustomPlace,
+    IPUPlace,
+    XPUPlace,
+)
+from ..base.dygraph import base, to_variable  # noqa: F401
+from ..base.dygraph.base import disable_dygraph as enable_static  # noqa: F401
+from ..base.dygraph.base import enable_dygraph as disable_static  # noqa: F401
 from ..base.dygraph.base import grad  # noqa: F401
-from .io import save  # noqa: F401
-from .io import load  # noqa: F401
-
-from .io_utils import _open_file_buffer  # noqa: F401
-from .io_utils import is_parameter  # noqa: F401
-from .io_utils import is_persistable  # noqa: F401
-from .io_utils import is_belong_to_optimizer  # noqa: F401
-from .io_utils import _clone_var_in_block_  # noqa: F401
-from .io_utils import _pickle_loads_mac
-from .io_utils import _pack_loaded_dict
-from .io_utils import _unpack_saved_dict
-from .io_utils import _load_program_scope
+from ..base.dygraph.base import no_grad_ as no_grad  # noqa: F401
+from ..base.framework import (  # noqa: F401
+    Block,
+    IrGraph,
+    OpProtoHolder,
+    Parameter,
+    Program,
+    _apply_pass,
+    _create_tensor,
+    _current_expected_place,
+    _current_expected_place_,
+    _dygraph_tracer,
+    _get_paddle_place,
+    _global_flags,
+    _set_expected_place,
+)
+from ..base.framework import (  # noqa: F401
+    _stride_in_no_check_dy2st_diff as _no_check_dy2st_diff,
+)
+from ..base.framework import (  # noqa: F401
+    convert_np_dtype_to_dtype_,
+    deprecate_stat_dict,
+    disable_signal_handler,
+    dygraph_not_support,
+    dygraph_only,
+    generate_control_dev_var_name,
+    get_flags,
+)
+from ..base.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
+from ..base.framework import (  # noqa: F401
+    in_dynamic_or_pir_mode,
+    in_pir_mode,
+    set_flags,
+    switch_main_program,
+    switch_startup_program,
+    use_pir_api,
+)
+from ..base.layer_helper import LayerHelper  # noqa: F401
 
+# isort: off
 # Do the *DUPLICATED* monkey-patch for the tensor object.
 # We need remove the duplicated code here once we fix
 # the illogical implement in the monkey-patch methods later.
-from ..base.layers.math_op_patch import monkey_patch_variable
-from ..base.dygraph.math_op_patch import monkey_patch_math_tensor
-from ..base.framework import disable_signal_handler  # noqa: F401
-from ..base.framework import get_flags  # noqa: F401
-from ..base.framework import set_flags  # noqa: F401
-from ..base.framework import Parameter
-from ..base.dygraph.base import enable_dygraph as disable_static  # noqa: F401
-from ..base.dygraph.base import disable_dygraph as enable_static  # noqa: F401
-from ..base.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
-from ..base.framework import in_pir_mode, use_pir_api  # noqa: F401
-from ..base.framework import in_dynamic_or_pir_mode  # noqa: F401
-from ..base.framework import (
-    _current_expected_place,
-    _get_paddle_place,
-)  # noqa: F401
-from ..base.framework import dygraph_only  # noqa: F401
-from ..base.framework import dygraph_not_support  # noqa: F401
-from ..base.framework import (
-    convert_np_dtype_to_dtype_,
-    _create_tensor,
-    OpProtoHolder,
-)  # noqa: F401
-from ..base.framework import _dygraph_tracer  # noqa: F401
-from ..base.framework import generate_control_dev_var_name  # noqa: F401
+from ..base.dygraph.math_op_patch import monkey_patch_math_tensor  # noqa: F401
+from ..base.layers.math_op_patch import monkey_patch_variable  # noqa: F401
 
-from ..base.layer_helper import LayerHelper  # noqa: F401
-from ..base.framework import _global_flags  # noqa: F401
-from ..base.framework import _apply_pass  # noqa: F401
-from ..base.framework import switch_main_program
-from ..base.framework import switch_startup_program
-from ..base.framework import _set_expected_place  # noqa: F401
-from ..base.framework import Block, Program  # noqa: F401
-from ..base.framework import IrGraph  # noqa: F401
-from ..base.framework import deprecate_stat_dict
-from ..base.framework import (
-    _stride_in_no_check_dy2st_diff as _no_check_dy2st_diff,
-)  # noqa: F401
+# isort: on
+from ..base.param_attr import ParamAttr  # noqa: F401
+from . import random  # noqa: F401
+from .framework import get_default_dtype, set_default_dtype  # noqa: F401
+from .io import load, save  # noqa: F401
+from .io_utils import (  # noqa: F401
+    _clone_var_in_block_,
+    _load_program_scope,
+    _open_file_buffer,
+    _pack_loaded_dict,
+    _pickle_loads_mac,
+    _unpack_saved_dict,
+    is_belong_to_optimizer,
+    is_parameter,
+    is_persistable,
+)
+from .random import seed  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/geometric/__init__.py b/python/paddle/geometric/__init__.py
index 6c132a529bc37f..ab86fdcd23603c 100644
--- a/python/paddle/geometric/__init__.py
+++ b/python/paddle/geometric/__init__.py
@@ -12,17 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .message_passing import send_u_recv  # noqa: F401
-from .message_passing import send_ue_recv  # noqa: F401
-from .message_passing import send_uv  # noqa: F401
-from .math import segment_sum  # noqa: F401
-from .math import segment_mean  # noqa: F401
-from .math import segment_min  # noqa: F401
-from .math import segment_max  # noqa: F401
-from .reindex import reindex_graph  # noqa: F401
-from .reindex import reindex_heter_graph  # noqa: F401
-from .sampling import sample_neighbors  # noqa: F401
-from .sampling import weighted_sample_neighbors  # noqa: F401
+from .math import segment_max, segment_mean, segment_min, segment_sum
+from .message_passing import send_u_recv, send_ue_recv, send_uv
+from .reindex import reindex_graph, reindex_heter_graph
+from .sampling import sample_neighbors, weighted_sample_neighbors
 
 __all__ = [
     'send_u_recv',
diff --git a/python/paddle/geometric/message_passing/__init__.py b/python/paddle/geometric/message_passing/__init__.py
index c07f9bc40c6b39..0c6d1151b3c857 100644
--- a/python/paddle/geometric/message_passing/__init__.py
+++ b/python/paddle/geometric/message_passing/__init__.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .send_recv import send_u_recv  # noqa: F401
-from .send_recv import send_ue_recv  # noqa: F401
-from .send_recv import send_uv  # noqa: F401
+from .send_recv import send_u_recv, send_ue_recv, send_uv  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/geometric/sampling/__init__.py b/python/paddle/geometric/sampling/__init__.py
index ee7bacfc9047f6..f3eb55603246f9 100644
--- a/python/paddle/geometric/sampling/__init__.py
+++ b/python/paddle/geometric/sampling/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .neighbors import sample_neighbors  # noqa: F401
-from .neighbors import weighted_sample_neighbors  # noqa: F401
+from .neighbors import sample_neighbors, weighted_sample_neighbors  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/hapi/__init__.py b/python/paddle/hapi/__init__.py
index 2829bbe9470898..a1bab472fa7913 100644
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -12,15 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import logger  # noqa: F401
-from . import callbacks  # noqa: F401
-from . import hub  # noqa: F401
-from . import progressbar  # noqa: F401
-from . import static_flops  # noqa: F401
-
+from . import callbacks, hub, logger, progressbar, static_flops  # noqa: F401
+from .dynamic_flops import flops  # noqa: F401
 from .model import Model  # noqa: F401
 from .model_summary import summary  # noqa: F401
-from .dynamic_flops import flops  # noqa: F401
 
 logger.setup_logger()
 
diff --git a/python/paddle/hapi/dynamic_flops.py b/python/paddle/hapi/dynamic_flops.py
index fcae6e4120ac8e..d02610f6e51848 100644
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
@@ -85,7 +85,6 @@ def flops(net, input_size, custom_ops=None, print_detail=False):
             ...                      [1, 1, 28, 28],
             ...                      custom_ops= {nn.LeakyReLU: count_leaky_relu},
             ...                      print_detail=True)
-            >>> # doctest: +SKIP
             >>> print(FLOPs)
             <class 'paddle.nn.layer.conv.Conv2D'>'s flops has been counted
             <class 'paddle.nn.layer.activation.ReLU'>'s flops has been counted
@@ -106,7 +105,6 @@ def flops(net, input_size, custom_ops=None, print_detail=False):
             +--------------+-----------------+-----------------+--------+--------+
             Total Flops: 347560     Total Params: 61610
             347560
-            >>> # doctest: -SKIP
     """
     if isinstance(net, nn.Layer):
         # If net is a dy2stat model, net.forward is StaticFunction instance,
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 8ca5712a3036c2..55814227385989 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -2399,7 +2399,6 @@ def summary(self, input_size=None, dtype=None):
                 >>> optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())
                 >>> model.prepare(optim, paddle.nn.CrossEntropyLoss())
                 >>> params_info = model.summary()
-                >>> # doctest: +SKIP
                 >>> print(params_info)
                 ---------------------------------------------------------------------------
                 Layer (type)       Input Shape          Output Shape         Param #
@@ -2424,7 +2423,6 @@ def summary(self, input_size=None, dtype=None):
                 Estimated Total Size (MB): 0.35
                 ---------------------------------------------------------------------------
                 {'total_params': 61610, 'trainable_params': 61610}
-                >>> # doctest: -SKIP
 
         """
         assert (
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index df5791a5fd70d8..bedd109b0a532b 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -78,7 +78,6 @@ def summary(net, input_size=None, dtypes=None, input=None):
             >>> lenet = LeNet()
 
             >>> params_info = paddle.summary(lenet, (1, 1, 28, 28))
-            >>> # doctest: +SKIP
             >>> print(params_info)
             ---------------------------------------------------------------------------
             Layer (type)       Input Shape          Output Shape         Param #
@@ -103,7 +102,6 @@ def summary(net, input_size=None, dtypes=None, input=None):
             Estimated Total Size (MB): 0.35
             ---------------------------------------------------------------------------
             {'total_params': 61610, 'trainable_params': 61610}
-            >>> # doctest: -SKIP
             >>> # multi input demo
             >>> class LeNetMultiInput(LeNet):
             ...     def forward(self, inputs, y):
@@ -119,7 +117,6 @@ def summary(net, input_size=None, dtypes=None, input=None):
             >>> params_info = paddle.summary(lenet_multi_input,
             ...                              [(1, 1, 28, 28), (1, 400)],
             ...                              dtypes=['float32', 'float32'])
-            >>> # doctest: +SKIP
             >>> print(params_info)
             ---------------------------------------------------------------------------
             Layer (type)       Input Shape          Output Shape         Param #
@@ -144,7 +141,6 @@ def summary(net, input_size=None, dtypes=None, input=None):
             Estimated Total Size (MB): 0.35
             ---------------------------------------------------------------------------
             {'total_params': 61610, 'trainable_params': 61610}
-            >>> # doctest: -SKIP
             >>> # list input demo
             >>> class LeNetListInput(LeNet):
             ...     def forward(self, inputs):
@@ -158,7 +154,6 @@ def summary(net, input_size=None, dtypes=None, input=None):
             >>> lenet_list_input = LeNetListInput()
             >>> input_data = [paddle.rand([1, 1, 28, 28]), paddle.rand([1, 400])]
             >>> params_info = paddle.summary(lenet_list_input, input=input_data)
-            >>> # doctest: +SKIP
             >>> print(params_info)
             ---------------------------------------------------------------------------
             Layer (type)       Input Shape          Output Shape         Param #
@@ -183,7 +178,6 @@ def summary(net, input_size=None, dtypes=None, input=None):
             Estimated Total Size (MB): 0.35
             ---------------------------------------------------------------------------
             {'total_params': 61610, 'trainable_params': 61610}
-            >>> # doctest: -SKIP
             >>> # dict input demo
             >>> class LeNetDictInput(LeNet):
             ...     def forward(self, inputs):
@@ -198,7 +192,6 @@ def summary(net, input_size=None, dtypes=None, input=None):
             >>> input_data = {'x1': paddle.rand([1, 1, 28, 28]),
             ...               'x2': paddle.rand([1, 400])}
             >>> params_info = paddle.summary(lenet_dict_input, input=input_data)
-            >>> # doctest: +SKIP
             >>> print(params_info)
             ---------------------------------------------------------------------------
             Layer (type)       Input Shape          Output Shape         Param #
@@ -223,7 +216,6 @@ def summary(net, input_size=None, dtypes=None, input=None):
             Estimated Total Size (MB): 0.35
             ---------------------------------------------------------------------------
             {'total_params': 61610, 'trainable_params': 61610}
-            >>> # doctest: -SKIP
 
     """
     if input_size is None and input is None:
diff --git a/python/paddle/hub.py b/python/paddle/hub.py
index 1960d98e95b670..3e5da36cd93bbe 100644
--- a/python/paddle/hub.py
+++ b/python/paddle/hub.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .hapi.hub import help  # noqa: F401
-from .hapi.hub import list  # noqa: F401
-from .hapi.hub import load  # noqa: F401
+from .hapi.hub import help, list, load
 
-__all__ = ['list', 'help', 'load']  # noqa
+__all__ = ['list', 'help', 'load']
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index f6eb4377f60334..da64c62fdbf05f 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -12,35 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .optimizer import LookAhead  # noqa: F401
-from .optimizer import ModelAverage  # noqa: F401
-from .optimizer import DistributedFusedLamb  # noqa: F401
-from .checkpoint import auto_checkpoint  # noqa: F401
 from ..base.layer_helper import LayerHelper  # noqa: F401
-from .operators import softmax_mask_fuse_upper_triangle  # noqa: F401
-from .operators import softmax_mask_fuse  # noqa: F401
-from .operators import graph_send_recv
-from .operators import graph_khop_sampler
-from .operators import graph_sample_neighbors
-from .operators import graph_reindex
-from .tensor import segment_sum
-from .tensor import segment_mean
-from .tensor import segment_max
-from .tensor import segment_min
-from .tensor import _npu_identity
-from .passes import fuse_resnet_unit_pass
-
-from . import autograd  # noqa: F401
-from . import autotune  # noqa: F401
-from . import nn  # noqa: F401
-from . import asp  # noqa: F401
-from . import multiprocessing  # noqa: F401
-from . import layers
-
+from ..distributed import fleet  # noqa: F401
+from . import (  # noqa: F401
+    asp,
+    autograd,
+    autotune,
+    layers,
+    multiprocessing,
+    nn,
+    xpu,
+)
+from .checkpoint import auto_checkpoint  # noqa: F401
 from .nn.loss import identity_loss
-
-from ..distributed import fleet
-from . import xpu
+from .operators import (
+    graph_khop_sampler,
+    graph_reindex,
+    graph_sample_neighbors,
+    graph_send_recv,
+    softmax_mask_fuse,
+    softmax_mask_fuse_upper_triangle,
+)
+from .optimizer import DistributedFusedLamb  # noqa: F401
+from .optimizer import LookAhead, ModelAverage
+from .passes import fuse_resnet_unit_pass  # noqa: F401
+from .tensor import (  # noqa: F401
+    _npu_identity,
+    segment_max,
+    segment_mean,
+    segment_min,
+    segment_sum,
+)
 
 __all__ = [
     'LookAhead',
diff --git a/python/paddle/incubate/asp/__init__.py b/python/paddle/incubate/asp/__init__.py
index 9e6af7e94c139a..af703c83df96a5 100644
--- a/python/paddle/incubate/asp/__init__.py
+++ b/python/paddle/incubate/asp/__init__.py
@@ -13,28 +13,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# isort: off
+# NOTE(gouzil): MaskAlgo can cause circular references, so sorting is disabled
+from .utils import (  # noqa: F401
+    CheckMethod,
+    MaskAlgo,
+    calculate_density,
+    check_mask_1d,
+    check_mask_2d,
+    check_sparsity,
+    create_mask,
+    get_mask_1d,
+    get_mask_2d_best,
+    get_mask_2d_greedy,
+)
 
-from .utils import check_mask_1d  # noqa: F401
-from .utils import get_mask_1d  # noqa: F401
-from .utils import check_mask_2d  # noqa: F401
-from .utils import get_mask_2d_greedy  # noqa: F401
-from .utils import get_mask_2d_best  # noqa: F401
-from .utils import create_mask  # noqa: F401
-from .utils import check_sparsity  # noqa: F401
-from .utils import MaskAlgo  # noqa: F401
-from .utils import CheckMethod  # noqa: F401
-from .utils import calculate_density  # noqa: F401
+# isort: on
 
-from .asp import decorate  # noqa: F401
-from .asp import prune_model  # noqa: F401
-from .asp import set_excluded_layers  # noqa: F401
-from .asp import reset_excluded_layers  # noqa: F401
 from .asp import ASPHelper  # noqa: F401
+from .asp import (
+    decorate,
+    prune_model,
+    reset_excluded_layers,
+    set_excluded_layers,
+)
+from .supported_layer_list import add_supported_layer
 
-from .supported_layer_list import add_supported_layer  # noqa: F401
-
-
-__all__ = [  # noqa
+__all__ = [
     'calculate_density',
     'decorate',
     'prune_model',
diff --git a/python/paddle/incubate/asp/asp.py b/python/paddle/incubate/asp/asp.py
index 041132047dc718..9ffaee1c2b5048 100644
--- a/python/paddle/incubate/asp/asp.py
+++ b/python/paddle/incubate/asp/asp.py
@@ -47,75 +47,75 @@ def set_excluded_layers(param_names, main_program=None):
                                           If None is given, then it would be set as `paddle.static.default_main_program().
                                           Default is None.
     Examples:
-        1. Usage of Dynamic Graph
-
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> class MyLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self.conv1 = paddle.nn.Conv2D(
-                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                ...         self.linear1 = paddle.nn.Linear(4624, 100)
-                ...
-                ...     def forward(self, img):
-                ...         hidden = self.conv1(img)
-                ...         hidden = paddle.flatten(hidden, start_axis=1)
-                ...         prediction = self.linear1(hidden)
-                ...         return prediction
-
-                >>> my_layer = MyLayer()
-                >>> optimizer = paddle.optimizer.SGD(
-                ...     learning_rate=0.01, parameters=my_layer.parameters())
-
-                >>> # Need to set excluded layers before calling decorate
-                >>> paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
-
-                >>> optimizer = paddle.incubate.asp.decorate(optimizer)
-
-        2. Usage of Static Graph
-
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> paddle.enable_static()
-
-                >>> class MyLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self.conv1 = paddle.nn.Conv2D(
-                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                ...         self.linear1 = paddle.nn.Linear(4624, 100)
-                ...
-                ...     def forward(self, img):
-                ...         hidden = self.conv1(img)
-                ...         hidden = paddle.flatten(hidden, start_axis=1)
-                ...         prediction = self.linear1(hidden)
-                ...         return prediction
-
-                >>> main_program = paddle.static.Program()
-                >>> startup_program = paddle.static.Program()
-
-                >>> with paddle.static.program_guard(main_program, startup_program):
-                ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
-                ...     label = paddle.static.data(name='label', shape=[None, 100])
-                ...     my_layer = MyLayer()
-                ...     prob = my_layer(input_data)
-                ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-                ...
-                ...     # Setup exluded layers out from ASP workflow.
-                ...     # Please note, excluded_layers must be set before calling optimizer.minimize().
-                ...     paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
-                ...
-                ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                ...     optimizer = paddle.static.amp.decorate(optimizer )
-                ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
-                ...     # will insert necessary masking operations for ASP workflow.
-                ...     optimizer = paddle.incubate.asp.decorate(optimizer)
-                ...     optimizer.minimize(loss, startup_program)
+        .. code-block:: python
+            :name: dynamic-graph
+
+            >>> # Example1: Usage of Dynamic Graph
+            >>> import paddle
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.conv1 = paddle.nn.Conv2D(
+            ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+            ...         self.linear1 = paddle.nn.Linear(4624, 100)
+            ...
+            ...     def forward(self, img):
+            ...         hidden = self.conv1(img)
+            ...         hidden = paddle.flatten(hidden, start_axis=1)
+            ...         prediction = self.linear1(hidden)
+            ...         return prediction
+
+            >>> my_layer = MyLayer()
+            >>> optimizer = paddle.optimizer.SGD(
+            ...     learning_rate=0.01, parameters=my_layer.parameters())
+
+            >>> # Need to set excluded layers before calling decorate
+            >>> paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+
+            >>> optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        .. code-block:: python
+            :name: static-graph
+
+            >>> # Example2: Usage of Static Graph
+            >>> import paddle
+
+            >>> paddle.enable_static()
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.conv1 = paddle.nn.Conv2D(
+            ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+            ...         self.linear1 = paddle.nn.Linear(4624, 100)
+            ...
+            ...     def forward(self, img):
+            ...         hidden = self.conv1(img)
+            ...         hidden = paddle.flatten(hidden, start_axis=1)
+            ...         prediction = self.linear1(hidden)
+            ...         return prediction
+
+            >>> main_program = paddle.static.Program()
+            >>> startup_program = paddle.static.Program()
+
+            >>> with paddle.static.program_guard(main_program, startup_program):
+            ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+            ...     label = paddle.static.data(name='label', shape=[None, 100])
+            ...     my_layer = MyLayer()
+            ...     prob = my_layer(input_data)
+            ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+            ...
+            ...     # Setup exluded layers out from ASP workflow.
+            ...     # Please note, excluded_layers must be set before calling optimizer.minimize().
+            ...     paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+            ...
+            ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+            ...     optimizer = paddle.static.amp.decorate(optimizer )
+            ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
+            ...     # will insert necessary masking operations for ASP workflow.
+            ...     optimizer = paddle.incubate.asp.decorate(optimizer)
+            ...     optimizer.minimize(loss, startup_program)
     """
     if main_program is None:
         main_program = paddle.static.default_main_program()
@@ -134,81 +134,81 @@ def reset_excluded_layers(main_program=None):
                                           If None is given, then this function would reset all excluded_layers.
                                           Default is None.
     Examples:
-        1. Usage of Dynamic Graph
-
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> class MyLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self.conv1 = paddle.nn.Conv2D(
-                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                ...         self.linear1 = paddle.nn.Linear(4624, 100)
-                ...
-                ...     def forward(self, img):
-                ...         hidden = self.conv1(img)
-                ...         hidden = paddle.flatten(hidden, start_axis=1)
-                ...         prediction = self.linear1(hidden)
-                ...         return prediction
-
-                >>> my_layer = MyLayer()
-                >>> optimizer = paddle.optimizer.SGD(
-                ...     learning_rate=0.01, parameters=my_layer.parameters())
-
-                >>> # Need to set excluded layers before calling decorate
-                >>> paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
-                >>> # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
-                >>> # Please note, reset_excluded_layers also must be called before calling asp.decorate().
-                >>> paddle.incubate.asp.reset_excluded_layers()
-
-                >>> optimizer = paddle.incubate.asp.decorate(optimizer)
-
-        2. Usage of Static Graph
-
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> paddle.enable_static()
-
-                >>> class MyLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self.conv1 = paddle.nn.Conv2D(
-                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                ...         self.linear1 = paddle.nn.Linear(4624, 100)
-                ...
-                ...     def forward(self, img):
-                ...         hidden = self.conv1(img)
-                ...         hidden = paddle.flatten(hidden, start_axis=1)
-                ...         prediction = self.linear1(hidden)
-                ...         return prediction
-
-                >>> main_program = paddle.static.Program()
-                >>> startup_program = paddle.static.Program()
-
-                >>> with paddle.static.program_guard(main_program, startup_program):
-                ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
-                ...     label = paddle.static.data(name='label', shape=[None, 100])
-                ...     my_layer = MyLayer()
-                ...     prob = my_layer(input_data)
-                ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-                ...
-                ...     # Setup exluded layers out from ASP workflow.
-                ...     # Please note, excluded_layers must be set before calling optimizer.minimize().
-                ...     paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
-                ...     # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
-                ...     # Please note, reset_excluded_layers also must be called before calling optimizer.minimize().
-                ...     paddle.incubate.asp.reset_excluded_layers(main_program)
-                ...
-                ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                ...     optimizer = paddle.static.amp.decorate(optimizer )
-                ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
-                ...     # will insert necessary masking operations for ASP workflow.
-                ...     optimizer = paddle.incubate.asp.decorate(optimizer)
-                ...     optimizer.minimize(loss, startup_program)
+        .. code-block:: python
+            :name: dynamic-graph
+
+            >>> # Example1: Usage of Dynamic Graph
+            >>> import paddle
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.conv1 = paddle.nn.Conv2D(
+            ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+            ...         self.linear1 = paddle.nn.Linear(4624, 100)
+            ...
+            ...     def forward(self, img):
+            ...         hidden = self.conv1(img)
+            ...         hidden = paddle.flatten(hidden, start_axis=1)
+            ...         prediction = self.linear1(hidden)
+            ...         return prediction
+
+            >>> my_layer = MyLayer()
+            >>> optimizer = paddle.optimizer.SGD(
+            ...     learning_rate=0.01, parameters=my_layer.parameters())
+
+            >>> # Need to set excluded layers before calling decorate
+            >>> paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+            >>> # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+            >>> # Please note, reset_excluded_layers also must be called before calling asp.decorate().
+            >>> paddle.incubate.asp.reset_excluded_layers()
+
+            >>> optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        .. code-block:: python
+            :name: static-graph
+
+            >>> # Example2: Usage of Static Graph
+            >>> import paddle
+
+            >>> paddle.enable_static()
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.conv1 = paddle.nn.Conv2D(
+            ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+            ...         self.linear1 = paddle.nn.Linear(4624, 100)
+            ...
+            ...     def forward(self, img):
+            ...         hidden = self.conv1(img)
+            ...         hidden = paddle.flatten(hidden, start_axis=1)
+            ...         prediction = self.linear1(hidden)
+            ...         return prediction
+
+            >>> main_program = paddle.static.Program()
+            >>> startup_program = paddle.static.Program()
+
+            >>> with paddle.static.program_guard(main_program, startup_program):
+            ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+            ...     label = paddle.static.data(name='label', shape=[None, 100])
+            ...     my_layer = MyLayer()
+            ...     prob = my_layer(input_data)
+            ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+            ...
+            ...     # Setup exluded layers out from ASP workflow.
+            ...     # Please note, excluded_layers must be set before calling optimizer.minimize().
+            ...     paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+            ...     # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+            ...     # Please note, reset_excluded_layers also must be called before calling optimizer.minimize().
+            ...     paddle.incubate.asp.reset_excluded_layers(main_program)
+            ...
+            ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+            ...     optimizer = paddle.static.amp.decorate(optimizer )
+            ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
+            ...     # will insert necessary masking operations for ASP workflow.
+            ...     optimizer = paddle.incubate.asp.decorate(optimizer)
+            ...     optimizer.minimize(loss, startup_program)
     """
     ASPHelper.reset_excluded_layers(main_program=main_program)
 
@@ -225,76 +225,76 @@ def decorate(optimizer):
     Returns:
         OptimizerWithSparsityGuarantee: A wrapper for ASP to decorate `minimize` function of the given optimizer.
     Examples:
-        1. Usage of Dynamic Graph
-
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> class MyLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self.conv1 = paddle.nn.Conv2D(
-                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                ...         self.linear1 = paddle.nn.Linear(4624, 32)
-                ...         self.linear2 = paddle.nn.Linear(32, 32)
-                ...         self.linear3 = paddle.nn.Linear(32, 10)
-                ...
-                ...     def forward(self, img):
-                ...         hidden = self.conv1(img)
-                ...         hidden = paddle.flatten(hidden, start_axis=1)
-                ...         hidden = self.linear1(hidden)
-                ...         hidden = self.linear2(hidden)
-                ...         prediction = self.linear3(hidden)
-                ...         return prediction
-
-                >>> my_layer = MyLayer()
-                >>> optimizer = paddle.optimizer.SGD(
-                ...     learning_rate=0.01, parameters=my_layer.parameters())
-
-                >>> # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
-                >>> # will apply necessary masking operations for ASP workflow.
-                >>> # In dynamic graph mode, ASP would create related mask variables during decoration.
-                >>> optimizer = paddle.incubate.asp.decorate(optimizer)
-
-        2. Usage of Static Graph
-
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> paddle.enable_static()
-
-                >>> class MyLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self.conv1 = paddle.nn.Conv2D(
-                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                ...         self.linear1 = paddle.nn.Linear(4624, 100)
-                ...
-                ...     def forward(self, img):
-                ...         hidden = self.conv1(img)
-                ...         hidden = paddle.flatten(hidden, start_axis=1)
-                ...         prediction = self.linear1(hidden)
-                ...         return prediction
-
-                >>> main_program = paddle.static.Program()
-                >>> startup_program = paddle.static.Program()
-
-                >>> with paddle.static.program_guard(main_program, startup_program):
-                ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
-                ...     label = paddle.static.data(name='label', shape=[None, 100])
-                ...     my_layer = MyLayer()
-                ...     prob = my_layer(input_data)
-                ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-                ...
-                ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
-                ...     # will insert necessary masking operations for ASP workflow.
-                ...     # In static graph mode, ASP creates related mask variables
-                ...     # during minimize().
-                ...     optimizer = paddle.incubate.asp.decorate(optimizer)
-                ...     optimizer.minimize(loss, startup_program)
+        .. code-block:: python
+            :name: dynamic-graph
+
+            >>> # Example1: Usage of Dynamic Graph
+            >>> import paddle
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.conv1 = paddle.nn.Conv2D(
+            ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+            ...         self.linear1 = paddle.nn.Linear(4624, 32)
+            ...         self.linear2 = paddle.nn.Linear(32, 32)
+            ...         self.linear3 = paddle.nn.Linear(32, 10)
+            ...
+            ...     def forward(self, img):
+            ...         hidden = self.conv1(img)
+            ...         hidden = paddle.flatten(hidden, start_axis=1)
+            ...         hidden = self.linear1(hidden)
+            ...         hidden = self.linear2(hidden)
+            ...         prediction = self.linear3(hidden)
+            ...         return prediction
+
+            >>> my_layer = MyLayer()
+            >>> optimizer = paddle.optimizer.SGD(
+            ...     learning_rate=0.01, parameters=my_layer.parameters())
+
+            >>> # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
+            >>> # will apply necessary masking operations for ASP workflow.
+            >>> # In dynamic graph mode, ASP would create related mask variables during decoration.
+            >>> optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        .. code-block:: python
+            :name: static-graph
+
+            >>> # Example2: Usage of Static Graph
+            >>> import paddle
+
+            >>> paddle.enable_static()
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.conv1 = paddle.nn.Conv2D(
+            ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+            ...         self.linear1 = paddle.nn.Linear(4624, 100)
+            ...
+            ...     def forward(self, img):
+            ...         hidden = self.conv1(img)
+            ...         hidden = paddle.flatten(hidden, start_axis=1)
+            ...         prediction = self.linear1(hidden)
+            ...         return prediction
+
+            >>> main_program = paddle.static.Program()
+            >>> startup_program = paddle.static.Program()
+
+            >>> with paddle.static.program_guard(main_program, startup_program):
+            ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+            ...     label = paddle.static.data(name='label', shape=[None, 100])
+            ...     my_layer = MyLayer()
+            ...     prob = my_layer(input_data)
+            ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+            ...
+            ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+            ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
+            ...     # will insert necessary masking operations for ASP workflow.
+            ...     # In static graph mode, ASP creates related mask variables
+            ...     # during minimize().
+            ...     optimizer = paddle.incubate.asp.decorate(optimizer)
+            ...     optimizer.minimize(loss, startup_program)
     """
     return ASPHelper.decorate(optimizer)
 
@@ -322,116 +322,116 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
-        1. Usage of Dynamic Graph
-
-            .. code-block:: python
-
-                >>> import paddle
-                >>> import numpy as np
-
-                >>> class MyLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self.conv1 = paddle.nn.Conv2D(
-                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                ...         self.linear1 = paddle.nn.Linear(4624, 32)
-                ...         self.linear2 = paddle.nn.Linear(32, 32)
-                ...         self.linear3 = paddle.nn.Linear(32, 10)
-                ...
-                ...     def forward(self, img):
-                ...         hidden = self.conv1(img)
-                ...         hidden = paddle.flatten(hidden, start_axis=1)
-                ...         hidden = self.linear1(hidden)
-                ...         hidden = self.linear2(hidden)
-                ...         prediction = self.linear3(hidden)
-                ...         return prediction
-
-                >>> my_layer = MyLayer()
-                >>> loss_fn = paddle.nn.MSELoss(reduction='mean')
-
-                >>> optimizer = paddle.optimizer.SGD(
-                ...     learning_rate=0.01, parameters=my_layer.parameters())
-
-                >>> # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
-                >>> # will apply necessary masking operations for ASP workflow.
-                >>> # In dynamic graph mode, ASP would create related mask variables during decoration.
-                >>> optimizer = paddle.incubate.asp.decorate(optimizer)
-
-                >>> # Must call paddle.incubate.asp.decorate() first before calling paddle.incubate.asp.prune_model()
-                >>> paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
-
-                >>> for i in range(10):
-                ...     imgs = paddle.to_tensor(
-                ...         np.random.randn(64, 3, 32, 32),
-                ...         dtype='float32', stop_gradient=False)
-                ...     labels = paddle.to_tensor(
-                ...         np.random.randint(10, size=(64, 1)),
-                ...         dtype='float32', stop_gradient=False)
-                ...     output = my_layer(imgs)
-                ...     loss = loss_fn(output, labels)
-                ...     loss.backward()
-                ...     optimizer.step()
-                ...     optimizer.clear_grad()
-
-        2. Usage of Static Graph
-
-            .. code-block:: python
-
-                >>> import paddle
-                >>> import numpy as np
-
-                >>> paddle.enable_static()
-
-                >>> class MyLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self.conv1 = paddle.nn.Conv2D(
-                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                ...         self.linear1 = paddle.nn.Linear(4624, 32)
-                ...         self.linear2 = paddle.nn.Linear(32, 32)
-                ...         self.linear3 = paddle.nn.Linear(32, 10)
-                ...
-                ...     def forward(self, img):
-                ...         hidden = self.conv1(img)
-                ...         hidden = paddle.flatten(hidden, start_axis=1)
-                ...         hidden = self.linear1(hidden)
-                ...         hidden = self.linear2(hidden)
-                ...         prediction = self.linear3(hidden)
-                ...         return prediction
-
-                >>> main_program = paddle.static.Program()
-                >>> startup_program = paddle.static.Program()
-
-                >>> with paddle.static.program_guard(main_program, startup_program):
-                ...     input_data = paddle.static.data(name='data', shape=[None, 3, 32, 32])
-                ...     label = paddle.static.data(name='label', shape=[None, 1])
-                ...     my_layer = MyLayer()
-                ...     prob = my_layer(input_data)
-                ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-                ...
-                ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
-                ...     # will insert necessary masking operations for ASP workflow.
-                ...     # In static graph mode, ASP creates related mask variables
-                ...     # during minimize().
-                ...     optimizer = paddle.incubate.asp.decorate(optimizer)
-                ...     optimizer.minimize(loss, startup_program)
-
-                >>> device = paddle.device.get_device()
-                >>> place = paddle.set_device(device)
-
-                >>> exe = paddle.static.Executor(place)
-                >>> exe.run(startup_program)
-
-                >>> # Must call exe.run(startup_program) first before calling paddle.asp.prune_model()
-                >>> paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
-                >>> # it also be accepted to call
-                >>> # paddle.incubate.asp.prune_model(main_program, mask_algo='mask_2d_best')
-
-                >>> for i in range(10):
-                ...     imgs = np.random.randn(64, 3, 32, 32).astype('float32')
-                ...     labels = np.random.randint(10, size=(64, 1)).astype('float32')
-                ...     exe.run(main_program, feed={'data':imgs, 'label':labels})
+        .. code-block:: python
+            :name: dynamic-graph
+
+            >>> # Example1: Usage of Dynamic Graph
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.conv1 = paddle.nn.Conv2D(
+            ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+            ...         self.linear1 = paddle.nn.Linear(4624, 32)
+            ...         self.linear2 = paddle.nn.Linear(32, 32)
+            ...         self.linear3 = paddle.nn.Linear(32, 10)
+            ...
+            ...     def forward(self, img):
+            ...         hidden = self.conv1(img)
+            ...         hidden = paddle.flatten(hidden, start_axis=1)
+            ...         hidden = self.linear1(hidden)
+            ...         hidden = self.linear2(hidden)
+            ...         prediction = self.linear3(hidden)
+            ...         return prediction
+
+            >>> my_layer = MyLayer()
+            >>> loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+            >>> optimizer = paddle.optimizer.SGD(
+            ...     learning_rate=0.01, parameters=my_layer.parameters())
+
+            >>> # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
+            >>> # will apply necessary masking operations for ASP workflow.
+            >>> # In dynamic graph mode, ASP would create related mask variables during decoration.
+            >>> optimizer = paddle.incubate.asp.decorate(optimizer)
+
+            >>> # Must call paddle.incubate.asp.decorate() first before calling paddle.incubate.asp.prune_model()
+            >>> paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+
+            >>> for i in range(10):
+            ...     imgs = paddle.to_tensor(
+            ...         np.random.randn(64, 3, 32, 32),
+            ...         dtype='float32', stop_gradient=False)
+            ...     labels = paddle.to_tensor(
+            ...         np.random.randint(10, size=(64, 1)),
+            ...         dtype='float32', stop_gradient=False)
+            ...     output = my_layer(imgs)
+            ...     loss = loss_fn(output, labels)
+            ...     loss.backward()
+            ...     optimizer.step()
+            ...     optimizer.clear_grad()
+
+        .. code-block:: python
+            :name: static-graph
+
+            >>> # Example2: Usage of Static Graph
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.conv1 = paddle.nn.Conv2D(
+            ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+            ...         self.linear1 = paddle.nn.Linear(4624, 32)
+            ...         self.linear2 = paddle.nn.Linear(32, 32)
+            ...         self.linear3 = paddle.nn.Linear(32, 10)
+            ...
+            ...     def forward(self, img):
+            ...         hidden = self.conv1(img)
+            ...         hidden = paddle.flatten(hidden, start_axis=1)
+            ...         hidden = self.linear1(hidden)
+            ...         hidden = self.linear2(hidden)
+            ...         prediction = self.linear3(hidden)
+            ...         return prediction
+
+            >>> main_program = paddle.static.Program()
+            >>> startup_program = paddle.static.Program()
+
+            >>> with paddle.static.program_guard(main_program, startup_program):
+            ...     input_data = paddle.static.data(name='data', shape=[None, 3, 32, 32])
+            ...     label = paddle.static.data(name='label', shape=[None, 1])
+            ...     my_layer = MyLayer()
+            ...     prob = my_layer(input_data)
+            ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+            ...
+            ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+            ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
+            ...     # will insert necessary masking operations for ASP workflow.
+            ...     # In static graph mode, ASP creates related mask variables
+            ...     # during minimize().
+            ...     optimizer = paddle.incubate.asp.decorate(optimizer)
+            ...     optimizer.minimize(loss, startup_program)
+
+            >>> device = paddle.device.get_device()
+            >>> place = paddle.set_device(device)
+
+            >>> exe = paddle.static.Executor(place)
+            >>> exe.run(startup_program)
+
+            >>> # Must call exe.run(startup_program) first before calling paddle.asp.prune_model()
+            >>> paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+            >>> # it also be accepted to call
+            >>> # paddle.incubate.asp.prune_model(main_program, mask_algo='mask_2d_best')
+
+            >>> for i in range(10):
+            ...     imgs = np.random.randn(64, 3, 32, 32).astype('float32')
+            ...     labels = np.random.randint(10, size=(64, 1)).astype('float32')
+            ...     exe.run(main_program, feed={'data':imgs, 'label':labels})
     """
     device = paddle.device.get_device()
     place = paddle.set_device(device)
diff --git a/python/paddle/incubate/autograd/__init__.py b/python/paddle/incubate/autograd/__init__.py
index d9b9e417819175..41f53878641129 100644
--- a/python/paddle/incubate/autograd/__init__.py
+++ b/python/paddle/incubate/autograd/__init__.py
@@ -16,7 +16,7 @@
 from .primx import prim2orig
 from .utils import disable_prim, enable_prim, prim_enabled
 
-__all__ = [  # noqa
+__all__ = [
     'vjp',
     'jvp',
     'Jacobian',
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index cc57f930de4a71..723c8b11d6fd89 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -266,7 +266,7 @@ def to_prim(
     blacklist = prim_config["forward_blacklist"] | blacklist
 
     with framework.program_guard(main_program):
-        print("Lowering composite forward ops begin...", flush=True)
+        logging.info("Lowering composite forward ops begin...")
 
         if len(blacklist) > 0 and len(whitelist) > 0:
             filter_ = lambda x: x.type in whitelist and x.type not in blacklist
@@ -283,6 +283,4 @@ def to_prim(
             backward_length=backward_length,
         )
         replace_ops = prim_config["composite_ops_record"]
-        print(
-            f"Lowering composite forward ops finish: {replace_ops}", flush=True
-        )
+        logging.info(f"Lowering composite forward ops finish: {replace_ops}")
diff --git a/python/paddle/incubate/autograd/primitives.py b/python/paddle/incubate/autograd/primitives.py
index 9f52d9d69ac233..bdaeb1f06d9f8f 100644
--- a/python/paddle/incubate/autograd/primitives.py
+++ b/python/paddle/incubate/autograd/primitives.py
@@ -11,62 +11,62 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle.tensor import abs  # noqa: F401
-from paddle.tensor import acos  # noqa: F401
-from paddle.tensor import acosh  # noqa: F401
-from paddle.tensor import add  # noqa: F401
-from paddle.tensor import asin  # noqa: F401
-from paddle.tensor import asinh  # noqa: F401
-from paddle.tensor import atan  # noqa: F401
-from paddle.tensor import atanh  # noqa: F401
-from paddle.tensor import broadcast_shape  # noqa: F401
-from paddle.tensor import broadcast_to  # noqa: F401
-from paddle.tensor import concat  # noqa: F401
-from paddle.tensor import cos  # noqa: F401
-from paddle.tensor import cosh  # noqa: F401
-from paddle.tensor import cumprod  # noqa: F401
-from paddle.tensor import cumsum  # noqa: F401
-from paddle.tensor import digamma  # noqa: F401
-from paddle.tensor import divide  # noqa: F401
-from paddle.tensor import erf  # noqa: F401
-from paddle.tensor import erfinv  # noqa: F401
-from paddle.tensor import exp  # noqa: F401
-from paddle.tensor import expm1  # noqa: F401
-from paddle.tensor import fill_constant  # noqa: F401
-from paddle.tensor import full  # noqa: F401
-from paddle.tensor import gather  # noqa: F401
-from paddle.tensor import greater_equal  # noqa: F401
-from paddle.tensor import lgamma  # noqa: F401
-from paddle.tensor import log  # noqa: F401
-from paddle.tensor import log1p  # noqa: F401
-from paddle.tensor import logcumsumexp  # noqa: F401
-from paddle.tensor import logit  # noqa: F401
-from paddle.tensor import logsumexp  # noqa: F401
-from paddle.tensor import max  # noqa: F401
-from paddle.tensor import mean  # noqa: F401
-from paddle.tensor import min  # noqa: F401
-from paddle.tensor import multiply  # noqa: F401
-from paddle.tensor import ones  # noqa: F401
-from paddle.tensor import pow  # noqa: F401
-from paddle.tensor import prod  # noqa: F401
-from paddle.tensor import reshape  # noqa: F401
-from paddle.tensor import rsqrt  # noqa: F401
-from paddle.tensor import sign  # noqa: F401
-from paddle.tensor import sin  # noqa: F401
-from paddle.tensor import sinh  # noqa: F401
-from paddle.tensor import sqrt  # noqa: F401
-from paddle.tensor import subtract  # noqa: F401
-from paddle.tensor import sum  # noqa: F401
-from paddle.tensor import tan  # noqa: F401
-from paddle.tensor import tanh  # noqa: F401
-from paddle.tensor import tile  # noqa: F401
-from paddle.tensor import uniform  # noqa: F401
-from paddle.tensor import zeros  # noqa: F401
-from paddle.tensor.creation import assign  # noqa: F401
-from paddle.tensor.creation import zeros_like  # noqa: F401
+from paddle.tensor import (  # noqa: F401
+    abs,
+    acos,
+    acosh,
+    add,
+    asin,
+    asinh,
+    atan,
+    atanh,
+    broadcast_shape,
+    broadcast_to,
+    concat,
+    cos,
+    cosh,
+    cumprod,
+    cumsum,
+    digamma,
+    divide,
+    erf,
+    erfinv,
+    exp,
+    expm1,
+    fill_constant,
+    full,
+    gather,
+    greater_equal,
+    lgamma,
+    log,
+    log1p,
+    logcumsumexp,
+    logit,
+    logsumexp,
+    max,
+    mean,
+    min,
+    multiply,
+    ones,
+    pow,
+    prod,
+    reshape,
+    rsqrt,
+    sign,
+    sin,
+    sinh,
+    sqrt,
+    subtract,
+    sum,
+    tan,
+    tanh,
+    tile,
+    uniform,
+    zeros,
+)
+from paddle.tensor.creation import assign, zeros_like  # noqa: F401
 from paddle.tensor.manipulation import cast  # noqa: F401
-from paddle.tensor.math import maximum  # noqa: F401
-from paddle.tensor.math import minimum  # noqa: F401
+from paddle.tensor.math import maximum, minimum  # noqa: F401
 
 """
 math_op = [
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
index 5dded6c9b81835..b5cd746e35890d 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
@@ -14,18 +14,24 @@
 
 import os
 import sys
-from .optimizer_factory import FLEET_GLOBAL_DICT  # noqa: F403
-from .optimizer_factory import DistributedAdam  # noqa: F403
+
 from google.protobuf import text_format
-from paddle.framework import core
-from paddle.incubate.distributed.fleet.base import Fleet
-from paddle.incubate.distributed.fleet.base import Mode
-from paddle.incubate.distributed.fleet.base import DistributedOptimizer
-from paddle.incubate.distributed.fleet.role_maker import MPISymetricRoleMaker
-from paddle.incubate.distributed.fleet.role_maker import HeterRoleMaker
-from paddle.common_ops_import import LayerHelper
 
 import paddle
+from paddle.common_ops_import import LayerHelper
+from paddle.framework import core
+from paddle.incubate.distributed.fleet.base import (
+    DistributedOptimizer,
+    Fleet,
+    Mode,
+)
+from paddle.incubate.distributed.fleet.role_maker import (
+    HeterRoleMaker,
+    MPISymetricRoleMaker,
+)
+
+from .optimizer_factory import DistributedAdam  # noqa: F401
+from .optimizer_factory import FLEET_GLOBAL_DICT
 
 
 class PSLib(Fleet):
diff --git a/python/paddle/incubate/multiprocessing/__init__.py b/python/paddle/incubate/multiprocessing/__init__.py
index df0f98f74d58bc..2498a04014d954 100644
--- a/python/paddle/incubate/multiprocessing/__init__.py
+++ b/python/paddle/incubate/multiprocessing/__init__.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from .reductions import init_reductions
-import multiprocessing
 
 __all__ = []
 
diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py
index c663d6248feb0f..cf110540025d92 100644
--- a/python/paddle/incubate/nn/__init__.py
+++ b/python/paddle/incubate/nn/__init__.py
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .layer.fused_transformer import FusedMultiHeadAttention  # noqa: F401
-from .layer.fused_transformer import FusedFeedForward  # noqa: F401
-from .layer.fused_transformer import FusedTransformerEncoderLayer  # noqa: F401
-from .layer.fused_transformer import FusedMultiTransformer  # noqa: F401
-from .layer.fused_linear import FusedLinear  # noqa: F401
+from .layer.fused_dropout_add import FusedDropoutAdd
+from .layer.fused_dropout_nd import FusedDropout  # noqa: F401
+from .layer.fused_ec_moe import FusedEcMoe
+from .layer.fused_linear import FusedLinear
 from .layer.fused_transformer import (
     FusedBiasDropoutResidualLayerNorm,
-)  # noqa: F401
-from .layer.fused_ec_moe import FusedEcMoe  # noqa: F401
-from .layer.fused_dropout_add import FusedDropoutAdd  # noqa: F401
-from .layer.fused_dropout_nd import FusedDropout  # noqa: F401
+    FusedFeedForward,
+    FusedMultiHeadAttention,
+    FusedMultiTransformer,
+    FusedTransformerEncoderLayer,
+)
 
-__all__ = [  # noqa
+__all__ = [
     'FusedMultiHeadAttention',
     'FusedFeedForward',
     'FusedTransformerEncoderLayer',
diff --git a/python/paddle/incubate/nn/functional/fused_rms_norm.py b/python/paddle/incubate/nn/functional/fused_rms_norm.py
index 3995cd4a4087d0..99f9c4e72e77d0 100644
--- a/python/paddle/incubate/nn/functional/fused_rms_norm.py
+++ b/python/paddle/incubate/nn/functional/fused_rms_norm.py
@@ -54,14 +54,15 @@ def fused_rms_norm(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
 
-            paddle_x = paddle.cast(paddle.randn(shape=[32, 256]), dtype=paddle.float16)
-            paddle_weight = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16)
-            paddle_bias = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16)
-            epsilon = 1e-6
-            paddle_rmsnorm = paddle.incubate.nn.functional.fused_rms_norm(paddle_x, paddle_weight, paddle_bias, epsilon, 1)
+            >>> paddle_x = paddle.cast(paddle.randn(shape=[32, 256]), dtype=paddle.float16)
+            >>> paddle_weight = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16)
+            >>> paddle_bias = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16)
+            >>> epsilon = 1e-6
+            >>> paddle_rmsnorm = paddle.incubate.nn.functional.fused_rms_norm(paddle_x, paddle_weight, paddle_bias, epsilon, 1)
     """
     if in_dynamic_or_pir_mode():
         return _C_ops.rms_norm(
diff --git a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
index 0b667687c114bf..adfcdc233fe567 100644
--- a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
+++ b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
@@ -14,6 +14,7 @@
 
 
 from paddle import _C_ops
+from paddle.base.layer_helper import LayerHelper
 from paddle.framework import in_dynamic_mode
 
 
@@ -91,6 +92,28 @@ def fused_rotary_position_embedding(
             q, k, v, sin, cos, position_ids, use_neox_rotary_style
         )
 
-    raise RuntimeError(
-        "This feature is currently supported only in dynamic mode and with CUDAPlace."
+    helper = LayerHelper('fused_rotary_position_embedding', **locals())
+    out_q = helper.create_variable_for_type_inference(dtype=q.dtype)
+    out_k = (
+        helper.create_variable_for_type_inference(dtype=k.dtype) if k else None
     )
+    out_v = (
+        helper.create_variable_for_type_inference(dtype=v.dtype) if v else None
+    )
+    helper.append_op(
+        type='fused_rotary_position_embedding',
+        inputs={
+            'q': q,
+            'k': k,
+            'v': v,
+            'sin': sin,
+            'cos': cos,
+            'position_ids': position_ids,
+        },
+        outputs={'out_q': out_q, 'out_k': out_k, 'out_v': out_v},
+        attrs={
+            'use_neox_rotary_style': use_neox_rotary_style,
+        },
+    )
+
+    return out_q, out_k, out_v
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 355b5916b5ddb2..c4cf8abfdb3546 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -56,20 +56,20 @@ def fused_feedforward(
     This operator only supports running on GPU. The function of the operator is consistent with
     the following pseudo code:
 
-    .. code-block:: python
-
-        residual = x
-        if pre_layer_norm:
-            out = layer_norm1(x)
-        else:
-            out = x
-        out = linear2(dropout1(activation(linear1(src))))
-        if add_residual:
-            out = residual + dropout2(out)
-        else:
-            out = dropout2(out)
-        if not pre_layer_norm:
-            out = layer_norm2(out)
+    .. code-block:: text
+
+        >>> residual = x
+        >>> if pre_layer_norm:
+        ...     out = layer_norm1(x)
+        ...  else:
+        ...     out = x
+        >>> out = linear2(dropout1(activation(linear1(src))))
+        >>> if add_residual:
+        ...     out = residual + dropout2(out)
+        ... else:
+        ...     out = dropout2(out)
+        >>> if not pre_layer_norm:
+        ...     out = layer_norm2(out)
 
 
     Args:
@@ -110,16 +110,17 @@ def fused_feedforward(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            import paddle.incubate.nn.functional as F
-
-            x = paddle.randn(shape=(1, 8, 8), dtype="float32")
-            linear1_weight = paddle.randn(shape=(8, 8), dtype="float32")
-            linear2_weight = paddle.randn(shape=(8, 8), dtype="float32")
-            out = F.fused_feedforward(x, linear1_weight, linear2_weight)
-            print(out.shape)
-            # (1, 8, 8)
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+            >>> import paddle.incubate.nn.functional as F
+
+            >>> x = paddle.randn(shape=(1, 8, 8), dtype="float32")
+            >>> linear1_weight = paddle.randn(shape=(8, 8), dtype="float32")
+            >>> linear2_weight = paddle.randn(shape=(8, 8), dtype="float32")
+            >>> out = F.fused_feedforward(x, linear1_weight, linear2_weight)
+            >>> print(out.shape)
+            [1, 8, 8]
     """
     _verify_dropout_rate(dropout1_rate)
     _verify_dropout_rate(dropout2_rate)
@@ -288,9 +289,9 @@ def fused_bias_dropout_residual_layer_norm(
 
     The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
 
-    .. code-block:: python
+    .. code-block:: text
 
-        y = layer_norm(residual + dropout(bias + x))
+        >>> y = layer_norm(residual + dropout(bias + x))
 
     Parameters:
         x (Tensor): The input tensor. The shape is `[*, embed\_dim]`.
@@ -323,21 +324,22 @@ def fused_bias_dropout_residual_layer_norm(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            import paddle.incubate.nn.functional as F
-
-            # input: [batch_size, seq_len, embed_dim]
-            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
-            # residual: [batch_size, seq_len, embed_dim]
-            residual = paddle.rand(shape=(2, 4, 128), dtype="float32")
-            # linear bias: [embed_dim]
-            bias = paddle.rand(shape=[128], dtype="float32")
-            # output: [batch_size, seq_len, embed_dim]
-            output = F.fused_bias_dropout_residual_layer_norm(
-                x, residual, bias)
-            # [2, 4, 128]
-            print(output.shape)
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+            >>> import paddle.incubate.nn.functional as F
+
+            >>> # input: [batch_size, seq_len, embed_dim]
+            >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            >>> # residual: [batch_size, seq_len, embed_dim]
+            >>> residual = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            >>> # linear bias: [embed_dim]
+            >>> bias = paddle.rand(shape=[128], dtype="float32")
+            >>> # output: [batch_size, seq_len, embed_dim]
+            >>> output = F.fused_bias_dropout_residual_layer_norm(
+            ...     x, residual, bias)
+            >>> print(output.shape)
+            [2, 4, 128]
 
     """
     seed = None
@@ -493,35 +495,35 @@ def fused_multi_head_attention(
     to information from different representation subspaces. This API only
     support self_attention. The pseudo code is as follows:
 
-    .. code-block:: python
-
-        residual = x
-        if pre_layer_norm:
-            out = layer_norm(x)
-        else:
-            out = x
-        # compute q, k, v
-        out = matmul(out, qkv_weight) + qkv_bias
-        out = transpose(out, perm=[2, 0, 3, 1, 4])
-        # extract q, k and v from out
-        q = out[0:1,::] * (head_dim ** -0.5)
-        k = out[1:2,::]
-        v = out[2:3,::]
-        out = matmul(q, k, transpose_y=True)
-        out = out + attn_mask
-        out = softmax(out)
-        out = dropout(out)
-        out = matmul(out, v)
-        # combine heads
-        out = transpose(out, perm=[0, 2, 1, 3])
-        # project to output
-        out = linear(out)
-        if add_residual:
-            out = residual + dropout(out)
-        else:
-            out = dropout(out)
-        if not pre_layer_norm:
-            out = layer_norm(out)
+    .. code-block:: text
+
+        >>> residual = x
+        >>> if pre_layer_norm:
+        ...     out = layer_norm(x)
+        ... else:
+        ...     out = x
+        >>> # compute q, k, v
+        >>> out = matmul(out, qkv_weight) + qkv_bias
+        >>> out = transpose(out, perm=[2, 0, 3, 1, 4])
+        >>> # extract q, k and v from out
+        >>> q = out[0:1,::] * (head_dim ** -0.5)
+        >>> k = out[1:2,::]
+        >>> v = out[2:3,::]
+        >>> out = matmul(q, k, transpose_y=True)
+        >>> out = out + attn_mask
+        >>> out = softmax(out)
+        >>> out = dropout(out)
+        >>> out = matmul(out, v)
+        >>> # combine heads
+        >>> out = transpose(out, perm=[0, 2, 1, 3])
+        >>> # project to output
+        >>> out = linear(out)
+        >>> if add_residual:
+        ...     out = residual + dropout(out)
+        ... else:
+        ...     out = dropout(out)
+        >>> if not pre_layer_norm:
+        ...     out = layer_norm(out)
 
 
     Parameters:
@@ -581,30 +583,31 @@ def fused_multi_head_attention(
 
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            import paddle.incubate.nn.functional as F
-
-            # input: [batch_size, seq_len, embed_dim]
-            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
-            # qkv_weight: [3, num_head, head_dim, embed_dim]
-            qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32")
-            # qkv_bias: [3, num_head, head_dim]
-            qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32")
-            # linear_weight: [embed_dim, embed_dim]
-            linear_weight = paddle.rand(shape=(128, 128), dtype="float32")
-            # linear_bias: [embed_dim]
-            linear_bias = paddle.rand(shape=[128], dtype="float32")
-            # self attention mask: [batch_size, num_heads, seq_len, seq_len]
-            attn_mask = paddle.rand(shape=(2, 4, 4, 4), dtype="float32")
-
-            # output: [batch_size, seq_len, embed_dim]
-            output = F.fused_multi_head_attention(
-                x, qkv_weight, linear_weight, False,
-                None, None, None, None, 1e-5, qkv_bias,
-                linear_bias, None, attn_mask)
-            # [2, 4, 128]
-            print(output.shape)
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+            >>> import paddle.incubate.nn.functional as F
+
+            >>> # input: [batch_size, seq_len, embed_dim]
+            >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            >>> # qkv_weight: [3, num_head, head_dim, embed_dim]
+            >>> qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32")
+            >>> # qkv_bias: [3, num_head, head_dim]
+            >>> qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32")
+            >>> # linear_weight: [embed_dim, embed_dim]
+            >>> linear_weight = paddle.rand(shape=(128, 128), dtype="float32")
+            >>> # linear_bias: [embed_dim]
+            >>> linear_bias = paddle.rand(shape=[128], dtype="float32")
+            >>> # self attention mask: [batch_size, num_heads, seq_len, seq_len]
+            >>> attn_mask = paddle.rand(shape=(2, 4, 4, 4), dtype="float32")
+
+            >>> # output: [batch_size, seq_len, embed_dim]
+            >>> output = F.fused_multi_head_attention(
+            ...     x, qkv_weight, linear_weight, False,
+            ...     None, None, None, None, 1e-5, qkv_bias,
+            ...     linear_bias, None, attn_mask)
+            >>> print(output.shape)
+            [2, 4, 128]
     """
 
     seed = None
@@ -906,39 +909,39 @@ def fused_multi_transformer(
     This operator only supports running on GPU. The function of the transformer layer is consistent
     with the following pseudo code:
 
-    .. code-block:: python
-
-        if pre_layer_norm:
-            out = layer_norm(x)
-            out = qkv_linear(out) + qkv_bias
-        else:
-            out = qkv_linear(x) + qkv_bias
-        out = transpose(out, perm=[2, 0, 3, 1, 4])
-        # extract q, k and v from out.
-        q = out[0:1, ::]
-        k = out[1:2, ::]
-        v = out[2:3, ::]
-        out = q * k^t
-        out = attn_mask + out
-        out = softmax(out)
-        out = dropout(out)
-        out = out * v
-        out = transpose(out, perm=[0, 2, 1, 3])
-        out = linear(out)
-        if pre_layer_norm:
-            out = x + dropout(out + bias)
-        else:
-            out = layer_norm(x + dropout(out + bias))
-
-        residual = out;
-        if pre_layer_norm:
-            out = ffn_layer_norm(out)
-        out = ffn1_linear(out)
-        out = dropout(activation(out + ffn1_bias))
-        out = ffn2_linear(out)
-        out = residual + dropout(out + ffn2_bias)
-        if not pre_layer_norm:
-            out = ffn_layer_norm(out)
+    .. code-block:: text
+
+        >>> if pre_layer_norm:
+        ...     out = layer_norm(x)
+        ...     out = qkv_linear(out) + qkv_bias
+        ... else:
+        ...     out = qkv_linear(x) + qkv_bias
+        >>> out = transpose(out, perm=[2, 0, 3, 1, 4])
+        >>> # extract q, k and v from out.
+        >>> q = out[0:1, ::]
+        >>> k = out[1:2, ::]
+        >>> v = out[2:3, ::]
+        >>> out = q * k^t
+        >>> out = attn_mask + out
+        >>> out = softmax(out)
+        >>> out = dropout(out)
+        >>> out = out * v
+        >>> out = transpose(out, perm=[0, 2, 1, 3])
+        >>> out = linear(out)
+        >>> if pre_layer_norm:
+        ...     out = x + dropout(out + bias)
+        ... else:
+        ...     out = layer_norm(x + dropout(out + bias))
+
+        >>> residual = out;
+        >>> if pre_layer_norm:
+        ...     out = ffn_layer_norm(out)
+        >>> out = ffn1_linear(out)
+        >>> out = dropout(activation(out + ffn1_bias))
+        >>> out = ffn2_linear(out)
+        >>> out = residual + dropout(out + ffn2_bias)
+        >>> if not pre_layer_norm:
+        ...     out = ffn_layer_norm(out)
 
     Args:
         x (Tensor): the input tensor could be 3-D tensor, the input data type could be float16 or float32, the shape is `[batch\_size, sequence\_length, d\_model]`.
@@ -996,48 +999,49 @@ def fused_multi_transformer(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            import paddle.incubate.nn.functional as F
-
-            # input: [batch_size, seq_len, embed_dim]
-            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
-
-            # ln_scale: [embed_dim], ln_bias: [embed_dim]
-            ln_scale = paddle.rand(shape=(128,), dtype="float32")
-            ln_bias = paddle.rand(shape=(128,), dtype="float32")
-
-            # qkv_weight: [3, num_head, head_dim, embed_dim], qkv_bias: [3, num_head, head_dim]
-            qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32")
-            qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32")
-
-            # linear_weight: [embed_dim, embed_dim], linear_bias: [embed_dim]
-            linear_weight = paddle.rand(shape=(128, 128), dtype="float32")
-            linear_bias = paddle.rand(shape=(128,), dtype="float32")
-
-            # ffn_ln_scale: [embed_dim], ffn_ln_bias: [embed_dim]
-            ffn_ln_scale = paddle.rand(shape=(128,), dtype="float32")
-            ffn_ln_bias = paddle.rand(shape=(128,), dtype="float32")
-
-            # ffn1_weight: [embed_dim, 4*embed_dim], ffn1_bias: [4*embed_dim]
-            ffn1_weight = paddle.rand(shape=(128, 4*128), dtype="float32")
-            ffn1_bias = paddle.rand(shape=(4*128,), dtype="float32")
-
-            # ffn2_weight: [4*embed_dim, embed_dim], ffn2_bias: [embed_dim]
-            ffn2_weight = paddle.rand(shape=(4*128, 128), dtype="float32")
-            ffn2_bias = paddle.rand(shape=(128,), dtype="float32")
-
-            # self attention mask: [batch_size, 1, seq_len, seq_len]
-            attn_mask = paddle.rand(shape=(2, 1, 4, 4), dtype="float32")
-
-            # output: [batch_size, seq_len, embed_dim]
-            output = F.fused_multi_transformer(
-                x, [ln_scale], [ln_bias], [qkv_weight], [qkv_bias],
-                [linear_weight], [linear_bias], [ffn_ln_scale], [ffn_ln_bias],
-                [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias],
-                attn_mask=attn_mask)
-            # [2, 4, 128]
-            print(output.shape)
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+            >>> import paddle.incubate.nn.functional as F
+
+            >>> # input: [batch_size, seq_len, embed_dim]
+            >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32")
+
+            >>> # ln_scale: [embed_dim], ln_bias: [embed_dim]
+            >>> ln_scale = paddle.rand(shape=(128,), dtype="float32")
+            >>> ln_bias = paddle.rand(shape=(128,), dtype="float32")
+
+            >>> # qkv_weight: [3, num_head, head_dim, embed_dim], qkv_bias: [3, num_head, head_dim]
+            >>> qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32")
+            >>> qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32")
+
+            >>> # linear_weight: [embed_dim, embed_dim], linear_bias: [embed_dim]
+            >>> linear_weight = paddle.rand(shape=(128, 128), dtype="float32")
+            >>> linear_bias = paddle.rand(shape=(128,), dtype="float32")
+
+            >>> # ffn_ln_scale: [embed_dim], ffn_ln_bias: [embed_dim]
+            >>> ffn_ln_scale = paddle.rand(shape=(128,), dtype="float32")
+            >>> ffn_ln_bias = paddle.rand(shape=(128,), dtype="float32")
+
+            >>> # ffn1_weight: [embed_dim, 4*embed_dim], ffn1_bias: [4*embed_dim]
+            >>> ffn1_weight = paddle.rand(shape=(128, 4*128), dtype="float32")
+            >>> ffn1_bias = paddle.rand(shape=(4*128,), dtype="float32")
+
+            >>> # ffn2_weight: [4*embed_dim, embed_dim], ffn2_bias: [embed_dim]
+            >>> ffn2_weight = paddle.rand(shape=(4*128, 128), dtype="float32")
+            >>> ffn2_bias = paddle.rand(shape=(128,), dtype="float32")
+
+            >>> # self attention mask: [batch_size, 1, seq_len, seq_len]
+            >>> attn_mask = paddle.rand(shape=(2, 1, 4, 4), dtype="float32")
+
+            >>> # output: [batch_size, seq_len, embed_dim]
+            >>> output = F.fused_multi_transformer(
+            ...     x, [ln_scale], [ln_bias], [qkv_weight], [qkv_bias],
+            ...     [linear_weight], [linear_bias], [ffn_ln_scale], [ffn_ln_bias],
+            ...     [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias],
+            ...     attn_mask=attn_mask)
+            >>> print(output.shape)
+            [2, 4, 128]
     """
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
diff --git a/python/paddle/incubate/nn/layer/fused_dropout_nd.py b/python/paddle/incubate/nn/layer/fused_dropout_nd.py
index ded171158fe3dc..09f083da88c741 100644
--- a/python/paddle/incubate/nn/layer/fused_dropout_nd.py
+++ b/python/paddle/incubate/nn/layer/fused_dropout_nd.py
@@ -54,6 +54,7 @@ class FusedDropout(paddle.nn.Layer):
         .. code-block:: python
 
             >>> import paddle
+            >>> paddle.seed(2023)
 
             >>> x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype="float32")
             >>> m = paddle.incubate.nn.FusedDropout(p=0.5)
@@ -61,15 +62,15 @@ class FusedDropout(paddle.nn.Layer):
             >>> y_train = m(x)
             >>> print(y_train)
             Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-                   [[2., 0., 6.],
-                    [0., 0., 0.]])
+            [[0., 0., 6.],
+             [0., 0., 0.]])
 
             >>> m.eval()  # switch the model to test phase
             >>> y_test = m(x)
             >>> print(y_test)
             Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-                   [[1., 2., 3.],
-                    [4., 5., 6.]])
+            [[1., 2., 3.],
+             [4., 5., 6.]])
     """
 
     def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py
index e96c3641196574..653dc97ed61939 100644
--- a/python/paddle/incubate/operators/__init__.py
+++ b/python/paddle/incubate/operators/__init__.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .softmax_mask_fuse_upper_triangle import (
-    softmax_mask_fuse_upper_triangle,
-)  # noqa: F401
-from .softmax_mask_fuse import softmax_mask_fuse  # noqa: F401
-from .resnet_unit import ResNetUnit  # noqa: F401
-from .graph_send_recv import graph_send_recv  # noqa: F401
 from .graph_khop_sampler import graph_khop_sampler  # noqa: F401
-from .graph_sample_neighbors import graph_sample_neighbors  # noqa: F401
 from .graph_reindex import graph_reindex  # noqa: F401
+from .graph_sample_neighbors import graph_sample_neighbors  # noqa: F401
+from .graph_send_recv import graph_send_recv  # noqa: F401
+from .resnet_unit import ResNetUnit  # noqa: F401
+from .softmax_mask_fuse import softmax_mask_fuse  # noqa: F401
+from .softmax_mask_fuse_upper_triangle import (  # noqa: F401
+    softmax_mask_fuse_upper_triangle,
+)
 from .unzip import unzip  # noqa: F401
diff --git a/python/paddle/incubate/optimizer/__init__.py b/python/paddle/incubate/optimizer/__init__.py
index 4c22c410add055..ed7935eac04b35 100644
--- a/python/paddle/incubate/optimizer/__init__.py
+++ b/python/paddle/incubate/optimizer/__init__.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from . import functional  # noqa: F401
+from .distributed_fused_lamb import DistributedFusedLamb  # noqa: F401
+from .gradient_merge import GradientMergeOptimizer  # noqa: F401
+from .lars_momentum import LarsMomentumOptimizer  # noqa: F401
+from .lbfgs import LBFGS
 from .lookahead import LookAhead  # noqa: F401
 from .modelaverage import ModelAverage  # noqa: F401
-from .lars_momentum import LarsMomentumOptimizer  # noqa: F401
-from .recompute import RecomputeOptimizer  # noqa: F401
 from .pipeline import PipelineOptimizer  # noqa: F401
-from .gradient_merge import GradientMergeOptimizer  # noqa: F401
-from .distributed_fused_lamb import DistributedFusedLamb  # noqa: F401
-from .lbfgs import LBFGS  # noqa: F401
-from . import functional  # noqa: F401
+from .recompute import RecomputeOptimizer  # noqa: F401
 
 __all__ = ['LBFGS']
diff --git a/python/paddle/incubate/optimizer/functional/__init__.py b/python/paddle/incubate/optimizer/functional/__init__.py
index fc863a923d88ff..a631b85363c092 100644
--- a/python/paddle/incubate/optimizer/functional/__init__.py
+++ b/python/paddle/incubate/optimizer/functional/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .bfgs import minimize_bfgs  # noqa: F401
-from .lbfgs import minimize_lbfgs  # noqa: F401
+from .bfgs import minimize_bfgs
+from .lbfgs import minimize_lbfgs
 
 __all__ = ['minimize_bfgs', 'minimize_lbfgs']
diff --git a/python/paddle/incubate/optimizer/lookahead.py b/python/paddle/incubate/optimizer/lookahead.py
index cb6e10fefc61e1..821b5c3ce036c6 100644
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
@@ -252,7 +252,7 @@ def minimize(
             loss (Tensor): A ``Tensor`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_paddle_static_Program` for
                 initializing parameters in ``parameters``. The default value
-                is None, at this time :ref:`api_base_default_startup_program` will be used.
+                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
             parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index ecab0f307304d3..8de533f9f0a4b2 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -302,7 +302,7 @@ def minimize(
             loss (Tensor): A ``Tensor`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_paddle_static_Program` for
                 initializing parameters in ``parameters``. The default value
-                is None, at this time :ref:`api_base_default_startup_program` will be used.
+                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
             parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py
index 6c0e80b1f57104..b7ae315576d791 100644
--- a/python/paddle/incubate/optimizer/pipeline.py
+++ b/python/paddle/incubate/optimizer/pipeline.py
@@ -48,47 +48,47 @@ class PipelineOptimizer:
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
-            import paddle.base.layers as layers
-            import numpy as np
-
-            paddle.enable_static()
-            with base.device_guard("gpu:0"):
-                x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0)
-                y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0)
-                data_loader = base.io.DataLoader.from_generator(
-                    feed_list=[x, y],
-                    capacity=64,
-                    use_double_buffer=True,
-                    iterable=False)
-
-                emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
-                emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
-
-            with base.device_guard("gpu:1"):
-                concat = layers.concat([emb_x, emb_y], axis=1)
-                fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
-                loss = paddle.mean(fc)
-            optimizer = paddle.optimizer.SGD(learning_rate=0.5)
-            optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer)
-            optimizer.minimize(loss)
-
-            def train_reader():
-                for _ in range(4):
-                    x = np.random.random(size=[1]).astype('int64')
-                    y = np.random.random(size=[1]).astype('int64')
-                    yield x, y
-            data_loader.set_sample_generator(train_reader, batch_size=1)
-
-            place = base.CUDAPlace(0)
-            exe = base.Executor(place)
-            exe.run(base.default_startup_program())
-            batch_size = 1
-            data_loader.start()
-            exe.train_from_dataset(
-                    base.default_main_program())
-            data_loader.reset()
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> import paddle.base.layers as layers
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+            >>> with base.device_guard("gpu:0"):
+            ...     x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0)
+            ...     y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0)
+            ...     data_loader = base.io.DataLoader.from_generator(
+            ...         feed_list=[x, y],
+            ...         capacity=64,
+            ...         use_double_buffer=True,
+            ...         iterable=False)
+
+            ...     emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
+            ...     emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
+
+            >>> with base.device_guard("gpu:1"):
+            ...     concat = layers.concat([emb_x, emb_y], axis=1)
+            ...     fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
+            ...     loss = paddle.mean(fc)
+            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.5)
+            >>> optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer)
+            >>> optimizer.minimize(loss)
+
+            >>> def train_reader():
+            ...     for _ in range(4):
+            ...         x = np.random.random(size=[1]).astype('int64')
+            ...         y = np.random.random(size=[1]).astype('int64')
+            ...         yield x, y
+            >>> data_loader.set_sample_generator(train_reader, batch_size=1)
+
+            >>> place = paddle.CUDAPlace(0)
+            >>> exe = paddle.static.Executor(place)
+            >>> exe.run(paddle.static.default_startup_program())
+            >>> batch_size = 1
+            >>> data_loader.start()
+            >>> exe.train_from_dataset(
+            ...         paddle.static.default_main_program())
+            >>> data_loader.reset()
     """
 
     def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py
index 9cbd8894f18897..2545115fa0d015 100644
--- a/python/paddle/incubate/optimizer/recompute.py
+++ b/python/paddle/incubate/optimizer/recompute.py
@@ -49,45 +49,57 @@ class RecomputeOptimizer(Optimizer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
-            import numpy as np
-
-            paddle.enable_static()
-
-            def gen_data():
-                return {"x": np.random.random(size=(32, 32)).astype('float32'),
-                "y": np.random.randint(2, size=(32, 1)).astype('int64')}
-            def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                print(input_x)
-                fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                cost = paddle.nn.functional.cross_entropy(
-                    input=prediction, label=input_y,
-                    reduction='none', use_softmax=False
-                )
-                sum_cost = paddle.mean(cost)
-                return sum_cost, fc_1, prediction
-            input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-            input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-            cost, fc_1, pred = mlp(input_x, input_y)
-
-            sgd = paddle.optimizer.Adam(learning_rate=0.01)
-            sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-            sgd._set_checkpoints([fc_1, pred])
-            sgd.minimize(cost)
-
-            print("Finished optimize")
-            place = base.CPUPlace()
-            exe = base.Executor(place)
-            exe.run(base.default_startup_program())
-            step = 10
-
-            for i in range(step):
-                cost_val = exe.run(feed=gen_data(),
-                       program=base.default_main_program(),
-                       fetch_list=[cost.name])
-                print("step=%d cost=%f" % (i, cost_val[0]))
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+
+            >>> def gen_data():
+            ...     return {"x": np.random.random(size=(32, 32)).astype('float32'),
+            ...     "y": np.random.randint(2, size=(32, 1)).astype('int64')}
+            >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+            ...     print(input_x)
+            ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+            ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+            ...     cost = paddle.nn.functional.cross_entropy(
+            ...         input=prediction, label=input_y,
+            ...         reduction='none', use_softmax=False
+            ...     )
+            ...     sum_cost = paddle.mean(cost)
+            ...     return sum_cost, fc_1, prediction
+            >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+            >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+            >>> cost, fc_1, pred = mlp(input_x, input_y)
+
+            >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+            >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+            >>> sgd._set_checkpoints([fc_1, pred])
+            >>> sgd.minimize(cost)
+
+            >>> print("Finished optimize")
+            Finished optimize
+            >>> place = paddle.CPUPlace()
+            >>> exe = paddle.static.Executor(place)
+            >>> exe.run(paddle.static.default_startup_program())
+            >>> step = 10
+
+            >>> for i in range(step):
+            ...     cost_val = exe.run(feed=gen_data(),
+            ...             program=paddle.static.default_main_program(),
+            ...             fetch_list=[cost.name])
+            ...     print("step=%d cost=%f" % (i, cost_val[0]))
+            var x : LOD_TENSOR.shape(-1, 32).dtype(float32).stop_gradient(True)
+            Finished optimize
+            step=0 cost=0.737203
+            step=1 cost=1.308077
+            step=2 cost=0.768422
+            step=3 cost=1.239475
+            step=4 cost=0.882643
+            step=5 cost=0.738027
+            step=6 cost=0.819374
+            step=7 cost=0.818534
+            step=8 cost=0.753692
+            step=9 cost=0.787448
 
     """
 
@@ -132,33 +144,34 @@ def load(self, state_dict):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-
-                paddle.enable_static()
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                    prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                    cost = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=input_y,
-                        reduction='none', use_softmax=False
-                    )
-                    sum_cost = paddle.mean(cost)
-                    return sum_cost, fc_1, prediction
-
-                input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-                input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = paddle.optimizer.Adam(learning_rate=0.01)
-                sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                try:
-                    state_dict = {}
-                    sgd.load(state_dict)
-                except NotImplementedError as e:
-                    print(e)
+                >>> import paddle
+
+                >>> paddle.enable_static()
+                >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+                ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+                ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+                ...     cost = paddle.nn.functional.cross_entropy(
+                ...         input=prediction, label=input_y,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...     sum_cost = paddle.mean(cost)
+                ...     return sum_cost, fc_1, prediction
+
+                >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+                >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+                >>> cost, fc_1, pred = mlp(input_x, input_y)
+                >>> print("Finished FF")
+                Finished FF
+
+                >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+                >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+                >>> sgd._set_checkpoints([fc_1, pred])
+                >>> try:
+                ...     state_dict = {}
+                ...     sgd.load(state_dict)
+                >>> except NotImplementedError as e:
+                ...     print(e)
+                load function is not supported by Recompute Optimizer for now
         """
         raise NotImplementedError(
             "load function is not supported by Recompute Optimizer for now"
@@ -177,42 +190,42 @@ def apply_gradients(self, params_grads):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import paddle.base.framework as framework
-
-                paddle.enable_static()
-
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                    prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                    cost = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=input_y,
-                        reduction='none', use_softmax=False
-                    )
-                    sum_cost = paddle.mean(cost)
-                    return sum_cost, fc_1, prediction
-
-
-                input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-                input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = paddle.optimizer.Adam(learning_rate=0.01)
-                sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None)
-
-                program = cost.block.program
-                with framework.program_guard(program, None):
-                    optimize_ops = sgd.apply_gradients(params_grads)
-
-                print("Finished apply gradients")
+                >>> import paddle
+                >>> import paddle.base.framework as framework
+
+                >>> paddle.enable_static()
+
+                >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+                ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+                ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+                ...     cost = paddle.nn.functional.cross_entropy(
+                ...         input=prediction, label=input_y,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...     sum_cost = paddle.mean(cost)
+                ...     return sum_cost, fc_1, prediction
+
+                >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+                >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+                >>> cost, fc_1, pred = mlp(input_x, input_y)
+                >>> print("Finished FF")
+                Finished FF
+
+                >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+                >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+                >>> sgd._set_checkpoints([fc_1, pred])
+                >>> params_grads = sgd.backward(
+                ...     cost,
+                ...     startup_program=None,
+                ...     parameter_list=None,
+                ...     no_grad_set=None)
+
+                >>> program = cost.block.program
+                >>> with framework.program_guard(program, None):
+                ...     optimize_ops = sgd.apply_gradients(params_grads)
+
+                >>> print("Finished apply gradients")
+                Finished apply gradients
         """
 
         return self._optimizer.apply_gradients(params_grads=params_grads)
@@ -651,36 +664,36 @@ def backward(
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-
-                paddle.enable_static()
-
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                    prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                    cost = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=input_y,
-                        reduction='none', use_softmax=False
-                    )
-                    sum_cost = paddle.mean(cost)
-                    return sum_cost, fc_1, prediction
-
-
-                input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-                input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = paddle.optimizer.Adam(learning_rate=0.01)
-                sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None)
-                print("Finished backward")
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+                ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+                ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+                ...     cost = paddle.nn.functional.cross_entropy(
+                ...         input=prediction, label=input_y,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...     sum_cost = paddle.mean(cost)
+                ...     return sum_cost, fc_1, prediction
+
+                >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+                >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+                >>> cost, fc_1, pred = mlp(input_x, input_y)
+                >>> print("Finished FF")
+                Finished FF
+
+                >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+                >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+                >>> sgd._set_checkpoints([fc_1, pred])
+                >>> params_grads = sgd.backward(
+                ...     cost,
+                ...     startup_program=None,
+                ...     parameter_list=None,
+                ...     no_grad_set=None)
+                >>> print("Finished backward")
+                Finished backward
         """
         assert (
             self._checkpoints is not None
@@ -733,39 +746,41 @@ def apply_optimize(self, loss, startup_program, params_grads):
             params_grads (list): list of (param, grad) pair to do optimization.
         Examples:
             .. code-block:: python
-                import paddle
-                import paddle.base as base
 
-                paddle.enable_static()
-
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                    prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                    cost = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=input_y,
-                        reduction='none', use_softmax=False
-                    )
-                    sum_cost = paddle.mean(cost)
-                    return sum_cost, fc_1, prediction
-
-                input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-                input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = paddle.optimizer.Adam(learning_rate=0.01)
-                sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None)
-
-                optimize_ops = sgd.apply_optimize(
-                    cost, startup_program=None, params_grads=params_grads)
-
-                print("Finished apply_optimize")
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+                ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+                ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+                ...     cost = paddle.nn.functional.cross_entropy(
+                ...         input=prediction, label=input_y,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...     sum_cost = paddle.mean(cost)
+                ...     return sum_cost, fc_1, prediction
+
+                >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+                >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+                >>> cost, fc_1, pred = mlp(input_x, input_y)
+                >>> print("Finished FF")
+                Finished FF
+
+                >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+                >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+                >>> sgd._set_checkpoints([fc_1, pred])
+                >>> params_grads = sgd.backward(
+                ...     cost,
+                ...     startup_program=None,
+                ...     parameter_list=None,
+                ...     no_grad_set=None)
+
+                >>> optimize_ops = sgd.apply_optimize(
+                ...     cost, startup_program=None, params_grads=params_grads)
+
+                >>> print("Finished apply_optimize")
+                Finished apply_optimize
         """
 
         func = (
diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py
index f59c5990573dbf..09e1a37e8c1013 100644
--- a/python/paddle/inference/__init__.py
+++ b/python/paddle/inference/__init__.py
@@ -31,9 +31,10 @@
     get_num_bytes_of_data_type,
     PredictorPool,
     XpuConfig,
+    InternalUtils,
 )
 
-__all__ = [  # noqa
+__all__ = [
     'Config',
     'DataType',
     'PlaceType',
diff --git a/python/paddle/inference/wrapper.py b/python/paddle/inference/wrapper.py
index 51095647fee2fe..5d8da4268d8187 100644
--- a/python/paddle/inference/wrapper.py
+++ b/python/paddle/inference/wrapper.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 from typing import Set
 
@@ -27,6 +28,11 @@
     PaddlePlace,
     convert_to_mixed_precision_bind,
 )
+from paddle.base.log_helper import get_logger
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)
 
 DataType = PaddleDType
 PlaceType = PaddlePlace
@@ -96,6 +102,11 @@ def convert_to_mixed_precision(
         kwargs: Supported keys including 'white_list'.
             - white_list: Operators that do convert precision.
     '''
+    if backend is PlaceType.GPU and not core.is_compiled_with_cuda():
+        _logger.error(
+            "You shoule use PaddlePaddle compiled with GPU when backend set to PlaceType.GPU"
+        )
+
     mixed_model_dirname = os.path.dirname(mixed_model_file)
     # Support mixed_params_file is empty, because some models don't have params, but convert_to_mixed_precision will call
     # constant_folding_pass, it will generate a new params file to save persistable vars, which is saved in the same
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 6c2e0dae678347..8d9a1909f07ca2 100755
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -14,23 +14,25 @@
 
 # TODO: define all functions about input & output in this directory
 
-from .reader import DataLoader  # noqa: F401
-from .dataloader import Dataset  # noqa: F401
-from .dataloader import IterableDataset  # noqa: F401
-from .dataloader import BatchSampler  # noqa: F401
-from .dataloader import get_worker_info  # noqa: F401
-from .dataloader import TensorDataset  # noqa: F401
-from .dataloader import Sampler  # noqa: F401
-from .dataloader import SequenceSampler  # noqa: F401
-from .dataloader import RandomSampler  # noqa: F401
-from .dataloader import DistributedBatchSampler  # noqa: F401
-from .dataloader import ComposeDataset  # noqa: F401
-from .dataloader import ChainDataset  # noqa: F401
-from .dataloader import WeightedRandomSampler  # noqa: F401
-from .dataloader import Subset  # noqa: F401
-from .dataloader import random_split  # noqa: F401
+from .dataloader import (
+    BatchSampler,
+    ChainDataset,
+    ComposeDataset,
+    Dataset,
+    DistributedBatchSampler,
+    IterableDataset,
+    RandomSampler,
+    Sampler,
+    SequenceSampler,
+    Subset,
+    TensorDataset,
+    WeightedRandomSampler,
+    get_worker_info,
+    random_split,
+)
+from .reader import DataLoader
 
-__all__ = [  # noqa
+__all__ = [
     'Dataset',
     'IterableDataset',
     'TensorDataset',
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index f508f72478b00b..37b1203e68e631 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -23,7 +23,7 @@
 from .dy2static.logging_utils import set_code_level, set_verbosity
 from .translated_layer import TranslatedLayer
 
-__all__ = [  # noqa
+__all__ = [
     'save',
     'load',
     'to_static',
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index 3873a150b6ab47..81c18b03c7381e 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -15,17 +15,17 @@
 
 # Temporary disable isort to avoid circular import
 # This can be removed after the circular import is resolved
-# isort: skip_file
 from __future__ import annotations
 
+import inspect
 import os
 import pickle
+import sys
+import threading
+import types
 import warnings
 from collections import OrderedDict
-import inspect
-import threading
 from typing import Any
-import types
 
 import paddle
 from paddle.base import core, dygraph
@@ -39,43 +39,40 @@
     program_desc_tracing_guard,
     switch_to_static_graph,
 )
-from .dy2static import logging_utils
-from .dy2static.convert_call_func import (
-    ConversionOptions,
-    add_ignore_module,
-)
-from .dy2static.program_translator import (
-    ProgramTranslator,
-    StaticFunction,
-    ASTStaticFunction,
-    SymbolicStaticFunction,
-    unwrap_decorators,
-)
-from paddle.jit.translated_layer import (
-    TranslatedLayer,
-    INFER_MODEL_SUFFIX,
-    INFER_PARAMS_SUFFIX,
-    INFER_PARAMS_INFO_SUFFIX,
-    INFER_PROPERTY_SUFFIX,
-)
-from paddle.nn import Layer
 from paddle.base.executor import Executor, scope_guard
 from paddle.base.framework import (
     Block,
+    EagerParamBase,
+    Parameter,
     Program,
     Variable,
-    Parameter,
-    EagerParamBase,
-)
-from paddle.base.framework import (
     _current_expected_place,
     _dygraph_guard,
     _dygraph_tracer,
+    dygraph_only,
 )
-from paddle.base.framework import dygraph_only
 from paddle.base.wrapped_decorator import wrap_decorator
-from paddle.static.io import save_inference_model
 from paddle.framework import in_dynamic_mode
+from paddle.nn import Layer
+from paddle.static.io import save_inference_model
+
+from .dy2static import logging_utils
+from .dy2static.convert_call_func import ConversionOptions, add_ignore_module
+from .dy2static.program_translator import (
+    ASTStaticFunction,
+    ProgramTranslator,
+    StaticFunction,
+    SymbolicStaticFunction,
+    convert_to_static,
+    unwrap_decorators,
+)
+from .translated_layer import (
+    INFER_MODEL_SUFFIX,
+    INFER_PARAMS_INFO_SUFFIX,
+    INFER_PARAMS_SUFFIX,
+    INFER_PROPERTY_SUFFIX,
+    TranslatedLayer,
+)
 
 
 def create_program_from_desc(program_desc):
@@ -165,7 +162,7 @@ def __impl__(*args, **kwargs):
                 "We will just return dygraph output."
             )
             return dygraph_func(*args, **kwargs)
-        static_func = program_translator.get_func(dygraph_func)
+        static_func = convert_to_static(dygraph_func)
         return static_func(*args, **kwargs)
 
     return __impl__
@@ -236,28 +233,31 @@ def to_static(
     input_spec=None,
     build_strategy=None,
     backend=None,
-    enable_fallback=None,
     **kwargs,
 ):
     """
-    Converts imperative dygraph APIs into declarative function APIs. Decorator
+    Converts dynamic graph APIs into static graph function APIs. Decorator
     @to_static handles the Program and Executor of static graph mode and returns
-    the result as dygraph Tensor(s). Users could use the returned dygraph
-    Tensor(s) to do imperative training, inference, or other operations. If the
-    decorated function calls other imperative function, the called one will be
-    converted into declarative function as well.
+    the result as dynamic graph Tensor(s). Users could use the returned dynamic
+    graph Tensor(s) to do dynamic graph training, inference, or other operations.
+    If the decorated function calls other dynamic graph function, the called one
+    will be converted into static graph function as well.
+
     Args:
-        function (callable): callable imperative function.
-        input_spec(list[InputSpec]|tuple[InputSpec]): list/tuple of InputSpec to specific the shape/dtype/name
-            information of each input Tensor.
-        build_strategy(BuildStrategy|None): This argument is used to compile the
+        function (callable): Callable dynamic graph function. If it used as a
+            decorator, the decorated function will be parsed as this parameter.
+        input_spec (list[InputSpec]|tuple[InputSpec]): list/tuple of InputSpec to
+            specific the shape/dtype/name information of each input Tensor.
+        build_strategy (BuildStrategy|None): This argument is used to compile the
             converted program with the specified options, such as operators' fusion
             in the computational graph and memory optimization during the execution
             of the computational graph. For more information about build_strategy,
             please refer to :code:`paddle.static.BuildStrategy`. The default is None.
-        backend(str, Optional): Specifies compilation backend, which can be `CINN` or None. When backend is `CINN`, CINN compiler will be used to speed up training and inference.
-        kwargs: Support keys including `property`, set `property` to True if the fucntion is python property.
-
+        backend(str, Optional): Specifies compilation backend, which can be `CINN` or
+            None. When backend is `CINN`, CINN compiler will be used to speed up
+            training and inference.
+        kwargs: Support keys including `property`, set `property` to True if the function
+            is python property.
 
     Returns:
         Tensor(s): containing the numerical result.
@@ -285,24 +285,31 @@ def to_static(
 
     """
     property = kwargs.get("property", False)
+    full_graph = kwargs.get("full_graph", None)
 
     def decorated(python_func):
         """
         Decorates a python function into a ASTStaticFunction object.
         """
 
-        nonlocal enable_fallback
-        if enable_fallback is None:
+        nonlocal full_graph
+        if full_graph is None:
             flag = os.environ.get("ENABLE_FALL_BACK", None)
-            if flag == "True":
-                enable_fallback = True
-            else:  # None or True
-                enable_fallback = False
+            if flag == "True" or flag is None:
+                full_graph = False
+            else:  # False
+                full_graph = True
+
+        if sys.version_info >= (3, 12) and not full_graph:
+            warnings.warn(
+                "full_graph=False is not supported in Python 3.12+. Set full_graph=True automatically"
+            )
+            full_graph = True
 
-        StaticClass = StaticFunctionClass = {
-            True: SymbolicStaticFunction,
-            False: ASTStaticFunction,
-        }[enable_fallback]
+        StaticClass = {
+            False: SymbolicStaticFunction,
+            True: ASTStaticFunction,
+        }[full_graph]
 
         # Step 1. unwrap the function if it is already decorated.
         _, python_func = unwrap_decorators(python_func)
@@ -1108,7 +1115,7 @@ def save(layer, path, input_spec=None, **configs):
                 static_forward = to_static(
                     inner_layer.forward,
                     input_spec=inner_input_spec,
-                    enable_fallback=False,
+                    full_graph=True,
                 )
                 concrete_program = (
                     static_forward.concrete_program_specify_input_spec(
@@ -1146,7 +1153,7 @@ def save(layer, path, input_spec=None, **configs):
                 static_function = to_static(
                     static_func,
                     input_spec=inner_input_spec,
-                    enable_fallback=False,
+                    full_graph=True,
                 )
                 concrete_program = static_function.concrete_program
 
diff --git a/python/paddle/jit/dy2static/__init__.py b/python/paddle/jit/dy2static/__init__.py
index 136ab6dab7d401..522814d2d293ce 100644
--- a/python/paddle/jit/dy2static/__init__.py
+++ b/python/paddle/jit/dy2static/__init__.py
@@ -12,28 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .utils import saw, UndefinedVar, ast_to_source_code
-from .convert_operators import convert_logical_and as And  # noqa: F401
-from .convert_operators import convert_var_dtype as AsDtype  # noqa: F401
-from .convert_operators import convert_assert as Assert  # noqa: F401
+from .assert_transformer import AssertTransformer  # noqa: F401
+from .ast_transformer import DygraphToStaticAst  # noqa: F401
 from .convert_call_func import convert_call as Call  # noqa: F401
+from .convert_operators import convert_assert as Assert  # noqa: F401
+from .convert_operators import convert_attr as Attr  # noqa: F401
 from .convert_operators import convert_ifelse as IfElse  # noqa: F401
 from .convert_operators import convert_len as Len  # noqa: F401
+from .convert_operators import convert_load as Ld  # noqa: F401
+from .convert_operators import convert_logical_and as And  # noqa: F401
 from .convert_operators import convert_logical_not as Not  # noqa: F401
 from .convert_operators import convert_logical_or as Or  # noqa: F401
 from .convert_operators import convert_pop as Pop  # noqa: F401
 from .convert_operators import convert_shape as Shape  # noqa: F401
+from .convert_operators import convert_shape_compare  # noqa: F401
+from .convert_operators import convert_var_dtype as AsDtype  # noqa: F401
 from .convert_operators import convert_while_loop as While  # noqa: F401
-from .convert_operators import unpack_by_structure as Unpack  # noqa: F401
-from .convert_operators import convert_attr as Attr  # noqa: F401
-from .convert_operators import convert_load as Ld  # noqa: F401
 from .convert_operators import indexable as Indexable  # noqa: F401
-from .variable_trans_func import create_bool_as_type  # noqa: F401
-from .variable_trans_func import to_static_variable  # noqa: F401
-from .convert_operators import convert_shape_compare  # noqa: F401
-from .assert_transformer import AssertTransformer
-from .ast_transformer import DygraphToStaticAst
-from .program_translator import convert_to_static
-from .static_analysis import NodeVarType, StaticAnalysisVisitor
+from .convert_operators import unpack_by_structure as Unpack  # noqa: F401
+from .program_translator import convert_to_static  # noqa: F401
+from .static_analysis import NodeVarType, StaticAnalysisVisitor  # noqa: F401
+from .utils import UndefinedVar, ast_to_source_code, saw  # noqa: F401
+from .variable_trans_func import (  # noqa: F401
+    create_bool_as_type,
+    to_static_variable,
+)
 
 __all__ = []
diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py
index 198cc105b3ec14..af8d826c87018b 100644
--- a/python/paddle/jit/dy2static/newir_partial_program.py
+++ b/python/paddle/jit/dy2static/newir_partial_program.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import itertools
-import os
 from copy import deepcopy
 
 import numpy as np
@@ -27,7 +26,7 @@
 from paddle.base.compiler import BuildStrategy
 from paddle.base.data_feeder import check_type, convert_dtype
 from paddle.base.dygraph.base import switch_to_static_graph
-from paddle.base.framework import _apply_pass
+from paddle.base.framework import _apply_pass, get_flags
 from paddle.framework import use_pir_api
 from paddle.optimizer.lr import LRScheduler
 from paddle.pir import OpResult, fake_op_result, is_fake_op_result
@@ -862,7 +861,9 @@ def _apply_inplace_pass(self, forward_program, backward_program):
                 "mem_opt_skip_vars": forward_mem_opt_skip_vars,
                 "for_partial_block": True,
             }
-            if not os.getenv("FLAGS_enable_new_ir_in_executor"):
+            if not get_flags('FLAGS_enable_new_ir_in_executor')[
+                'FLAGS_enable_new_ir_in_executor'
+            ]:
                 _apply_pass(
                     forward_program,
                     empty_startup_program,
@@ -876,7 +877,9 @@ def _apply_inplace_pass(self, forward_program, backward_program):
                 "mem_opt_skip_vars": backward_mem_opt_skip_vars,
                 "for_partial_block": True,
             }
-            if not os.getenv("FLAGS_enable_new_ir_in_executor"):
+            if not get_flags('FLAGS_enable_new_ir_in_executor')[
+                'FLAGS_enable_new_ir_in_executor'
+            ]:
                 _apply_pass(
                     backward_program,
                     empty_startup_program,
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 04255140ae9ca4..f890e1eb7d0233 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from copy import deepcopy
 
 import numpy as np
@@ -24,7 +23,7 @@
 from paddle.base.compiler import BuildStrategy
 from paddle.base.data_feeder import check_type, convert_dtype
 from paddle.base.dygraph.base import switch_to_static_graph
-from paddle.base.framework import _apply_pass
+from paddle.base.framework import _apply_pass, get_flags
 from paddle.base.unique_name import guard as UniqueNameGuard
 from paddle.optimizer.lr import LRScheduler
 
@@ -839,7 +838,9 @@ def _apply_inplace_pass(self, forward_program, backward_program):
                 "mem_opt_skip_vars": forward_mem_opt_skip_vars,
                 "for_partial_block": True,
             }
-            if not os.getenv("FLAGS_enable_new_ir_in_executor"):
+            if not get_flags('FLAGS_enable_new_ir_in_executor')[
+                'FLAGS_enable_new_ir_in_executor'
+            ]:
                 _apply_pass(
                     forward_program,
                     empty_startup_program,
@@ -853,7 +854,9 @@ def _apply_inplace_pass(self, forward_program, backward_program):
                 "mem_opt_skip_vars": backward_mem_opt_skip_vars,
                 "for_partial_block": True,
             }
-            if not os.getenv("FLAGS_enable_new_ir_in_executor"):
+            if not get_flags('FLAGS_enable_new_ir_in_executor')[
+                'FLAGS_enable_new_ir_in_executor'
+            ]:
                 _apply_pass(
                     backward_program,
                     empty_startup_program,
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 642b4c8b9529e8..95a9c2c9fdc91e 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -15,7 +15,6 @@
 import collections
 import inspect
 import os
-import textwrap
 import threading
 import warnings
 import weakref
@@ -56,7 +55,6 @@
     ALREADY_D2S,
     NO_SHAPE_VAR_TYPE,
     ast_to_func,
-    ast_to_source_code,
     backend_guard,
     func_to_source_code,
     input_specs_compatible,
@@ -680,8 +678,8 @@ def function_spec(self):
 def raise_error_template(func_str):
     def _raise_error(*args, **kwargs):
         error_template = (
-            "Can't call {func} when enable_fallback=True."
-            "Use paddle.jit.to_static(enable_fallback=False) instead."
+            "Can't call {func} when full_graph=False. "
+            "Use paddle.jit.to_static(full_graph=True) instead."
         )
         raise RuntimeError(error_template.format(func=func_str))
 
@@ -692,13 +690,15 @@ class SymbolicStaticFunction(StaticFunction):
     def __init__(self, function, input_spec=None, **kwargs):
         if input_spec is not None:
             warnings.warn(
-                "\nSymbolic Trace don't support input_spec arguments. It will Will not produce any effect.\n"
-                "1. You can disable fallback mode by `paddle.jit.to_static(enable_fallback=False)` to switch to AST to static, then you can assign input spec.\n"
+                "full_graph=False don't support input_spec arguments. It will not produce any effect.\n"
+                "You can set full_graph=True, then you can assign input spec.\n"
             )
         super().__init__(function, input_spec, **kwargs)
         self.last_call_input_spec = None
 
     def _perform_call(self, *args, **kwargs):
+        from ..sot import symbolic_translate
+
         args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
         (
             input_args_with_spec,
@@ -706,16 +706,6 @@ def _perform_call(self, *args, **kwargs):
         ) = self._function_spec.args_to_input_spec(args, kwargs)
         self.last_call_input_spec = input_args_with_spec
 
-        try:
-            from sot import symbolic_translate
-        except:
-            import os
-
-            os.system(
-                "pip install git+https://github.com/PaddlePaddle/PaddleSOT@develop"
-            )
-            from sot import symbolic_translate
-
         build_strategy = self._kwargs.get("build_strategy", None)
         backend = self._kwargs.get("backend", None)
         traced_fun = symbolic_translate(
@@ -1770,37 +1760,6 @@ def __init__(self):
         self.enable_to_static = True
 
     def enable(self, enable_to_static):
-        """
-        Enable or disable the converting from imperative to static graph by
-        ProgramTranslator globally.
-
-        Args:
-            enable_to_static (bool): True or False to enable or disable converting to static.
-
-        Returns:
-            None.
-
-        Examples:
-            .. code-block:: python
-
-                >>> # doctest: +SKIP('`paddle.jit.to_static` can not run in xdoctest')
-                >>> import paddle
-                >>> def func(x):
-                ...     if paddle.mean(x) > 0:
-                ...         x_v = x - 1
-                ...     else:
-                ...         x_v = x + 1
-                ...     return x_v
-                ...
-                ...
-                >>> prog_trans = paddle.jit.dy2static.program_translator.ProgramTranslator()
-
-                >>> x = paddle.ones([1, 2])
-                >>> x_v = prog_trans.get_output(func, x)
-                >>> print(x_v)
-                Tensor(shape=[1, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                [[0., 0.]])
-        """
         check_type(
             enable_to_static,
             "enable_to_static",
@@ -1809,274 +1768,6 @@ def enable(self, enable_to_static):
         )
         self.enable_to_static = enable_to_static
 
-    def get_output(self, dygraph_func, *args, **kwargs):
-        """
-        Returns the output dygraph Tensor for dygraph function. The dygraph
-        function will be translated into static graph function so the under
-        beneath numerical result will be calculated by static graph mode.
-
-        Args:
-            dygraph_func (callable): the dygraph function.
-            *args (tuple): the input argument of dygraph_func.
-            **kwargs (dict): the input argument of dygraph_func.
-
-        Returns:
-            Tensor or tuple of Tensors: the dygraph Tensor containing digital result.
-
-        Examples:
-            .. code-block:: python
-
-                >>> # doctest: +SKIP('`paddle.jit.to_static` can not run in xdoctest')
-                >>> import paddle
-                >>> def func(x):
-                ...     if paddle.mean(x) > 0:
-                ...         x_v = x - 1
-                ...     else:
-                ...         x_v = x + 1
-                ...     return x_v
-                ...
-                ...
-                >>> prog_trans = paddle.jit.dy2static.program_translator.ProgramTranslator()
-
-                >>> x = paddle.ones([1, 2])
-                >>> x_v = prog_trans.get_output(func, x)
-                >>> print(x_v)
-                Tensor(shape=[1, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                [[0., 0.]])
-        """
-        assert callable(
-            dygraph_func
-        ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
-
-        if not self.enable_to_static:
-            # Here calls `warnings.warn` but not `logging_utils.warn` because by default warnings.warn(message)
-            # will show up **only once**.
-            logging_utils.warn(
-                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable to False. "
-                "We will just return dygraph output. "
-                "Please call ProgramTranslator.enable(True) if you would like to get static output."
-            )
-            return dygraph_func(*args, **kwargs)
-        try:
-            function_spec = FunctionSpec(dygraph_func)
-            cache_key = CacheKey.from_func_and_args(
-                function_spec,
-                args,
-                kwargs,
-                getattr(dygraph_func, '__self__', None),
-            )
-            _, partial_program_layer = self._program_cache[cache_key]
-
-            if args and isinstance(args[0], layers.Layer):
-                # Synchronize self.training attribute.
-                partial_program_layer.training = args[0].training
-                args = args[1:]
-            try:
-                return partial_program_layer(args)
-            except BaseException as e:
-                # NOTE:
-                # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
-                # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
-                if not hasattr(e, error.ERROR_DATA):
-                    # runtime error
-                    error.attach_error_data(e, in_runtime=True)
-                raise
-        except BaseException as e:
-            error_data = getattr(e, error.ERROR_DATA, None)
-            if error_data:
-                error_data.raise_new_exception()
-            else:
-                logging_utils.warn(
-                    "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
-                    " if you can't handle this {} yourself.".format(type(e))
-                )
-                raise e
-
-    def get_func(self, dygraph_func):
-        """
-        Returns a callable function which converts imperative dygraph APIs of
-        the input dygraph_func into declarative net-building APIs, which means
-        it doesn't return immediate digital result as get_output does.
-        Users should handle Program and Executor by themselves.
-
-        Args:
-            dygraph_func (callable): the dygraph function.
-
-        Returns:
-            callable: converting imperative dygraph APIs into declarative
-            net-building APIs.
-
-        Examples:
-            .. code-block:: python
-
-                >>> # doctest: +SKIP('`paddle.jit.to_static` can not run in xdoctest')
-                >>> import paddle
-                >>> def func(x):
-                ...     if paddle.mean(x) > 0:
-                ...         x_v = x - 1
-                ...     else:
-                ...         x_v = x + 1
-                ...     return x_v
-                ...
-                >>> prog_trans = paddle.jit.dy2static.program_translator.ProgramTranslator()
-                >>> static_func = prog_trans.get_func(func)
-                >>> print(callable(static_func))
-                True
-        """
-        assert callable(
-            dygraph_func
-        ), "Input dygraph_func is not a callable in ProgramTranslator.get_func"
-
-        if not self.enable_to_static:
-            logging_utils.warn(
-                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable to False. We will "
-                "just return dygraph output. Please call ProgramTranslator.enable(True) if you would like to get static output."
-            )
-            return dygraph_func
-
-        static_func = convert_to_static(dygraph_func)
-        return static_func
-
-    def get_program(self, dygraph_func, *args, **kwargs):
-        """
-        Returns the translated static program and input/output Tensors from
-        dygraph function. The users can use the program to run by executor.
-
-        Args:
-            dygraph_func (callable): the dygraph function.
-            *args (tuple): the input argument of dygraph_func.
-            **kwargs (dict): the input argument of dygraph_func.
-
-        Returns:
-            tuple of (main_program, startup_program, inputs, outputs) whose
-            types are (Program, Program, list of Tensors, list of Tensors).
-            main_program: the converted main program.
-            startup_program: the converted startup program.
-            inputs: list of input Tensors which need to be fed.
-            outputs: list of output Tensors which users can fetch.
-
-        Examples:
-            .. code-block:: python
-
-                >>> # doctest: +SKIP('`paddle.jit.to_static` can not run in xdoctest')
-                >>> import paddle
-                >>> def func(x):
-                ...     if paddle.mean(x) > 0:
-                ...         x_v = x - 1
-                ...     else:
-                ...         x_v = x + 1
-                ...     return x_v
-                ...
-                >>> prog_trans = paddle.jit.dy2static.program_translator.ProgramTranslator()
-                >>> x = paddle.ones([1, 2])
-                >>> main_prog, start_prog, inputs, outputs = prog_trans.get_program(func, x)
-                >>> print([i.name for i in inputs])
-                >>> # [u'generated_tensor_0'] the feed input Tensor name representing x
-                >>> print([o.name for o in outputs])
-                >>> # [u'_generated_var_4'] the fetch output Tensor name representing x_v
-        """
-        assert callable(
-            dygraph_func
-        ), "Input dygraph_func is not a callable in ProgramTranslator.get_program"
-
-        if not self.enable_to_static:
-            logging_utils.warn(
-                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable to False."
-                "We will just return dygraph output. "
-                "Please call ProgramTranslator.enable(True) if you would like to get static output."
-            )
-            return dygraph_func(*args, **kwargs)
-
-        function_spec = FunctionSpec(dygraph_func)
-        cache_key = CacheKey.from_func_and_args(
-            function_spec, args, kwargs, getattr(dygraph_func, '__self__', None)
-        )
-        concrete_program, partial_program_layer = self._program_cache[cache_key]
-
-        # Note: concrete_program hold all input/output infos include non-Variable
-        input_vars = [
-            var
-            for var in concrete_program.inputs
-            if isinstance(var, framework.Variable)
-        ]
-        output_vars = [
-            var
-            for var in concrete_program.outputs
-            if isinstance(var, framework.Variable)
-        ]
-
-        return (
-            concrete_program.main_program,
-            concrete_program.startup_program,
-            input_vars,
-            output_vars,
-        )
-
-    def get_code(self, dygraph_func):
-        """
-        Returns the translated static function string code from dygraph function.
-
-        Args:
-            dygraph_func (callable): the dygraph function.
-
-        Returns:
-            str: the string code of translated static function.
-
-        Examples:
-            .. code-block:: python
-
-                >>> # doctest: +SKIP('`paddle.jit.to_static` can not run in xdoctest')
-                >>> import paddle
-                >>> def func(x):
-                ...     if paddle.mean(x) > 0:
-                ...         x_v = x - 1
-                ...     else:
-                ...         x_v = x + 1
-                ...     return x_v
-                ...
-                >>> prog_trans = paddle.jit.dy2static.program_translator.ProgramTranslator()
-
-                >>> code = prog_trans.get_code(func)
-                >>> print(type(code))
-                <class 'str'>
-        """
-        assert callable(
-            dygraph_func
-        ), "Input dygraph_func is not a callable in ProgramTranslator.get_code"
-        # Gets AST from dygraph function
-
-        unwrap_func = unwrap(dygraph_func)
-        raw_code = inspect.getsource(unwrap_func)
-        code = textwrap.dedent(raw_code)
-        root = gast.parse(code)
-
-        # Transform AST
-        dygraph_to_static = DygraphToStaticAst()
-        root = dygraph_to_static.get_static_ast(root)
-
-        # Get source_code
-        source_code = ast_to_source_code(root)
-        return source_code
-
-    def get_program_cache(self):
-        """
-        Returns the ProgramCache instance. This method is used by PaddlePaddle
-        developers to manage program cache in ProgramTranslator. Normal users
-        don't have to call this method.
-
-        Returns:
-            ProgramCache: ProgramCache instance of ProgramTranslator.
-
-        Examples:
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> prog_trans = paddle.jit.dy2static.program_translator.ProgramTranslator()
-                >>> prog_cache = prog_trans.get_program_cache()
-        """
-        return self._program_cache
-
 
 def enable_to_static(enable_to_static_bool):
     """
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 87eb1b45d34a52..fd5eba66c76842 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -40,16 +40,18 @@
 
 from .ast_utils import ast_to_source_code
 from .static_analysis import StaticAnalysisVisitor
-from .utils_helper import DYGRAPH_MODULE_PREFIX  # noqa: F401
-from .utils_helper import DYGRAPH_TO_STATIC_MODULE_PREFIX  # noqa: F401
-from .utils_helper import PADDLE_MODULE_PREFIX  # noqa: F401
-from .utils_helper import NodeVarType  # noqa: F401
-from .utils_helper import _is_api_in_module_helper  # noqa: F401
-from .utils_helper import index_in_list  # noqa: F401
-from .utils_helper import is_api_in_module  # noqa: F401
-from .utils_helper import is_dygraph_api  # noqa: F401
-from .utils_helper import is_numpy_api  # noqa: F401;
-from .utils_helper import is_paddle_api  # noqa: F401
+from .utils_helper import (  # noqa: F401
+    DYGRAPH_MODULE_PREFIX,
+    DYGRAPH_TO_STATIC_MODULE_PREFIX,
+    PADDLE_MODULE_PREFIX,
+    NodeVarType,
+    _is_api_in_module_helper,
+    index_in_list,
+    is_api_in_module,
+    is_dygraph_api,
+    is_numpy_api,
+    is_paddle_api,
+)
 
 __all__ = []
 
diff --git a/python/paddle/jit/dy2static/utils_helper.py b/python/paddle/jit/dy2static/utils_helper.py
index 06f96d2094a1e9..b54c026745f700 100644
--- a/python/paddle/jit/dy2static/utils_helper.py
+++ b/python/paddle/jit/dy2static/utils_helper.py
@@ -20,8 +20,7 @@
 
 import paddle  # noqa: F401
 from paddle import base  # noqa: F401
-from paddle.base import dygraph  # noqa: F401
-from paddle.base import layers  # noqa: F401
+from paddle.base import dygraph, layers  # noqa: F401
 from paddle.base.dygraph import to_variable  # noqa: F401
 from paddle.utils import gast
 
@@ -184,3 +183,14 @@ def type_from_annotation(annotation):
         # raise warning if not found
         warn("Currently we don't support annotation: %s" % annotation_str)
         return NodeVarType.UNKNOWN
+
+
+def set_dynamic_shape(variable, shape_list):
+    if paddle.base.dygraph.base.in_to_static_mode():
+        assert isinstance(
+            variable, paddle.base.framework.Variable
+        ), "In to_static mode, variable must be a Variable."
+        variable.desc.set_shape(shape_list)
+    else:
+        # in dygraph mode, dynamic shape is not needed, just do nothing.
+        return
diff --git a/python/paddle/jit/sot/__init__.py b/python/paddle/jit/sot/__init__.py
new file mode 100644
index 00000000000000..1b45c0c55389b2
--- /dev/null
+++ b/python/paddle/jit/sot/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import psdb  # noqa: F401
+from .opcode_translator.breakpoint import (  # noqa: F401
+    BM,
+    add_breakpoint,
+    add_event,
+)
+from .opcode_translator.skip_files import skip_function  # noqa: F401
+from .translate import symbolic_translate  # noqa: F401
diff --git a/python/paddle/jit/sot/infer_meta.py b/python/paddle/jit/sot/infer_meta.py
new file mode 100644
index 00000000000000..a88338bdf2e740
--- /dev/null
+++ b/python/paddle/jit/sot/infer_meta.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.amp.auto_cast import amp_state
+from paddle.base.unique_name import UniqueNameGenerator
+from paddle.base.unique_name import guard as UniqueNameGuard
+from paddle.static import Program
+from paddle.utils import flatten, is_sequence
+
+from .utils import Cache, Singleton, map_if_extend, meta_str
+
+
+class MetaInfo:
+    def __init__(
+        self, shape, dtype, stop_gradient, name, persistable, type, place
+    ):
+        self.name = name
+        self.persistable = persistable
+        self.type = type
+        self.place = place
+        self.shape = shape
+        self.dtype = dtype
+        self.stop_gradient = stop_gradient
+
+    @staticmethod
+    def from_tensor(tensor):
+        # We always use float32 in simulation if AMP is enabled.
+        dtype = tensor.dtype
+        current_amp_state = amp_state()
+        if (
+            dtype == paddle.float16
+            and current_amp_state is not None
+            and current_amp_state["dtype"] == "float16"
+        ):
+            dtype = paddle.float32
+        return MetaInfo(
+            list(tensor.shape),
+            dtype,
+            tensor.stop_gradient,
+            tensor.name,
+            tensor.persistable,
+            tensor.type,
+            tensor.place,
+        )
+
+    def is_dynamic_shape(self):
+        """
+        if -1 in shape, return True
+        else: return False
+        """
+        return -1 in self.shape
+
+    def to_input_spec(self):
+        return paddle.static.InputSpec(
+            self.shape, dtype=self.dtype, stop_gradient=self.stop_gradient
+        )
+
+    def guard_str(self):
+        return f"({self.shape}, {self.dtype}, {self.stop_gradient})"
+
+    def __repr__(self):
+        return meta_str(self.shape, self.dtype, self.stop_gradient)
+
+    def __eq__(self, meta):
+        return (
+            self.shape == meta.shape
+            and self.dtype == meta.dtype
+            and self.stop_gradient == meta.stop_gradient
+        )
+
+    def __hash__(self):
+        return hash((tuple(self.shape), self.dtype, self.stop_gradient))
+
+
+@Singleton
+class VariableCreator:
+    """
+    We use the static graph Variable to infer the meta information of Tensor.
+    This singleton class is used to create Variable for infer meta.
+    """
+
+    def __init__(self):
+        self.var_cache = {}
+        self.main_program = Program()
+        self.startup_program = Program()
+        self.var_name_generator = UniqueNameGenerator("infer_meta_variable_")
+
+    def gen_name(self, meta):
+        name = f"{meta.dtype}_{meta.stop_gradient}"
+        for l in meta.shape:
+            name += f"_{l}"
+        return name
+
+    def create_var(self, meta):
+        var = self.main_program.global_block().create_var(
+            shape=meta.shape,
+            dtype=meta.dtype,
+            stop_gradient=meta.stop_gradient,
+        )
+        assert not isinstance(
+            var, paddle.Tensor
+        ), "Expect a Variable, but got a Tensor."
+        return var
+
+    def get_variable(self, meta):
+        var_feature_name = self.gen_name(meta)
+        if var_feature_name not in self.var_cache:
+            self.var_cache[var_feature_name] = self.create_var(meta)
+        return self.var_cache[var_feature_name]
+
+    def infer_meta(self, func, *args, **kwargs):
+        with paddle.base.framework._dygraph_guard(None), UniqueNameGuard(
+            self.var_name_generator
+        ):
+            args, kwargs = convert_meta_to_variable(
+                args
+            ), convert_meta_to_variable(kwargs)
+
+            with paddle.static.program_guard(
+                self.main_program, self.startup_program
+            ):
+                if isinstance(func, str):
+                    # TODO(Aurelius84): Is length of args always greater than 0?
+                    # Do we need add condition check here?
+                    out = getattr(args[0], func)(*args[1:], **kwargs)
+                else:
+                    out = func(*args, **kwargs)
+
+        return convert_variable_to_meta_info(out)
+
+
+def convert_meta_to_variable(args):
+    return map_if_extend(
+        args,
+        pred=lambda x: isinstance(x, MetaInfo),
+        true_fn=lambda x: VariableCreator().get_variable(x),
+        false_fn=lambda x: x,
+    )
+
+
+def convert_meta_to_input_spec(args):
+    return map_if_extend(
+        args,
+        pred=lambda x: isinstance(x, MetaInfo),
+        true_fn=lambda x: x.to_input_spec(),
+        # TODO(xiongkun): can x be tensor ?
+        false_fn=lambda x: paddle.static.InputSpec.from_tensor(x)
+        if isinstance(x, paddle.Tensor)
+        else x,
+    )
+
+
+def convert_variable_to_meta_info(args):
+    return map_if_extend(
+        args,
+        pred=lambda x: isinstance(x, paddle.static.Variable),
+        true_fn=lambda x: MetaInfo.from_tensor(x),
+        false_fn=lambda x: x,
+    )
+
+
+def infer_meta(func, *args, **kwargs):
+    fn = SpecialInferMeta().get_infermeta_fn(func)
+    if fn:
+        return fn(*args, **kwargs)
+    return VariableCreator().infer_meta(func, *args, **kwargs)
+
+
+def infer_meta_for_layer(layer, *args, **kwargs):
+    assert isinstance(
+        layer, paddle.nn.Layer
+    ), f"Expect a Layer, but got {layer}."
+    layer = paddle.jit.to_static(layer, full_graph=True)
+
+    args_, kwargs_ = convert_meta_to_input_spec((args, kwargs))
+
+    (
+        concrete_program,
+        partial_program_layer,
+    ) = layer.forward.get_concrete_program(*args_, **kwargs_)
+
+    out = partial_program_layer._restore_out(
+        paddle.utils.flatten(
+            convert_variable_to_meta_info(concrete_program.outputs)
+        )
+    )
+    layer.forward.rollback()
+    return out
+
+
+@Singleton
+class SpecialInferMeta:
+    """
+    There are some functions that cannot be inferred directly through static graph,
+    and need to be implemented manually. This class is used to implement infer meta
+    for these functions.
+    """
+
+    def __init__(self):
+        pass
+
+    def get_infermeta_fn(self, fn):
+        try:
+            funcname = fn.__name__
+            return getattr(self, f"infermeta_{funcname}")
+        except:
+            pass
+        return None
+
+    def infermeta_grad(
+        self,
+        outputs,
+        inputs,
+        grad_outputs=None,
+        retain_graph=None,
+        create_graph=False,
+        only_inputs=True,
+        allow_unused=False,
+        no_grad_vars=None,
+    ):
+        if not is_sequence(inputs):
+            inputs = [inputs]
+        return inputs
+
+
+@Singleton
+class InferMetaCache(Cache):
+    def key_fn(
+        self, func, *args, **kwargs
+    ):  # args & kwargs have transformed to MetaInfo
+        try:
+            retval = hash(
+                (
+                    func,
+                    tuple(flatten(args)),
+                    tuple(kwargs.keys()),
+                    tuple(flatten(kwargs)),
+                )
+            )
+        except Exception as e:
+            return None
+        return retval
+
+    def value_fn(self, func, *args, **kwargs):
+        return infer_meta(func, *args, **kwargs)
+
+
+@Singleton
+class LayerInferMetaCache(Cache):
+    def key_fn(self, layer, *args, **kwargs):
+        params = [
+            MetaInfo.from_tensor(x)
+            for x in layer.parameters(include_sublayers=True)
+        ]
+        try:
+            retval = hash(
+                (
+                    layer,
+                    tuple(params),
+                    tuple(flatten(args)),
+                    tuple(kwargs.keys()),
+                    tuple(flatten(kwargs)),
+                )
+            )
+        except Exception as e:
+            return None
+        return retval
+
+    def value_fn(self, layer, *args, **kwargs):
+        return infer_meta_for_layer(layer, *args, **kwargs)
diff --git a/python/paddle/jit/sot/opcode_translator/__init__.py b/python/paddle/jit/sot/opcode_translator/__init__.py
new file mode 100644
index 00000000000000..bf230190e3e112
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .transform import eval_frame_callback  # noqa: F401
diff --git a/python/paddle/jit/sot/opcode_translator/breakpoint.py b/python/paddle/jit/sot/opcode_translator/breakpoint.py
new file mode 100644
index 00000000000000..6f3217dd8776ea
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/breakpoint.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import traceback
+from dataclasses import dataclass
+
+from ..opcode_translator.instruction_utils import instrs_info
+from ..utils import Singleton, log
+from .executor.opcode_executor import OpcodeExecutorBase
+
+# this file is a debug utils files for quick debug
+# >>> sot.add_breakpoint(file, line)
+# >>> sot.remove_breakpoint(file, line)
+
+
+@dataclass
+class Breakpoint:
+    file: str
+    line: int
+    co_name: str
+    offset: int
+
+    def __hash__(self):
+        return hash((self.file, self.line, self.co_name, self.offset))
+
+
+@Singleton
+class BreakpointManager:
+    def __init__(self):
+        self.breakpoints = set()
+        self.executors = OpcodeExecutorBase.call_stack
+        self.activate = 0
+        self.record_event = []
+
+    def clear_event(self, event):
+        self.record_event.clear()
+
+    def add_event(self, event):
+        """
+        event in ['All' ,'FallbackError', 'BreakGraphError', 'InnerError']
+        """
+        self.record_event.append(event)
+
+    def add(self, file, line, coname=None, offset=None):
+        log(1, f"add breakpoint at {file}:{line}\n")
+        self.breakpoints.add(Breakpoint(file, line, coname, offset))
+
+    def addn(self, *lines):
+        """
+        called inside a executor. add a list of line number in current file.
+        """
+        if not isinstance(lines, (list, tuple)):
+            lines = [lines]
+        for line in lines:
+            file = self.cur_exe._code.co_filename
+            self.add(file, line)
+
+    def clear(self):
+        self.breakpoints.clear()
+
+    def hit(self, file, line, co_name, offset):
+        if Breakpoint(file, line, None, None) in self.breakpoints:
+            return True
+        if Breakpoint(file, line, co_name, offset) in self.breakpoints:
+            return True
+        return False
+
+    def locate(self, exe):
+        for i, _e in enumerate(self.executors):
+            if _e is exe:
+                self.activate = i
+                return
+        raise RuntimeError("Not found executor.")
+
+    def up(self):
+        if self.activate == 0:
+            return
+        self.activate -= 1
+        print("current function is: ", self.cur_exe._code.co_name)
+
+    def down(self):
+        if self.activate >= len(self.executors) - 1:
+            return
+        self.activate += 1
+        print("current function is: ", self.cur_exe._code.co_name)
+
+    def opcode(self, cur_exe=None):
+        if cur_exe is None:
+            cur_exe = self.cur_exe
+        instr = cur_exe._instructions[cur_exe._lasti - 1]
+        message = f"[Translate {cur_exe}]: (line {cur_exe._current_line:>3}) {instr.opname:<12} {instr.argval}, stack is {cur_exe._stack}\n"
+        return message
+
+    def bt(self):
+        """
+        display all inline calls: backtrace.
+        """
+        for exe in self.executors:
+            lines, _ = inspect.getsourcelines(exe._code)
+            print(
+                "  "
+                + exe._code.co_filename
+                + f"({exe._current_line})"
+                + f"{exe._code.co_name}()"
+            )
+            print(f"-> {lines[0].strip()}")
+            print(f"-> {self.opcode(exe)}")
+        pass
+
+    def on_event(self, event):
+        if "All" in self.record_event or event in self.record_event:
+            print("event captured.")
+            self.activate = len(self.executors) - 1
+            breakpoint()
+
+    def _dis_source_code(self):
+        cur_exe = self.executors[self.activate]
+        lines, start_line = inspect.getsourcelines(cur_exe._code)
+        cur_line = cur_exe._current_line
+        lines[
+            cur_line - start_line + 1 : cur_line - start_line + 1
+        ] = "  ^^^^^ HERE  \n"
+        print("\033[31mSource Code is: \033[0m")
+        print("".join(lines))
+
+    def dis(self, range=5):
+        """
+        display all instruction code and source code.
+        """
+        print("displaying debug info...")
+        cur_exe = self.cur_exe
+        print(self._dis_source_code())
+
+        print(f"\n{cur_exe._code}")
+        lasti = cur_exe._lasti
+        lines = instrs_info(cur_exe._instructions, lasti - 1, range)
+        print("\n".join(lines))
+
+    @property
+    def cur_exe(self):
+        exe = self.executors[self.activate]
+        return exe
+
+    def sir(self):
+        """
+        display sir in a page.
+        """
+        print("displaying sir...")
+        self.cur_exe.print_sir()
+
+    def pe(self, e):
+        """
+        print exception.
+        """
+        lines = traceback.format_tb(e.__traceback__)
+        print("".join(lines))
+
+
+def add_breakpoint(file, line, co_name=None, offset=None):
+    BM.add(file, line, co_name, offset)
+
+
+def add_event(event):
+    BM.add_event(event)
+
+
+BM = BreakpointManager()
diff --git a/python/paddle/jit/sot/opcode_translator/custom_code.py b/python/paddle/jit/sot/opcode_translator/custom_code.py
new file mode 100644
index 00000000000000..da674fb673170a
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/custom_code.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import types
+from typing import NamedTuple
+
+
+class CustomCode(NamedTuple):
+    code: types.CodeType | None
+    disable_eval_frame: bool
diff --git a/python/paddle/jit/sot/opcode_translator/executor/__init__.py b/python/paddle/jit/sot/opcode_translator/executor/__init__.py
new file mode 100644
index 00000000000000..4d9db28d227077
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import variable_dispatch  # noqa: F401
diff --git a/python/paddle/jit/sot/opcode_translator/executor/dispatch_functions.py b/python/paddle/jit/sot/opcode_translator/executor/dispatch_functions.py
new file mode 100644
index 00000000000000..9b00dcde0462b4
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/dispatch_functions.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file stores the customed function that will be called by the dispatch mechanism.
+
+from ...utils import BreakGraphError, FallbackError
+
+
+def raise_break_graph_fn(*args, **kwarg):
+    raise BreakGraphError("raise by raise_break_graph_fn.")
+
+
+def raise_not_implement_fn(*args, **kwarg):
+    raise FallbackError("raise by raise_break_graph_fn.")
+
+
+# just a function for operator.in
+def operator_in(left, right):
+    return left in right
+
+
+def operator_not_in(left, right):
+    return left not in right
+
+
+def operator_exception_match(left, right):
+    pass
+
+
+def operator_BAD(left, right):
+    pass
+
+
+def operator_is_none(val):
+    pass
+
+
+def operator_is_not_none(val):
+    pass
+
+
+def tensor_numel(x):
+    pass
diff --git a/python/paddle/jit/sot/opcode_translator/executor/dispatcher.py b/python/paddle/jit/sot/opcode_translator/executor/dispatcher.py
new file mode 100644
index 00000000000000..315066f27e820c
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/dispatcher.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import copy
+import inspect
+import operator
+from functools import cached_property, reduce
+from typing import TYPE_CHECKING, Any, Callable, Dict, Tuple, TypeVar
+
+from ...utils import InnerError, NameGenerator, hashable
+
+if TYPE_CHECKING:
+    T = TypeVar("T")
+    Args = Tuple[T, ...]
+    Kwargs = Dict[str, T]
+
+
+def format_type(type_: type[Any] | tuple[type[Any], ...]) -> str:
+    if not isinstance(type_, tuple):
+        type_ = (type_,)
+    return " | ".join([t.__name__ for t in type_])
+
+
+def format_param(param: Parameter) -> str:
+    kind = param.kind
+    # TODO: support VAR_KEYWORD
+    if kind == inspect.Parameter.VAR_POSITIONAL:
+        return f"*{format_type(param.type)}"
+    else:
+        return format_type(param.type)
+
+
+def convert_annotation_to_type(type_str: str) -> tuple[type[Any], ...]:
+    """
+    Convert type annotation to runtime value. Because we are using :pep:`563`
+    to use the future annotation syntax, we cannot use `get_type_hints <https://docs.python.org/3.8/library/typing.html#typing.get_type_hints>`_
+    directly. Currently, only the builtins and variables namespaces are supported.
+
+    Returns:
+        tuple: The converted type.
+    """
+
+    import builtins
+
+    from . import variables
+
+    type_str = type_str.strip()
+    if type_str == "Any":
+        type_str = "object"
+
+    if "|" in type_str:
+        return reduce(
+            operator.add, map(convert_annotation_to_type, type_str.split("|"))
+        )
+
+    search_namespaces = [variables, builtins]
+    for namespace in search_namespaces:
+        if hasattr(namespace, type_str):
+            return (getattr(namespace, type_str),)
+    raise InnerError(f"Cannot find type {type_str} in {search_namespaces}")
+
+
+class Parameter:
+    name_gen = NameGenerator("param_")
+    annotation: str
+    name: str
+
+    def __init__(
+        self,
+        annotation: str,
+        *,
+        kind: inspect._ParameterKind = inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        name: str | None = None,
+        default: Any = inspect._empty,
+    ):
+        self.name = name if name is not None else Parameter.name_gen.next()
+        self.annotation = annotation
+        self.kind = kind
+        self.default = default
+
+    def to_parameter(self) -> inspect.Parameter:
+        return inspect.Parameter(
+            self.name,
+            kind=self.kind,
+            annotation=self.annotation,
+            default=copy.copy(self.default),
+        )
+
+    @cached_property
+    def type(self) -> tuple[type[Any], ...]:
+        return convert_annotation_to_type(self.annotation)
+
+    def match_arg(self, arg: Any) -> bool:
+        # TODO: support VAR_KEYWORD
+        if self.kind == inspect.Parameter.VAR_POSITIONAL:
+            is_tuple = isinstance(arg, tuple)
+            return is_tuple and all(isinstance(a, self.type) for a in arg)
+        else:
+            return isinstance(arg, self.type)
+
+    @staticmethod
+    def from_str(annotation: str) -> Parameter:
+        return Parameter(annotation)
+
+    @staticmethod
+    def from_parameter(parameter: inspect.Parameter) -> Parameter:
+        if parameter.annotation != parameter.empty and not isinstance(
+            parameter.annotation, str
+        ):
+            raise InnerError(
+                f"Parameter {parameter} has annotation {parameter.annotation} "
+                "which is not a string. Please add `from __future__ import annotations` "
+                "to the top of your file."
+            )
+        annotation = (
+            parameter.annotation
+            if parameter.annotation != parameter.empty
+            else "Any"
+        )
+
+        return Parameter(
+            annotation,
+            kind=parameter.kind,
+            name=parameter.name,
+            default=parameter.default,
+        )
+
+    def __repr__(self) -> str:
+        default_repr = f"= {self.default!r}"
+        return f"Parameter({', '.join([self.annotation, default_repr])})"
+
+
+def optional(annotation: str, default: Any = None) -> Parameter:
+    return Parameter(annotation, default=default)
+
+
+class Pattern:
+    parameters: dict[str, Parameter]
+    signature: inspect.Signature
+
+    def __init__(
+        self,
+        *parameters: Parameter,
+    ):
+        self.parameters = {
+            parameter.name: parameter for parameter in parameters
+        }
+        self.signature = inspect.Signature(
+            [parameter.to_parameter() for parameter in self.parameters.values()]
+        )
+
+    def match_inputs(self, /, *args: Any, **kwargs: Any) -> bool:
+        """
+        Match the input parameters of the function.
+
+        Returns:
+            bool: Whether the input parameters match the pattern.
+        """
+        try:
+            bound_args = self.signature.bind(*args, **kwargs)
+        except TypeError:
+            return False
+        for arg_name, arg_value in bound_args.arguments.items():
+            if arg_name not in self.parameters:
+                continue
+            if not self.parameters[arg_name].match_arg(arg_value):
+                return False
+        return True
+
+    def __repr__(self) -> str:
+        types_repr = ", ".join(
+            [format_param(param) for param in self.parameters.values()]
+        )
+        return f"Pattern({types_repr})"
+
+
+class Dispatcher:
+    """
+    Used for pattern registration and distribution.
+
+    For more design ideas, refer to the `Builtin dispatcher <https://github.com/PaddlePaddle/PaddleSOT/blob/develop/docs/design/builtin-dispatcher.md>`_ for details.
+
+    Examples:
+
+        >>> def builtin_add(a: int, b: int) -> int:
+        ...     ...
+        ...
+        >>> Dispatcher.register(builtin_add, ("int", "int"), lambda a, b: a + b)
+        >>> handler = Dispatcher.dispatch(builtin_add, 1, 2)
+        >>> handler(1, 2)
+        3
+    """
+
+    handlers: dict[
+        Callable[..., Any], list[tuple[Pattern, Callable[..., Any]]]
+    ] = {}
+    graph: Any = None
+
+    @classmethod
+    def register(
+        cls,
+        fn: Callable[..., Any],
+        parameters: tuple[str | Parameter, ...],
+        handler: Callable[..., Any],
+    ):
+        """
+        Registering function signature.
+
+        Args:
+            fn: The function to be registered.
+            parameters: The parameters of the function to be registered.
+            handler: The handler function.
+        """
+        _parameters = tuple(
+            Parameter.from_str(parameter)
+            if isinstance(parameter, str)
+            else parameter
+            for parameter in parameters
+        )
+        if fn not in cls.handlers:
+            cls.handlers[fn] = []
+        cls.handlers[fn].append((Pattern(*_parameters), handler))
+
+    @classmethod
+    def register_decorator(cls, fn: Callable[..., Any]):
+        """
+        Decorator mode of register, Used to register some complex functions.
+
+        Args:
+            fn: The function to be registered.
+
+        Examples:
+            >>> def builtin_add(a: int, b: int) -> int:
+            ...     ...
+            ...
+            >>> @Dispatcher.register_decorator(builtin_add)
+            ... def builtin_add_dispatcher(a: int, b: int) -> int:
+            ...     return a + b
+            ...
+            >>> handler = Dispatcher.dispatch(builtin_add, 1, 2)
+            >>> handler(1, 2)
+            3
+        """
+
+        def decorator(handler: Callable[..., Any]):
+            signature = inspect.signature(handler)
+            parameters = tuple(
+                Parameter.from_parameter(parameter)
+                for parameter in signature.parameters.values()
+            )
+            cls.register(fn, parameters, handler)
+
+        return decorator
+
+    @classmethod
+    def call(cls, fn, *args, **kwargs):
+        func = cls.dispatch(fn, *args, **kwargs)
+        if func is None:
+            raise InnerError(
+                f"Cannot find handler for {fn} with args {args} and kwargs {kwargs}"
+            )
+        return func(*args, **kwargs)
+
+    @classmethod
+    def dispatch(
+        cls, fn: Callable[..., Any], *args: Any, **kwargs: Any
+    ) -> Callable[..., Any] | None:
+        """
+        Find the matching handler from the registered functions.
+
+        Args:
+            fn: The function to be dispatched.
+            args: The args of the function.
+            kwargs: The kwargs of the function.
+        """
+        if not hashable(fn) or fn not in cls.handlers:
+            return None
+        for pattern, handler in cls.handlers[fn]:
+            if pattern.match_inputs(*args, **kwargs):
+                return handler
+        return None
diff --git a/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py b/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py
new file mode 100644
index 00000000000000..67d656f4dcd752
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import traceback
+import types
+from typing import List, Tuple
+
+from ...profiler import EventGuard, event_register
+from ...psdb import NO_FALLBACK_CODES
+from ...utils import (
+    BreakGraphError,
+    FallbackError,
+    InnerError,
+    Singleton,
+    is_strict_mode,
+    log,
+    log_do,
+)
+from ..custom_code import CustomCode
+from .guard import Guard
+from .opcode_executor import OpcodeExecutor, OpcodeExecutorBase
+from .pycode_generator import PyCodeGen
+
+GuardedFunction = Tuple[CustomCode, Guard]
+GuardedFunctions = List[GuardedFunction]
+
+dummy_guard: Guard = lambda frame: True
+dummy_guard.expr = "lambda frame: True"
+dummy_guard.lambda_expr = "lambda frame: True"
+
+
+@Singleton
+class OpcodeExecutorCache:
+    """
+    A singleton class that implements a cache for translated instructions.
+    This cache is used to store previously translated instructions along with their corresponding guard functions.
+
+    Attributes:
+        cache (dict): A dictionary that maps code objects to tuples of a cache getter function and a list of guarded functions.
+        translate_count (int): The count of how many instructions have been translated. It is used to test whether the cache hits.
+    """
+
+    MAX_CACHE_SIZE = 20
+    cache: dict[types.CodeType, GuardedFunctions]
+    translate_count: int
+
+    def __init__(self):
+        self.cache = {}
+        self.translate_count = 0
+
+    def clear(self):
+        """
+        Clears the cache and resets the translate count.
+        """
+        self.cache.clear()
+        self.translate_count = 0
+
+    def __call__(self, frame: types.FrameType, **kwargs) -> CustomCode:
+        code: types.CodeType = frame.f_code
+        if code not in self.cache:
+            log(2, f"[Cache]: Firstly call {code}\n")
+            new_custom_code, guard_fn = self.translate(frame, **kwargs)
+            self.cache[code] = [(new_custom_code, guard_fn)]
+            return new_custom_code
+        guarded_fns = self.cache[code]
+        return self.lookup(frame, guarded_fns, **kwargs)
+
+    @event_register("lookup")
+    def lookup(
+        self, frame: types.FrameType, guarded_fns: GuardedFunctions, **kwargs
+    ) -> CustomCode:
+        """
+        Looks up the cache for a matching code object and returns a custom code object if a matching guard function is found, otherwise None.
+
+        Args:
+            frame (types.FrameType): The frame whose code object needs to be looked up in the cache.
+            guarded_fns (GuardedFunctions): The list of guarded functions associated with the code object.
+
+        Returns:
+            CustomCode | None: The custom code object if a matching guard function is found, otherwise None.
+        """
+
+        if len(guarded_fns) >= self.MAX_CACHE_SIZE:
+            log(2, "[Cache]: Exceed max cache size, skip it\n")
+            return CustomCode(None, False)
+
+        for custom_code, guard_fn in guarded_fns:
+            try:
+                with EventGuard("try guard"):
+                    guard_result = guard_fn(frame)
+                if guard_result:
+                    log(
+                        2,
+                        f"[Cache]: Cache hit, Guard is \n{getattr(guard_fn, 'expr', 'None')}\n",
+                    )
+                    return custom_code
+                else:
+                    log_do(
+                        4,
+                        self.analyse_guard_global_object(guard_fn),
+                    )
+                    log(
+                        2,
+                        f"[Cache]: Cache miss, Guard is \n{getattr(guard_fn, 'expr', 'None')}\n",
+                    )
+                    log_do(
+                        2,
+                        self.analyse_guard_error(guard_fn, frame),
+                    )
+            except Exception as e:
+                log(2, f"[Cache]: Guard function error: {e}\n")
+                continue
+
+        log(2, "[Cache]: all guards missed\n")
+        new_custom_code, guard_fn = self.translate(frame, **kwargs)
+        guarded_fns.append((new_custom_code, guard_fn))
+        return new_custom_code
+
+    def translate(
+        self, frame: types.FrameType, **kwargs
+    ) -> tuple[CustomCode, Guard]:
+        """
+        Translates the given frame's code object and returns the cache getter function and a guarded function for the translated code object.
+
+        Args:
+            frame (types.FrameType): The frame whose code object needs to be translated.
+
+        Returns:
+            tuple[CustomCode, Guard]: The cache getter function and a guarded function for the translated code object.
+        """
+        code: types.CodeType = frame.f_code
+        self.translate_count += 1
+        custom_new_code, guard_fn = start_translate(frame, **kwargs)
+        return custom_new_code, guard_fn
+
+    def analyse_guard_global_object(self, guard_fn):
+        def inner():
+            for key in guard_fn.__globals__.keys():
+                if key.startswith("__object"):
+                    print(
+                        f"[Cache] meet global object: {key} : {guard_fn.__globals__[key]}",
+                    )
+
+        return inner
+
+    def analyse_guard_error(self, guard_fn, frame):
+        def inner():
+            guard_expr = guard_fn.lambda_expr
+            lambda_head = "lambda frame: "
+            guard_expr = guard_expr.replace(lambda_head, "")
+            guards = guard_expr.split(" and ")
+            for guard_str in guards:
+                guard = eval(lambda_head + guard_str, guard_fn.__globals__)
+                result = False
+                try:
+                    result = guard(frame)
+                except Exception as e:
+                    print(
+                        f"[Cache]: skip checking {guard_str}\n         because error occured {e}"
+                    )
+                if result is False:
+                    print(f"[Cache]: missed at {guard_str}")
+                    return
+            print("[Cache]: missed guard not found.")
+
+        return inner
+
+
+def start_translate(frame: types.FrameType, **kwargs) -> GuardedFunction:
+    """
+    Starts the translation process for the given frame and returns the translated code object and its guard function, or None if translation fails.
+
+    Args:
+        frame: The frame to be translated.
+
+    Returns:
+        GuardedFunction | None: The translated code object and its guard function, or None if translation fails.
+    """
+    simulator = OpcodeExecutor(frame, **kwargs)
+    try:
+        new_custom_code, guard_fn = simulator.transform()
+        return new_custom_code, guard_fn
+    # TODO(zrr1999): InnerError maybe place before (FallbackError, BreakGraphError)
+    # TODO(0x45f): handle BreakGraphError to trigger fallback
+    except BreakGraphError as e:
+        raise RuntimeError(
+            f"Found BreakGraphError raised, it should not be catch at start_translate!\n{e}"
+        )
+    except FallbackError as e:
+        if simulator._code in NO_FALLBACK_CODES:
+            raise InnerError(
+                f"{simulator._code.co_name} should not fallback, but got '{e}'"
+            )
+        # if disable_eval_frame is True, it means we want fallback to speedup rather than error occured
+        if is_strict_mode() and e.disable_eval_frame is False:
+            raise
+        log(
+            2,
+            f"Unsupport Frame is {frame.f_code}, error message is: \n"
+            + "".join(traceback.format_exception(type(e), e, e.__traceback__)),
+        )
+
+        # NOTE: If resume fn need fallback, we should replace NullVariable using NULL otherwise will fail to run
+        py_codegen = PyCodeGen(frame)
+        new_code = py_codegen.replace_null_variable()
+        # simulation not complete, not sure whether this code has sir, set disable_eval_frame = False
+        guard_fn = (
+            dummy_guard if e.disable_eval_frame is False else simulator.guard_fn
+        )
+        return (
+            CustomCode(new_code, e.disable_eval_frame),
+            guard_fn,
+        )
+    except Exception as e:
+        raise InnerError(OpcodeExecutorBase.error_message_summary(e)) from e
+    finally:
+        simulator.cleanup()
diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
new file mode 100644
index 00000000000000..0859ecfec46b9a
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
@@ -0,0 +1,684 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is specifically used to handle the problem
+# of generating a Graph from a linear function call.
+
+from __future__ import annotations
+
+import builtins
+import inspect
+from collections import namedtuple
+from copy import deepcopy
+from functools import cached_property
+from typing import Any, Callable
+
+from ...infer_meta import InferMetaCache, LayerInferMetaCache, MetaInfo
+from ...profiler import EventGuard, event_register
+from ...symbolic.statement_ir import Symbol
+from ...symbolic.symbolic_context import SymbolicTraceContext
+from ...utils import (
+    NameGenerator,
+    OrderedSet,
+    inner_error_default_handler,
+    is_inplace_api,
+    is_paddle_api,
+    log,
+    log_do,
+    map_if,
+    show_trackers,
+    tmp_name_guard,
+)
+from .guard import Guard, StringifyExpression, make_guard
+from .mutable_data import MutationDel, MutationNew, MutationSet
+from .pycode_generator import PyCodeGen
+from .side_effects import (
+    DictSideEffectRestorer,
+    GlobalDelSideEffectRestorer,
+    GlobalSetSideEffectRestorer,
+    ListSideEffectRestorer,
+    ObjDelSideEffectRestorer,
+    ObjSetSideEffectRestorer,
+    SideEffectRestorer,
+    SideEffects,
+)
+from .tracker import BuiltinTracker, DummyTracker
+from .variables import (
+    DictVariable,
+    GlobalVariable,
+    ListVariable,
+    NullVariable,
+    PaddleLayerVariable,
+    TensorVariable,
+    VariableBase,
+    VariableFactory,
+    find_traceable_vars,
+    map_variables,
+)
+
+
+def convert_to_meta(inputs: Any):
+    """
+    Convert the input variables to meta if it is TensorVariable.
+    """
+
+    def func(x):
+        if isinstance(x, TensorVariable):
+            return x.meta
+        if isinstance(x, VariableBase):
+            return x.get_py_value()
+        return x
+
+    return map_variables(func, inputs)
+
+
+def convert_to_symbol(inputs: Any):
+    """
+    Convert the input variables to symbol if it can be symbolic.
+    """
+
+    def func(x):
+        if isinstance(x, (TensorVariable, PaddleLayerVariable)):
+            return x.get_symbol()
+        if isinstance(x, VariableBase):
+            return x.get_py_value()
+        return x
+
+    return map_variables(func, inputs)
+
+
+class FunctionGraph:
+    """
+    A Graph representation corresponding to each FunctionFrame
+    The input binding diagram containing the current call represents three parts of output settings,
+    This Graph can be compiled as a f_locals dependency function which produce the same outputs.
+    """
+
+    OUT_VAR_PREFIX = "___SIR_out_"
+    Memo = namedtuple(
+        "function_graph_memo",
+        [
+            'inner_out',
+            'input_variables',
+            "stmt_ir",
+            "global_guards",
+            "side_effects_state",
+            "print_variables",
+            "inplace_tensors",
+        ],
+    )
+
+    def __init__(self, frame, **kwargs):
+        self.sir_ctx = SymbolicTraceContext()
+        self.inner_out = set()
+        self.input_variables = []  # Store variables required within a function
+        self.pycode_gen = PyCodeGen(frame, disable_eval_frame=True)
+        self.side_effects = SideEffects()
+        self._global_guarded_variables: OrderedSet[VariableBase] = OrderedSet()
+        self._print_variables = []
+        self._inplace_tensors = OrderedSet()
+        self.build_strategy = kwargs.get('build_strategy', None)
+        self._kwargs = kwargs
+
+    @cached_property
+    def _builtins(self):
+        builtins_ = {}
+        # prepare builtins
+        for name, value in builtins.__dict__.items():
+            builtins_[name] = VariableFactory.from_value(
+                value, self, BuiltinTracker(name), debug_name=name
+            )
+        return builtins_
+
+    def add_print_variables(self, variable):
+        """
+        Used to support psdb_print
+        """
+        self._print_variables.append(variable)
+
+    def add_inplace_tensors(self, variable):
+        """
+        Used to support psdb_print
+        """
+        self._inplace_tensors.add(variable)
+
+    def need_add_input(self, var):
+        """
+        Determine if it is the input of graph.
+
+        Args:
+            var: The input variable.
+
+        """
+        if var.id in self.inner_out:
+            return False
+        for v in self.input_variables:
+            if v.id == var.id:
+                return False
+        return True
+
+    def save_memo(self) -> FunctionGraph.Memo:
+        """
+        Save the state of the current FunctionGraph, for future state recovery, it is used for state recovery during inline call error reporting
+
+        NOTE:
+            Why don't use __deepcopy__, because memo is not a deepcopy, i.e inner_out is only a shallow copy, SIR is a deepcopy.
+        """
+        saved_stmt_ir = deepcopy(self.sir_ctx.TOS)
+        return FunctionGraph.Memo(
+            inner_out=set(self.inner_out),
+            input_variables=list(self.input_variables),
+            stmt_ir=saved_stmt_ir,
+            global_guards=OrderedSet(self._global_guarded_variables),
+            side_effects_state=self.side_effects.get_state(),
+            print_variables=list(self._print_variables),
+            inplace_tensors=OrderedSet(self._inplace_tensors),
+        )
+
+    def restore_memo(self, memo: FunctionGraph.Memo):
+        """
+        Restore the state of graph to memo.
+
+        Args:
+            memo: Previously recorded memo
+
+        """
+        self.inner_out = memo.inner_out
+        self.input_variables = memo.input_variables
+        self.sir_ctx.replace_TOS(memo.stmt_ir)
+        self._global_guarded_variables = memo.global_guards
+        self.side_effects.restore_state(memo.side_effects_state)
+        self._print_variables = memo.print_variables
+        self._inplace_tensors = memo.inplace_tensors
+
+    def collect_input_variables(self, inputs: list[VariableBase]):
+        """
+        Variables required within the method
+
+        Args:
+            inputs: Required VariableBase
+        """
+
+        def collect(inp):
+            if isinstance(inp, VariableBase) and self.need_add_input(inp):
+                self.input_variables.append(inp)
+
+        map_variables(
+            collect,
+            inputs,
+        )
+
+    @property
+    @event_register("guard_fn")
+    def guard_fn(self) -> Guard:
+        with tmp_name_guard():
+            guards = []
+            with EventGuard(
+                "guard_fn: find vars and make stringify guard", event_level=1
+            ):
+                for variable in find_traceable_vars(
+                    self.input_variables + list(self._global_guarded_variables)
+                ):
+                    guards.extend(variable.make_stringify_guard())
+
+            guards = OrderedSet(guards)
+
+            for guard in guards:
+                assert isinstance(
+                    guard, StringifyExpression
+                ), "guard must be StringifyExpression."
+
+            return make_guard(guards)
+
+    def start_compile_with_name_store(self, ret_vars, to_store_vars):
+        class VariableLoader:
+            def __init__(self, index_for_load, pycode_gen):
+                self._index_for_load = index_for_load
+                self._pycode_gen: PyCodeGen = pycode_gen
+
+            def load(self, var, allow_push_null=True):
+                if isinstance(var, NullVariable):
+                    if allow_push_null:
+                        var.reconstruct(self._pycode_gen)
+                    else:
+                        # Avoid passing NULL as a parameter to the resume function
+                        self._pycode_gen.gen_load_null_variable()
+                    return
+                self._pycode_gen.gen_load_fast(self._index_for_load[var.id])
+
+        # var_id -> local_name mapping
+        index_for_load = {}
+        to_store_vars = list(
+            filter(lambda x: not isinstance(x, NullVariable), to_store_vars)
+        )
+        self.start_compile(*(ret_vars + to_store_vars))
+        name_gen = NameGenerator("__start_compile_saved_")
+        for var in to_store_vars:
+            index_for_load[var.id] = name_gen.next()
+
+            def _log_fn():
+                print(
+                    f"[StartCompile] saved var: {index_for_load[var.id]} = ",
+                    var,
+                )
+
+            log_do(4, _log_fn)
+
+        for var in to_store_vars[::-1]:
+            self.pycode_gen.gen_store_fast(index_for_load[var.id])
+        return VariableLoader(index_for_load, self.pycode_gen)
+
+    @event_register("start_compile", event_level=2)
+    def start_compile(self, *ret_vars: VariableBase):
+        """
+        Generate bytecode based on the information collected by the simulation execution.
+
+        This consists of the following steps:
+        - Compile the FunctionGraph into a dy2st StaticFunction and load it in the generated bytecode
+        - Load the group network input
+        - Calling the generated dy2st StaticFunction
+        - Restore the side effects
+        - Restore the output
+        - Return the top of the stack
+        """
+        from ..breakpoint import BreakpointManager
+
+        BreakpointManager().on_event("start_compile")
+
+        ret_items = [
+            ret_item
+            for ret_var in ret_vars
+            for ret_item in ret_var.flatten_items()
+        ]
+
+        tensor_items = self._find_tensor_outputs(ret_items)
+        compiled_fn, statment_ir = self.sir_ctx.compile_fn(
+            [Symbol(tensor_var.var_name) for tensor_var in tensor_items],
+            **self._kwargs,
+        )
+        input_names = statment_ir.inputs
+        compiled_fn_name = f"__compiled_fn_{statment_ir.name}"
+        # prepare function and inputs
+        self.pycode_gen.gen_load_object(compiled_fn, compiled_fn_name)
+        for name in input_names:
+            found = False
+            for variable in self.input_variables:
+                if (
+                    isinstance(variable, TensorVariable)
+                    and variable.get_symbol().name == name
+                ):
+                    variable.tracker.gen_instructions(self.pycode_gen)
+                    found = True
+                    break
+            assert found, f"can't find input {name} in SIR."
+        # Pack all args into a tuple, because we don't support *args now.
+        self.pycode_gen.gen_build_tuple(count=len(input_names))
+        # call the compiled_fn
+        self.pycode_gen.gen_call_function(argc=1)
+
+        # Store outputs to f_locals
+        self.pycode_gen.gen_unpack_sequence(count=len(tensor_items))
+        for tensor_var in tensor_items:
+            self.pycode_gen.gen_store_fast(tensor_var.out_var_name)
+        # restore the outputs.
+        for ret_var in ret_vars:
+            ret_var.reconstruct(self.pycode_gen)
+
+        # deal side effect
+        self.restore_inplace_tensor(self._inplace_tensors)
+        self.restore_print_stmts(self._print_variables)
+        self.restore_side_effects(self.side_effects.proxy_variables)
+        self.pycode_gen.gen_enable_eval_frame()
+
+        tracker_output_path = show_trackers()
+        if tracker_output_path:
+            from .tracker_viewer import view_tracker
+
+            view_tracker(list(ret_vars), tracker_output_path, format="png")
+
+    def call_paddle_api(
+        self,
+        func: Callable[..., Any],
+        *args: VariableBase,
+        **kwargs: VariableBase,
+    ):
+        """
+        Record Paddle Networking API to SIR
+
+        Args:
+            func: paddle api
+        """
+        assert is_paddle_api(func)
+        # not fallback api, start symbolic trace.
+        # TODO(xiokgun): may have python buildin object inside metas.
+        # TODO(xiokgun): 4 kinds of python arguments. support it !!
+        log(3, f"call paddle.api : {func.__name__}", "\n")
+
+        def message_handler(*args, **kwargs):
+            return f"Call paddle_api error: {func.__name__}, may be not a operator api ?"
+
+        return inner_error_default_handler(self.symbolic_call, message_handler)(
+            InferMetaCache(), self.sir_ctx.call_API, func, *args, **kwargs
+        )
+
+    def call_tensor_method(
+        self, method_name: str, *args: VariableBase, **kwargs
+    ):
+        """
+        call tensor method, start symbolic trace.
+
+        Args:
+            method_name: tensor method name
+        """
+
+        def message_handler(*args, **kwargs):
+            return f"Call tensor_method error: Tensor.{method_name}, may be not a valid operator api ?"
+
+        return inner_error_default_handler(self.symbolic_call, message_handler)(
+            InferMetaCache(),
+            self.sir_ctx.call_METHOD,
+            method_name,
+            *args,
+            **kwargs,
+        )
+
+    @staticmethod
+    def get_opcode_executor_stack():
+        # NOTE: only for debug.
+        # dependent on OpcodeExecutor.
+        from .opcode_executor import OpcodeExecutorBase
+
+        if len(OpcodeExecutorBase.call_stack) == 0:
+            # In test case, we can meet this senario.
+            return []
+        current_executor = OpcodeExecutorBase.call_stack[-1]
+        current_line = current_executor._current_line
+        filename = current_executor._code.co_filename
+        source_lines, start_line = inspect.getsourcelines(
+            current_executor._code
+        )
+        # TODO(SigureMo): In 3.11, lineno maybe changed after multiple breakgraph,
+        # We need to find a way to fix this.
+        line_idx = min(current_line - start_line, len(source_lines) - 1)
+        code_line = source_lines[line_idx]
+        stack = []
+        stack.append(
+            '  File "{}", line {}, in {}'.format(
+                filename,
+                current_line,
+                current_executor._code.co_name,
+            )
+        )
+        stack.append(f'    {code_line}')
+        return stack
+
+    def call_layer(
+        self,
+        layer: PaddleLayerVariable,
+        *args: VariableBase,
+        **kwargs: VariableBase,
+    ):
+        """
+        call paddle layer, start symbolic trace.
+
+        Args:
+            layer: paddle layer
+        """
+
+        def infer_meta_fn(layer, *metas, **kwmetas):
+            metas = LayerInferMetaCache()(layer.value, *metas, **kwmetas)
+            return metas
+
+        def compute_fn(layer, inputs, outputs, stacks):
+            self.sir_ctx.call_LAYER(
+                layer.value,
+                inputs=inputs,
+                outputs=outputs,
+                stacks=stacks,
+            )
+
+        def message_handler(*args, **kwargs):
+            return f"Call paddle layer error: {layer}, may be not a valid paddle layer ?"
+
+        return inner_error_default_handler(self.symbolic_call, message_handler)(
+            infer_meta_fn, compute_fn, layer, *args, **kwargs
+        )
+
+    def symbolic_call(self, infer_meta_fn, compute_fn, func, *args, **kwargs):
+        """
+        Using infer_meta_fn and compute_fn convert func to symbolic function.
+
+        Args:
+            infer_meta_fn: function for infer meta, (func, metas, kwmetas) -> output_metas
+            compute_fn   : function for sir compile, (func, input_symbols, outputs_symbols) -> None
+            func         : symbolic function
+        """
+        self.collect_input_variables(list(args))
+        self.collect_input_variables(list(kwargs.values()))
+        metas = convert_to_meta(args)
+        kwmetas = convert_to_meta(kwargs)
+
+        out_metas = infer_meta_fn(func, *metas, **kwmetas)
+        inputs_symbols = (
+            convert_to_symbol(args),
+            convert_to_symbol(kwargs),
+        )
+        log(3, f"         inputs : {inputs_symbols}", "\n")
+
+        outputs = map_if(
+            out_metas,
+            pred=lambda x: isinstance(x, MetaInfo),
+            true_fn=lambda x: TensorVariable(
+                x,
+                self,
+                tracker=DummyTracker(list(args) + list(kwargs.values())),
+            ),
+            false_fn=lambda x: x,
+        )
+        stmt_stacks = []
+        log_do(
+            3,
+            lambda: stmt_stacks.extend(
+                FunctionGraph.get_opcode_executor_stack()
+            ),
+        )
+        if outputs is not None:
+            if is_inplace_api(func):
+                # if we want to use a non-inplace api (static api) to replace an inplace behavior (in simulation)
+                # just set it back in SIR, and return outputs to replace tensor meta (it might changes?)
+                # in this case, the output will not exactly be used
+                compute_fn(
+                    func,
+                    inputs_symbols,
+                    convert_to_symbol(args[0]),
+                    stmt_stacks,
+                )
+            else:
+                compute_fn(
+                    func,
+                    inputs_symbols,
+                    convert_to_symbol(outputs),
+                    stmt_stacks,
+                )  # symbolic only contain symbols.
+                self._put_inner(outputs)
+            return VariableFactory.from_value(
+                outputs, self, DummyTracker(list(args) + list(kwargs.values()))
+            )
+        else:
+            return None
+
+    def _put_inner(self, vars: VariableBase):
+        """
+        put inner variable to inner_out
+        """
+        map_if(
+            vars,
+            pred=lambda x: isinstance(x, VariableBase),
+            true_fn=lambda x: self.inner_out.add(x.id),
+            false_fn=lambda x: None,
+        )
+
+    def add_global_guarded_variable(self, variable: VariableBase):
+        """
+        Add variable to global guarded variable
+        """
+        self._global_guarded_variables.add(variable)
+
+    def remove_global_guarded_variable(self, variable: VariableBase):
+        """
+        Remove variable to global guarded variable
+        """
+        if variable in self._global_guarded_variables:
+            self._global_guarded_variables.remove(variable)
+
+    def _find_tensor_outputs(
+        self, outputs: list[VariableBase]
+    ) -> OrderedSet[TensorVariable]:
+        """
+        Return all TensorVariable. find TensorVariables participating in networking from the output Variables
+
+        Args:
+            outputs: output variables
+        """
+        output_tensors: OrderedSet[TensorVariable] = OrderedSet()
+        # Find Tensor Variables from outputs.
+        for output in outputs:
+            if isinstance(output.tracker, DummyTracker):
+                if isinstance(output, TensorVariable):
+                    output_tensors.add(output)
+                else:
+                    # Guard output that can not be traced.
+                    self.add_global_guarded_variable(output)
+        # Find Tensor Variables from side effects Variables.
+        for side_effect_var in self.side_effects.proxy_variables:
+            if isinstance(side_effect_var, (ListVariable, DictVariable)):
+                for var in side_effect_var.flatten_items():
+                    if (
+                        isinstance(var.tracker, DummyTracker)
+                        and isinstance(var, TensorVariable)
+                        and side_effect_var.tracker.is_traceable()
+                    ):
+                        output_tensors.add(var)
+            else:
+                if isinstance(side_effect_var, GlobalVariable):
+                    proxy_records = side_effect_var.proxy.records
+                elif side_effect_var.tracker.is_traceable():
+                    # for attr side effect
+                    proxy_records = side_effect_var.attr_proxy.records
+                else:
+                    continue
+                for record in proxy_records:
+                    if isinstance(record, (MutationSet, MutationNew)):
+                        for var in record.value.flatten_items():
+                            if isinstance(
+                                var.tracker, DummyTracker
+                            ) and isinstance(var, TensorVariable):
+                                output_tensors.add(var)
+        # Find Tensor in print_stmts
+        for print_stmt in self._print_variables:
+            for var in print_stmt.flatten_items():
+                if isinstance(var.tracker, DummyTracker) and isinstance(
+                    var, TensorVariable
+                ):
+                    output_tensors.add(var)
+
+        # add inplace tensors into output tensors.
+        for inplace_tensor in self._inplace_tensors:
+            output_tensors.add(inplace_tensor)
+
+        return output_tensors
+
+    def restore_print_stmts(self, variables: list[VariableBase]):
+        for var in variables:
+            var.reconstruct(
+                self.pycode_gen,
+                use_tracker=False,
+                add_to_global_guarded_vars=False,
+            )
+
+    def restore_inplace_tensor(self, variables: list[VariableBase]):
+        for var in variables:
+            if not var.tracker.is_traceable():
+                continue
+            var.reconstruct(
+                self.pycode_gen,
+                use_tracker=True,
+                add_to_global_guarded_vars=False,
+            )
+            self.pycode_gen.gen_load_method(
+                "_inplace_assign"
+            )  # NOTE: paddle related logic.
+            var.reconstruct(
+                self.pycode_gen,
+                use_tracker=False,
+                add_to_global_guarded_vars=True,
+            )
+            self.pycode_gen.gen_call_method(1)
+            self.pycode_gen.gen_pop_top()
+
+    def restore_side_effects(self, variables: list[VariableBase]):
+        """
+        Generate side effect recovery code for variables with side effects
+
+        Args:
+            variables: Variables that may have side effects.
+        """
+        restorers: list[SideEffectRestorer] = []
+
+        for var in variables:
+            # skip inner variables
+            if not var.tracker.is_traceable() and not isinstance(
+                var, GlobalVariable
+            ):
+                continue
+            if isinstance(var, DictVariable):
+                restorers.append(DictSideEffectRestorer(var))
+            elif isinstance(var, ListVariable):
+                restorers.append(ListSideEffectRestorer(var))
+            else:
+                if isinstance(var, GlobalVariable):
+                    for record in var.proxy.records[::-1]:
+                        if isinstance(record, (MutationSet, MutationNew)):
+                            restorers.append(
+                                GlobalSetSideEffectRestorer(
+                                    record.key,
+                                    record.value,
+                                )
+                            )
+                        elif isinstance(record, MutationDel):
+                            restorers.append(
+                                GlobalDelSideEffectRestorer(record.key)
+                            )
+                else:
+                    for record in var.attr_proxy.records[::-1]:
+                        if isinstance(record, (MutationSet, MutationNew)):
+                            restorers.append(
+                                ObjSetSideEffectRestorer(
+                                    var,
+                                    record.key,
+                                    record.value,
+                                )
+                            )
+                        elif isinstance(record, MutationDel):
+                            restorers.append(
+                                ObjDelSideEffectRestorer(
+                                    var,
+                                    record.key,
+                                )
+                            )
+
+        for restorer in restorers:
+            restorer.pre_gen(self.pycode_gen)
+        for restorer in restorers[::-1]:
+            restorer.post_gen(self.pycode_gen)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/guard.py b/python/paddle/jit/sot/opcode_translator/executor/guard.py
new file mode 100644
index 00000000000000..b839c064f407da
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/guard.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import types
+import weakref
+from typing import TYPE_CHECKING, Any, Callable, TypeVar
+
+from ...profiler import EventGuard
+from ...utils import InnerError, current_tmp_name_records, log, log_do
+
+Guard = Callable[[types.FrameType], bool]
+
+if TYPE_CHECKING:
+    from .variables import VariableBase
+
+    CheckGuardInputT = TypeVar("CheckGuardInputT", bound=VariableBase)
+
+# NOTE(SigureMo): [How to write Stringify Guard?]
+# 1. we should capture free variables manually, the string cannot capture free
+#    variables automatically.
+# 2. Be aware that the comparison logic before and after stringify may be different.
+# 3. we should compute as much as possible at "compile time" and encode the
+#    computation in the Guard string, rather than passing it to runtime to minimize
+#    runtime overhead.
+
+
+class StringifyExpression:
+    """
+    Used to store string based expressions for generating Guard.
+    """
+
+    def __init__(self, str_expr, sub_exprs, free_vars):
+        expr = str_expr.format(*[arg.expr for arg in sub_exprs])
+        self.expr = current_tmp_name_records().add_tmp_var(expr)
+        self.debug_expr = str_expr.format(
+            *[arg.debug_expr for arg in sub_exprs]
+        )
+        self.free_vars = free_vars
+
+    def __post_init__(self):
+        self.check_expr(self.expr)
+
+    def check_expr(self, expr: str):
+        try:
+            pass
+            # ast.parse(expr) # TODO(xiongkun): too slow
+        except SyntaxError as e:
+            raise InnerError(f"Invalid expression: {expr}") from e
+
+    def __hash__(self):
+        if self.free_vars:
+            return hash((self.debug_expr, id(self)))
+        else:
+            return hash(self.debug_expr)
+
+
+def union_free_vars(*free_vars: dict[str, Any]):
+    return {k: v for d in free_vars for k, v in d.items()}
+
+
+def make_guard(stringify_guards: list[StringifyExpression]) -> Guard:
+    """
+    Make a guard from a list of StringifyExpression.
+
+    For more design ideas, refer to the `Stringify guard <https://github.com/PaddlePaddle/PaddleSOT/blob/develop/docs/design/stringify-guard.md>`_ for details.
+
+    Args:
+        stringify_guards: a list of StringifyExpression.
+    """
+    with EventGuard("make_guard"):
+        num_guards = len(stringify_guards)
+        if not num_guards:
+            guard = lambda frame: True
+            guard.expr = "lambda frame: True"
+            return guard
+
+        def analyse_expresions(stringify_exprs, tmp_names):
+            func_string = "def built_guard_fn(frame):\n"
+            lambda_string = "lambda frame: "
+            free_vars = {}
+
+            for k, v in tmp_names.items():
+                func_string += f"    {v} = {k}\n"
+
+            func_result = ""
+            for str_expr in stringify_exprs:
+                func_result += str_expr.expr + " and "
+                lambda_string += str_expr.debug_expr + " and "
+                free_vars = union_free_vars(free_vars, str_expr.free_vars)
+
+            func_string += f"    return {func_result[:-5]}"
+
+            return func_string, free_vars, lambda_string[:-5]
+
+        (
+            func_string,
+            free_vars,
+            lambda_string,
+        ) = analyse_expresions(
+            stringify_guards, current_tmp_name_records().tmp_names_record
+        )
+
+        exec(
+            func_string,
+            free_vars,
+        )
+
+        guard = free_vars['built_guard_fn']
+        log(3, f"[Guard]: {lambda_string}\n")
+        guard.lambda_expr = lambda_string
+        guard.expr = func_string
+        assert callable(guard), "guard must be callable."
+
+        return guard
+
+
+def support_weak_ref(obj):
+    if isinstance(obj, types.FunctionType):
+        return True
+    return False
+
+
+def check_guard(
+    fn: Callable[[CheckGuardInputT], list[StringifyExpression]]
+) -> Callable[[CheckGuardInputT], list[StringifyExpression]]:
+    def wrapper(self: CheckGuardInputT) -> list[StringifyExpression]:
+        assert (
+            self.tracker.is_traceable()
+        ), "Cannot make guard from a non-tracable guard variable."
+
+        def guard_log():
+            frame_value_tracer = self.tracker.trace_value_from_frame()
+            print(
+                f"[Guard]: guard_fn for {self}, tracker={self.tracker.__class__.__name__}, value={frame_value_tracer.expr}"
+            )
+
+        log_do(4, guard_log)
+        return fn(self)
+
+    return wrapper
+
+
+@check_guard
+def object_equal_stringify_guard(self) -> list[StringifyExpression]:
+    frame_value_tracer = self.tracker.trace_value_from_frame()
+
+    obj_free_var_name = f"__{self.id}"
+    weak_ref_obj = self.get_py_value()
+    if support_weak_ref(weak_ref_obj):
+        weak_ref_obj = weakref.ref(self.get_py_value())
+        return [
+            StringifyExpression(
+                f"{obj_free_var_name}() is not None and {{}} == {obj_free_var_name}()",
+                [frame_value_tracer],
+                union_free_vars(
+                    frame_value_tracer.free_vars,
+                    {obj_free_var_name: weak_ref_obj},
+                ),
+            )
+        ]
+    return [
+        StringifyExpression(
+            f"{{}} == {obj_free_var_name}",
+            [frame_value_tracer],
+            union_free_vars(
+                frame_value_tracer.free_vars,
+                {obj_free_var_name: self.get_py_value()},
+            ),
+        )
+    ]
diff --git a/python/paddle/jit/sot/opcode_translator/executor/instr_flag.py b/python/paddle/jit/sot/opcode_translator/executor/instr_flag.py
new file mode 100644
index 00000000000000..1dd795439d4597
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/instr_flag.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flags for instructions
+
+
+class FORMAT_VALUE_FLAG:
+    FVC_MASK = 0x3
+    FVC_NONE = 0x0
+    FVC_STR = 0x1
+    FVC_REPR = 0x2
+    FVC_ASCII = 0x3
+    FVS_MASK = 0x4
+    FVS_HAVE_SPEC = 0x4
+
+
+class MAKE_FUNCTION_FLAG:
+    MF_HAS_CLOSURE = 0x08
+    MF_HAS_ANNOTATION = 0x04
+    MF_HAS_KWDEFAULTS = 0x02
+    MF_HAS_DEFAULTS = 0x01
+
+
+class CALL_FUNCTION_EX_FLAG:
+    CFE_HAS_KWARGS = 0x01
diff --git a/python/paddle/jit/sot/opcode_translator/executor/mutable_data.py b/python/paddle/jit/sot/opcode_translator/executor/mutable_data.py
new file mode 100644
index 00000000000000..d6bda43d42ef4e
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/mutable_data.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar
+
+if TYPE_CHECKING:
+    from typing_extensions import Concatenate, ParamSpec, TypeAlias
+
+    P = ParamSpec("P")
+    R = TypeVar("R")
+
+    MutableDataT = TypeVar("MutableDataT", bound="MutableData")
+    DataGetter: TypeAlias = Callable[[MutableDataT, Any], Any]
+
+InnerMutableDataT = TypeVar(
+    "InnerMutableDataT", bound="dict[str, Any] | list[Any]"
+)
+
+
+class Mutation:
+    ABBR: str
+
+
+class MutationSet(Mutation):
+    """
+    Setting a value.
+    This mutation is used for MutableDictLikeData and MutableListLikeData.
+    """
+
+    ABBR = "S"
+
+    def __init__(self, key, value):
+        self.key = key
+        self.value = value
+
+    def __repr__(self):
+        return f"MutationSet({self.key}, {self.value})"
+
+
+class MutationDel(Mutation):
+    """
+    Deleting a value.
+    This mutation is used for MutableDictLikeData and MutableListLikeData.
+    """
+
+    ABBR = "D"
+
+    def __init__(self, key):
+        self.key = key
+
+    def __repr__(self):
+        return f"MutationDel({self.key})"
+
+
+class MutationNew(Mutation):
+    """
+    Adding a new value.
+    This mutation is only used for MutableDictLikeData.
+    """
+
+    ABBR = "N"
+
+    def __init__(self, key, value):
+        self.key = key
+        self.value = value
+
+    def __repr__(self):
+        return f"MutationNew({self.key}, {self.value})"
+
+
+class MutationInsert(Mutation):
+    """
+    Inserting a value.
+    This mutation is only used for MutableListLikeData.
+    """
+
+    ABBR = "I"
+
+    def __init__(self, index, value):
+        self.index = index
+        self.value = value
+
+    def __repr__(self):
+        return f"MutationInsert({self.index}, {self.value})"
+
+
+class MutationPermutate(Mutation):
+    """
+    Permutating all the values.
+    This mutation is only used for MutableListLikeData.
+    """
+
+    ABBR = "P"
+
+    def __init__(self, permutation):
+        self.permutation = permutation
+
+    def __repr__(self):
+        return f"MutationPermutate({self.permutation})"
+
+
+def record_mutation(
+    mutation_fn: Callable[Concatenate[MutableDataT, P], Mutation]
+) -> Callable[Concatenate[MutableDataT, P], None]:
+    def wrapper(self, *args: P.args, **kwargs: P.kwargs):
+        mutation = mutation_fn(self, *args, **kwargs)
+        self.records.append(mutation)
+
+    return wrapper
+
+
+class MutableData(Generic[InnerMutableDataT]):
+    """
+    An intermediate data structure between data and variable, it records all the mutations.
+    """
+
+    read_cache: InnerMutableDataT
+
+    class Empty:
+        def __repr__(self):
+            return "Empty()"
+
+    def __init__(self, data: Any, getter: DataGetter):
+        self.original_data = data
+        self.getter = getter
+        self.records: list[Mutation] = []
+
+    def is_empty(self, value):
+        return isinstance(value, MutableData.Empty)
+
+    @property
+    def version(self):
+        return len(self.records)
+
+    @property
+    def has_changed(self):
+        return self.version != 0
+
+    def rollback(self, version: int):
+        assert version <= self.version
+        self.records[:] = self.records[:version]
+
+    def get(self, key):
+        raise NotImplementedError()
+
+    def set(self, key, value):
+        raise NotImplementedError()
+
+    def apply(self, mutation: Mutation, write_cache: InnerMutableDataT):
+        raise NotImplementedError()
+
+    def reproduce(self, version: int | None = None) -> InnerMutableDataT:
+        if version is None:
+            version = self.version
+        write_cache = self.read_cache.copy()
+        for mutation in self.records[:version]:
+            self.apply(mutation, write_cache)
+        return write_cache
+
+    def __repr__(self) -> str:
+        records_abbrs = "".join([mutation.ABBR for mutation in self.records])
+        return f"{self.__class__.__name__}({records_abbrs})"
+
+
+class MutableDictLikeData(MutableData["dict[str, Any]"]):
+    def __init__(self, data: Any, getter: DataGetter):
+        super().__init__(data, getter)
+        self.read_cache = {}
+
+    def clear_read_cache(self):
+        self.read_cache.clear()
+
+    def get(self, key: Any):
+        # TODO(SigureMo): Optimize performance of this.
+        write_cache = self.reproduce(self.version)
+        if key not in write_cache:
+            self.read_cache[key] = self.getter(self, key)
+        return self.reproduce(self.version)[key]
+
+    def get_all(self):
+        original_keys = list(self.original_data.keys())
+        for mutation in self.records:
+            if isinstance(mutation, MutationNew):
+                original_keys.append(mutation.key)
+            elif isinstance(mutation, MutationDel):
+                original_keys.remove(mutation.key)
+        return {key: self.get(key) for key in original_keys}
+
+    @record_mutation
+    def set(self, key: Any, value: Any) -> Mutation:
+        is_new = False
+        if self.is_empty(self.get(key)):
+            is_new = True
+        return (
+            MutationSet(key, value) if not is_new else MutationNew(key, value)
+        )
+
+    @record_mutation
+    def delete(self, key):
+        return MutationDel(key)
+
+    def apply(self, mutation: Mutation, write_cache: dict[str, Any]):
+        if isinstance(mutation, MutationNew):
+            write_cache[mutation.key] = mutation.value
+        elif isinstance(mutation, MutationSet):
+            write_cache[mutation.key] = mutation.value
+        elif isinstance(mutation, MutationDel):
+            write_cache[mutation.key] = MutableData.Empty()
+        else:
+            raise ValueError(f"Unknown mutation type {mutation}")
+
+    def reproduce(self, version: int | None = None):
+        if version is None:
+            version = self.version
+        write_cache = self.read_cache.copy()
+        for mutation in self.records[:version]:
+            self.apply(mutation, write_cache)
+        return write_cache
+
+
+class MutableListLikeData(MutableData["list[Any]"]):
+    def __init__(self, data: Any, getter: DataGetter):
+        super().__init__(data, getter)
+        self.read_cache = [
+            self.getter(self, idx) for idx in range(len(self.original_data))
+        ]
+
+    def clear_read_cache(self):
+        self.read_cache[:] = []
+
+    @property
+    def length(self):
+        return len(self.reproduce())
+
+    def get(self, key):
+        write_cache = self.reproduce(self.version)
+        return write_cache[key]
+
+    def get_all(self) -> list[Any]:
+        items = self.reproduce(self.version)
+        return items
+
+    @record_mutation
+    def set(self, key: int, value: Any):
+        return MutationSet(self._regularize_index(key), value)
+
+    @record_mutation
+    def delete(self, key: int):
+        return MutationDel(self._regularize_index(key))
+
+    @record_mutation
+    def insert(self, index: int, value: Any):
+        return MutationInsert(self._regularize_index(index), value)
+
+    @record_mutation
+    def permutate(self, permutation: list[int]):
+        return MutationPermutate(permutation)
+
+    def _regularize_index(self, index: int):
+        if index < 0:
+            index += self.length
+        return index
+
+    def apply(self, mutation: Mutation, write_cache: list[Any]):
+        if isinstance(mutation, MutationSet):
+            write_cache[mutation.key] = mutation.value
+        elif isinstance(mutation, MutationDel):
+            write_cache[:] = (
+                write_cache[: mutation.key] + write_cache[mutation.key + 1 :]
+            )
+        elif isinstance(mutation, MutationInsert):
+            write_cache.insert(mutation.index, mutation.value)
+        elif isinstance(mutation, MutationPermutate):
+            write_cache[:] = [write_cache[i] for i in mutation.permutation]
+        else:
+            raise ValueError(f"Unknown mutation type {mutation}")
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
new file mode 100644
index 00000000000000..6d9ec8829497a5
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -0,0 +1,2070 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import dis
+import functools
+import inspect
+import operator
+import sys
+import traceback
+import types
+from dataclasses import dataclass
+from itertools import chain
+from typing import Any, Callable
+
+import opcode
+
+from ...profiler import EventGuard, event_register
+from ...psdb import NO_BREAKGRAPH_CODES
+from ...utils import (
+    BreakGraphError,
+    FallbackError,
+    InnerError,
+    OrderedSet,
+    SotUndefinedVar,
+    log,
+    log_do,
+    min_graph_size,
+)
+from ..custom_code import CustomCode
+from ..instruction_utils import (
+    Instruction,
+    Space,
+    analysis_inputs,
+    analysis_used_names_with_space,
+    calc_stack_effect,
+    get_instructions,
+)
+from ..instruction_utils.opcode_info import JumpDirection, PopJumpCond
+from .dispatch_functions import (
+    operator_BAD,
+    operator_exception_match,
+    operator_in,
+    operator_is_none,
+    operator_is_not_none,
+    operator_not_in,
+)
+from .dispatcher import Dispatcher
+from .function_graph import FunctionGraph
+from .instr_flag import CALL_FUNCTION_EX_FLAG as CFE
+from .instr_flag import FORMAT_VALUE_FLAG as FV
+from .instr_flag import MAKE_FUNCTION_FLAG as MF
+from .pycode_generator import PyCodeGen
+from .tracker import (
+    CellTracker,
+    ConstTracker,
+    DanglingTracker,
+    DummyTracker,
+    LocalTracker,
+)
+from .variable_stack import VariableStack
+from .variables import (
+    BuiltinVariable,
+    CellVariable,
+    ConstantVariable,
+    ContainerVariable,
+    DictVariable,
+    GlobalVariable,
+    ListVariable,
+    MethodVariable,
+    NullVariable,
+    SequenceIterVariable,
+    SliceVariable,
+    TensorVariable,
+    TupleVariable,
+    UserDefinedFunctionVariable,
+    VariableBase,
+    VariableFactory,
+)
+
+SUPPORT_COMPARE_OP = {
+    ">": operator.gt,
+    "<": operator.lt,
+    ">=": operator.ge,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+    "is not": operator.is_not,
+    "is": operator.is_,
+    "in": operator_in,
+    "not in": operator_not_in,
+    "exception match": operator_exception_match,
+    "BAD": operator_BAD,
+}
+
+
+@dataclass
+class Stop:
+    state: str
+
+
+def tos_op_wrapper(fn: Callable):
+    """
+    A decorator function that wraps an opcode operation and applies certain functionality to it.
+
+    Args:
+        fn: The opcode operation to be wrapped.
+
+    Returns:
+        The wrapped opcode operation.
+    """
+    nargs = len(inspect.signature(fn).parameters)
+
+    @call_break_graph_decorator(push_n=1)
+    def inner(self: OpcodeExecutorBase, instr: Instruction):
+        args = self.stack.pop_n(nargs)
+        res = BuiltinVariable(fn, graph=self._graph, tracker=DanglingTracker())(
+            *args
+        )
+        self.stack.push(res)
+
+    return inner
+
+
+def tos_inplace_op_wrapper(fn: Callable):
+    """
+    A decorator function that wraps an inplace opcode operation and applies certain functionality to it.
+
+    Args:
+        fn: The inplace opcode operation to be wrapped.
+
+    Returns:
+        The wrapped inplace opcode operation.
+
+    """
+
+    @call_break_graph_decorator(push_n=1)
+    def inner(self: OpcodeExecutorBase, instr: Instruction):
+        """
+        Inner function that represents the wrapped inplace opcode operation.
+
+        Args:
+            self: The instance of the OpcodeExecutorBase class.
+            instr: The instruction to be executed.
+
+        """
+        args = self.stack.pop_n(2)
+        res = BuiltinVariable(fn, graph=self._graph, tracker=DanglingTracker())(
+            *args
+        )
+        res.debug_name = args[0].debug_name
+        self.stack.push(res)
+
+    return inner
+
+
+def pop_jump_if_op_wrapper(fns: list[Callable[[Any], Any]]):
+    """
+    A decorator function that wraps a POP_JUMP_*_IF_* opcode operation and applies certain functionality to it.
+
+    Args:
+        fn: The condition function.
+
+    Returns:
+        The wrapped POP_JUMP_*_IF_* opcode operation.
+
+    """
+
+    @jump_break_graph_decorator
+    def inner(self: OpcodeExecutorBase, instr: Instruction):
+        """
+        Inner function that represents the wrapped POP_JUMP_IF opcode operation.
+
+        Args:
+            self: The instance of the OpcodeExecutorBase class.
+            instr: The instruction to be executed.
+
+        """
+        pred_obj = self.stack.pop()
+
+        try:
+            self._graph.add_global_guarded_variable(pred_obj)
+            res = pred_obj
+            for fn in fns:
+                res = BuiltinVariable(
+                    fn, graph=self._graph, tracker=DanglingTracker()
+                )(res)
+
+            assert isinstance(res, ConstantVariable)
+            is_jump = res.get_py_value()
+            assert isinstance(is_jump, bool)
+            if is_jump:
+                assert instr.jump_to is not None
+                self.jump_to(instr.jump_to)
+        except BreakGraphError:
+            raise FallbackError(
+                f"Currently don't support predicate {pred_obj.__class__.__name__}"
+            )
+
+    return inner
+
+
+def jump_break_graph_decorator(normal_jump: Callable):
+    """
+    A decorator function that breaks off the graph when a JUMP-related instruction is encountered.
+
+    Args:
+        normal_jump: The normal jump operation.
+
+    Returns:
+        The wrapped jump operation.
+
+    """
+
+    def inner(self: OpcodeExecutor, instr: Instruction):
+        result = self.stack.top
+        if isinstance(result, TensorVariable):
+            self.stack.pop()
+            # fallback when in OpcodeExecutor
+            # raise error in OpcodeInlineExecutor
+            log(3, "[BreakGraph] jump break graph, because if tensor\n")
+            self._break_graph_in_jump(result, instr)
+            return Stop(state="BreakGraph")
+        else:
+            return normal_jump(self, instr)
+
+    return inner
+
+
+def call_break_graph_decorator(push_n: int | Callable[[int | None], int]):
+    """
+    A decorator function that breaks off the graph when a function CALL instruction is encountered.
+
+    Args:
+        push_n: The number of arguments to be pushed onto the stack.
+
+    Returns:
+        The decorated function.
+
+    """
+
+    def decorate(call_fn: Callable):
+        @functools.wraps(call_fn)
+        def wrapper(self: OpcodeExecutor, instr: Instruction):
+            origin_stack = self.stack.copy()
+            try:
+                return call_fn(self, instr)
+            except BreakGraphError as e:
+                if self._code in NO_BREAKGRAPH_CODES:
+                    raise InnerError(
+                        f"{self._code.co_name} should not break graph, but got '{e}'"
+                    )
+                if isinstance(self, OpcodeExecutor):
+                    log(3, f"[BreakGraph] call function Break graph: {e}\n")
+                    self._break_graph_in_call(origin_stack, instr, push_n)
+                    return Stop(state="BreakGraph")
+                else:
+                    raise e
+
+        return wrapper
+
+    return decorate
+
+
+def fallback_when_occur_error(fn: Callable):
+    """
+    A decorator function that provides fallback behavior when an error occurs during graph processing.
+
+    Args:
+        fn: The function to be wrapped.
+
+    Returns:
+        The wrapped function.
+
+    """
+
+    def inner(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            raise FallbackError(
+                f'[Fallback] An exception occurred when processing break graph, fallback to dygraph, error message is: \n{type(e)} : {e}\n'
+            )
+
+    return inner
+
+
+class OpcodeExecutorBase:
+    """
+    Base class for executing opcode instructions.
+
+    The OpcodeExecutorBase class provides methods and functionality to execute opcode instructions.
+
+    If you want to learn more about Python instructions, see https://docs.python.org/3/library/dis.html for details.
+
+    Args:
+        code: The bytecode of the function to be executed.
+        graph: The function graph.
+
+    Attributes:
+        call_stack (list[OpcodeExecutorBase]): A list to keep track of the call stack.
+        _stack (list[VariableBase]): The stack used for storing variables during execution.
+        _co_consts: List to store constants.
+        _locals (dict): Dictionary to store local variables.
+        _globals (dict): Dictionary to store global variables.
+        _builtins (dict): Dictionary to store built-in variables.
+        _lasti (int): Index of the last executed instruction.
+        _code (types.CodeType): The code object to be executed.
+        _instructions: Iterator of opcode instructions.
+        _graph (FunctionGraph): The function graph representing the code.
+        _current_line: The current line number of the execution.
+        new_code: Placeholder for new code (to be generated by PyCodeGen).
+        guard_fn: Placeholder for guard function.
+        _name (str): Name of the executor.
+
+    """
+
+    call_stack: list[OpcodeExecutorBase] = []
+
+    @staticmethod
+    def validate_value(value):
+        assert isinstance(
+            value, VariableBase
+        ), f"value: {value}, type shoule be VariableBase(or derived), but get {type(value)}"
+        assert not isinstance(value.tracker, DanglingTracker) or isinstance(
+            value, (NullVariable, CellVariable)
+        ), f"dangling variable {value} should not be pushed into stack."
+
+    def __init__(self, code: types.CodeType, graph: FunctionGraph):
+        OpcodeExecutorBase.call_stack.append(self)
+        # fake env for run, new env should be gened by PyCodeGen
+        self.stack = VariableStack(validate_value_func=self.validate_value)
+        self._co_consts = []
+        self._locals = {}
+        self._globals: GlobalVariable = None  # type: ignore
+        self._builtins = {}
+        self._cells = {}  # position to put cells
+        self._lasti = 0  # idx of instruction list
+        self._code = code
+        self._current_line: int = -1
+        self._instructions = get_instructions(self._code)
+        self._graph = graph
+        self.new_code: types.CodeType | None = None
+        self.guard_fn = None
+        self._name = "Executor"
+        self._call_shape: tuple[
+            str, ...
+        ] | None = None  # store kwnames for Python 3.11+
+        self._prepare_virtual_env()
+
+        self.stop_state = None
+
+    def print_sir(self):
+        """
+        Prints the Static Instruction Representation (SIR) in the executor.
+
+        """
+        print(self._graph.sir_ctx.TOS)
+
+    def _prepare_virtual_env(self):
+        """
+        Prepares the virtual environment for the executor.
+
+        Raises:
+            NotImplementedError: If the method is not implemented.
+
+        """
+        raise NotImplementedError("Please implement virtual_env.")
+
+    def _break_graph_in_jump(self, result, instr: Instruction):
+        """
+        Breaks the graph in JUMP instructions.
+
+        Args:
+            result: The execution result.
+            instr: The jump instruction.
+
+        Raises:
+            NotImplementedError: If the method is not implemented.
+
+        """
+        raise NotImplementedError()
+
+    def transform(self):
+        """
+        Abstract method need to be implemented to symbolic translate each instruction.
+
+        Raises:
+            NotImplementedError: If the method is not implemented.
+
+        """
+        raise NotImplementedError()
+
+    def get_var(self, name: str):
+        """
+        Gets the variable with the given name.
+
+        Args:
+            name: The name of the variable.
+
+        Returns:
+            The variable.
+
+        Raises:
+            InnerError: If the variable cannot be found.
+
+        """
+        if name in self._locals.keys():
+            return self._locals[name]
+        elif name in self._cells.keys():  # in closure
+            return self._cells[name].cell_content()
+        elif name in self._globals.keys():
+            return self._globals.get(name)
+        elif name in self._builtins.keys():
+            return self._builtins[name]
+        else:
+            raise InnerError(f'Can not get var: {name}')
+
+    def has_var(self, name: str, space: str = "any"):
+        if space == "any":
+            return name in set(
+                chain(
+                    self._locals.keys(),
+                    self._cells.keys(),
+                    self._globals.keys(),
+                    self._builtins.keys(),
+                )
+            )
+        elif space == Space.locals:
+            return name in self._locals
+        elif space == Space.cells:
+            return name in self._cells
+        elif space == Space.globals:
+            return name in set(
+                chain(
+                    self._globals.keys(),
+                    self._builtins.keys(),
+                )
+            )
+        return False
+
+    def pop_call_stack_until_self(self):
+        """
+        Pops the call stack until the current executor.
+
+        """
+        assert (
+            self in OpcodeExecutorBase.call_stack
+        ), f"{self} not in call stack"
+        while OpcodeExecutorBase.call_stack.pop() is not self:
+            pass
+
+    @staticmethod
+    def error_message_summary(original_error: Exception) -> str:
+        """
+        Creates a summary of the error message during execution.
+
+        Args:
+            original_error: The original error.
+
+        Returns:
+            The summary error message.
+
+        """
+        indent = 2 * " "
+        message_lines = ["In simulate execution:", ""]
+        for current_simulator in OpcodeExecutorBase.call_stack:
+            code = current_simulator._code
+            current_line = current_simulator._current_line
+            lines, start = inspect.getsourcelines(code)
+            real_name = code.co_name
+            message_lines.append(
+                f"{indent}  File \"{code.co_filename}\", line {current_line}, in {real_name}"
+            )
+            if current_line != -1:
+                message_lines.append(
+                    f"{indent}  {lines[current_line-start].rstrip()}"
+                )
+        error_message = traceback.format_exception_only(
+            type(original_error), original_error
+        )
+        for line in error_message:
+            line = line.rstrip()
+            message_lines.append(f"{indent}  {line}")
+        return "\n".join(message_lines)
+
+    def run(self):
+        """
+        Executes the opcode.
+
+        """
+        log(3, f"start execute opcode: {self._code}\n")
+        self._lasti = 0
+        while True:
+            if self._lasti >= len(self._instructions):
+                raise InnerError("lasti out of range, InnerError.")
+            cur_instr = self._instructions[self._lasti]
+            self._lasti += 1
+            is_stop = self.step(cur_instr)
+            if is_stop:
+                self.stop_state = is_stop.state
+                self.pop_call_stack_until_self()
+                break
+
+    def step(self, instr: Instruction):
+        """
+        Executes a single step of the opcode.
+
+        Args:
+            instr: The instruction to be executed.
+
+        Returns:
+            True if execution should stop, False otherwise.
+
+        Raises:
+            FallbackError: If the opcode is not supported.
+
+        """
+        if instr.starts_line is not None:
+            self._current_line = instr.starts_line
+        if not hasattr(self, instr.opname):
+            raise FallbackError(f"opcode: {instr.opname} is not supported.")
+        log_message = f"[Translate {self._name}]: (line {self._current_line:>3}) {instr.opname:<12} {instr.argval}, stack is {self.stack}\n"
+        log(3, log_message)
+        code_file = self._code.co_filename
+        code_line = self._current_line
+        code_name = self._code.co_name
+        code_offset = instr.offset
+        from ..breakpoint import BreakpointManager
+
+        if BreakpointManager().hit(
+            code_file, code_line, code_name, code_offset
+        ):
+            BreakpointManager().locate(self)
+            print(log_message)
+            breakpoint()  # breakpoint for debug
+
+        with EventGuard(f"{instr.opname}", event_level=1):
+            return getattr(self, instr.opname)(instr)  # run single step.
+
+    def indexof(self, instr: Instruction):
+        """
+        Gets the index of the instruction.
+
+        Args:
+            instr: The instruction.
+
+        Returns:
+            The index of the instruction.
+
+        """
+        return self._instructions.index(instr)
+
+    def jump_to(self, instr: Instruction):
+        """
+        Jumps to the given instruction.
+
+        Args:
+            instr: The instruction to jump to.
+
+        """
+        self._lasti = self.indexof(instr)
+
+    def COPY(self, instr: Instruction):
+        assert isinstance(instr.arg, int)
+        self.stack.push(self.stack.peek[instr.arg])
+
+    def DUP_TOP(self, instr: Instruction):
+        self.stack.push(self.stack.top)
+
+    def DUP_TOP_TWO(self, instr: Instruction):
+        for ref in self.stack.peek[:2]:
+            self.stack.push(ref)
+
+    def ROT_N(self, instr: Instruction):
+        assert instr.argval is not None
+        self._rot_top_n(instr.argval)
+
+    def _rot_top_n(self, n: int):
+        # a1 a2 a3 ... an  <- TOS
+        # the stack changes to
+        # an a1 a2 a3 an-1 <- TOS
+        assert (
+            len(self.stack) >= n
+        ), f"There are not enough elements on the stack. {n} is needed."
+        top = self.stack.pop()
+        self.stack.insert(n - 1, top)
+
+    def POP_TOP(self, instr: Instruction):
+        self.stack.pop()
+
+    def PUSH_NULL(self, instr: Instruction):
+        self.stack.push(NullVariable())
+
+    def ROT_TWO(self, instr: Instruction):
+        self._rot_top_n(2)
+
+    def ROT_THREE(self, instr: Instruction):
+        self._rot_top_n(3)
+
+    def ROT_FOUR(self, instr: Instruction):
+        self._rot_top_n(4)
+
+    def RESUME(self, instr: Instruction):
+        # RESUME is a no-op, it just for internal tracing, debugging and optimization checks.
+        pass
+
+    def SWAP(self, instr: Instruction):
+        assert isinstance(instr.arg, int)
+        self.stack.top, self.stack.peek[instr.arg] = (
+            self.stack.peek[instr.arg],
+            self.stack.top,
+        )
+
+    # unary operators
+    UNARY_POSITIVE = tos_op_wrapper(operator.pos)
+    UNARY_NEGATIVE = tos_op_wrapper(operator.neg)
+    UNARY_NOT = tos_op_wrapper(operator.not_)
+    UNARY_INVERT = tos_op_wrapper(operator.invert)
+
+    # binary operators
+    BINARY_POWER = tos_op_wrapper(operator.pow)
+    BINARY_MULTIPLY = tos_op_wrapper(operator.mul)
+    BINARY_MATRIX_MULTIPLY = tos_op_wrapper(operator.matmul)
+    BINARY_FLOOR_DIVIDE = tos_op_wrapper(operator.floordiv)
+    BINARY_TRUE_DIVIDE = tos_op_wrapper(operator.truediv)
+    BINARY_MODULO = tos_op_wrapper(operator.mod)
+    BINARY_ADD = tos_op_wrapper(operator.add)
+    BINARY_SUBTRACT = tos_op_wrapper(operator.sub)
+    BINARY_LSHIFT = tos_op_wrapper(operator.lshift)
+    BINARY_RSHIFT = tos_op_wrapper(operator.rshift)
+    BINARY_AND = tos_op_wrapper(operator.and_)
+    BINARY_OR = tos_op_wrapper(operator.or_)
+    BINARY_XOR = tos_op_wrapper(operator.xor)
+
+    def BINARY_OP(self, instr: Instruction):
+        opname, _ = opcode._nb_ops[instr.arg]
+        opname = (
+            opname.replace("NB_", "BINARY_")
+            .replace("BINARY_INPLACE", "INPLACE")
+            .replace("REMAINDER", "MODULO")
+        )
+        return getattr(self, opname)(instr)
+
+    @call_break_graph_decorator(push_n=1)
+    def BINARY_SUBSCR(self, instr: Instruction):
+        key = self.stack.pop()
+        container = self.stack.pop()
+        assert isinstance(key, VariableBase)
+        # TODO(xiongkun): getitem / getattr support key and attr as variable.
+        if isinstance(key, TensorVariable) and isinstance(
+            container, TensorVariable
+        ):
+            # NOTE(xiongkun): tensor[tensor] should support.
+            output = self._graph.call_tensor_method(
+                "__getitem__", container, key
+            )
+            self.stack.push(output)
+            return
+
+        if isinstance(key, TensorVariable):
+            raise BreakGraphError(
+                f"Key is a TensorVariable in BINARY_SUBSCR, {container}[{key}]"
+            )
+
+        result = BuiltinVariable(
+            operator.getitem, self._graph, DanglingTracker()
+        )(container, key)
+        self.stack.push(result)
+
+    # inplace operators
+    # paddle variable do not have inplace operators. For example when call `y **= x`, will call var.__pow__
+    INPLACE_POWER = tos_inplace_op_wrapper(operator.ipow)
+    INPLACE_MULTIPLY = tos_inplace_op_wrapper(operator.imul)
+    INPLACE_MATRIX_MULTIPLY = tos_inplace_op_wrapper(operator.imatmul)
+    INPLACE_FLOOR_DIVIDE = tos_inplace_op_wrapper(operator.ifloordiv)
+    INPLACE_TRUE_DIVIDE = tos_inplace_op_wrapper(operator.itruediv)
+    INPLACE_MODULO = tos_inplace_op_wrapper(operator.imod)
+    INPLACE_ADD = tos_inplace_op_wrapper(operator.iadd)
+    INPLACE_SUBTRACT = tos_inplace_op_wrapper(operator.isub)
+    INPLACE_LSHIFT = tos_inplace_op_wrapper(operator.ilshift)
+    INPLACE_RSHIFT = tos_inplace_op_wrapper(operator.irshift)
+    INPLACE_AND = tos_inplace_op_wrapper(operator.iand)
+    INPLACE_OR = tos_inplace_op_wrapper(operator.ior)
+    INPLACE_XOR = tos_inplace_op_wrapper(operator.ixor)
+
+    def NOP(self, instr: Instruction):
+        pass
+
+    @call_break_graph_decorator(push_n=1)
+    def LOAD_ATTR(self, instr: Instruction):
+        attr_name = self._code.co_names[instr.arg]
+        attr_name_var = ConstantVariable.wrap_literal(attr_name, self._graph)
+        obj = self.stack.pop()
+        self.stack.push(
+            BuiltinVariable(
+                getattr, graph=self._graph, tracker=DanglingTracker()
+            )(obj, attr_name_var)
+        )
+
+    def LOAD_CONST(self, instr: Instruction):
+        var = self._co_consts[instr.arg]
+        self.stack.push(var)
+
+    def MAKE_CELL(self, instr: Instruction):
+        self._locals[instr.argval] = self._cells[instr.argval]
+
+    def LOAD_CLOSURE(self, instr: Instruction):
+        if sys.version_info >= (3, 11):
+            self.LOAD_FAST(instr)
+            return
+        namemap = self._code.co_cellvars + self._code.co_freevars
+        name = namemap[instr.arg]
+        self.stack.push(self._cells[name])
+
+    def LOAD_DEREF(self, instr: Instruction):
+        if sys.version_info >= (3, 11):
+            self.stack.push(self._locals[instr.argval].cell_content())
+            return
+        namemap = self._code.co_cellvars + self._code.co_freevars
+        name = namemap[instr.arg]
+        self.stack.push(self._cells[name].cell_content())
+
+    def COPY_FREE_VARS(self, instr: Instruction):
+        for i in range(instr.arg):
+            freevar_name = self._code.co_freevars[i]
+            self._locals[freevar_name] = self._cells[freevar_name]
+
+    def LOAD_FAST(self, instr: Instruction):
+        var = self._locals[instr.argval]
+        self.stack.push(var)
+
+    def DELETE_FAST(self, instr: Instruction):
+        varname = self._code.co_varnames[instr.arg]
+        del self._locals[varname]
+
+    def LOAD_GLOBAL(self, instr: Instruction):
+        namei: int = instr.arg
+        push_null = False
+        if sys.version_info >= (3, 11):
+            push_null = namei & 1
+            namei >>= 1
+        if push_null:
+            self.stack.push(NullVariable())
+        name = self._code.co_names[namei]
+        if name in self._globals.keys():
+            value = self._globals.get(name)
+        elif name in self._builtins.keys():
+            value = self._builtins[name]
+        else:
+            raise InnerError(f"{name} not in globals and builtins")
+        self.stack.push(value)
+
+    def LOAD_METHOD(self, instr: Instruction):
+        method_name = self._code.co_names[instr.arg]
+        method_name_var = ConstantVariable.wrap_literal(
+            method_name, self._graph
+        )
+        obj = self.stack.pop()
+
+        method = BuiltinVariable(
+            getattr, graph=self._graph, tracker=DanglingTracker()
+        )(obj, method_name_var)
+
+        if isinstance(method, MethodVariable):
+            # bound method, push the unbound method and the self
+            self.stack.push(method.fn)
+            self.stack.push(obj)
+        else:
+            # unbound method, push the dummy and the function
+            self.stack.push(NullVariable())
+            self.stack.push(method)
+
+    @call_break_graph_decorator(push_n=0)
+    def STORE_ATTR(self, instr: Instruction):
+        obj = self.stack.pop()
+        val = self.stack.pop()
+        key = self._code.co_names[instr.arg]
+        key_var = ConstantVariable.wrap_literal(key, self._graph)
+        BuiltinVariable(
+            setattr, self._graph, DummyTracker([obj, key_var, val])
+        )(obj, key_var, val)
+
+    def DELETE_ATTR(self, instr: Instruction):
+        obj = self.stack.pop()
+        key = instr.argval
+        key_var = ConstantVariable.wrap_literal(key, self._graph)
+        BuiltinVariable(delattr, self._graph, DummyTracker([obj, key_var]))(
+            obj, key_var
+        )
+
+    def STORE_DEREF(self, instr: Instruction):
+        if sys.version_info >= (3, 11):
+            self._cells[instr.argval].set_value(self.stack.pop())
+            self._locals[instr.argval] = self._cells[instr.argval]
+            return
+        namemap = self._code.co_cellvars + self._code.co_freevars
+        name = namemap[instr.arg]
+        self._cells[name].set_value(self.stack.pop())
+
+    def STORE_FAST(self, instr: Instruction):
+        """
+        TODO: side effect may happen
+        """
+        var = self.stack.pop()
+        name = self._code.co_varnames[instr.arg]
+        var.debug_name = name
+        self._locals[name] = var
+
+    def STORE_GLOBAL(self, instr: Instruction):
+        var = self.stack.pop()
+        name = self._code.co_names[instr.arg]
+        var.debug_name = name
+        self._globals.set(name, var)
+
+    def DELETE_GLOBAL(self, instr: Instruction):
+        self._globals.delete(self._code.co_names[instr.arg])
+
+    @call_break_graph_decorator(push_n=0)
+    def STORE_SUBSCR(self, instr: Instruction):
+        key = self.stack.pop()
+        container = self.stack.pop()
+        value = self.stack.pop()
+        assert isinstance(key, VariableBase)
+        self._graph.add_global_guarded_variable(key)
+        if isinstance(key, TensorVariable):
+            raise BreakGraphError(
+                f"Key is a TensorVariable in STORE_SUBSCR, {container}[{key}] = {value}"
+            )
+        # TODO(xiongkun): support tensor[tensor] = tensor, dy2static is not the same with dygraph.
+        container[key.get_py_value()] = value
+        value.debug_name = f"{container.debug_name}[{key.debug_name}]"
+
+    def DELETE_SUBSCR(self, instr: Instruction):
+        key = self.stack.pop()
+        container = self.stack.pop()
+        assert isinstance(key, VariableBase)
+        self._graph.add_global_guarded_variable(key)
+        BuiltinVariable(operator.delitem, self._graph, DanglingTracker())(
+            container, key
+        )
+
+    def BUILD_LIST(self, instr: Instruction):
+        list_size = instr.arg
+        assert list_size <= len(
+            self.stack
+        ), f"OpExecutor want BUILD_LIST with size {list_size}, but current stack do not have enough elems."
+        val_list = self.stack.pop_n(list_size)
+        self.stack.push(
+            ListVariable(
+                val_list, graph=self._graph, tracker=DummyTracker(val_list)
+            )
+        )
+
+    def BUILD_TUPLE(self, instr: Instruction):
+        tuple_size = instr.arg
+        assert tuple_size <= len(
+            self.stack
+        ), f"OpExecutor want BUILD_TUPLE with size {tuple_size}, but current stack do not have enough elems."
+        val_tuple = self.stack.pop_n(tuple_size)
+        self.stack.push(
+            TupleVariable(
+                tuple(val_tuple),
+                graph=self._graph,
+                tracker=DummyTracker(val_tuple),
+            )
+        )
+
+    def BUILD_STRING(self, instr: Instruction):
+        count = instr.arg
+        assert count <= len(
+            self.stack
+        ), f"OpExecutor want BUILD_STRING with size {count}, but current stack do not have enough elems."
+        str_list = self.stack.pop_n(count)
+        new_str = ''
+        for s in str_list:
+            assert s.get_py_type() is str
+            new_str += s.get_py_value()
+        self.stack.push(
+            ConstantVariable(new_str, self._graph, DummyTracker(str_list))
+        )
+
+    @call_break_graph_decorator(push_n=1)
+    def BUILD_SLICE(self, instr: Instruction):
+        if instr.arg == 3:
+            step = self.stack.pop()
+        else:
+            step = ConstantVariable.wrap_literal(None, self._graph)
+        stop = self.stack.pop()
+        start = self.stack.pop()
+
+        self.stack.push(
+            SliceVariable(
+                slice(start, stop, step),
+                graph=self._graph,
+                tracker=DummyTracker([start, stop, step]),
+            )
+        )
+
+    def build_map(
+        self, keys: list[VariableBase], values: list[VariableBase]
+    ) -> VariableBase:
+        built_map = {}
+        for key, value in zip(keys, values):
+            assert isinstance(key, VariableBase)
+            # Add key to global guarded variable to avoid missing the key guard
+            self._graph.add_global_guarded_variable(key)
+            key = key.get_py_value()
+            built_map[key] = value
+        return DictVariable(
+            built_map,
+            graph=self._graph,
+            tracker=DummyTracker(keys + values),
+        )
+
+    def BUILD_MAP(self, instr: Instruction):
+        map_size = instr.arg
+        assert map_size * 2 <= len(
+            self.stack
+        ), f"OpExecutor want BUILD_MAP with size {map_size} * 2, but current stack do not have enough elems."
+        val_for_dict = self.stack.pop_n(map_size * 2)
+        keys = val_for_dict[::2]
+        values = val_for_dict[1::2]
+        self.stack.push(self.build_map(keys, values))
+
+    def BUILD_CONST_KEY_MAP(self, instr: Instruction):
+        map_size = instr.arg
+        assert map_size + 1 <= len(
+            self.stack
+        ), f"OpExecutor want BUILD_CONST_KEY_MAP with size {map_size} + 1, but current stack do not have enough elems."
+        keys = self.stack.pop().get_items()
+        assert len(keys) == map_size
+        values = self.stack.pop_n(map_size)
+        self.stack.push(self.build_map(keys, values))
+
+    def build_seq_unpack(self, instr: Instruction):
+        oparg = instr.arg
+        assert isinstance(oparg, int)
+        unpack_values = self.stack.pop_n(oparg)
+
+        retval = []
+        for item in unpack_values:
+            assert isinstance(item, (TupleVariable, ListVariable))
+            retval.extend(item.get_wrapped_items())
+
+        if instr.opname in {
+            "BUILD_TUPLE_UNPACK_WITH_CALL",
+            "BUILD_TUPLE_UNPACK",
+        }:
+            retval = tuple(retval)
+
+        self.stack.push(
+            VariableFactory.from_value(
+                retval, self._graph, DummyTracker(unpack_values)
+            )
+        )
+
+    def BUILD_TUPLE_UNPACK_WITH_CALL(self, instr: Instruction):
+        self.build_seq_unpack(instr)
+
+    def BUILD_TUPLE_UNPACK(self, instr: Instruction):
+        self.build_seq_unpack(instr)
+
+    def BUILD_LIST_UNPACK(self, instr: Instruction):
+        self.build_seq_unpack(instr)
+
+    def BUILD_MAP_UNPACK(self, instr: Instruction):
+        oparg = instr.arg
+        assert isinstance(oparg, int)
+        unpack_values = self.stack.pop_n(oparg)
+
+        retval = {}
+        for item in unpack_values:
+            assert item.get_py_type() is dict
+            retval.update(item.get_wrapped_items())
+
+        self.stack.push(
+            VariableFactory.from_value(
+                retval, self._graph, DummyTracker(unpack_values)
+            )
+        )
+
+    def BUILD_MAP_UNPACK_WITH_CALL(self, instr: Instruction):
+        oparg = instr.arg
+        assert isinstance(oparg, int)
+        unpack_values = self.stack.pop_n(oparg)
+
+        retval = {}
+        for item in unpack_values:
+            assert item.get_py_type() is dict
+            wrapped_item = item.get_wrapped_items()
+            if wrapped_item.items() & retval.items():
+                raise InnerError(
+                    "BUILD_MAP_UNPACK_WITH_CALL found repeated key."
+                )
+            retval.update(wrapped_item)
+
+        self.stack.push(
+            VariableFactory.from_value(
+                retval, self._graph, DummyTracker(unpack_values)
+            )
+        )
+
+    def PRECALL(self, instr: Instruction):
+        assert isinstance(instr.arg, int)
+        is_method_layout = not isinstance(
+            self.stack.peek[instr.arg + 2], NullVariable
+        )
+        nargs = instr.arg + int(is_method_layout)
+        method = self.stack.peek[nargs + 1]
+        if not is_method_layout and isinstance(method, MethodVariable):
+            unbound_method = method.fn
+            self_var = method.bound_instance
+            self.stack.peek[nargs + 1] = self_var
+            self.stack.peek[nargs + 2] = unbound_method
+
+    def KW_NAMES(self, instr: Instruction):
+        assert self._call_shape is None
+        assert isinstance(instr.arg, int)
+        self._call_shape = self._co_consts[instr.arg].get_py_value()
+
+    @call_break_graph_decorator(push_n=1)
+    def CALL(self, instr: Instruction):
+        assert isinstance(instr.arg, int)
+        assert instr.arg + 2 <= len(self.stack)
+        is_method = not isinstance(self.stack.peek[instr.arg + 2], NullVariable)
+        total_args = instr.arg + int(is_method)
+        kwnames = self._call_shape if self._call_shape is not None else []
+        n_kwargs = len(kwnames)
+        n_positional_args = total_args - n_kwargs
+        kwargs_list = self.stack.pop_n(n_kwargs)
+        kwargs = dict(zip(kwnames, kwargs_list))
+        args = self.stack.pop_n(n_positional_args)
+        fn = self.stack.pop()
+        if not is_method:
+            # pop the NULL variable
+            self.stack.pop()
+        self.stack.push(fn(*args, **kwargs))
+        self._call_shape = None
+
+    @call_break_graph_decorator(push_n=1)
+    def CALL_FUNCTION(self, instr: Instruction):
+        assert isinstance(instr.arg, int)
+        n_args = instr.arg
+        assert isinstance(n_args, int)
+        args = self.stack.pop_n(n_args)
+        kwargs = {}
+        fn = self.stack.pop()
+        ret = fn(*args, **kwargs)
+        self.stack.push(ret)
+
+    @call_break_graph_decorator(push_n=1)
+    def CALL_FUNCTION_KW(self, instr: Instruction):
+        n_args = instr.arg
+        assert n_args + 2 <= len(self.stack)
+
+        kwargs_keys = self.stack.pop()
+        assert isinstance(kwargs_keys, TupleVariable)
+        assert len(kwargs_keys) > 0
+        kwargs_keys = [
+            x.get_py_value() if isinstance(x, VariableBase) else x
+            for x in kwargs_keys.get_py_value()
+        ]
+
+        # split arg_list to args and kwargs
+        arg_list = self.stack.pop_n(n_args)
+        args = arg_list[: -len(kwargs_keys)]
+        kwargs_values = arg_list[-len(kwargs_keys) :]
+        kwargs = dict(zip(kwargs_keys, kwargs_values))
+
+        fn = self.stack.pop()
+        ret = fn(*args, **kwargs)
+        self.stack.push(ret)
+
+    @call_break_graph_decorator(push_n=1)
+    def CALL_FUNCTION_EX(self, instr: Instruction):
+        flag = instr.arg
+        if flag & CFE.CFE_HAS_KWARGS:
+            kwargs_variable = self.stack.pop()
+            assert isinstance(kwargs_variable, DictVariable)
+            kwargs = kwargs_variable.get_wrapped_items()
+        else:
+            kwargs = {}
+
+        args_variable = self.stack.pop()
+        assert isinstance(args_variable, (TupleVariable, ListVariable))
+        args = args_variable.get_wrapped_items()
+
+        fn = self.stack.pop()
+        if sys.version_info >= (3, 11):
+            null = self.stack.pop()
+            assert isinstance(null, NullVariable)
+        ret = fn(*args, **kwargs)
+        self.stack.push(ret)
+
+    @call_break_graph_decorator(push_n=1)
+    def CALL_METHOD(self, instr: Instruction):
+        n_args = instr.arg
+        assert isinstance(n_args, int)
+        args = self.stack.pop_n(n_args)
+        self_var = self.stack.pop()
+        method = self.stack.pop()
+        if isinstance(method, NullVariable):
+            method = self_var
+        else:
+            args = [self_var] + args
+        self.stack.push(method(*args))
+
+    @call_break_graph_decorator(
+        push_n=1
+    )  # call instance, in, not in may call TensorVariable.get_py_value, which raise BreakGraphError
+    def COMPARE_OP(self, instr: Instruction):
+        op = dis.cmp_op[instr.arg]
+        right, left = self.stack.pop(), self.stack.pop()
+        self.stack.push(
+            BuiltinVariable(
+                SUPPORT_COMPARE_OP[op], self._graph, DanglingTracker()
+            )(left, right)
+        )
+
+    @call_break_graph_decorator(push_n=1)
+    def IS_OP(self, instr: Instruction):
+        # It will only be 0 or 1
+        assert instr.arg == 0 or instr.arg == 1
+        right, left = self.stack.pop(), self.stack.pop()
+        op = "is" if instr.arg == 0 else "is not"
+        self.stack.push(
+            BuiltinVariable(
+                SUPPORT_COMPARE_OP[op], self._graph, DanglingTracker()
+            )(left, right)
+        )
+
+    def MAKE_FUNCTION(self, instr: Instruction):
+        if sys.version_info < (3, 11):
+            fn_name = self.stack.pop()
+        codeobj = self.stack.pop()
+        if sys.version_info >= (3, 11):
+            # MAKE_FUNCTION behavior actually changed in 3.11, see
+            # https://github.com/python/cpython/pull/93189/
+            assert hasattr(codeobj.value, "co_qualname")
+            fn_name = ConstantVariable(
+                codeobj.value.co_qualname, self._graph, DummyTracker([codeobj])
+            )
+
+        global_dict = self._globals.get_value()
+
+        related_list = [fn_name, codeobj]
+
+        flag = instr.arg
+        if flag & MF.MF_HAS_CLOSURE:
+            # closure should be a tuple of Variables
+            closure_variable = self.stack.pop()
+            assert isinstance(closure_variable, TupleVariable)
+            closure = []
+            for item in closure_variable.get_wrapped_items():
+                closure.append(types.CellType())
+                closure[-1].cell_contents = item
+            closure = tuple(closure)
+        else:
+            closure = ()
+
+        if flag & MF.MF_HAS_ANNOTATION:
+            # can not set annotation in python env, skip it
+            related_list.append(self.stack.pop())
+
+        if flag & MF.MF_HAS_KWDEFAULTS:
+            raise FallbackError(
+                "Found need func_kwdefaults when MAKE_FUNCTION."
+            )
+
+        if flag & MF.MF_HAS_DEFAULTS:
+            '''
+            default_args should have tracker too, like:
+
+            def f(x):
+                def g(z=x):
+                    pass
+            '''
+            default_args_variable = self.stack.pop()
+            assert isinstance(default_args_variable, TupleVariable)
+            related_list.append(default_args_variable)
+            default_args = tuple(default_args_variable.get_wrapped_items())
+        else:
+            default_args = ()
+
+        new_fn = types.FunctionType(
+            codeobj.get_py_value(),
+            global_dict,
+            fn_name.get_py_value(),
+            default_args,
+            closure,
+        )
+        self.stack.push(
+            UserDefinedFunctionVariable(
+                new_fn, self._graph, DummyTracker(related_list)
+            )
+        )
+
+    def GET_ITER(self, instr: Instruction):
+        source_obj = self.stack.pop()
+        iter_variable = BuiltinVariable(iter, self._graph, DanglingTracker())(
+            source_obj
+        )
+        self.stack.push(iter_variable)
+
+    def JUMP_ABSOLUTE(self, instr: Instruction):
+        assert instr.jump_to is not None
+        self.jump_to(instr.jump_to)
+
+    def JUMP_FORWARD(self, instr: Instruction):
+        self.JUMP_ABSOLUTE(instr)
+
+    def JUMP_BACKWARD(self, instr: Instruction):
+        # TODO: check interrupt
+        self.JUMP_ABSOLUTE(instr)
+
+    def JUMP_BACKWARD_NO_INTERRUPT(self, instr: Instruction):
+        self.JUMP_ABSOLUTE(instr)
+
+    @call_break_graph_decorator(push_n=1)
+    def CONTAINS_OP(self, instr: Instruction):
+        # It will only be 0 or 1
+        assert instr.arg == 0 or instr.arg == 1
+        right, left = self.stack.pop(), self.stack.pop()
+        op = "in" if instr.arg == 0 else "not in"
+        self.stack.push(
+            BuiltinVariable(
+                SUPPORT_COMPARE_OP[op], self._graph, DanglingTracker()
+            )(left, right)
+        )
+
+    @jump_break_graph_decorator
+    def JUMP_IF_FALSE_OR_POP(self, instr: Instruction):
+        pred_obj = self.stack.top
+        if isinstance(pred_obj, (ConstantVariable, ContainerVariable)):
+            self._graph.add_global_guarded_variable(pred_obj)
+            is_jump = not bool(pred_obj)
+            if is_jump:
+                assert instr.jump_to is not None
+                self.jump_to(instr.jump_to)
+            else:
+                self.stack.pop()
+            return
+        raise FallbackError(
+            "Currently don't support predicate a non-const / non-tensor obj."
+        )
+
+    @jump_break_graph_decorator
+    def JUMP_IF_TRUE_OR_POP(self, instr: Instruction):
+        pred_obj = self.stack.top
+        if isinstance(pred_obj, (ConstantVariable, ContainerVariable)):
+            self._graph.add_global_guarded_variable(pred_obj)
+            is_jump = bool(pred_obj)
+            if is_jump:
+                assert instr.jump_to is not None
+                self.jump_to(instr.jump_to)
+            else:
+                self.stack.pop()
+            return
+        raise FallbackError(
+            "Currently don't support predicate a non-const / non-tensor obj."
+        )
+
+    POP_JUMP_IF_FALSE = pop_jump_if_op_wrapper([bool, operator.not_])
+    POP_JUMP_FORWARD_IF_FALSE = POP_JUMP_IF_FALSE
+    POP_JUMP_BACKWARD_IF_FALSE = POP_JUMP_IF_FALSE
+
+    POP_JUMP_IF_TRUE = pop_jump_if_op_wrapper([bool])
+    POP_JUMP_FORWARD_IF_TRUE = POP_JUMP_IF_TRUE
+    POP_JUMP_BACKWARD_IF_TRUE = POP_JUMP_IF_TRUE
+
+    POP_JUMP_FORWARD_IF_NONE = pop_jump_if_op_wrapper([operator_is_none])
+    POP_JUMP_BACKWARD_IF_NONE = POP_JUMP_FORWARD_IF_NONE
+
+    POP_JUMP_FORWARD_IF_NOT_NONE = pop_jump_if_op_wrapper(
+        [operator_is_not_none]
+    )
+    POP_JUMP_BACKWARD_IF_NOT_NONE = POP_JUMP_FORWARD_IF_NOT_NONE
+
+    @call_break_graph_decorator(push_n=lambda arg: arg)
+    def UNPACK_SEQUENCE(self, instr: Instruction):
+        sequence = self.stack.pop()
+        seq_iter = BuiltinVariable(iter, self._graph, DanglingTracker())(
+            sequence
+        )
+        unpacked = []
+        for _ in range(instr.arg):
+            unpacked.append(seq_iter.next())
+        for item in reversed(unpacked):
+            self.stack.push(item)
+
+    def UNPACK_EX(self, instr: Instruction):
+        getitem = BuiltinVariable(
+            operator.getitem, self._graph, DanglingTracker()
+        )
+        assert instr.arg is not None
+        sequence = self.stack.pop()
+        if not isinstance(
+            sequence, (ListVariable, TupleVariable, TensorVariable)
+        ):
+            raise FallbackError(f"Unpack {sequence} is not implemented.")
+
+        if instr.argval >= 256:
+            # NOTE: If the number of unpacked variables exceeds 256, python will report an error like:
+            # SyntaxError: too many expressions in star-unpacking assignmen,
+            # so if the number of unpacked variables exceeds 256, it will be treated as the following case.
+            # a, b, *c, d = e
+            front_nums = instr.arg & 0xFF
+            back_nums = instr.arg >> 8
+            assert (
+                len(sequence) >= front_nums + back_nums
+            ), f"Want unpack {sequence} to {front_nums + back_nums}, but {len(sequence)} is smaller than {front_nums + back_nums}."
+
+            for i in range(
+                len(sequence) - 1, len(sequence) - back_nums - 1, -1
+            ):
+                self.stack.push(getitem(sequence, i))
+
+            slice_var = SliceVariable(
+                slice(front_nums, len(sequence) - back_nums - 1),
+                self._graph,
+                DummyTracker([sequence]),
+            )
+        else:
+            # a, b, c, *d = e
+            assert (
+                len(sequence) >= instr.arg
+            ), f"Want unpack {sequence} to {instr.arg}, but {len(sequence)} is smaller than {instr.arg}."
+
+            slice_obj = slice(instr.arg, None)
+            slice_var = SliceVariable(
+                slice_obj, self._graph, ConstTracker(slice_obj)
+            )
+            front_nums = instr.arg
+        self.stack.push(getitem(sequence, slice_var))
+        for i in range(front_nums - 1, -1, -1):
+            self.stack.push(getitem(sequence, i))
+
+    def FORMAT_VALUE(self, instr: Instruction):
+        flag = instr.arg
+        assert flag is not None
+        which_conversion = flag & FV.FVC_MASK
+        have_fmt_spec = bool((flag & FV.FVS_MASK) == FV.FVS_HAVE_SPEC)
+
+        fmt_spec = self.stack.pop().get_py_value() if have_fmt_spec else ""
+        value = self.stack.pop()
+
+        if which_conversion == FV.FVC_NONE:
+            convert_fn = None
+        elif which_conversion == FV.FVC_STR:
+            convert_fn = "__str__"
+        elif which_conversion == FV.FVC_REPR:
+            convert_fn = "__repr__"
+        elif which_conversion == FV.FVC_ASCII:
+            convert_fn = "__ascii__"
+        else:
+            raise InnerError(
+                f"Unexpected conversion flag {flag} for FORMAT_VALUE"
+            )
+
+        # different type will lead to different Tracker, so call self.stack.push in different branch
+        if isinstance(value, ConstantVariable):
+            result = value.get_py_value()
+            if convert_fn is not None:
+                result = getattr(result, convert_fn)(result)
+
+            if not isinstance(result, str) or fmt_spec != "":
+                result = format(result, fmt_spec)
+
+            self.stack.push(
+                ConstantVariable(result, self._graph, DummyTracker([value]))
+            )
+        else:
+            raise FallbackError(f"Do not support format {type(value)} now")
+
+    # NOTE: This operation will generate SideEffects, and the mechanism has not been completed yet
+    def DICT_UPDATE(self, instr: Instruction):
+        dict_value = self.stack.pop()
+        assert isinstance(instr.arg, int)
+        BuiltinVariable(dict.update, self._graph, tracker=DanglingTracker())(
+            self.stack.peek[instr.arg], dict_value
+        )
+
+    def DICT_MERGE(self, instr: Instruction):
+        dict_value = self.stack.pop()
+        assert isinstance(instr.arg, int)
+        for key in dict_value.get_wrapped_items().keys():
+            result = (
+                self.stack.peek[instr.arg].get_wrapped_items().get(key, None)
+            )
+            if result is not None:
+                raise InnerError(
+                    f"got multiple values for keyword argument '{key}'"
+                )
+        BuiltinVariable(dict.update, self._graph, tracker=DanglingTracker())(
+            self.stack.peek[instr.arg], dict_value
+        )
+
+    def LIST_APPEND(self, instr: Instruction):
+        list_value = self.stack.pop()
+        assert isinstance(instr.arg, int)
+        BuiltinVariable(list.append, self._graph, tracker=DanglingTracker())(
+            self.stack.peek[instr.arg], list_value
+        )
+
+    def MAP_ADD(self, instr: Instruction):
+        key, value = self.stack.pop_n(2)
+        assert isinstance(instr.arg, int)
+        BuiltinVariable(operator.setitem, self._graph, DanglingTracker())(
+            self.stack.peek[instr.arg], key, value
+        )
+
+    def LIST_EXTEND(self, instr: Instruction):
+        list_value = self.stack.pop()
+        assert isinstance(instr.arg, int)
+        BuiltinVariable(list.extend, self._graph, tracker=DanglingTracker())(
+            self.stack.peek[instr.arg], list_value
+        )
+
+    def LIST_TO_TUPLE(self, instr: Instruction):
+        list_value = self.stack.pop()
+        self.stack.push(
+            TupleVariable(
+                list_value.get_wrapped_items(),
+                self._graph,
+                DummyTracker([list_value]),
+            )
+        )
+
+
+class OpcodeExecutor(OpcodeExecutorBase):
+    """
+    A class that represents an executor for opcode operations.
+
+    Args:
+        frame: The frame object.
+
+    """
+
+    def __init__(self, frame: types.FrameType, **kwargs):
+        graph = FunctionGraph(frame, **kwargs)
+        self._frame = frame
+        self._name = "Executor"
+        self.call_stack[:] = []
+        super().__init__(frame.f_code, graph)
+        Dispatcher.graph = graph
+
+    def cleanup(self):
+        self._graph.pycode_gen = None
+        Dispatcher.graph = None
+
+    @event_register("OpcodeExecutor: _prepare_virtual_env", event_level=2)
+    def _prepare_virtual_env(self):
+        """
+        Prepare the virtual environment for execution by adding variables from locals, globals, builtins, and constants.
+
+        """
+        log(
+            3,
+            f"[Executor] code options: co_cellvars={self._frame.f_code.co_cellvars}\n",
+        )
+        free_or_cell_vars = (
+            self._frame.f_code.co_cellvars + self._frame.f_code.co_freevars
+        )
+        for name, value in self._frame.f_locals.items():
+            tracker = (
+                CellTracker(name)
+                if name in free_or_cell_vars
+                else LocalTracker(name)
+            )
+            self._locals[name] = VariableFactory.from_value(
+                value, self._graph, tracker, debug_name=name
+            )
+
+        for name in free_or_cell_vars:
+            # create a cell for each variable.
+            self._cells[name] = CellVariable()  # put in cells.
+            if name in self._locals:
+                self._cells[name].set_value(self._locals[name])
+
+        self._globals = GlobalVariable(
+            self._frame.f_globals,
+            self._graph,
+            DanglingTracker(),
+        )
+
+        self._builtins = self._graph._builtins
+
+        for value in self._code.co_consts:
+            self._co_consts.append(
+                VariableFactory.from_value(
+                    value, self._graph, ConstTracker(value)
+                )
+            )
+
+    def _create_resume_fn(self, index, stack_size=0):
+        """
+        Create a resume function and its inputs at the specified index.
+
+        Args:
+            index: The index at which the resume function is created.
+            stack_size: The size of the stack.
+
+        Returns:
+            The resume function and its inputs.
+
+        """
+        pycode_gen = PyCodeGen(self._frame)
+        fn, inputs = pycode_gen.gen_resume_fn_at(index, stack_size)
+        return fn, inputs
+
+    @fallback_when_occur_error
+    def _break_graph_in_jump(self, result: VariableBase, instr: Instruction):
+        """
+        Break the graph at a JUMP instruction.
+
+        Args:
+            result: The result variable of the jump instruction.
+            instr: The jump instruction.
+
+        """
+        self._graph.add_global_guarded_variable(result)
+        stack_size = len(self.stack)
+        if_fn, if_inputs = self._create_resume_fn(
+            self.indexof(instr) + 1, stack_size
+        )
+        else_fn, else_inputs = self._create_resume_fn(
+            self.indexof(instr.jump_to), stack_size
+        )
+
+        # gen call static fn opcode
+        inputs_name = if_inputs | else_inputs
+        inputs_var = [
+            self.get_var(name)
+            for name in inputs_name
+            if self.get_var(name) is not result
+        ]
+        ret_vars = [
+            result,
+        ] + inputs_var
+        # Collect all the to store variables.
+        store_vars = []
+        for stack_arg in self.stack:
+            store_vars.append(stack_arg)
+        for name in inputs_name:
+            store_vars.append(self.get_var(name))
+
+        var_loader = self._graph.start_compile_with_name_store(
+            ret_vars, store_vars
+        )
+        # only pop the input of if/else resume fn, and keep the bool tensor result on the stack
+        for _ in inputs_var:
+            self._graph.pycode_gen.gen_pop_top()
+
+        # gen call if/else resume fn opcode
+        if if_fn is not None:
+            self._graph.pycode_gen.gen_load_object(
+                if_fn, if_fn.__code__.co_name
+            )
+            insert_index = len(self._graph.pycode_gen._instructions) - 1
+            for i, stack_arg in enumerate(self.stack):
+                var_loader.load(
+                    stack_arg, allow_push_null=i >= len(self.stack) - 1
+                )
+            for name in if_inputs:
+                var_loader.load(self.get_var(name))
+            self._graph.pycode_gen.gen_call_function(
+                argc=if_fn.__code__.co_argcount,
+            )
+            self._graph.pycode_gen.gen_return()
+        else:
+            insert_index = len(self._graph.pycode_gen._instructions) - 1
+            self._graph.pycode_gen.gen_return()
+
+        if else_fn is not None:
+            self._graph.pycode_gen.gen_load_object(
+                else_fn, else_fn.__code__.co_name
+            )
+            jump_to = self._graph.pycode_gen._instructions[-1]
+            for i, stack_arg in enumerate(self.stack):
+                var_loader.load(
+                    stack_arg, allow_push_null=i >= len(self.stack) - 1
+                )
+            for name in else_inputs:
+                var_loader.load(self.get_var(name))
+            self._graph.pycode_gen.gen_call_function(
+                argc=else_fn.__code__.co_argcount,
+            )
+            self._graph.pycode_gen.gen_return()
+        else:
+            self._graph.pycode_gen.gen_return()
+            jump_to = self._graph.pycode_gen._instructions[-1]
+
+        # gen jump opcode
+        self._graph.pycode_gen._insert_instr(
+            insert_index, instr.opname, jump_to=jump_to
+        )
+
+        self.new_code = self._graph.pycode_gen.gen_pycode()
+        self.guard_fn = self._graph.guard_fn
+
+    @fallback_when_occur_error
+    def _break_graph_in_call(
+        self,
+        origin_stack: VariableStack,
+        instr: Instruction,
+        push_n: int | Callable[[int | None], int],
+    ):
+        """
+        Break the graph at a CALL instruction.
+
+        Args:
+            origin_stack: The original stack.
+            instr: The call instruction.
+            push_n: The number of elements to be pushed onto the stack.
+
+        """
+        push_n = push_n(instr.arg) if callable(push_n) else push_n
+        index = self.indexof(instr)
+        self.stack = origin_stack
+
+        # gen call static fn opcode
+        ret_vars = [
+            arg
+            for arg in self.stack
+            if isinstance(arg, (TensorVariable, ContainerVariable))
+        ]
+        resume_input_name = analysis_inputs(self._instructions, index + 1)
+        ret_vars = ret_vars + [
+            self.get_var(name)
+            for name in resume_input_name
+            if self.get_var(name) not in ret_vars
+        ]
+
+        # Collect all the to store variables.
+        store_vars = []
+        for stack_arg in self.stack:
+            store_vars.append(stack_arg)
+        for name in resume_input_name:
+            store_vars.append(self.get_var(name))
+        var_loader = self._graph.start_compile_with_name_store(
+            ret_vars, store_vars
+        )
+
+        for _ in ret_vars:
+            self._graph.pycode_gen.gen_pop_top()
+
+        # gen graph break call fn opcode
+        stack_effect = calc_stack_effect(instr)
+        pop_n = push_n - stack_effect
+
+        for i, stack_arg in enumerate(self.stack):
+            var_loader.load(
+                stack_arg, allow_push_null=i >= len(self.stack) - pop_n
+            )
+
+        # gen call resume fn opcode
+        # NOTE(SigureMo): In Python 3.11，we need generate KW_NAMES if the call shape is not None.
+        self._graph.pycode_gen.gen_kw_names(self._call_shape)
+        self._graph.pycode_gen.add_pure_instructions([instr])
+        self.stack.pop_n(pop_n)
+        stack_size = len(self.stack) + push_n
+
+        resume_fn, _ = self._create_resume_fn(index + 1, stack_size)
+        if resume_fn:
+            self._graph.pycode_gen.gen_load_object(
+                resume_fn, resume_fn.__code__.co_name
+            )
+            # NOTE(zrr1999): We need to shift the resume_fn under its arguments.
+            # In Python 3.11+, NULL + resume_fn should be shifted together.
+            shift_n = 2 if sys.version_info >= (3, 11) else 1
+            self._graph.pycode_gen.gen_shift_n(shift_n, stack_size + shift_n)
+            for name in resume_input_name:
+                var_loader.load(self.get_var(name))
+            self._graph.pycode_gen.gen_call_function(
+                argc=resume_fn.__code__.co_argcount,
+            )
+
+        # gen RETURN_VALUE
+        self._graph.pycode_gen.gen_return()
+
+        self.new_code = self._graph.pycode_gen.gen_pycode()
+        self.guard_fn = self._graph.guard_fn
+
+    def transform(self):
+        self.run()
+        if self.new_code is None:
+            raise InnerError("OpExecutor return a empty new_code.")
+        # stopped by RETURN_VALUE and has sir len is enough => disable_eval_frame
+        simulate_complete = bool(self.stop_state == "Return")
+        if simulate_complete:
+            if self._graph.sir_ctx.TOS.graph_size() < min_graph_size():
+                raise FallbackError(
+                    "Fallback after simulate for reasons.",
+                    disable_eval_frame=True,
+                )
+            else:
+                # if simulate stop with graph successfully, the all codes will be
+                # surrounded by the eval_frame triggers which exist in self.new_code
+                # we need not set disable_eval_frame=False here (for it already is)
+                return (
+                    CustomCode(self.new_code, True),
+                    self.guard_fn,
+                )
+        else:
+            # if return because breakgraph, need open eval_frame
+            return (
+                CustomCode(self.new_code, False),
+                self.guard_fn,
+            )
+
+    def _gen_loop_body_between(
+        self, inputs: list, for_iter_idx: int, start: int, end: int
+    ) -> types.FunctionType:
+        """
+        Generates the loop body between the specified indices in the instruction list.
+
+        Args:
+            inputs: function inputs infos
+            for_iter_idx (int): For find the for_iter opcode
+            start (int): The start index of the loop body.
+            end (int): The end index of the loop body.
+
+        Returns:
+            tuple: The generated loop body function object and its inputs.
+
+        """
+        pycode_gen = PyCodeGen(self._frame)
+        origin_instrs = get_instructions(pycode_gen._origin_code)
+
+        for_iter = origin_instrs[for_iter_idx]
+
+        # for balance the stack (the loop body will pop iter first before break or return)
+        # this None is used for replace the iterator obj in stack top
+        pycode_gen.gen_load_const(None)
+
+        # extend loop body main logic
+        pycode_gen.extend_instrs(origin_instrs[start:end])
+
+        # break should jump to this nop
+        nop_for_break = pycode_gen._add_instr("NOP")
+
+        # need do additional operates when break
+        pycode_gen.gen_load_const(False)
+        pycode_gen.gen_store_fast(inputs[-1])
+        pycode_gen.gen_load_const(None)  # keep stack balance
+
+        # continue should jump to this nop
+        nop_for_continue = pycode_gen._add_instr("NOP")
+        pycode_gen.gen_pop_top()
+
+        # relocate jump
+        out_loop = for_iter.jump_to
+        for instr in pycode_gen._instructions:
+            if instr.jump_to == for_iter:
+                instr.jump_to = nop_for_continue
+            if instr.jump_to == out_loop:
+                instr.jump_to = nop_for_break
+
+        # outputs is the same as inputs
+        pycode_gen.gen_outputs_and_return(inputs)
+        return pycode_gen.create_fn_with_inputs(inputs)
+
+    @fallback_when_occur_error
+    def _break_graph_in_for_loop(
+        self, iterator: VariableBase, for_iter: Instruction
+    ):
+        '''
+        for_iter: the FOR_ITER opcode
+
+        need find out opcodes which unpack value from FOR_ITER, by analysing stack
+
+        case 1:
+            for i in iter:
+
+            FOR_ITER
+            STORE_FAST i
+
+        case 2:
+            for i,j in iter:
+
+            FOR_ITER
+            UNPACK_SEQUENCE 2
+            STORE_FAST i
+            STORE_FAST j
+
+        TODO: check var is in globals or builtins, only locals considered now
+        '''
+        # 0. prepare sub functions
+        # 0.1 find the range of loop body
+        assert for_iter.jump_to is not None
+        loop_body_start_idx = self.indexof(for_iter) + 1
+        loop_body_end_idx = self.indexof(for_iter.jump_to)
+        curent_stack = 1
+
+        while True:
+            if loop_body_start_idx >= len(self._instructions):
+                raise InnerError("Can not balance stack in loop body.")
+            cur_instr = self._instructions[loop_body_start_idx]
+            # do not consider jump instr
+            stack_effect = calc_stack_effect(cur_instr, jump=False)
+            curent_stack += stack_effect
+            loop_body_start_idx += 1
+            if curent_stack == 0:
+                break
+
+        # 0.2 create loop body function
+        all_used_vars = analysis_used_names_with_space(
+            self._instructions, loop_body_start_idx, loop_body_end_idx
+        )
+        loop_body_inputs = [
+            k
+            for k, v in all_used_vars.items()
+            if v in (Space.locals, Space.cells)
+        ] + ["_break_flag"]
+
+        loop_body_fn = self._gen_loop_body_between(
+            loop_body_inputs,
+            self.indexof(for_iter),
+            loop_body_start_idx,
+            loop_body_end_idx,
+        )
+
+        log(3, "[Resumed Function]: break graph in loop create loop body as\n")
+        log_do(3, lambda: dis.dis(loop_body_fn))
+
+        # 0.3 create after loop part function
+        after_loop_fn, fn_inputs = self._create_resume_fn(
+            loop_body_end_idx, len(self.stack)
+        )
+
+        total_inputs = OrderedSet(list(fn_inputs) + list(loop_body_inputs[:-1]))
+
+        # 1. part before for-loop, start compile
+        ret_names = [
+            name
+            for name in total_inputs
+            if name in chain(self._locals, self._cells)
+        ]
+        ret_vars = [self.get_var(name) for name in ret_names]
+        store_vars = [ret_vars[idx] for idx in range(len(ret_names))]
+        store_vars.extend(iter(self.stack))
+        store_vars.append(iterator.get_hold())
+        var_loader = self._graph.start_compile_with_name_store(
+            ret_vars, store_vars
+        )
+
+        for _ in ret_vars:
+            self._graph.pycode_gen.gen_pop_top()
+
+        # 2. restore vars
+        for idx in range(len(ret_names)):
+            var_loader.load(ret_vars[idx])
+            self._graph.pycode_gen.gen_store(ret_names[idx], self._code)
+
+        # 3. setup vars which is created in loop
+        undefined_names = set()
+        for name in loop_body_inputs[:-1]:
+            if not self.has_var(name, all_used_vars[name]):
+                undefined_names.add(name)
+                self._graph.pycode_gen.gen_load_const(SotUndefinedVar())
+                self._graph.pycode_gen.gen_store(name, self._code)
+
+        # close eval_frame
+        # TODO: need support effective strategies
+        # self._graph.pycode_gen.gen_disable_eval_frame()
+
+        # 4.1 load iterator
+        iterator.reconstruct(self._graph.pycode_gen)
+
+        # 4.2 gen FOR_ITER and unpack data
+        self._graph.pycode_gen.extend_instrs(
+            self._instructions[self.indexof(for_iter) : loop_body_start_idx]
+        )
+
+        # 5. call loop body
+        # 5.1 load loop body
+        self._graph.pycode_gen.gen_load_object(
+            loop_body_fn, loop_body_fn.__code__.co_name
+        )
+
+        # 5.2 load loop body inputs
+        for name in loop_body_inputs[:-1]:
+            self._graph.pycode_gen.gen_load(name)
+
+        # 5.3 load break flag
+        self._graph.pycode_gen.gen_load_const(True)
+
+        # 5.4 call loop body
+        self._graph.pycode_gen.gen_call_function(
+            argc=loop_body_fn.__code__.co_argcount
+        )
+
+        # 5.5 unpack and store retval, keep break_flag in stack
+        self._graph.pycode_gen.gen_unpack_sequence(len(loop_body_inputs))
+
+        for name in loop_body_inputs[:-1]:
+            self._graph.pycode_gen.gen_store(name, self._code)
+
+        # 6. add jump if break
+        jump_if_break = self._graph.pycode_gen.gen_pop_jump(
+            direction=JumpDirection.FORWARD, suffix=PopJumpCond.FALSE
+        )
+
+        # 7. jump back to FOR_ITER
+        self._graph.pycode_gen.gen_jump(
+            for_iter, direction=JumpDirection.BACKWARD
+        )
+        nop = self._graph.pycode_gen._add_instr("NOP")
+        for_iter.jump_to = nop
+        jump_if_break.jump_to = nop
+
+        # open eval_frame
+        # TODO: need support effective strategies
+        # self._graph.pycode_gen.gen_enable_eval_frame()
+
+        # 8. call after_loop_fn
+        self._graph.pycode_gen.gen_load_object(
+            after_loop_fn, after_loop_fn.__code__.co_name
+        )
+
+        for stack_arg in self.stack:
+            var_loader.load(stack_arg)
+        for name in fn_inputs:
+            if not self.has_var(name) and name not in undefined_names:
+                undefined_names.add(name)
+                self._graph.pycode_gen.gen_load_const(SotUndefinedVar())
+                self._graph.pycode_gen.gen_store(name, self._code)
+            self._graph.pycode_gen.gen_load(name)
+
+        self._graph.pycode_gen.gen_call_function(
+            argc=after_loop_fn.__code__.co_argcount
+        )
+
+        self._graph.pycode_gen.gen_return()
+        self.new_code = self._graph.pycode_gen.gen_pycode()
+        self.guard_fn = self._graph.guard_fn
+
+    def _inline_call_for_loop(
+        self, iterator: VariableBase, for_iter: Instruction
+    ):
+        assert for_iter.jump_to is not None
+        pycode_gen = PyCodeGen(self._frame)
+        origin_instrs = get_instructions(pycode_gen._origin_code)
+
+        start_idx = self.indexof(for_iter)
+        end_idx = self.indexof(for_iter.jump_to)
+
+        all_used_vars = analysis_used_names_with_space(
+            origin_instrs, start_idx, end_idx
+        )
+
+        inputs = [
+            k
+            for k, v in all_used_vars.items()
+            if v in (Space.locals, Space.cells)
+        ] + [iterator.id]
+
+        # 1. load iter
+        pycode_gen.gen_load_fast(iterator.id)
+
+        # 2. copy main logic
+        pycode_gen.extend_instrs(origin_instrs[start_idx:end_idx])
+
+        # 3. add break, continue marker and relocate jump
+        for_iter_instr = origin_instrs[start_idx]
+        assert for_iter_instr.jump_to is not None
+        out_loop_instr = for_iter_instr.jump_to
+
+        pycode_gen.gen_jump(out_loop_instr, direction=JumpDirection.FORWARD)
+        nop_for_continue = pycode_gen._add_instr("NOP")
+
+        jump = pycode_gen.gen_jump(
+            for_iter_instr, direction=JumpDirection.BACKWARD
+        )
+
+        nop_for_break = pycode_gen._add_instr("NOP")
+
+        for instr in pycode_gen._instructions:
+            if instr.jump_to == for_iter_instr:
+                instr.jump_to = nop_for_continue
+
+            if (
+                instr.jump_to in origin_instrs
+                and origin_instrs.index(instr.jump_to) >= end_idx
+            ):
+                instr.jump_to = nop_for_break
+
+        jump.jump_to = for_iter_instr
+        pycode_gen.gen_outputs_and_return(inputs)
+        inline_call_fn = pycode_gen.create_fn_with_inputs(inputs)
+
+        log(
+            3,
+            f"[Resumed Function]: Inline call for loop function {inline_call_fn.__code__.co_name}\n",
+        )
+        log_do(3, lambda: dis.dis(inline_call_fn))
+
+        # TODO: update globals builtins
+        fn = UserDefinedFunctionVariable(
+            inline_call_fn,
+            self._graph,
+            DanglingTracker(),
+        )
+
+        input_vars = [
+            self.get_var(name)
+            if self.has_var(name, all_used_vars[name])
+            else SotUndefinedVar()
+            for name in inputs[:-1]
+        ] + [iterator]
+        ret = fn(*input_vars)
+        # slice_variable is [:-1]
+        slice_const = slice(None, -1, None)
+        slice_variable = SliceVariable(
+            slice_const, self._graph, ConstTracker(slice_const)
+        )
+        for name, val in zip(inputs[:-1], ret[slice_variable]):
+            self._locals[name] = val
+
+    def FOR_ITER(self, instr):
+        iterator = self.stack.pop()
+        backup_iter_idx = None
+
+        start = self.indexof(instr)
+        end = self.indexof(instr.jump_to)
+        for i in range(start, end):
+            if self._instructions[i].opname == "RETURN_VALUE":
+                raise FallbackError("Found RETURN_VALUE in for loop body.")
+
+        self._graph.add_global_guarded_variable(iterator)
+
+        try:
+            if not isinstance(iterator, SequenceIterVariable):
+                raise BreakGraphError()
+
+            backup_iter_idx = iterator.idx
+
+            self._inline_call_for_loop(iterator, instr)
+            self._lasti = self.indexof(instr.jump_to)
+        except BreakGraphError as e:
+            log(3, f"{e}")
+            if backup_iter_idx:
+                iterator.idx = backup_iter_idx
+            self._graph.remove_global_guarded_variable(iterator)
+            self._break_graph_in_for_loop(iterator, instr)
+            return Stop(state="BreakGraph")
+
+    def RETURN_VALUE(self, instr: Instruction):
+        assert (
+            len(self.stack) == 1
+        ), f"Stack must have one element, but get {len(self.stack)} elements."
+        ret_val = self.stack.pop()
+        self._graph.start_compile(ret_val)
+        self._graph.pycode_gen.gen_return()
+        self.new_code = self._graph.pycode_gen.gen_pycode()
+        self.guard_fn = self._graph.guard_fn
+        return Stop(state="Return")
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py
new file mode 100644
index 00000000000000..c24e94b07ffb26
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py
@@ -0,0 +1,330 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import contextlib
+import inspect
+import re
+from typing import TYPE_CHECKING
+
+from ...profiler import event_register
+from ...utils import BreakGraphError, log
+from ..instruction_utils import Instruction
+from .guard import StringifyExpression, union_free_vars
+from .opcode_executor import OpcodeExecutorBase, Stop
+from .tracker import ConstTracker, DanglingTracker, DummyTracker, Tracker
+from .variables import (
+    CellVariable,
+    FunctionGlobalVariable,
+    IterVariable,
+    SequenceIterVariable,
+    VariableBase,
+)
+
+if TYPE_CHECKING:
+    from .pycode_generator import PyCodeGen
+    from .variables import FunctionVariable
+
+
+class FunctionGlobalTracker(Tracker):
+    """
+    A tracker class that represents a function global variable.
+
+    Args:
+        fn: FunctionVariable object.
+        name: The name of the global variable.
+
+    """
+
+    def __init__(self, fn: FunctionVariable, name: str):
+        super().__init__([fn])
+        self.fn = fn
+        self.name = name
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        """
+        Generate bytecode instructions in order to put the variables at the top of the stack.
+
+        Args:
+            codegen: The PyCodeGen object used to generate bytecode.
+
+        """
+        self.fn.tracker.gen_instructions(codegen)
+        codegen.gen_load_attr("__globals__")
+        codegen.gen_load_const(self.name)
+        codegen.gen_subscribe()
+
+    def trace_value_from_frame(self) -> StringifyExpression:
+        """
+        Trace the value of the function global variable from the frame.
+
+        Returns:
+            StringifyExpression: The traced value of the function global variable.
+
+        """
+        fn_tracer = self.fn.tracker.trace_value_from_frame()
+        return StringifyExpression(
+            f"{{}}.__globals__['{self.name}']",
+            [fn_tracer],
+            union_free_vars(fn_tracer.free_vars),
+        )
+
+    def __repr__(self) -> str:
+        return f"FunctionGlobalTracker(fn={self.fn}, name={self.name})"
+
+
+class FunctionClosureTracker(Tracker):
+    """
+    A tracker class that represents a function closure variable.
+
+    Args:
+        fn: The FunctionVariable object.
+        idx: The index of the closure variable.
+
+    """
+
+    def __init__(self, fn: FunctionVariable, idx: int):
+        super().__init__([fn])
+        self.fn = fn
+        self.idx = idx
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        """
+        Generate bytecode instructions to trace the value of the function closure variable.
+
+        Args:
+            codegen: The PyCodeGen object used to generate bytecode.
+
+        """
+        self.fn.tracker.gen_instructions(codegen)
+        codegen.gen_load_attr("__closure__")
+        codegen.gen_load_const(self.idx)
+        codegen.gen_subscribe()
+        codegen.gen_load_attr("cell_contents")
+
+    def trace_value_from_frame(self):
+        """
+        Trace the value of the function closure variable from the frame.
+
+        Returns:
+            The traced value of the function closure variable.
+
+        """
+        fn_tracer = self.fn.tracker.trace_value_from_frame()
+        return StringifyExpression(
+            f"{{}}.__closure__[{self.idx}].cell_contents",
+            [fn_tracer],
+            union_free_vars(fn_tracer.free_vars),
+        )
+
+    def __repr__(self) -> str:
+        return f"FunctionClosureTracker(fn={self.fn}, idx={self.idx})"
+
+
+@contextlib.contextmanager
+def signature_clear_guard(fn, name):
+    if not hasattr(fn, name):
+        yield
+    else:
+        saved_attr = getattr(fn, name)
+        delattr(fn, name)
+        yield
+        setattr(fn, name, saved_attr)
+
+
+class OpcodeInlineExecutor(OpcodeExecutorBase):
+    """
+    A class that represents an executor for inlined opcode operations.
+
+    Args:
+        fn_variable: The function variable.
+
+    """
+
+    def __init__(
+        self,
+        fn_variable: FunctionVariable,
+        *args,
+        **kwargs,
+    ):
+        self._fn_var = fn_variable
+        self.return_value: VariableBase | None = None
+        self._fn_value = fn_variable.value
+        super().__init__(fn_variable.get_code(), fn_variable.graph)
+        self._name = "Inline"
+        self._prepare_locals(*args, **kwargs)
+        self._prepare_closure()
+
+    def _handle_comps(self):
+        is_comp = any(
+            x in self._fn_value.__name__
+            for x in ['<listcomp>', '<dictcomp>', '<genexpr>']
+        )
+        if not is_comp:
+            return
+        pattern = r'implicit\d+'
+        for name in list(self._locals.keys()):
+            if re.match(pattern, name):
+                self._locals[name.replace('implicit', '.')] = self._locals[name]
+
+    def _prepare_locals(self, *args, **kwargs):
+        """
+        Prepare local variables for execution by adding them to the locals dictionary.
+
+        """
+        from .variables import VariableBase, VariableFactory
+
+        # temparay clear the fn.__signature__ to avoid signature check error
+        with signature_clear_guard(
+            self._fn_value, "__signature__"
+        ), signature_clear_guard(self._fn_value, "__wrapped__"):
+            sig = inspect.signature(self._fn_value)
+            bound_args = sig.bind(*args, **kwargs)
+        bound_args.apply_defaults()
+        for name, value in bound_args.arguments.items():
+            assert name in sig.parameters
+            # Convert varargs and kwargs to Variable
+            if sig.parameters[name].kind == inspect.Parameter.VAR_POSITIONAL:
+                tracker = DummyTracker(value)
+            elif sig.parameters[name].kind == inspect.Parameter.VAR_KEYWORD:
+                tracker = DummyTracker(list(value.values()))
+            # Convert default args to Variable
+            elif not isinstance(value, VariableBase):
+                tracker = ConstTracker(value)
+            else:
+                tracker = value.tracker
+            value = VariableFactory.from_value(value, self._graph, tracker)
+            self._locals[name] = value
+
+        self._handle_comps()
+
+        log(
+            5, f"[INLINE CALL] {self._code.co_name} with locals: ", self._locals
+        )
+
+    def _prepare_closure(self):
+        """
+        Prepare closure variables for execution by adding them to the closure list.
+
+        """
+        from .variables import VariableFactory
+
+        closure = self._fn_var.get_py_value().__closure__
+        for name in self._code.co_cellvars + self._code.co_freevars:
+            # create a cell for each variable.
+            self._cells[name] = CellVariable()  # put in cells.
+            if name in self._locals:
+                self._cells[name].set_value(self._locals[name])
+
+        if closure is None:
+            return
+        assert len(closure) == len(self._code.co_freevars)
+        for idx, (name, cell) in enumerate(
+            zip(self._code.co_freevars, closure)
+        ):
+            value = cell.cell_contents
+            value = VariableFactory.from_value(
+                value, self._graph, FunctionClosureTracker(self._fn_var, idx)
+            )
+            # wrapped by a CellVariable
+            if not isinstance(value, CellVariable):
+                value = CellVariable(value)
+            self._cells[name] = value
+
+    @event_register("OpcodeInlineExecutor: _prepare_virtual_env", event_level=2)
+    def _prepare_virtual_env(self):
+        """
+        Prepare the virtual environment for execution by adding variables from globals, builtins, and constants.
+
+        """
+        from .variables import VariableFactory
+
+        self._globals = FunctionGlobalVariable(
+            self._fn_var,
+            self._fn_value.__globals__,
+            self._graph,
+            DanglingTracker(),
+        )
+
+        self._builtins = self._graph._builtins
+
+        # prepare consts
+        for value in self._code.co_consts:
+            self._co_consts.append(
+                VariableFactory.from_value(
+                    value, self._graph, ConstTracker(value)
+                )
+            )
+
+    def inline_call(self) -> VariableBase:
+        """
+        Execute the inline call of the function.
+        """
+        self.run()
+        assert self.return_value is not None
+        return self.return_value
+
+    def RETURN_VALUE(self, instr: Instruction):
+        assert (
+            len(self.stack) == 1
+        ), f"Stack must have one element, but get {len(self.stack)} elements."
+        self.return_value = self.stack.pop()
+        return Stop(state="Return")
+
+    def _break_graph_in_jump(self, result, instr: Instruction):
+        """
+        Helper method to raise a BreakGraphError when breaking the graph in a jump operation.
+
+        Args:
+            result: The result of the operation.
+            instr (Instruction): The jump instruction.
+        """
+        raise BreakGraphError(
+            "OpcodeInlineExecutor want call _break_graph_in_jump."
+        )
+
+    def _create_resume_fn(self, index: int, stack_size: int = 0):
+        """
+        Helper method to create a resume function for the executor.
+
+        Args:
+            index (int): The index of the instruction to resume execution from.
+            stack_size (int, optional): The size of the stack. Defaults to 0.
+        """
+        raise BreakGraphError("_create_resume_fn.")
+
+    def FOR_ITER(self, instr: Instruction):
+        iterator = self.stack.top
+        assert isinstance(iterator, IterVariable)
+
+        self._graph.add_global_guarded_variable(iterator)
+
+        # simplely get next
+        if isinstance(
+            iterator,
+            SequenceIterVariable,
+        ):
+            try:
+                self.stack.push(iterator.next())
+            except StopIteration:
+                self.stack.pop()
+                assert isinstance(instr.jump_to, Instruction)
+                self._lasti = self.indexof(instr.jump_to)
+
+        else:
+            self._graph.remove_global_guarded_variable(iterator)
+            raise BreakGraphError(
+                f"Found {iterator.__class__.__name__} as iterator."
+            )
diff --git a/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py b/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py
new file mode 100644
index 00000000000000..3e2032dcc3a800
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py
@@ -0,0 +1,1072 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This class is used for abstract code generation:
+# We only need to care about what type of bytecode our code needs to generate,
+# without worrying about the subscripts of bytecode instructions in the code option.
+
+from __future__ import annotations
+
+import random
+import sys
+import types
+from functools import cached_property
+from typing import TYPE_CHECKING
+
+import opcode
+
+import paddle
+
+from ...utils import (
+    FallbackError,
+    InnerError,
+    OrderedSet,
+    ResumeFnNameFactory,
+    is_clean_code,
+    list_contain_by_id,
+    list_find_index_by_id,
+    no_eval_frame,
+)
+from ..instruction_utils import (
+    analysis_inputs,
+    calc_stack_effect,
+    gen_instr,
+    get_instructions,
+    instrs_info,
+    modify_instrs,
+    modify_vars,
+)
+from ..instruction_utils.opcode_info import (
+    PYOPCODE_CACHE_SIZE,
+    UNCONDITIONAL_JUMP,
+    JumpDirection,
+    PopJumpCond,
+)
+from .instr_flag import CALL_FUNCTION_EX_FLAG
+
+CODE_NAME_RNG = random.Random(2023)
+
+if TYPE_CHECKING:
+    from typing import Any
+
+    from ..instruction_utils import Instruction
+
+
+def get_pycode_attributes() -> list[str]:
+    """
+    Returns a list of attribute names for PyCodeObject.
+    NOTE(SigureMo): The order should consistent with signature specified in code_doc
+    3.8: https://github.com/python/cpython/blob/3.8/Objects/codeobject.c#L416-L421
+    3.10: https://github.com/python/cpython/blob/3.10/Objects/codeobject.c#L523-L543
+    3.11: https://github.com/python/cpython/blob/3.11/Objects/codeobject.c#L1494-L1516
+
+    Returns:
+        list[str]: The attribute names for PyCodeObject.
+    """
+    pycode_attributes = [
+        "co_argcount",
+        "co_posonlyargcount",
+        "co_kwonlyargcount",
+        "co_nlocals",
+        "co_stacksize",
+        "co_flags",
+        "co_code",
+        "co_consts",
+        "co_names",
+        "co_varnames",
+        "co_filename",
+        "co_name",
+    ]
+    if sys.version_info >= (3, 11):
+        pycode_attributes.append("co_qualname")
+    pycode_attributes.append("co_firstlineno")
+    if sys.version_info >= (3, 10):
+        pycode_attributes.append("co_linetable")
+    else:
+        pycode_attributes.append("co_lnotab")
+    if sys.version_info >= (3, 11):
+        pycode_attributes.append("co_exceptiontable")
+    pycode_attributes += [
+        "co_freevars",
+        "co_cellvars",
+    ]
+    return pycode_attributes
+
+
+PYCODE_ATTRIBUTES = get_pycode_attributes()
+
+
+def gen_code_options(code: types.CodeType) -> dict[str, Any]:
+    """
+    Generates a dictionary of code options for the given code object.
+
+    Args:
+        code (types.CodeType): The code object.
+
+    Returns:
+        dict[str, any]: The code options.
+    """
+    code_options = {}
+    for k in PYCODE_ATTRIBUTES:
+        val = getattr(code, k)
+        if isinstance(val, tuple):
+            val = list(val)
+        code_options[k] = val
+
+    return code_options
+
+
+def gen_new_opcode(
+    instrs: list[Instruction], code_options: dict[str, Any], keys: list[str]
+) -> types.CodeType:
+    """
+    Generates a new code object with the given instructions, code options, and keys.
+
+    Args:
+        instrs (list[Instruction]): The instructions for the new code object.
+        code_options (dict[str, any]): The code options for the new code object.
+        keys (list[str]): The keys to specify the order of code options.
+
+    Returns:
+        types.CodeType: The new code object.
+    """
+    bytecode, linetable = assemble(instrs, code_options["co_firstlineno"])
+    if sys.version_info >= (3, 10):
+        # Python deprecated co_lnotab in 3.10, use co_linetable instead
+        # https://peps.python.org/pep-0626/
+        code_options["co_linetable"] = linetable
+    else:
+        code_options["co_lnotab"] = linetable
+    code_options["co_code"] = bytecode
+    code_options["co_nlocals"] = len(code_options["co_varnames"])
+    code_options["co_stacksize"] = stacksize(instrs)
+    if sys.version_info >= (3, 11):
+        # TODO: generate 3.11 exception table
+        code_options["co_exceptiontable"] = bytes([])
+    for key, val in code_options.items():
+        if isinstance(val, list):
+            code_options[key] = tuple(val)
+    # code_options is a dict, use keys to makesure the input order
+    return types.CodeType(*[code_options[k] for k in keys])
+
+
+def assemble(
+    instructions: list[Instruction], firstlineno: int
+) -> tuple[bytes, bytes]:
+    """
+    Assembles a list of instructions into bytecode and lnotab.
+
+    Args:
+        instructions (list[Instruction]): The list of instructions to assemble.
+        firstlineno (int): The starting line number.
+
+    Returns:
+        tuple[bytes, bytes]: The assembled bytecode and lnotab.
+    """
+    code = []
+    linetable = []
+
+    calc_linetable, update_cursor = create_linetable_calculator(firstlineno)
+
+    for instr in instructions:
+        # set linetable, Python 3.11 need to set linetable for each instruction
+        if instr.starts_line is not None or sys.version_info >= (3, 11):
+            linetable.extend(calc_linetable(instr.starts_line, len(code)))
+            update_cursor(instr.starts_line, len(code))
+
+        # get bytecode
+        arg = instr.arg or 0
+        code.extend((instr.opcode, arg & 0xFF))
+        # fill CACHE
+        for _ in range(get_instruction_size(instr) // 2 - 1):
+            code.extend((0, 0))
+
+    if sys.version_info >= (3, 11):
+        # End hook for Python 3.11
+        linetable.extend(calc_linetable(None, len(code)))
+    elif sys.version_info >= (3, 10):
+        # End hook for Python 3.10
+        linetable.extend(calc_linetable(0, len(code)))
+
+    return bytes(code), bytes(linetable)
+
+
+def to_byte(num):
+    """
+    Converts a negative number to an unsigned byte.
+
+    Args:
+        num (int): The number to convert.
+
+    Returns:
+        int: The converted unsigned byte.
+    """
+    if num < 0:
+        num += 256
+    return num
+
+
+def get_instruction_size(instr: Instruction) -> int:
+    cache_size = 0
+    if sys.version_info >= (3, 11):
+        cache_size = PYOPCODE_CACHE_SIZE.get(instr.opname, 0)
+    return 2 * (cache_size + 1)
+
+
+def create_linetable_calculator(firstlineno: int):
+    """
+    Creates a line table calculator function.
+
+    Args:
+        firstlineno (int): The starting line number.
+
+    Returns:
+        Callable: The line table calculator function.
+    """
+    cur_lineno = firstlineno
+    cur_bytecode = 0
+    line_offset = 0  # For Python 3.10
+
+    def update_cursor(starts_line: int | None, code_length: int):
+        nonlocal cur_lineno, cur_bytecode
+        cur_bytecode = code_length
+        if starts_line is not None:
+            cur_lineno = starts_line
+
+    def calc_lnotab(starts_line: int, code_length: int):
+        """
+        Calculates the lnotab for Python 3.8 and 3.9.
+        https://github.com/python/cpython/blob/3.9/Objects/lnotab_notes.txt
+
+        Args:
+            starts_line (int): The line number where the instruction starts.
+            code_length (int): The length of the code.
+
+        Returns:
+            list[int]: The lnotab.
+        """
+        nonlocal cur_lineno, cur_bytecode
+        line_offset = starts_line - cur_lineno
+        byte_offset = code_length - cur_bytecode
+        result = []
+
+        while line_offset or byte_offset:
+            line_offset_step = min(max(line_offset, -128), 127)
+            byte_offset_step = min(max(byte_offset, 0), 255)
+            result.extend((byte_offset_step, to_byte(line_offset_step)))
+            line_offset -= line_offset_step
+            byte_offset -= byte_offset_step
+        return result
+
+    def calc_linetable_py310(starts_line: int, code_length: int):
+        """
+        Calculates the linetable for Python 3.10.
+        https://github.com/python/cpython/blob/3.10/Objects/lnotab_notes.txt
+
+        Args:
+            starts_line (int): The line number where the instruction starts.
+            code_length (int): The length of the code.
+
+        Returns:
+            list[int]: The linetable.
+        """
+        nonlocal cur_lineno, cur_bytecode, line_offset
+        byte_offset = code_length - cur_bytecode
+        result = []
+        while line_offset or byte_offset:
+            line_offset_step = min(max(line_offset, -127), 127)
+            byte_offset_step = min(max(byte_offset, 0), 254)
+            result.extend((byte_offset_step, to_byte(line_offset_step)))
+            line_offset -= line_offset_step
+            byte_offset -= byte_offset_step
+        line_offset = starts_line - cur_lineno
+        return result
+
+    def _encode_varint(num: int):
+        """
+        Encode unsigned integer into variable-length format.
+        """
+        continue_flag = 0b01 << 6
+        stop_flag = 0b00 << 6
+        while num >= 0x40:
+            yield (num & 0x3F) | continue_flag
+            num >>= 6
+        yield num | stop_flag
+
+    def _encode_svarint(num: int):
+        """
+        Encode signed integer into variable-length format.
+        """
+        unsigned_value = (((-num) << 1) | 1) if num < 0 else (num << 1)
+        yield from _encode_varint(unsigned_value)
+
+    def _encode_bytecode_to_entries_py311(line_offset: int, byte_offset: int):
+        if not byte_offset:
+            return []
+        if 0 < byte_offset <= 8:
+            entry_head = 0b1_1101_000 | (byte_offset - 1)
+            return [entry_head, *list(_encode_svarint(line_offset))]
+        return [
+            *_encode_bytecode_to_entries_py311(line_offset, 8),
+            *_encode_bytecode_to_entries_py311(line_offset, byte_offset - 8),
+        ]
+
+    def calc_linetable_py311(starts_line: int | None, code_length: int):
+        """
+        Calculates the linetable for Python 3.11.
+        https://github.com/python/cpython/blob/3.11/Objects/locations.md
+
+        Args:
+            starts_line (int): The line number where the instruction starts.
+            code_length (int): The length of the code.
+
+        Returns:
+            list[int]: The linetable.
+        """
+        nonlocal cur_lineno, cur_bytecode
+        line_offset = starts_line - cur_lineno if starts_line is not None else 0
+        byte_offset = (code_length - cur_bytecode) // 2
+        return _encode_bytecode_to_entries_py311(line_offset, byte_offset)
+
+    if sys.version_info >= (3, 11):
+        return calc_linetable_py311, update_cursor
+    elif sys.version_info >= (3, 10):
+        return calc_linetable_py310, update_cursor
+    else:
+        return calc_lnotab, update_cursor
+
+
+def compile_exception_table():
+    """Compile the exception table, it is used for Python 3.11+.
+    See https://github.com/python/cpython/blob/3.11/Objects/exception_handling_notes.txt
+    """
+    # TODO
+    ...
+
+
+def stacksize(instructions: list[Instruction]) -> float:
+    """
+    Calculates the maximum stack size before each opcode is called.
+
+    Args:
+        instructions (list[Instruction]): The list of instructions.
+
+    Returns:
+        int: The maximum stack size.
+    """
+    max_stack = [float("-inf")] * len(instructions)
+
+    max_stack[0] = 0
+
+    queue = []
+    queue.append(0)
+
+    def update_stacksize(lasti: int, nexti: int, stack_effect: int):
+        """
+        Updates the maximum stack size.
+
+        Args:
+            lasti (int): The index of the last instruction.
+            nexti (int): The index of the next instruction.
+            stack_effect (int): The effect on the stack size.
+
+        Returns:
+            None
+        """
+        old_max = max_stack[nexti]
+        max_stack[nexti] = max(
+            max_stack[nexti], max_stack[lasti] + stack_effect
+        )
+        if old_max != max_stack[nexti]:
+            if nexti not in queue:  # may be slow, we can use a flag.
+                queue.append(nexti)
+
+    while len(queue) > 0:
+        idx = queue[0]
+        del queue[0]
+        instr = instructions[idx]
+        opname = instr.opname
+        if (
+            idx + 1 < len(instructions)
+            and instr.opname not in UNCONDITIONAL_JUMP
+        ):
+            stack_effect = calc_stack_effect(instr, jump=False)
+            update_stacksize(idx, idx + 1, stack_effect)
+
+        if instr.opcode in opcode.hasjabs or instr.opcode in opcode.hasjrel:
+            stack_effect = calc_stack_effect(instr, jump=True)
+            target_idx = instructions.index(instr.jump_to)
+            update_stacksize(idx, target_idx, stack_effect)
+
+    # assert min(min_stack) >= 0 # min_stack may be a negative number when try: except is got.
+    return max(max_stack)
+
+
+class PyCodeGen:
+    """Helper to create new code object"""
+
+    def __init__(
+        self, frame: types.FrameType, disable_eval_frame: bool = False
+    ):
+        """
+        Initializes a PyCodeGen object.
+
+        Args:
+            frame: The frame to be translated.
+            disable_eval_frame (bool): Whether to disable the evaluation frame. Defaults to False.
+        """
+        self._frame = frame
+        self._origin_code = frame.f_code
+        self._code_options = gen_code_options(self._origin_code)
+        self.update_code_name("", is_resumed_fn=False)
+        self._f_globals = frame.f_globals
+        self._instructions = []
+        self.disable_eval_frame = disable_eval_frame
+        if self.disable_eval_frame:
+            self.gen_disable_eval_frame()
+
+    def insert_prefix_instructions(self):
+        """
+        Insert prefix instructions to the instruction list.
+        In Python 3.11+, we need to insert MAKE_CELL and COPY_FREE_VARS before the
+        first instruction.
+        The implementation is based on cpython implementation:
+        https://github.com/python/cpython/blob/f45ef5edabb1cc0748f3326e7114b8aaa0424392/Python/compile.c#L8177
+        """
+        prefixes = []
+        if sys.version_info >= (3, 11):
+            if self._code_options["co_cellvars"]:
+                # Insert MAKE_CELL
+                name_map = list(
+                    OrderedSet(self._code_options["co_varnames"])
+                    | OrderedSet(self._code_options["co_cellvars"])
+                )
+
+                for i in self._code_options["co_cellvars"]:
+                    idx: int = name_map.index(i)
+                    prefixes.append(gen_instr("MAKE_CELL", arg=idx, argval=i))
+
+            if self._code_options["co_freevars"]:
+                n_freevars = len(self._code_options["co_freevars"])
+                # Insert COPY_FREE_VARS
+                prefixes.append(
+                    gen_instr(
+                        "COPY_FREE_VARS", arg=n_freevars, argval=n_freevars
+                    )
+                )
+
+            # Insert RESUME
+            prefixes.append(gen_instr("RESUME", arg=0, argval=0))
+        self._instructions[:] = prefixes + self._instructions
+
+    def update_code_name(self, fn_name, is_resumed_fn):
+        if is_resumed_fn:
+            self._code_options[
+                'co_name'
+            ] = f"${fn_name}@{self._code_options['co_name'][1:]}"
+        else:
+            if self._code_options['co_name'].startswith("$"):
+                self._code_options[
+                    'co_name'
+                ] = f"#{self._code_options['co_name']}"
+            elif not self._code_options['co_name'].startswith("#"):
+                random_number = int(CODE_NAME_RNG.random() * 100000000)
+                self._code_options[
+                    'co_name'
+                ] = f"#{self._code_options['co_name']}_{hex(random_number & 0xFFFFF)[2:]:0>5}"
+
+    def gen_pycode(self) -> types.CodeType:
+        """
+        Generates a new pycode that is runnable.
+
+        Returns:
+            CodeType: The generated code object.
+        """
+        self.insert_prefix_instructions()
+        modify_instrs(self._instructions)
+        modify_vars(self._instructions, self._code_options)
+        new_code = gen_new_opcode(
+            self._instructions, self._code_options, PYCODE_ATTRIBUTES
+        )
+        return new_code
+
+    def gen_resume_fn_at(
+        self, index: int, stack_size: int = 0
+    ) -> tuple[None | types.FunctionType, OrderedSet[str]]:
+        """
+        Generates a resume function at the specified index in the instruction list.
+
+        Args:
+            index (int): The index in the instruction list to generate the resume function.
+            stack_size (int): The size of the stack. Defaults to 0.
+
+        Returns:
+            tuple: The resume function object and the inputs to the function.
+
+        """
+        self._instructions = get_instructions(self._origin_code)
+        # TODO(dev): could give an example code here?
+        if self._instructions[index].opname == 'RETURN_VALUE':
+            return None, OrderedSet()
+        inputs = analysis_inputs(self._instructions, index)
+        fn_name = ResumeFnNameFactory().next()
+        stack_arg_str = fn_name + '_stack_{}'
+        self._instructions = (
+            [
+                gen_instr('LOAD_FAST', argval=stack_arg_str.format(i))
+                for i in range(stack_size)
+            ]
+            + [gen_instr('JUMP_FORWARD', jump_to=self._instructions[index])]
+            + self._instructions
+        )
+
+        self._code_options['co_argcount'] = len(inputs) + stack_size
+        # inputs should be at the front of the co_varnames
+        self._code_options['co_varnames'] = list(
+            [stack_arg_str.format(i) for i in range(stack_size)]
+            + list(inputs)
+            + [
+                var_name
+                for var_name in self._origin_code.co_varnames
+                if var_name not in inputs
+            ]
+        )
+
+        self.update_code_name(fn_name, is_resumed_fn=True)
+
+        new_code = self.gen_pycode()
+        if len(new_code.co_freevars) + len(new_code.co_cellvars) > 0:
+            raise FallbackError("Break graph in closure is not support.")
+        fn = types.FunctionType(new_code, self._f_globals, new_code.co_name)
+
+        return fn, inputs
+
+    @cached_property
+    def global_null_variable(self):
+        from .variables.basic import NullVariable
+
+        return NullVariable()
+
+    def gen_disable_eval_frame(self):
+        """
+        Generates instructions to disable the evaluation frame.
+        """
+        if is_clean_code():
+            return
+        self.gen_load_object(
+            paddle.framework.core.set_eval_frame, "paddle_set_eval_frame_fn"
+        )
+        self.gen_load_const(None)
+        self.gen_call_function(1)
+        self.gen_store_fast("___old_eval_frame")
+
+    def gen_enable_eval_frame(self):
+        """
+        Generates instructions to enable the evaluation frame.
+        """
+        if is_clean_code():
+            return
+        self.gen_load_object(
+            paddle.framework.core.set_eval_frame, "paddle_set_eval_frame_fn"
+        )
+        self.gen_load_fast("___old_eval_frame")
+        self.gen_call_function(1)
+        self.gen_pop_top()
+
+    def gen_outputs_and_return(self, outputs):
+        for name in outputs:
+            self.gen_load(name)
+        self.gen_build_tuple(len(outputs))
+        self.gen_return()
+
+    def create_fn_with_inputs(self, inputs: list) -> types.FunctionType:
+        """
+        Creates a function with specific input and output variables.
+
+        Args:
+            inputs (list): The input variables.
+
+        Returns:
+            function: The created function object.
+        """
+        self._code_options['co_argcount'] = len(inputs)
+        self._code_options['co_varnames'] = list(
+            list(inputs)
+            + [
+                var_name
+                for var_name in self._origin_code.co_varnames
+                if var_name not in inputs
+            ]
+        )
+        fn_name = ResumeFnNameFactory().next()
+        self.update_code_name(fn_name, is_resumed_fn=True)
+        new_code = self.gen_pycode()
+        if len(new_code.co_freevars) + len(new_code.co_cellvars) > 0:
+            raise FallbackError("Break graph in closure is not support.")
+        fn = types.FunctionType(new_code, self._f_globals, new_code.co_name)
+        return fn
+
+    def gen_load_const(self, value: Any):
+        """
+        Generates instructions to load a constant value.
+        """
+        # Python `list.index` will find an item equal to query, i.e. `query == item`
+        # returns a value of True. Since `1 == True`, this will result in an incorrect
+        # index. To avoid this problem, we use id for comparison.
+        if not list_contain_by_id(self._code_options["co_consts"], value):
+            self._code_options["co_consts"].append(value)
+        idx = list_find_index_by_id(self._code_options["co_consts"], value)
+        self._add_instr("LOAD_CONST", arg=idx, argval=value)
+
+    def gen_print_log(self, message):
+        """print a log"""
+        import paddle
+
+        self.gen_load_object(
+            paddle.framework.core.set_eval_frame, "dbg_set_eval_frame"
+        )
+        self.gen_load_const(None)
+        self.gen_call_function(1)
+        self.gen_store_fast("old_eval_frame")
+        self.gen_load_global("print", push_null=True)
+        self.gen_load_const(message)
+        self.gen_call_function(1)
+        self.gen_pop_top()
+        self.gen_load_object(
+            paddle.framework.core.set_eval_frame, "dbg_set_eval_frame"
+        )
+        self.gen_load_fast("old_eval_frame")
+        self.gen_call_function(1)
+        self.gen_pop_top()
+
+    def gen_dbg_function(self, dbg_fun):
+        """debug bytecode helper function.
+        Usage like:
+        def dbg_func():
+            import inspect
+            import dis
+            print("dbg here.")
+            print(locals())
+            frame = inspect.currentframe().f_back
+            code = (inspect.currentframe().f_back.f_code)
+            breakpoint()
+            print(inspect.currentframe().f_back.f_locals['y'])
+
+        self.pycode_gen.gen_dbg_function(dbg_func)
+        """
+        import paddle
+
+        self.gen_load_object(
+            paddle.framework.core.set_eval_frame, "dbg_set_eval_frame"
+        )
+        self.gen_load_const(None)
+        self.gen_call_function(1)
+        self.gen_store_fast("old_eval_frame")
+        self.gen_load_object(dbg_fun, "dbg1")
+        self.gen_call_function(0)
+        self.gen_pop_top()
+        self.gen_load_object(
+            paddle.framework.core.set_eval_frame, "dbg_set_eval_frame"
+        )
+        self.gen_load_fast("old_eval_frame")
+        self.gen_call_function(1)
+        self.gen_pop_top()
+
+    @property
+    def cell_free_storage(self):
+        return (
+            self._code_options["co_cellvars"]
+            + self._code_options["co_freevars"]
+        )
+
+    def gen_load(self, name):
+        if name in self.cell_free_storage:
+            self.gen_load_deref(name)
+        elif name in self._code_options["co_varnames"]:
+            self.gen_load_fast(name)
+        elif name in self._code_options["co_names"]:
+            self.gen_load_global(name, push_null=False)
+        else:
+            raise InnerError(
+                f"Want gen_load, but {name} can not found in code object."
+            )
+
+    def gen_store(self, name, code):
+        """
+        Generate the bytecode for storing a variable identified by 'name'
+        in the corresponding symbol table and generate the appropriate
+        store code based on the symbol table analysis.
+
+        Args:
+            name (str): The name of the variable.
+        """
+        if name in (code.co_freevars + code.co_cellvars):
+            self.gen_store_deref(name)
+        elif name in code.co_varnames:
+            self.gen_store_fast(name)
+        elif name in code.co_names:
+            self.gen_store_global(name)
+        else:
+            raise InnerError(
+                f"Want gen_store, but {name} can not found in code object."
+            )
+
+    def gen_load_global(self, name, push_null=False):
+        """
+        Generate the bytecode for loading a global variable.
+
+        Args:
+            name (str): The name of the global variable.
+        """
+        if name not in self._code_options["co_names"]:
+            self._code_options["co_names"].append(name)
+        idx = self._code_options["co_names"].index(name)
+        if sys.version_info >= (3, 11):
+            idx <<= 1
+            if push_null:
+                idx |= 1
+        self._add_instr("LOAD_GLOBAL", arg=idx, argval=name)
+
+    def gen_load_object(self, obj, obj_name: str, push_null: bool = True):
+        """
+        Generate the bytecode for loading an object.
+
+        Args:
+            obj (Any): The object to load.
+            obj_name (str): The name of the object.
+        """
+
+        if obj_name not in self._f_globals:
+            self._f_globals[obj_name] = obj
+        self.gen_load_global(obj_name, push_null=push_null)
+
+    def gen_load_null_variable(self):
+        """
+        Generate the bytecode for loading a null variable.
+        """
+        null_var = self.global_null_variable
+        self.gen_load_object(null_var, "___null_var", push_null=False)
+
+    def gen_load_fast(self, name):
+        """
+        Generate the bytecode for loading a local variable.
+
+        Args:
+            name (str): The name of the local variable.
+        """
+        if name not in self._code_options["co_varnames"]:
+            self._code_options["co_varnames"].append(name)
+        idx = self._code_options["co_varnames"].index(name)
+        self._add_instr("LOAD_FAST", arg=idx, argval=name)
+
+    def gen_load_deref(self, name):
+        if name not in self.cell_free_storage:
+            self._code_options["co_freevars"].append(name)
+        if sys.version_info >= (3, 11):
+            # Because the co_varnames maybe changed after other codegen
+            # operations, we need re-calculate the index in modify_vars
+            idx = (
+                self._code_options["co_varnames"]
+                + self._code_options["co_freevars"]
+            ).index(name)
+        else:
+            idx = self.cell_free_storage.index(name)
+        self._add_instr("LOAD_DEREF", arg=idx, argval=name)
+
+    def gen_load_attr(self, name: str):
+        if name not in self._code_options["co_names"]:
+            self._code_options["co_names"].append(name)
+        idx = self._code_options["co_names"].index(name)
+        self._add_instr("LOAD_ATTR", arg=idx, argval=name)
+
+    def gen_store_attr(self, name: str):
+        if name not in self._code_options["co_names"]:
+            self._code_options["co_names"].append(name)
+        idx = self._code_options["co_names"].index(name)
+        self._add_instr("STORE_ATTR", arg=idx, argval=name)
+
+    def gen_delete_attr(self, name: str):
+        if name not in self._code_options["co_names"]:
+            self._code_options["co_names"].append(name)
+        idx = self._code_options["co_names"].index(name)
+        self._add_instr("DELETE_ATTR", arg=idx, argval=name)
+
+    def gen_load_method(self, name: str):
+        if name not in self._code_options["co_names"]:
+            self._code_options["co_names"].append(name)
+        idx = self._code_options["co_names"].index(name)
+        self._add_instr("LOAD_METHOD", arg=idx, argval=name)
+
+    def gen_delete_global(self, name: str):
+        if name not in self._code_options["co_names"]:
+            self._code_options["co_names"].append(name)
+        idx = self._code_options["co_names"].index(name)
+        self._add_instr("DELETE_GLOBAL", arg=idx, argval=name)
+
+    def gen_import_name(self, name: str):
+        if name not in self._code_options["co_names"]:
+            self._code_options["co_names"].append(name)
+        idx = self._code_options["co_names"].index(name)
+        self._add_instr("IMPORT_NAME", arg=idx, argval=name)
+
+    def gen_push_null(self):
+        if sys.version_info >= (3, 11):
+            self._add_instr("PUSH_NULL")
+        else:
+            # There is no PUSH_NULL bytecode before python3.11, so we push
+            # a NULL element to the stack through the following bytecode
+            self.gen_load_const(0)
+            self.gen_load_const(None)
+            self.gen_import_name('sys')
+            self.gen_store_fast('sys')
+            self.gen_load_fast('sys')
+            self.gen_load_method('getsizeof')
+            self.gen_pop_top()
+
+    def gen_store_fast(self, name):
+        if name not in self._code_options["co_varnames"]:
+            self._code_options["co_varnames"].append(name)
+        idx = self._code_options["co_varnames"].index(name)
+        self._add_instr("STORE_FAST", arg=idx, argval=name)
+
+    def gen_store_global(self, name):
+        if name not in self._code_options["co_names"]:
+            self._code_options["co_names"].append(name)
+        idx = self._code_options["co_names"].index(name)
+        self._add_instr("STORE_GLOBAL", arg=idx, argval=name)
+
+    def gen_store_deref(self, name):
+        if name not in self.cell_free_storage:
+            self._code_options["co_freevars"].append(name)
+        if sys.version_info >= (3, 11):
+            # Because the co_varnames maybe changed after other codegen
+            # operations, we need re-calculate the index in modify_vars
+            idx = (
+                self._code_options["co_varnames"]
+                + self._code_options["co_freevars"]
+            ).index(name)
+        else:
+            idx = self.cell_free_storage.index(name)
+        self._add_instr("STORE_DEREF", arg=idx, argval=name)
+
+    def gen_store_subscr(self):
+        self._add_instr("STORE_SUBSCR")
+
+    def gen_subscribe(self):
+        self._add_instr("BINARY_SUBSCR")
+
+    def gen_build_tuple(self, count):
+        self._add_instr("BUILD_TUPLE", arg=count, argval=count)
+
+    def gen_build_list(self, count):
+        self._add_instr("BUILD_LIST", arg=count, argval=count)
+
+    def gen_build_map(self, count):
+        self._add_instr("BUILD_MAP", arg=count, argval=count)
+
+    def gen_build_slice(self, argc):
+        self._add_instr("BUILD_SLICE", arg=argc, argval=argc)
+
+    def gen_unpack_sequence(self, count):
+        self._add_instr("UNPACK_SEQUENCE", arg=count, argval=count)
+
+    def gen_call_function(self, argc=0):
+        if sys.version_info >= (3, 11):
+            self._add_instr("PRECALL", arg=argc, argval=argc)
+            self._add_instr("CALL", arg=argc, argval=argc)
+        else:
+            self._add_instr("CALL_FUNCTION", arg=argc, argval=argc)
+
+    def gen_call_function_ex(self, has_kwargs):
+        flag = 0
+        if has_kwargs:
+            flag |= CALL_FUNCTION_EX_FLAG.CFE_HAS_KWARGS
+        self._add_instr("CALL_FUNCTION_EX", arg=flag, argval=flag)
+
+    def gen_call_method(self, argc=0):
+        if sys.version_info >= (3, 11):
+            self._add_instr("PRECALL", arg=argc, argval=argc)
+            self._add_instr("CALL", arg=argc, argval=argc)
+        else:
+            self._add_instr("CALL_METHOD", arg=argc, argval=argc)
+
+    def gen_kw_names(self, kw_names: tuple[str, ...] | None):
+        if kw_names is None:
+            return
+        if sys.version_info < (3, 11):
+            raise InnerError("gen_kw_names is not supported before python3.11")
+        if kw_names not in self._code_options["co_consts"]:
+            self._code_options["co_consts"].append(kw_names)
+        idx = self._code_options["co_consts"].index(kw_names)
+        self._add_instr("KW_NAMES", arg=idx, argval=kw_names)
+
+    def gen_pop_top(self):
+        self._add_instr("POP_TOP")
+
+    def gen_rot_n(self, n):
+        if n <= 1:
+            return
+        if sys.version_info >= (3, 11):
+            for i in range(n, 1, -1):
+                self._add_instr("SWAP", arg=i)
+        elif sys.version_info >= (3, 10):
+            self._add_instr("ROT_N", arg=n)
+        else:
+            if n <= 4:
+                self._add_instr("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])
+            else:
+
+                def rot_n_fn(n):
+                    vars = [f"var{i}" for i in range(n)]
+                    rotated = reversed(vars[-1:] + vars[:-1])
+                    fn = eval(f"lambda {','.join(vars)}: ({','.join(rotated)})")
+                    fn = no_eval_frame(fn)
+                    fn.__name__ = f"rot_{n}_fn"
+                    return fn
+
+                self.gen_build_tuple(n)
+                self.gen_load_const(rot_n_fn(n))
+                self.gen_rot_n(2)
+                self._add_instr("CALL_FUNCTION_EX", arg=0)
+                self.gen_unpack_sequence(n)
+
+    def gen_shift_n(self, s: int, n: int):
+        """
+        Generate the bytecode for shifting the stack.
+
+        Args:
+            s (int): Steps to shift.
+            n (int): The number of elements to shift.
+        """
+        if s == 0 or n <= 1:
+            return
+
+        # NOTE(zrr1999): right shift s steps is equal to left shift n-s steps
+        if abs(s) > n // 2:
+            new_s = s - n if s > 0 else s + n
+            self.gen_shift_n(new_s, n)
+            return
+        if s > 0:
+            # NOTE: s=1, n=3 [1,2,3,4,5] -> [1,2,5,3,4]
+            #       s=2, n=3 [1,2,3,4,5] -> [1,2,4,5,3]
+            if s == 1:
+                self.gen_rot_n(n)
+            else:
+                self.gen_rot_n(n)
+                self.gen_shift_n(s - 1, n)
+
+        else:  # s < 0
+            if sys.version_info >= (3, 11):
+                # NOTE: s=-1, n=3 [1,2,3,4,5] -> [1,2,4,5,3]
+                if s == -1:
+                    for i in range(2, n + 1):
+                        self._add_instr("SWAP", arg=i)
+                else:
+                    self.gen_shift_n(-1, n)
+                    self.gen_shift_n(s + 1, n)
+            else:
+                raise NotImplementedError(
+                    "shift_n is not supported before python3.11"
+                )
+
+    def gen_swap(self, n):
+        if sys.version_info >= (3, 11):
+            self._add_instr("SWAP", arg=n)
+        else:
+            raise NotImplementedError("swap is not supported before python3.11")
+
+    def gen_jump(
+        self,
+        jump_to: Instruction | None = None,
+        *,
+        direction: JumpDirection = JumpDirection.FORWARD,
+    ) -> Instruction:
+        if sys.version_info >= (3, 11):
+            return self._add_instr(f"JUMP_{direction.value}", jump_to=jump_to)
+        else:
+            return self._add_instr("JUMP_ABSOLUTE", jump_to=jump_to)
+
+    def gen_pop_jump(
+        self,
+        jump_to: Instruction | None = None,
+        *,
+        direction: JumpDirection = JumpDirection.FORWARD,
+        suffix: PopJumpCond = PopJumpCond.NONE,
+    ) -> Instruction:
+        if sys.version_info >= (3, 11):
+            return self._add_instr(
+                f"POP_JUMP_{direction.value}_IF_{suffix.value}", jump_to=jump_to
+            )
+        else:
+            return self._add_instr(
+                f"POP_JUMP_IF_{suffix.value}", jump_to=jump_to
+            )
+
+    def gen_return(self):
+        self._add_instr("RETURN_VALUE")
+
+    def gen_get_iter(self):
+        self._add_instr("GET_ITER")
+
+    def add_pure_instructions(self, instructions):
+        """
+        add instructions and do nothing.
+        """
+        self._instructions.extend(instructions)
+
+    def _add_instr(self, *args, **kwargs):
+        instr = gen_instr(*args, **kwargs)
+        self._instructions.append(instr)
+        return instr
+
+    def _insert_instr(self, index, *args, **kwargs):
+        instr = gen_instr(*args, **kwargs)
+        self._instructions.insert(index, instr)
+
+    def pprint(self):
+        print('\n'.join(instrs_info(self._instructions)))
+
+    def extend_instrs(self, instrs):
+        self._instructions.extend(instrs)
+
+    def pop_instr(self):
+        self._instructions.pop()
+
+    def replace_null_variable(self):
+        """
+        Replace all NullVariables in the bytecode.
+
+        Returns:
+            Optional[Tuple[Any, Callable]]: The new code object and its guard function, or None if no dummy variables are found.
+        """
+        from .variables.basic import NullVariable
+
+        instructions = get_instructions(self._origin_code)
+        has_null_variable = False
+        for instr in instructions:
+            if (
+                instr.opname == 'LOAD_FAST'
+                and instr.argval in self._frame.f_locals.keys()
+                and isinstance(self._frame.f_locals[instr.argval], NullVariable)
+            ):
+                has_null_variable = True
+                self._frame.f_locals[instr.argval].reconstruct(self)
+            else:
+                self.add_pure_instructions([instr])
+
+        if has_null_variable:
+            new_code = self.gen_pycode()
+            return new_code
+        else:
+            return None
diff --git a/python/paddle/jit/sot/opcode_translator/executor/side_effects.py b/python/paddle/jit/sot/opcode_translator/executor/side_effects.py
new file mode 100644
index 00000000000000..f9f8fc20141a13
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/side_effects.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, NamedTuple, TypeVar
+
+from .mutable_data import MutableData
+from .variables import VariableBase
+
+if TYPE_CHECKING:
+    from .mutable_data import DataGetter
+    from .pycode_generator import PyCodeGen
+
+    MutableDataT = TypeVar("MutableDataT", bound=MutableData)
+
+
+class SideEffectsState(NamedTuple):
+    data_id_to_proxy: dict[int, MutableData]
+    proxy_variables: list[VariableBase]
+    mutable_variables: list[VariableBase]
+    proxy_versions: list[int]
+    mutable_attrs: list[dict[str, Any]]
+
+
+class SideEffects:
+    def __init__(self):
+        self.data_id_to_proxy: dict[int, MutableData] = {}
+        self.proxy_variables: list[VariableBase] = []
+        self.mutable_variables: list[VariableBase] = []
+
+    def record_proxy_variable(self, variable: VariableBase):
+        if variable not in self.proxy_variables:
+            self.proxy_variables.append(variable)
+
+    def record_mutable_variable(self, variable: VariableBase):
+        if variable not in self.mutable_variables:
+            self.mutable_variables.append(variable)
+
+    def get_proxy(
+        self,
+        proxy_type: type[MutableDataT],
+        data: Any,
+        getter: DataGetter,
+    ) -> MutableDataT:
+        data_id = id(data)
+        if data_id not in self.data_id_to_proxy:
+            self.data_id_to_proxy[data_id] = proxy_type(data, getter)
+        return self.data_id_to_proxy[data_id]  # type: ignore
+
+    def get_state(self):
+        return SideEffectsState(
+            self.data_id_to_proxy.copy(),
+            self.proxy_variables.copy(),
+            self.mutable_variables.copy(),
+            [proxy.version for proxy in self.data_id_to_proxy.values()],
+            [
+                {attr: getattr(var, attr)}
+                for var in self.mutable_variables
+                for attr in var.mutable_attrs
+            ],
+        )
+
+    def restore_state(self, state: SideEffectsState):
+        self.data_id_to_proxy = state.data_id_to_proxy
+        self.proxy_variables = state.proxy_variables
+        self.mutable_variables = state.mutable_variables
+        # NOTE(SigureMo): We can use the `strict=True` option in zip after
+        # Python 3.10.
+        assert len(self.data_id_to_proxy.values()) == len(
+            state.proxy_versions
+        ), "proxy_versions length not match"
+        assert len(self.mutable_variables) == len(
+            state.mutable_attrs
+        ), "mutable_attrs length not match"
+
+        for proxy, version in zip(
+            self.data_id_to_proxy.values(), state.proxy_versions
+        ):
+            proxy.rollback(version)
+
+        for (variable, attr), attr_dict in zip(
+            (
+                (var, attr)
+                for var in self.mutable_variables
+                for attr in var.mutable_attrs
+            ),
+            (attr_dict for attr_dict in state.mutable_attrs),
+        ):
+            setattr(variable, attr, attr_dict[attr])
+
+
+class SideEffectRestorer:
+    def pre_gen(self, codegen: PyCodeGen):
+        raise NotImplementedError()
+
+    def post_gen(self, codegen: PyCodeGen):
+        raise NotImplementedError()
+
+
+class DictSideEffectRestorer(SideEffectRestorer):
+    """
+    old_dict.clear()
+    old_dict.update(new_dict)
+    """
+
+    def __init__(self, var: VariableBase):
+        super().__init__()
+        self.var = var
+
+    def pre_gen(self, codegen: PyCodeGen):
+        # Reference to the original dict.
+        # load old_dict.update and new_dict to stack.
+        self.var.reconstruct(codegen)
+        codegen.gen_load_method("update")
+        # Generate dict by each key-value pair.
+        self.var.reconstruct(codegen, use_tracker=False)
+        # load old_dict.clear to stack.
+        self.var.reconstruct(codegen)
+        codegen.gen_load_method("clear")
+
+    def post_gen(self, codegen: PyCodeGen):
+        # Call methods to apply side effects.
+        codegen.gen_call_method(0)  # call clear
+        codegen.gen_pop_top()
+        codegen.gen_call_method(1)  # call update
+        codegen.gen_pop_top()
+
+
+class ListSideEffectRestorer(SideEffectRestorer):
+    """
+    old_list[:] = new_list
+    """
+
+    def __init__(self, var: VariableBase):
+        super().__init__()
+        self.var = var
+
+    def pre_gen(self, codegen: PyCodeGen):
+        # Reference to the original list.
+        # load new_list to stack.
+        self.var.reconstruct(codegen, use_tracker=False)
+        # load old_list[:] to stack.
+        self.var.reconstruct(codegen)
+        codegen.gen_load_const(None)
+        codegen.gen_load_const(None)
+        codegen.gen_build_slice(2)
+
+    def post_gen(self, codegen: PyCodeGen):
+        # Call STROE_SUBSCR to apply side effects.
+        codegen.gen_store_subscr()
+
+
+class GlobalSetSideEffectRestorer(SideEffectRestorer):
+    """
+    global_var = new_value
+    """
+
+    def __init__(self, name: str, var: VariableBase):
+        super().__init__()
+        self.name = name
+        self.var = var
+
+    def pre_gen(self, codegen: PyCodeGen):
+        self.var.reconstruct(codegen)
+
+    def post_gen(self, codegen: PyCodeGen):
+        codegen.gen_store_global(self.name)
+
+
+class GlobalDelSideEffectRestorer(SideEffectRestorer):
+    """
+    del global_var
+    """
+
+    def __init__(self, name: str):
+        super().__init__()
+        self.name = name
+
+    def pre_gen(self, codegen: PyCodeGen):
+        # do nothing
+        ...
+
+    def post_gen(self, codegen: PyCodeGen):
+        codegen.gen_delete_global(self.name)
+
+
+class ObjSetSideEffectRestorer(SideEffectRestorer):
+    """
+    obj.attr = new_value
+    """
+
+    def __init__(self, obj: VariableBase, name: str, var: VariableBase):
+        super().__init__()
+        self.obj = obj
+        self.name = name
+        self.var = var
+
+    def pre_gen(self, codegen: PyCodeGen):
+        # value
+        self.var.reconstruct(codegen)
+        # obj
+        self.obj.reconstruct(codegen)
+
+    def post_gen(self, codegen: PyCodeGen):
+        codegen.gen_store_attr(self.name)
+
+
+class ObjDelSideEffectRestorer(SideEffectRestorer):
+    """
+    del obj.attr
+    """
+
+    def __init__(self, obj: VariableBase, name: str):
+        super().__init__()
+        self.obj = obj
+        self.name = name
+
+    def pre_gen(self, codegen: PyCodeGen):
+        self.obj.reconstruct(codegen)
+
+    def post_gen(self, codegen: PyCodeGen):
+        codegen.gen_delete_attr(self.name)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/tracker.py b/python/paddle/jit/sot/opcode_translator/executor/tracker.py
new file mode 100644
index 00000000000000..c085e14b5b3824
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/tracker.py
@@ -0,0 +1,387 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import builtins
+import sys
+from typing import TYPE_CHECKING
+
+from ...utils import InnerError, NameGenerator
+from .guard import StringifyExpression, union_free_vars
+
+if TYPE_CHECKING:
+    from typing import Sequence
+
+    from .pycode_generator import PyCodeGen
+    from .variables import VariableBase
+
+
+class Tracker:
+    """
+    Tracker is a base class responsible for tracking variables or objects in Python code.
+    It is used to identify how a variable is derived from the initial state of the frame.
+
+    Args:
+        inputs: The list of variables to be tracked.
+
+    Note:
+        It serves as an abstract class and should not be instantiated directly.
+    """
+
+    inputs: Sequence[VariableBase]
+    name_generator = NameGenerator("tracker_")
+
+    def __init__(self, inputs: Sequence[VariableBase], changed: bool = False):
+        self.inputs = inputs
+        self.changed = changed
+        self.id = Tracker.name_generator.next()
+
+    def gen_instructions(self, codegen: PyCodeGen) -> None:
+        """
+        Generate instructions based on the tracked variables.
+
+        Args:
+            codegen (PyCodeGen): An instance of PyCodeGen to generate instructions.
+        """
+        raise NotImplementedError()
+
+    # TODO(xiongkun): trace_value_from_frame is not a good name, it should be more related to guard but not tracable.
+    def trace_value_from_frame(self) -> StringifyExpression:
+        """
+        Trace the value of the tracked variables from the frame. It used for generating the guard.
+
+        Returns:
+            The value of the tracked variables.
+        """
+        raise NotImplementedError()
+
+    def is_traceable(self) -> bool:
+        """
+        Determine if all the tracked variables can be traced from the frame.
+
+        Returns:
+            bool, True if all tracked variables are traceable, False otherwise.
+        """
+        if self.changed:
+            return False
+        for input in self.inputs:
+            if not input.tracker.is_traceable():
+                return False
+        return True
+
+    def need_guard(self) -> bool:
+        return self.is_traceable()
+
+
+class DummyTracker(Tracker):
+    """
+    DummyTracker is a subclass of Tracker that specifically tracks variables cannot be reproduced from the frame.
+    It is mostly generated by complex operations (instructions).
+
+    Args:
+        inputs (list[VariableBase]): The input variables associated with the generated variables.
+    """
+
+    def __init__(self, inputs: Sequence[VariableBase]):
+        super().__init__(inputs)
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        raise InnerError("DummyTracker has no instructions")
+
+    def trace_value_from_frame(self):
+        raise InnerError("DummyTracker can't trace value from frame")
+
+    def is_traceable(self):
+        return False
+
+    def __repr__(self) -> str:
+        return f"DummyTracker(num_inputs={len(self.inputs)})"
+
+    def need_guard(self) -> bool:
+        return False
+
+
+class DanglingTracker(Tracker):
+    """
+    DanglingTracker is a subclass of Tracker that specifically tracks variables that are not in the frame.
+    Variables whose tracker is DanglingTracker should not be placed on the stack, except for NullVariable.
+    DanglingTracker is often used in conjunction with BuiltinVariable to reuse the dispatch mechanism.
+
+    Examples:
+        >>> import operator
+        >>> from sot.opcode_translator.executor.variables import BuiltinVariable, ConstantVariable
+        >>> a = ConstantVariable.wrap_literal(1, None)
+        >>> b = ConstantVariable.wrap_literal(2, None)
+        >>> c = BuiltinVariable(operator.add, None, DanglingTracker())(a, b)
+        >>> c.value
+        3
+    """
+
+    def __init__(self):
+        super().__init__([])
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        raise InnerError("DanglingTracker has no instructions")
+
+    def trace_value_from_frame(self):
+        raise InnerError("DanglingTracker can't trace value from frame")
+
+    def is_traceable(self):
+        return False
+
+    def __repr__(self) -> str:
+        return "DanglingTracker()"
+
+
+class LocalTracker(Tracker):
+    """
+    LocalTracker is a subclass of Tracker that specifically tracks variables from f_locals of frame.
+
+    Args:
+        name (str): The name of the variable in f_locals to be tracked.
+    """
+
+    def __init__(self, name: str):
+        super().__init__([])
+        self.name = name
+
+    def gen_instructions(self, codegen: PyCodeGen) -> None:
+        codegen.gen_load_fast(self.name)
+
+    def trace_value_from_frame(self) -> StringifyExpression:
+        return StringifyExpression(f"frame.f_locals['{self.name}']", [], {})
+
+    def __repr__(self) -> str:
+        return f"LocalTracker(name={self.name})"
+
+
+class CellTracker(LocalTracker):
+    def gen_instructions(self, codegen: PyCodeGen):
+        codegen.gen_load_deref(self.name)
+
+    def trace_value_from_frame(self):
+        return StringifyExpression(f"frame.f_locals['{self.name}']", [], {})
+
+    def __repr__(self) -> str:
+        return f"CellTracker(name={self.name})"
+
+
+class GlobalTracker(Tracker):
+    """
+    GlobalTracker is a subclass of Tracker that specifically tracks variables from f_globals of frame.
+
+    Args:
+        name (str): The name of the variable in f_globals to be tracked.
+    """
+
+    def __init__(self, name: str):
+        super().__init__([])
+        self.name = name
+
+    def gen_instructions(self, codegen: PyCodeGen) -> None:
+        codegen.gen_load_global(self.name, push_null=False)
+
+    def trace_value_from_frame(self) -> StringifyExpression:
+        return StringifyExpression(f"frame.f_globals['{self.name}']", [], {})
+
+    def __repr__(self) -> str:
+        return f"GlobalTracker(name={self.name})"
+
+
+class BuiltinTracker(Tracker):
+    """
+    BuiltinTracker is a subclass of Tracker that specifically tracks variables from f_builtins of frame.
+
+    Args:
+        name (str): The name of the variable in f_builtins to be tracked.
+    """
+
+    def __init__(self, name: str):
+        super().__init__([])
+        self.name = name
+
+    def gen_instructions(self, codegen: PyCodeGen) -> None:
+        codegen.gen_load_global(self.name, push_null=False)
+
+    def trace_value_from_frame(self) -> StringifyExpression:
+        return StringifyExpression(
+            f"builtins.__dict__['{self.name}']", [], {"builtins": builtins}
+        )
+
+    def __repr__(self) -> str:
+        return f"BuiltinTracker(name={self.name})"
+
+
+class ConstTracker(Tracker):
+    """
+    ConstTracker is a subclass of Tracker that specifically tracks a constant value.
+
+    Args:
+        value (Any): The value of the constant.
+    """
+
+    def __init__(self, value):
+        super().__init__([])
+        self.value = value
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        codegen.gen_load_const(self.value)
+
+    def trace_value_from_frame(self):
+        return StringifyExpression(f"{self.value!r}", [], {})
+
+    def __repr__(self) -> str:
+        return f"ConstTracker(value={self.value})"
+
+    def need_guard(self) -> bool:
+        return False
+
+
+class GetAttrTracker(Tracker):
+    """
+    GetAttrTracker is a subclass of Tracker that specifically tracks the attribute access of an variable.
+
+    Args:
+        obj (VariableBase): The object whose attribute is to be tracked.
+        attr (str): The attribute to be tracked.
+    """
+
+    def __init__(self, obj: VariableBase, attr: str, changed: bool = False):
+        super().__init__([obj], changed)
+        self.obj = obj
+        self.attr = attr
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        self.obj.tracker.gen_instructions(codegen)
+        codegen.gen_load_attr(self.attr)
+
+    def trace_value_from_frame(self):
+        obj_tracer = self.obj.tracker.trace_value_from_frame()
+        if self.attr.isidentifier():
+            expr = f"{{}}.{self.attr}"
+        else:
+            expr = f"getattr({{}}, '{self.attr}')"
+        return StringifyExpression(
+            expr,
+            [obj_tracer],
+            union_free_vars(obj_tracer.free_vars),
+        )
+
+    def __repr__(self) -> str:
+        return f"GetAttrTracker(attr={self.attr})"
+
+    def need_guard(self) -> bool:
+        return self.is_traceable() and self.obj.tracker.need_guard()
+
+
+class GetItemTracker(Tracker):
+    """
+    GetItemTracker is a subclass of Tracker that specifically tracks item access of a container variable.
+
+    It generates instructions and traces the item value from the frame.
+
+    Args:
+        container_var (VariableBase): The container object whose item is to be tracked.
+        key: The key/index of the item to be tracked.
+    """
+
+    def __init__(self, container_var: VariableBase, key: object, changed=False):
+        super().__init__([container_var], changed)
+        self.container = container_var
+        self.key = key
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        self.container.tracker.gen_instructions(codegen)
+        if isinstance(self.key, slice):
+            codegen.gen_load_const(self.key.start)
+            codegen.gen_load_const(self.key.stop)
+            codegen.gen_load_const(self.key.step)
+            codegen.gen_build_slice(3)
+        else:
+            codegen.gen_load_const(self.key)
+        codegen.gen_subscribe()
+
+    def trace_value_from_frame(self):
+        container_tracer = self.container.tracker.trace_value_from_frame()
+        return StringifyExpression(
+            f"{{}}[{self.key!r}]",
+            [container_tracer],
+            union_free_vars(container_tracer.free_vars),
+        )
+
+    def __repr__(self) -> str:
+        return f"GetItemTracker(key={self.key!r})"
+
+    def need_guard(self) -> bool:
+        return self.is_traceable() and self.container.tracker.need_guard()
+
+
+class GetIterTracker(Tracker):
+    """
+    GetIterTracker is a subclass of Tracker that specifically tracks iteration of a variable.
+
+    It generates instructions and traces the iterator from the frame.
+
+    Args:
+        iter_source (VariableBase): The source variable to be iterated.
+    """
+
+    def __init__(self, iter_source: VariableBase):
+        super().__init__([iter_source])
+        self.iter_source = iter_source
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        self.iter_source.tracker.gen_instructions(codegen)
+        codegen._add_instr("GET_ITER")
+
+    def trace_value_from_frame(self):
+        iter_source_tracer = self.iter_source.tracker.trace_value_from_frame()
+        return StringifyExpression(
+            "iter({})",
+            [iter_source_tracer],
+            union_free_vars(iter_source_tracer.free_vars),
+        )
+
+    def __repr__(self) -> str:
+        return "GetIterTracker()"
+
+
+class CreateLayerTracker(Tracker):
+    def __init__(self, layer_class, args, kwargs):
+        super().__init__([layer_class] + list(args) + list(kwargs.values()))
+        self.layer_class = layer_class
+        self.args = args
+        self.kwargs = kwargs
+
+    def gen_instructions(self, codegen: PyCodeGen):
+        if sys.version_info >= (3, 11):
+            codegen.gen_push_null()
+
+        self.layer_class.reconstruct(codegen)
+        for variable in self.args:
+            variable.reconstruct(codegen)
+
+        if len(self.kwargs) == 0:
+            codegen.gen_call_function(argc=len(self.args))
+        else:
+            codegen.gen_build_tuple(len(self.args))
+            for k, v in self.kwargs.items():
+                codegen.gen_load_const(k)
+                v.reconstruct(codegen)
+            codegen.gen_build_map(len(self.kwargs))
+            codegen.gen_call_function_ex(has_kwargs=True)
+
+    def __repr__(self) -> str:
+        return f"CreateLayerTracker(Layer={self.layer_class}, args={self.args}, kwargs={self.kwargs})"
diff --git a/python/paddle/jit/sot/opcode_translator/executor/tracker_viewer.py b/python/paddle/jit/sot/opcode_translator/executor/tracker_viewer.py
new file mode 100644
index 00000000000000..f132c34abcac16
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/tracker_viewer.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import queue
+from typing import TYPE_CHECKING
+
+from .tracker import DummyTracker
+from .variables import VariableBase
+
+SIR_GRAPH_CLUSTER_NAME = "cluster_sir_part"
+
+if TYPE_CHECKING:
+    import graphviz
+
+
+def try_import_graphviz():
+    try:
+        import graphviz
+
+        return graphviz
+    except ImportError:
+        return None
+
+
+def draw_variable(graph: graphviz.Digraph, var: VariableBase):
+    """
+    Draw and colour a node in the graph.
+
+    Args:
+        graph (graphviz.Digraph): The graph to draw the variable.
+        var (VariableBase): The variable to draw.
+
+    Returns:
+        None
+    """
+    # Draw Variable
+    graph.attr('node', shape='oval', style="filled", fillcolor='aliceblue')
+    graph.attr('edge', style='solid')
+    graph.node(var.id, str(var))
+
+    # Draw Tracker
+    tracker = var.tracker
+    graph.attr('node', shape='rect', style='filled', fillcolor='beige')
+    if isinstance(tracker, DummyTracker):
+        graph.attr('edge', style='dashed')
+        graph.attr('node', shape='rect', style='filled', fillcolor='goldenrod')
+    graph.node(tracker.id, str(tracker))
+
+    # Draw edge (Tracker -> Variable)
+    graph.edge(tracker.id, var.id)
+
+    # Draw edge (Tracker inputs -> Tracker)
+    graph.attr('node', shape='oval', style="filled", fillcolor='cadetblue')
+    graph.attr('edge', style='solid')
+    for input in tracker.inputs:
+        graph.edge(input.id, tracker.id)
+
+
+def view_tracker(
+    root_variables: list[VariableBase], filename: str, format: str
+):
+    """
+    Generates a graph visualization starting from the given root variables and save it to the given file.
+
+    Args:
+        root_variables (list[VariableBase]): The root variables to start the visualization from.
+        filename (str): The name of the file used to save the results of the visualisation.
+        format (str): The format (e.g., `pdf`, `png` and 'svg' etc.) of the file to save the visualization to.
+
+    Returns:
+        None
+    """
+    # TODO(SigureMo):
+    # 1. Colorize the trackers
+    # 2. Highlight the user specific node, to speedup debug process
+    graphviz = try_import_graphviz()
+    if graphviz is None:
+        print("Cannot import graphviz, please install it first.")
+        return
+
+    graph = graphviz.Digraph("graph", filename=filename, format=format)
+    visited = set()
+    var_queue = queue.Queue()
+    for var in root_variables:
+        var_queue.put(var)
+
+    while not var_queue.empty():
+        var = var_queue.get()
+        if var.id in visited:
+            continue
+        visited.add(var.id)
+        if isinstance(var.tracker, DummyTracker):
+            with graph.subgraph(name=SIR_GRAPH_CLUSTER_NAME) as sir_part:
+                sir_part.attr(color='green')
+                draw_variable(sir_part, var)
+        else:
+            draw_variable(graph, var)
+        for input in var.tracker.inputs:
+            if input not in var_queue.queue:
+                var_queue.put(input)
+
+    graph.render(view=False)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py b/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py
new file mode 100644
index 00000000000000..9eb10fb81bcd53
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py
@@ -0,0 +1,1109 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import math
+import operator
+from functools import partial, reduce
+from typing import TYPE_CHECKING
+
+import paddle
+
+from ...utils import BreakGraphError, FallbackError
+from ...utils.magic_methods import (
+    BINARY_OPS,
+    UNARY_OPS,
+    magic_method_builtin_dispatch,
+)
+from .dispatch_functions import (
+    operator_in,
+    operator_is_none,
+    operator_is_not_none,
+    operator_not_in,
+    raise_break_graph_fn,
+    tensor_numel,
+)
+from .dispatcher import Dispatcher, optional
+from .tracker import ConstTracker, DanglingTracker, DummyTracker
+from .variables import (
+    BuiltinVariable,
+    ConstantVariable,
+    ContainerVariable,
+    DictVariable,
+    EnumerateVariable,
+    ListVariable,
+    MapVariable,
+    NumpyVariable,
+    RangeVariable,
+    SliceVariable,
+    TupleVariable,
+    VariableBase,
+    VariableFactory,
+)
+
+if TYPE_CHECKING:
+    from .variables import DataVariable, TensorVariable
+
+
+def add_guard(var: VariableBase):
+    var.graph.add_global_guarded_variable(var)
+    return var
+
+
+def raise_err_handle(error):
+    def inner(*args, **kwargs):
+        raise error
+
+    return inner
+
+
+# slice
+Dispatcher.register(
+    slice,
+    ("VariableBase",),
+    lambda stop: SliceVariable(
+        slice(stop),
+        graph=stop.graph,
+        tracker=DummyTracker([stop]),
+    ),
+)
+
+Dispatcher.register(
+    slice,
+    ("VariableBase", "VariableBase"),
+    lambda start, stop: SliceVariable(
+        slice(start, stop),
+        graph=stop.graph,
+        tracker=DummyTracker([start, stop]),
+    ),
+)
+
+Dispatcher.register(
+    slice,
+    ("VariableBase", "VariableBase", "VariableBase"),
+    lambda start, stop, step: SliceVariable(
+        slice(start, stop, step),
+        graph=stop.graph,
+        tracker=DummyTracker([start, stop, step]),
+    ),
+)
+
+
+# iter
+Dispatcher.register(
+    iter,
+    ("VariableBase",),
+    lambda variable: variable.get_iter(),
+)
+
+
+# in
+Dispatcher.register(
+    operator_in,
+    ("VariableBase", "IterVariable"),
+    raise_err_handle(BreakGraphError("Codes like: `variable in iterator`.")),
+)
+
+Dispatcher.register(
+    operator_in,
+    ("TensorVariable", "VariableBase"),
+    lambda left, right: ConstantVariable(
+        left.id
+        in [
+            x.id
+            for x in right.get_py_value(allow_tensor=True)
+            if hasattr(x, "id")
+        ],
+        left.graph,
+        tracker=DummyTracker([left, right]),
+    ),
+)
+
+Dispatcher.register(
+    operator_in,
+    ("VariableBase", "VariableBase"),
+    lambda left, right: ConstantVariable(
+        left.get_py_value(allow_tensor=True)
+        in right.get_py_value(allow_tensor=True),
+        left.graph,
+        tracker=DummyTracker([left, right]),
+    ),
+)
+
+Dispatcher.register(
+    operator_not_in,
+    ("VariableBase", "IterVariable"),
+    raise_err_handle(
+        BreakGraphError("Codes like: `variable not in iterator`.")
+    ),
+)
+
+Dispatcher.register(
+    operator_not_in,
+    ("TensorVariable", "VariableBase"),
+    lambda left, right: ConstantVariable(
+        left.id
+        not in [
+            x.id
+            for x in right.get_py_value(allow_tensor=True)
+            if hasattr(x, "id")
+        ],
+        left.graph,
+        tracker=DummyTracker([left, right]),
+    ),
+)
+
+Dispatcher.register(
+    operator_not_in,
+    ("VariableBase", "VariableBase"),
+    lambda left, right: ConstantVariable(
+        left.get_py_value(allow_tensor=True)
+        not in right.get_py_value(allow_tensor=True),
+        left.graph,
+        tracker=DummyTracker([left, right]),
+    ),
+)
+
+
+# dict
+Dispatcher.register(
+    dict,
+    (),
+    lambda: DictVariable(
+        {},
+        graph=Dispatcher.graph,
+        tracker=DummyTracker([]),
+    ),
+)
+
+Dispatcher.register(
+    dict,
+    ("DictVariable",),
+    lambda var: var.copy(),
+)
+
+
+@Dispatcher.register_decorator(dict)
+def dispatch_dict(var: ListVariable | TupleVariable):
+    res_dict = {}
+    length_var = BuiltinVariable(len, var.graph, DanglingTracker())(var)
+    getitem = BuiltinVariable(operator.getitem, var.graph, DanglingTracker())
+    for index in range(length_var.get_py_value()):
+        index_value = getitem(var, index)
+        # check
+        assert isinstance(index_value, (ListVariable, TupleVariable))
+        assert len(index_value) == 2
+        # recombination
+        key = getitem(index_value, 0)
+        value = getitem(index_value, 1)
+        value.graph.add_global_guarded_variable(key)
+        res_dict.update({key.get_py_value(): value})
+    return DictVariable(res_dict, var.graph, DummyTracker([var]))
+
+
+@Dispatcher.register_decorator(dict.fromkeys)
+def dispatch_dict_fromkeys(seq: ListVariable | TupleVariable, default: VariableBase = None):  # type: ignore
+    if default is None:
+        default = ConstantVariable.wrap_literal(None, seq.graph)
+    res_dict = {}
+    getitem = BuiltinVariable(operator.getitem, seq.graph, DanglingTracker())
+    for index in range(len(seq)):
+        index_value = getitem(seq, index)
+        seq.graph.add_global_guarded_variable(index_value)
+        res_dict.update({index_value.get_py_value(): default})
+    return DictVariable(res_dict, seq.graph, DummyTracker([seq]))
+
+
+Dispatcher.register(
+    dict.get,
+    ("DictVariable", "ConstantVariable", optional("VariableBase")),
+    lambda var, key, default=None: var.get(key.get_py_value(), default),
+)
+Dispatcher.register(
+    dict.keys,
+    ("DictVariable",),
+    lambda var: var.keys(),
+)
+
+Dispatcher.register(
+    operator.not_,
+    ("VariableBase",),
+    lambda x: ConstantVariable(
+        not x.get_py_value(allow_tensor=False), x.graph, DummyTracker([x])
+    ),
+)
+
+Dispatcher.register(
+    dict.values,
+    ("DictVariable",),
+    lambda var: var.values(),
+)
+Dispatcher.register(
+    dict.items,
+    ("DictVariable",),
+    lambda var: var.items(),
+)
+Dispatcher.register(
+    dict.setdefault,
+    ("DictVariable", "ConstantVariable", optional("VariableBase")),
+    lambda var, key, default=None: var.setdefault(key.get_py_value(), default),
+)
+Dispatcher.register(
+    dict.update,
+    ("DictVariable", "DictVariable"),
+    lambda var, other: var.update(other),
+)
+Dispatcher.register(
+    dict.copy,
+    ("DictVariable",),
+    lambda var: var.copy(),
+)
+Dispatcher.register(
+    dict.clear,
+    ("DictVariable",),
+    lambda var: var.clear(),
+)
+Dispatcher.register(
+    dict.pop,
+    ("DictVariable", "ConstantVariable"),
+    lambda var, key: var.pop(key.get_py_value()),
+)
+Dispatcher.register(
+    dict.pop,
+    ("DictVariable", "ConstantVariable", "VariableBase"),
+    lambda var, key, default: var.pop(key.get_py_value(), default),
+)
+Dispatcher.register(
+    dict.popitem,
+    ("DictVariable",),
+    lambda var: var.popitem(),
+)
+
+# tuple
+Dispatcher.register(
+    tuple,
+    ("ContainerVariable",),
+    lambda var: TupleVariable(
+        tuple(var.get_wrapped_items()),
+        graph=var.graph,
+        tracker=DummyTracker([var]),
+    ),
+)
+Dispatcher.register(
+    tuple,
+    ("SequenceIterVariable",),
+    lambda var: TupleVariable(
+        tuple(var.to_list()),
+        graph=var.graph,
+        tracker=DummyTracker([var]),
+    ),
+)
+Dispatcher.register(
+    tuple.count,
+    ("TupleVariable", "VariableBase"),
+    lambda var, value: var.count(value),
+)
+Dispatcher.register(
+    tuple.index,
+    ("TupleVariable", "VariableBase"),
+    lambda var, value: var.index(value),
+)
+
+# list
+Dispatcher.register(
+    list,
+    (),
+    lambda: ListVariable(
+        [],
+        graph=Dispatcher.graph,
+        tracker=DummyTracker([]),
+    ),
+)
+
+Dispatcher.register(
+    list,
+    ("ContainerVariable",),
+    lambda var: ListVariable(
+        list(var.get_wrapped_items()),
+        graph=var.graph,
+        tracker=DummyTracker([var]),
+    ),
+)
+
+Dispatcher.register(
+    list,
+    ("IterVariable",),
+    lambda var: ListVariable(
+        var.to_list(),
+        graph=var.graph,
+        tracker=DummyTracker([var]),
+    ),
+)
+Dispatcher.register(
+    list.extend,
+    ("ListVariable", "ListVariable | TupleVariable"),
+    lambda var, other: var.extend(other),
+)
+Dispatcher.register(
+    list.append,
+    ("ListVariable", "VariableBase"),
+    lambda var, other: var.append(other),
+)
+Dispatcher.register(
+    list.insert,
+    ("ListVariable", "ConstantVariable", "VariableBase"),
+    lambda var, index, obj: var.insert(index.get_py_value(), obj),
+)
+Dispatcher.register(
+    list.remove,
+    ("ListVariable", "VariableBase"),
+    lambda var, other: var.remove(other),
+)
+Dispatcher.register(
+    list.pop,
+    ("ListVariable", optional("ConstantVariable")),
+    lambda var, index=None: var.pop(index),
+)
+Dispatcher.register(
+    list.clear,
+    ("ListVariable",),
+    lambda var: var.clear(),
+)
+Dispatcher.register(
+    list.sort,
+    ("ListVariable",),
+    lambda var: var.sort(),
+)
+Dispatcher.register(
+    list.reverse,
+    ("ListVariable",),
+    lambda var: var.reverse(),
+)
+Dispatcher.register(
+    list.copy,
+    ("ListVariable",),
+    lambda var: var.copy(),
+)
+Dispatcher.register(
+    list.count,
+    ("ListVariable", "VariableBase"),
+    lambda var, obj: var.count(obj),
+)
+Dispatcher.register(
+    list.index,
+    ("ListVariable", "VariableBase"),
+    lambda var, obj: var.index(obj),
+)
+Dispatcher.register(
+    operator.add,
+    ("ListVariable", "ListVariable"),
+    lambda var, other: var.concat(other),
+)
+Dispatcher.register(
+    operator.add,
+    ("TupleVariable", "TupleVariable"),
+    lambda var, other: var.concat(other),
+)
+Dispatcher.register(
+    operator.mul,
+    ("ListVariable | TupleVariable", "ConstantVariable"),
+    lambda var, other: var.repeat(other),
+)
+
+# getattr
+Dispatcher.register(
+    getattr,
+    ("VariableBase", "ConstantVariable", optional("VariableBase")),
+    lambda var, name, default=None: var.getattr(
+        add_guard(name).get_py_value(), default
+    ),
+)
+
+# hasattr
+Dispatcher.register(
+    hasattr,
+    ("VariableBase", "ConstantVariable"),
+    lambda var, name: var.hasattr(add_guard(name).get_py_value()),
+)
+
+Dispatcher.register(
+    delattr,
+    ("VariableBase", "VariableBase"),
+    lambda var, name: var.delattr(add_guard(name).get_py_value()),
+)
+
+Dispatcher.register(
+    setattr,
+    ("VariableBase", "VariableBase", "VariableBase"),
+    lambda var, name, value: var.setattr(add_guard(name).get_py_value(), value),
+)
+
+# len
+Dispatcher.register(
+    len,
+    ("ContainerVariable | ContainerLayerVariable",),
+    lambda var: var.len(),
+)
+
+# range
+# stop
+Dispatcher.register(
+    range,
+    ("ConstantVariable",),
+    lambda stop: RangeVariable(
+        range(stop.get_py_value()),
+        graph=stop.graph,
+        tracker=DummyTracker([stop]),
+    ),
+)
+
+# start, stop
+Dispatcher.register(
+    range,
+    ("ConstantVariable", "ConstantVariable"),
+    lambda start, stop: RangeVariable(
+        range(start.get_py_value(), stop.get_py_value()),
+        graph=stop.graph,
+        tracker=DummyTracker([start, stop]),
+    ),
+)
+# start, stop, step
+Dispatcher.register(
+    range,
+    ("ConstantVariable", "ConstantVariable", "ConstantVariable"),
+    lambda start, stop, step: RangeVariable(
+        range(start.get_py_value(), stop.get_py_value(), step.get_py_value()),
+        graph=stop.graph,
+        tracker=DummyTracker([start, stop, step]),
+    ),
+)
+# TODO(zmh): Modify
+# enumerate
+Dispatcher.register(
+    enumerate,
+    ("VariableBase",),
+    lambda var: EnumerateVariable.from_iterator(
+        var, graph=var.graph, tracker=DummyTracker([var])
+    ),
+)
+
+
+# map
+Dispatcher.register(
+    map,
+    (
+        "CallableVariable",
+        "VariableBase",
+    ),
+    lambda fn, var: MapVariable.from_iterator(
+        fn, var, graph=var.graph, tracker=DummyTracker([var])
+    ),
+)
+
+
+# reversed
+@Dispatcher.register_decorator(reversed)
+def dispatch_reversed(var: ContainerVariable):
+    from .tracker import DanglingTracker
+    from .variables import BuiltinVariable, SequenceIterVariable
+
+    length_var = BuiltinVariable(len, var.graph, DanglingTracker())(var)
+    assert isinstance(length_var, ConstantVariable)
+    getitem = BuiltinVariable(operator.getitem, var.graph, DanglingTracker())
+    out = reversed([getitem(var, i) for i in range(length_var.get_py_value())])
+    out_var = ListVariable(
+        list(out), graph=var.graph, tracker=DummyTracker([var])
+    )
+    return SequenceIterVariable(
+        out_var,
+        graph=var.graph,
+        tracker=DummyTracker([var]),
+    )
+
+
+# isinstance
+Dispatcher.register(
+    isinstance,
+    ("TensorVariable", "VariableBase"),
+    lambda left, right: ConstantVariable(
+        isinstance(
+            paddle.to_tensor(0),
+            right.get_py_value(allow_tensor=True),
+        ),
+        left.graph,
+        DummyTracker([left, right]),
+    ),
+)
+
+Dispatcher.register(
+    isinstance,
+    ("VariableBase", "VariableBase"),
+    lambda left, right: ConstantVariable(
+        isinstance(
+            left.get_py_value(allow_tensor=True),
+            right.get_py_value(allow_tensor=True),
+        ),
+        left.graph,
+        DummyTracker([left, right]),
+    ),
+)
+
+# bool
+Dispatcher.register(
+    bool,
+    ("ContainerVariable",),
+    lambda var: var.bool(),
+)
+Dispatcher.register(
+    operator.truth,
+    ("ConstantVariable",),
+    lambda var: var.bool(),
+)
+
+# str
+Dispatcher.register(
+    str,
+    ("ConstantVariable",),
+    lambda var: var.str(),
+)
+
+
+@Dispatcher.register_decorator(str.format)
+def str_format(var: ConstantVariable, *args: ConstantVariable):
+    return var.format(*args)
+
+
+Dispatcher.register(
+    str.lower,
+    ("ConstantVariable",),
+    lambda var: var.lower(),
+)
+
+
+@Dispatcher.register_decorator(str.startswith)
+def str_startswith(var: ConstantVariable, substr: ConstantVariable, beg: ConstantVariable = None, end: ConstantVariable = None):  # type: ignore
+    value = var.get_py_value()
+    if end is None:
+        end = ConstantVariable(len(value), var.graph, DanglingTracker())
+    if beg is None:
+        beg = ConstantVariable(0, var.graph, DanglingTracker())
+
+    res = value.startswith(
+        substr.get_py_value(), beg.get_py_value(), end.get_py_value()
+    )
+    return ConstantVariable(
+        res, var.graph, DummyTracker([var, substr, beg, end])
+    )
+
+
+@Dispatcher.register_decorator(str.endswith)
+def str_endswith(var: ConstantVariable, substr: ConstantVariable, beg: ConstantVariable = None, end: ConstantVariable = None):  # type: ignore
+    value = var.get_py_value()
+    if end is None:
+        end = ConstantVariable(len(value), var.graph, DanglingTracker())
+    if beg is None:
+        beg = ConstantVariable(0, var.graph, DanglingTracker())
+
+    res = value.endswith(
+        substr.get_py_value(), beg.get_py_value(), end.get_py_value()
+    )
+    return ConstantVariable(
+        res, var.graph, DummyTracker([var, substr, beg, end])
+    )
+
+
+# getitem
+# TODO: Should pass its Variable into the getitem and perform operations such as getting value in the getitem. like this:https://github.com/PaddlePaddle/PaddleSOT/pull/198#discussion_r1241110949
+Dispatcher.register(
+    operator.getitem,
+    (
+        "TensorVariable",
+        "Any",
+    ),
+    lambda var, key: var.getitem(
+        VariableFactory.from_value(
+            key, graph=var.graph, tracker=ConstTracker(key)
+        )
+    ),
+)
+
+Dispatcher.register(
+    operator.getitem,
+    (
+        "VariableBase",
+        "int | str",
+    ),
+    lambda var, key: var.getitem(
+        VariableFactory.from_value(
+            key, graph=var.graph, tracker=ConstTracker(key)
+        )
+    ),
+)
+
+Dispatcher.register(
+    operator.getitem,
+    (
+        "VariableBase",
+        "ConstantVariable | SliceVariable",
+    ),
+    lambda var, key: var.getitem(key),
+)
+
+# setitem
+Dispatcher.register(
+    operator.setitem,
+    (
+        "VariableBase",
+        "int | str | ConstantVariable | TensorVariable",
+        "int | str | ConstantVariable | TensorVariable",
+    ),
+    lambda var, key, value: var.setitem(key.get_py_value(), value),
+)
+
+# delitem
+Dispatcher.register(
+    operator.delitem,
+    (
+        "VariableBase",
+        "int | str | TensorVariable",
+    ),
+    lambda var, key: var.delitem(key),
+)
+Dispatcher.register(
+    operator.delitem,
+    (
+        "VariableBase",
+        "ConstantVariable",
+    ),
+    lambda var, key: var.delitem(key.get_py_value()),
+)
+
+
+# TensorVariable
+Dispatcher.register(
+    paddle.is_tensor,
+    ("TensorVariable",),
+    lambda var: var.is_tensor(),
+)
+Dispatcher.register(
+    paddle.is_complex,
+    ("TensorVariable",),
+    lambda var: var.is_complex(),
+)
+Dispatcher.register(
+    paddle.is_integer,
+    ("TensorVariable",),
+    lambda var: var.is_integer(),
+)
+Dispatcher.register(
+    paddle.is_floating_point,
+    ("TensorVariable",),
+    lambda var: var.is_floating_point(),
+)
+Dispatcher.register(
+    paddle.rank,
+    ("TensorVariable",),
+    lambda var: var.ndim,
+)
+
+Dispatcher.register(
+    operator.is_,
+    ("TensorVariable", "TensorVariable"),
+    lambda var, other: ConstantVariable(
+        var.get_symbol() == other.get_symbol(),
+        var.graph,
+        tracker=DummyTracker([var, other]),
+    ),
+)
+
+Dispatcher.register(
+    operator.is_,
+    ("TensorVariable", "VariableBase"),
+    lambda var, other: ConstantVariable(
+        False,
+        var.graph,
+        tracker=DummyTracker([var, other]),
+    ),
+)
+
+Dispatcher.register(
+    operator.is_,
+    ("VariableBase", "TensorVariable"),
+    lambda var, other: ConstantVariable(
+        False,
+        var.graph,
+        tracker=DummyTracker([var, other]),
+    ),
+)
+
+# VariableBase
+Dispatcher.register(
+    operator.is_,
+    ("VariableBase", "VariableBase"),
+    lambda var, other: ConstantVariable(
+        var.get_py_value() is other.get_py_value(),
+        var.graph,
+        tracker=DummyTracker([var, other]),
+    ),
+)
+
+
+@Dispatcher.register_decorator(operator.is_not)
+def is_not_func(var: VariableBase, other: VariableBase):
+    handler = Dispatcher.dispatch(operator.is_, var, other)
+    if handler is None:
+        raise FallbackError(
+            f"Not found implementation operator.is for {var} and {other}."
+        )
+    return handler(var, other).bool_not()
+
+
+# is None
+Dispatcher.register(
+    operator_is_none,
+    ("VariableBase",),
+    lambda var: BuiltinVariable(operator.is_, var.graph, DanglingTracker())(
+        var, ConstantVariable.wrap_literal(None, var.graph)
+    ),
+)
+
+# is not None
+Dispatcher.register(
+    operator_is_not_none,
+    ("VariableBase",),
+    lambda var: BuiltinVariable(operator.is_not, var.graph, DanglingTracker())(
+        var, ConstantVariable.wrap_literal(None, var.graph)
+    ),
+)
+
+
+# NOTE(SigureMo): Don't directly capture free var inside for-loop, use partial instead.
+# ```python
+# lambdas = []
+# for i in range(10):
+#     lambdas.append(lambda: i)
+# for fn in lambdas:
+#     print(fn()) # result is 9, 9, 9, 9, 9, 9, 9, 9, 9, 9
+# ```
+# Rewrite by partial:
+# ```python
+# lambdas = []
+# for i in range(10):
+#     lambdas.append(partial(lambda i: i, i))
+# for fn in lambdas:
+#     print(fn()) # result is 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+# ```
+
+# Constant
+for unary_fn in UNARY_OPS:
+    for magic_method in magic_method_builtin_dispatch(unary_fn):
+        Dispatcher.register(
+            unary_fn,
+            ("ConstantVariable",),
+            partial(
+                lambda fn, var: VariableFactory.from_value(
+                    fn(var.get_py_value()),
+                    var.graph,
+                    tracker=DummyTracker([var]),
+                ),
+                unary_fn,
+            ),
+        )
+for binary_fn in BINARY_OPS:
+    for magic_method in magic_method_builtin_dispatch(binary_fn):
+        Dispatcher.register(
+            binary_fn,
+            ("ConstantVariable", "ConstantVariable"),
+            partial(
+                lambda fn, var, other: VariableFactory.from_value(
+                    fn(var.get_py_value(), other.get_py_value()),
+                    var.graph,
+                    tracker=DummyTracker([var, other]),
+                ),
+                binary_fn,
+            ),
+        )
+# Tensor
+fallback_tensor_unary_method = {
+    int,
+    bool,
+    operator.truth,
+}
+
+Dispatcher.register(tensor_numel, ("TensorVariable",), lambda x: x.numel())
+
+for unary_fn in UNARY_OPS:
+    if unary_fn in fallback_tensor_unary_method:
+        Dispatcher.register(
+            unary_fn,
+            ("TensorVariable",),
+            raise_break_graph_fn,
+        )
+        continue
+
+    if unary_fn is len:
+        Dispatcher.register(
+            unary_fn,
+            ("TensorVariable",),
+            lambda x: x.len(),
+        )
+        continue
+
+    for magic_method in magic_method_builtin_dispatch(unary_fn):
+        Dispatcher.register(
+            unary_fn,
+            ("TensorVariable",),
+            partial(
+                lambda magic_name, var: var.graph.call_tensor_method(
+                    magic_name, var
+                ),
+                magic_method.name,
+            ),
+        )
+for binary_fn in BINARY_OPS:
+    for magic_method in magic_method_builtin_dispatch(binary_fn):
+        # skip all inplace magic method name, we will dispatch it to non-inplace
+        # magic methods
+        if magic_method.is_inplace:
+            continue
+
+        if not magic_method.is_reverse:
+            Dispatcher.register(
+                binary_fn,
+                (
+                    "TensorVariable",
+                    "TensorVariable | ConstantVariable | NumpyVariable",
+                ),
+                partial(
+                    lambda magic_name, var, other: var.graph.call_tensor_method(
+                        magic_name, var, other
+                    ),
+                    magic_method.name,
+                ),
+            )
+        else:
+            # skip __mod__ for str and TensorVariable
+            if magic_method.name == "__rmod__":
+
+                @Dispatcher.register_decorator(operator.mod)
+                def tensor_mod_dispatcher(
+                    var: ConstantVariable, other: TensorVariable
+                ):
+                    if var.get_py_type() is str:
+                        raise BreakGraphError(
+                            "(ConstantVariable % TensorVariable) raise a callback. "
+                        )
+                    raise FallbackError("Tensor doesn't support __rmod__")
+
+            else:
+                Dispatcher.register(
+                    binary_fn,
+                    (
+                        "ConstantVariable | NumpyVariable",
+                        "TensorVariable",
+                    ),
+                    partial(
+                        lambda reverse_magic_name, var, other: other.graph.call_tensor_method(
+                            reverse_magic_name, other, var
+                        ),
+                        magic_method.name,
+                    ),
+                )
+
+# Register dispatch for NumpyVariable: fallback !
+for unary_fn in UNARY_OPS:
+    if unary_fn in [bool]:
+        continue
+    for magic_method in magic_method_builtin_dispatch(unary_fn):
+
+        @Dispatcher.register_decorator(unary_fn)
+        def numpy_unary_dispatcher(var: NumpyVariable):
+            raise FallbackError('Numpy operator need fallback to dygraph')
+
+
+Dispatcher.register(
+    operator.eq,
+    ("NumpyVariable", "ConstantVariable | NumpyVariable"),
+    lambda left, right: constant_numpy_equal(right, left),
+)
+
+
+for binary_fn in BINARY_OPS:
+    for magic_method in magic_method_builtin_dispatch(binary_fn):
+
+        @Dispatcher.register_decorator(binary_fn)
+        def numpy_binary_dispatcher(var: NumpyVariable, other: NumpyVariable):
+            raise FallbackError('Numpy operator need fallback to dygraph')
+
+
+# Register dispatch for DataVariable: directy call and return a wrapped variable.
+def data_variable_binary_dispatcher(var, other, operator):
+    return VariableFactory.from_value(
+        operator(var.get_py_value(), other.get_py_value()),
+        var.graph,
+        DummyTracker([var, other]),
+    )
+
+
+for binary_fn in BINARY_OPS:
+    for magic_method in magic_method_builtin_dispatch(binary_fn):
+        Dispatcher.register(
+            binary_fn,
+            ("DataVariable", "Any"),
+            partial(data_variable_binary_dispatcher, operator=binary_fn),
+        )
+        Dispatcher.register(
+            binary_fn,
+            ("Any", "DataVariable"),
+            partial(data_variable_binary_dispatcher, operator=binary_fn),
+        )
+
+for unary_fn in UNARY_OPS:
+    for magic_method in magic_method_builtin_dispatch(unary_fn):
+
+        def data_variable_unary_dispatcher(var: DataVariable, fn):
+            return VariableFactory.from_value(
+                fn(var.get_py_value()),
+                var.graph,
+                DummyTracker([var]),
+            )
+
+        Dispatcher.register(
+            unary_fn,
+            ("DataVariable",),
+            partial(data_variable_unary_dispatcher, fn=unary_fn),
+        )
+
+
+Dispatcher.register(
+    math.ceil,
+    ("ConstantVariable",),
+    lambda var: ConstantVariable(
+        math.ceil(var.get_py_value()),
+        var.graph,
+        tracker=DummyTracker([var]),
+    ),
+)
+
+Dispatcher.register(
+    math.floor,
+    ("ConstantVariable",),
+    lambda var: ConstantVariable(
+        math.floor(var.get_py_value()),
+        var.graph,
+        tracker=DummyTracker([var]),
+    ),
+)
+
+Dispatcher.register(
+    ord,
+    ("ConstantVariable",),
+    lambda var: var.ord(),
+)
+
+Dispatcher.register(
+    chr,
+    ("ConstantVariable",),
+    lambda var: var.chr(),
+)
+
+
+# pow
+# base ** exp % mod
+@Dispatcher.register_decorator(pow)
+def dispatch_pow(base: VariableBase, exp: VariableBase, mod: VariableBase = None):  # type: ignore
+    graph = base.graph
+    result = BuiltinVariable(operator.pow, graph, DanglingTracker())(base, exp)
+    if exp is not None:
+        result = BuiltinVariable(operator.mod, graph, DanglingTracker())(
+            result, mod
+        )
+    return result
+
+
+Dispatcher.register(
+    math.pow,
+    ("ConstantVariable", "ConstantVariable"),
+    lambda var1, var2: ConstantVariable(
+        math.pow(var1.get_py_value(), var2.get_py_value()),
+        var1.graph,
+        tracker=DummyTracker([var1, var2]),
+    ),
+)
+
+
+@Dispatcher.register_decorator(sum)
+def dispatch_sum(var: ContainerVariable | TensorVariable, start: VariableBase = None):  # type: ignore
+    if start is None:
+        start = ConstantVariable.wrap_literal(0, var.graph)
+    elements = [
+        var.getitem(ConstantVariable.wrap_literal(i, var.graph))
+        for i in range(len(var))
+    ]
+    result = reduce(
+        BuiltinVariable(operator.add, var.graph, DanglingTracker()),
+        elements,
+        start,
+    )
+    return result
+
+
+Dispatcher.register(
+    max,
+    ("ListVariable",),
+    lambda var: var.max(),
+)
+
+Dispatcher.register(
+    min,
+    ("ListVariable",),
+    lambda var: var.min(),
+)
+
+Dispatcher.register(
+    math.sqrt,
+    ("ConstantVariable",),
+    lambda var: ConstantVariable(
+        math.sqrt(var.get_py_value()),
+        var.graph,
+        tracker=DummyTracker([var]),
+    ),
+)
+
+
+def constant_numpy_equal(left, right):
+    numpy_ans = left.get_py_value() == right.get_py_value()
+    return NumpyVariable(
+        numpy_ans,
+        left.graph,
+        tracker=DummyTracker([left, right]),
+    )
+
+
+Dispatcher.register(
+    operator.eq,
+    ("ConstantVariable", "NumpyVariable"),
+    lambda left, right: constant_numpy_equal(left, right),
+)
+
+Dispatcher.register(
+    bool,
+    ("NumpyVariable",),
+    lambda x: ConstantVariable(
+        bool(x.get_py_value()),
+        x.graph,
+        tracker=DummyTracker([x]),
+    ),
+)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variable_stack.py b/python/paddle/jit/sot/opcode_translator/executor/variable_stack.py
new file mode 100644
index 00000000000000..e7389de5b88050
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/variable_stack.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar, overload
+
+if TYPE_CHECKING:
+    ValidateValueFunc = Callable[[Any], None]
+
+
+StackDataT = TypeVar("StackDataT")
+
+
+class VariableStack(Generic[StackDataT]):
+    """
+    A stack class for storing variables.
+
+    Examples:
+        >>> var1, var2, var3, var4 = range(1, 5)
+        >>> stack = VariableStack()
+        >>> stack.push(var1)
+        >>> stack.push(var3)
+        >>> stack.insert(1, var2)
+        >>> stack
+        [1, 2, 3]
+        >>> stack.pop()
+        3
+        >>> stack.pop_n(2)
+        [1, 2]
+        >>> stack.push(var1)
+        >>> stack.push(var2)
+        >>> stack.push(var3)
+        >>> stack
+        [1, 2, 3]
+        >>> stack.top
+        3
+        >>> stack.peek[1]
+        3
+        >>> stack.peek[:1]
+        [3]
+        >>> stack.peek[:2]
+        [2, 3]
+        >>> stack.peek[1] = var4
+        >>> stack
+        [1, 2, 4]
+
+    """
+
+    class VariablePeeker:
+        @overload
+        def __getitem__(self, index: int) -> StackDataT:
+            ...
+
+        @overload
+        def __getitem__(self, index: slice) -> list[StackDataT]:
+            ...
+
+        @overload
+        def __call__(self, index: int = 1) -> StackDataT:
+            ...
+
+        @overload
+        def __call__(self, index: slice) -> list[StackDataT]:
+            ...
+
+        def __init__(
+            self, data: list[StackDataT], validate_value_func: ValidateValueFunc
+        ):
+            self._data = data
+            self.validate_value_func = validate_value_func
+
+        def __getitem__(
+            self, index: int | slice
+        ) -> StackDataT | list[StackDataT]:
+            if isinstance(index, int):
+                assert 0 < index <= len(self._data)
+                return self._data[-index]
+            if isinstance(index, slice):
+                assert (
+                    index.start is None and index.step is None
+                ), "slice which has start or step not supported"
+                assert 0 < index.stop <= len(self._data)
+                return self._data[-index.stop :]
+            raise NotImplementedError(f"index type {type(index)} not supported")
+
+        def __setitem__(self, index: int, value: Any):
+            assert isinstance(
+                index, int
+            ), f"index type {type(index)} not supported"
+            assert (
+                0 < index <= len(self._data)
+            ), f"index should be in [1, {len(self._data)}], but get {index}"
+            self.validate_value_func(value)
+            self._data[-index] = value
+
+        def __call__(
+            self, index: int | slice = 1
+        ) -> StackDataT | list[StackDataT]:
+            return self[index]
+
+    def __init__(
+        self,
+        data: list[StackDataT] | None = None,
+        *,
+        validate_value_func: ValidateValueFunc | None = None,
+    ):
+        if data is None:
+            data = []
+        else:
+            data = data.copy()
+        self.validate_value_func = (
+            (lambda _: None)
+            if validate_value_func is None
+            else validate_value_func
+        )
+        self._data = data
+        self._peeker = VariableStack.VariablePeeker(
+            self._data, self.validate_value_func
+        )
+
+    def copy(self):
+        return VariableStack(
+            self._data, validate_value_func=self.validate_value_func
+        )
+
+    def push(self, val: StackDataT):
+        """
+        Pushes a variable onto the stack.
+
+        Args:
+            val: The variable to be pushed.
+
+        """
+        self.validate_value_func(val)
+        self._data.append(val)
+
+    def insert(self, index: int, val: StackDataT):
+        """
+        Inserts a variable onto the stack.
+
+        Args:
+            index: The index at which the variable is to be inserted, the top of the stack is at index 0.
+            val: The variable to be inserted.
+
+        """
+        assert (
+            0 <= index <= len(self)
+        ), f"index should be in [0, {len(self)}], but get {index}"
+        self.validate_value_func(val)
+        self._data.insert(len(self) - index, val)
+
+    def pop(self) -> StackDataT:
+        """
+        Pops the top value from the stack.
+
+        Returns:
+            The popped value.
+
+        """
+        assert len(self) > 0, "stack is empty"
+        return self._data.pop()
+
+    def pop_n(self, n: int) -> list[StackDataT]:
+        """
+        Pops the top n values from the stack.
+
+        Args:
+            n: The number of values to pop.
+
+        Returns:
+            A list of the popped values.
+
+        """
+        assert (
+            len(self) >= n >= 0
+        ), f"n should be in [0, {len(self)}], but get {n}"
+        if n == 0:
+            return []
+        retval = self._data[-n:]
+        self._data[-n:] = []
+        return retval
+
+    @property
+    def peek(self) -> VariablePeeker:
+        return self._peeker
+
+    @property
+    def top(self) -> StackDataT:
+        assert len(self) > 0, "stack is empty"
+        return self.peek[1]
+
+    @top.setter
+    def top(self, value):
+        assert len(self) > 0, "stack is empty"
+        self.peek[1] = value
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __repr__(self) -> str:
+        return str(self._data)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py b/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py
new file mode 100644
index 00000000000000..9611734ffffcdd
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import (  # noqa: F401
+    ConstTypes,
+    VariableBase,
+    VariableFactory,
+    find_traceable_vars,
+    map_variables,
+)
+from .basic import (  # noqa: F401
+    CellVariable,
+    ConstantVariable,
+    DataVariable,
+    DygraphTracerVariable,
+    FunctionGlobalVariable,
+    GlobalVariable,
+    ModuleVariable,
+    NullVariable,
+    NumpyVariable,
+    ObjectVariable,
+    SliceVariable,
+    TensorVariable,
+)
+from .callable import (  # noqa: F401
+    BuiltinVariable,
+    CallableVariable,
+    ClassVariable,
+    ContainerLayerVariable,
+    FunctionVariable,
+    LayerVariable,
+    MethodVariable,
+    PaddleApiVariable,
+    PaddleLayerVariable,
+    UserDefinedFunctionVariable,
+    UserDefinedGeneratorVariable,
+    UserDefinedLayerVariable,
+)
+from .container import (  # noqa: F401
+    ContainerVariable,
+    DictVariable,
+    ListVariable,
+    RangeVariable,
+    TupleVariable,
+)
+from .iter import (  # noqa: F401
+    EnumerateVariable,
+    IterVariable,
+    MapVariable,
+    SequenceIterVariable,
+    UserDefinedIterVariable,
+)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/base.py b/python/paddle/jit/sot/opcode_translator/executor/variables/base.py
new file mode 100644
index 00000000000000..17cb99aeef516a
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/base.py
@@ -0,0 +1,618 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import inspect
+import operator
+from functools import cached_property
+from queue import Queue
+from typing import TYPE_CHECKING, Any, Callable, Optional
+
+import paddle
+
+from ....profiler import event_register
+from ....utils import NameGenerator, get_unbound_method, log
+from ....utils.exceptions import FallbackError, HasNoAttributeError
+from ..dispatcher import Dispatcher
+from ..guard import StringifyExpression, check_guard, union_free_vars
+from ..mutable_data import MutableDictLikeData
+from ..pycode_generator import PyCodeGen
+from ..tracker import (
+    DummyTracker,
+    GetAttrTracker,
+    GetItemTracker,
+    GetIterTracker,
+    Tracker,
+)
+
+if TYPE_CHECKING:
+    from ..function_graph import FunctionGraph
+
+    # Each variable object should implement a method called `from_value`,
+    # which should adhere to the FromValueFunc signature.
+    FromValueFunc = Callable[
+        [Any, FunctionGraph, Tracker], Optional["VariableBase"]
+    ]
+
+
+ConstTypes = (int, float, str, bool, type(None))
+
+
+@event_register("find_traceable_vars")
+def find_traceable_vars(
+    root_vars: list[VariableBase],
+) -> list[VariableBase]:
+    """
+    This function is used to find all traceable variables in the given list of variables.
+
+    Args:
+        root_vars (list[VariableBase]): A list of root variables from which the ordering starts.
+
+    Returns:
+        list[VariableBase]: A list of variables that are traceable.
+    """
+    results: list[VariableBase] = []
+    visited: set[VariableBase] = set()
+    queue: Queue[VariableBase] = Queue()
+
+    for root in root_vars:
+        queue.put(root)
+
+    while not queue.empty():
+        var = queue.get()
+        if var in visited:
+            continue
+
+        visited.add(var)
+        if var.tracker.need_guard():
+            results.append(var)
+            continue
+
+        # Pruning traceable variable, if the variable is traceable, we don't need to
+        # trace its inputs.
+        inputs = var.get_inputs()
+
+        for var in inputs:
+            if var not in visited and var not in queue.queue:
+                queue.put(var)
+
+    return results
+
+
+def map_variables(map_func, variables: list[VariableBase]):
+    """
+    This function maps the given map_func to the given list of variables in a recursive manner.
+    Args:
+        map_func (Callable[[VariableBase], Any]): The function to be mapped to each variable.
+        variables (list[VariableBase]): A list of variables to which the map_func is to be applied.
+
+    Returns:
+        tuple: The result of applying the map_func to the variables.
+    """
+
+    def _map_variable(variable: VariableBase | object):
+        from .basic import SliceVariable
+        from .container import ContainerVariable
+
+        if isinstance(variable, ContainerVariable):
+            return paddle.utils.map_structure(
+                _map_variable, variable.get_wrapped_items()
+            )
+
+        if isinstance(variable, SliceVariable):
+            return slice(
+                map_func(variable.getattr("start")),
+                map_func(variable.getattr("stop")),
+                map_func(variable.getattr("step")),
+            )
+
+        return map_func(variable)
+
+    return paddle.utils.map_structure(_map_variable, variables)
+
+
+class VariableFactory:
+    """
+    A factory class for creating variables from arbitrary values.
+
+    This class provides a set of registration and factory methods for creating variables
+    of different types based on the type of the input value.
+
+    """
+
+    registered_funcs: dict[str, list[str]] = {"default": []}
+    mapping_str_func: dict[str, FromValueFunc] = {}
+
+    @staticmethod
+    def default_from_value(value, graph, tracker):
+        """
+        A default factory function that creates an ObjectVariable from the given value.
+
+        Args:
+            value: The input value.
+            graph: The FunctionGraph object that this variable is associated with.
+            tracker: The Tracker object that tracks the information of this variable.
+
+        Returns:
+            ObjectVariable: A new ObjectVariable representing the input value.
+        """
+        from .basic import ObjectVariable
+
+        return ObjectVariable(value, graph, tracker)
+
+    @staticmethod
+    def register_from_value(*, successor: str | None = None):
+        """
+        A decorator function that registers a function for creating a Variable from a value.
+
+        Args:
+            successor (str | None, optional): The name of the successor function that will be called after this function when creating a Variable. If None, the function is added to a default list of functions.
+
+        Returns:
+            The _register_from_value decorator function, which takes the function to be registered as an argument.
+        """
+        registered_funcs = VariableFactory.registered_funcs
+        mapping_str_func = VariableFactory.mapping_str_func
+
+        def _register_from_value(func: FromValueFunc):
+            """
+            Function to register a function for creating a Variable from a value
+            """
+            # Get the name of the function
+            name = func.__qualname__.split(".")[0]
+            # Map the name of the function to the function
+            mapping_str_func[name] = func
+            if successor is None:
+                registered_funcs["default"].append(
+                    name
+                )  # If successor is None, add the function to the "default" list
+            elif successor not in registered_funcs.keys():
+                registered_funcs[successor] = [
+                    name
+                ]  # If the successor is not in the registered_funcs dictionary, set the value to a list containing only name
+            else:
+                registered_funcs[successor].append(
+                    name
+                )  # If the successor is in the registered_funcs dictionary, append name to the existing list of functions for that successor
+
+        log(
+            4, VariableFactory.registered_funcs
+        )  # Print the registered_funcs dictionary if the logging level is at least 4
+        return _register_from_value
+
+    @staticmethod
+    def from_value(
+        value: Any,
+        graph: FunctionGraph,
+        tracker: Tracker,
+        *,
+        debug_name: str | None = None,
+    ) -> VariableBase:
+        """
+        Create a new variable object from the given value.
+
+        This method searches through the registered from_value functions to find one
+        that can create a variable object from the given value. If no matching function
+        is found, the default_from_value function is used.
+
+        Args:
+            value (Any): The input value.
+            graph (FunctionGraph): The FunctionGraph object that this variable is associated with.
+            tracker (Tracker): The Tracker object that tracks the information of this variable.
+            debug_name (str | None): An optional debug name for the variable.
+
+        Returns:
+            VariableBase: A new variable object representing the input value.
+        """
+        registered_funcs = VariableFactory.registered_funcs
+
+        def _find_var(key: str = "default") -> VariableBase | None:
+            for name in registered_funcs[key]:
+                if name in registered_funcs.keys():
+                    # If the function name is a key in the registered_funcs dictionary, recursively find a Variable using that function
+                    var = _find_var(name)
+                    if var is not None:
+                        return var
+                # Get the function corresponding to the name from the mapping_str_func dictionary
+                func = VariableFactory.mapping_str_func[name]
+                var = func(
+                    value, graph, tracker
+                )  # Call the function to create a Variable from the value
+                if var is not None:
+                    return var
+
+        var = _find_var()
+        if var is None:
+            var = VariableFactory.default_from_value(
+                value, graph, tracker
+            )  # If a Variable could not be found using the registered functions, use the default function to create a new Variable
+        var.debug_name = debug_name
+        return var
+
+
+class VariableBase:
+    """
+    VariableBase is a basic concept and each symbols in VM stack is regarded as
+    an Variable Object in symblic tracing process.
+
+    There are two key data structures during Python runtime:
+    PyFrameObject, which provides the instance for function logical lock usage,
+    and PyCodeObject, which provides the bytecode for the corresponding function.
+    With these data, the Python virtual machine executes the bytecode sequentially on a stack to complete function logic.
+
+    Args:
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+
+    Note:
+        We should push an object of a subclass of VariableBase instead of an object of VariableBase onto the VM stack.
+        It serves as an abstract class and should not be instantiated directly.
+    """
+
+    tracker: Tracker  # An attribute to store the Tracker object associated with the variable
+    value: Any
+    name_generator = NameGenerator(
+        "object_"
+    )  # A class-level attribute to generate names for new variables
+    mutable_attrs = []
+
+    def __init__(self, graph: FunctionGraph, tracker: Tracker):
+        self.graph = graph
+        self.tracker = tracker
+        self.id = VariableBase.name_generator.next()
+        self._debug_name: str | None = None
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        """
+        Property method to return a dictionary of main information about the variable
+
+        Returns:
+            main_info: Main information of the variable.
+        """
+        return {}
+
+    @property
+    def debug_info(self) -> dict[str, Any]:
+        """
+        Property method to return a dictionary of debug information about the variable
+        """
+        return {
+            "debug_name": self.debug_name,
+            "id": self.id,
+        }
+
+    @property
+    def debug_name(self) -> str:
+        """
+        Generate a debug_name for each variable.
+
+        Returns:
+            _debug_name: the name of variable.
+        """
+        if self._debug_name is not None:
+            # Return the self._debug_name cache if it is not None.
+            return self._debug_name
+        inputs = self.tracker.inputs
+        if isinstance(self.tracker, GetItemTracker):
+            self._debug_name = (
+                f"{self.tracker.container.debug_name}[{self.tracker.key}]"
+            )
+        elif isinstance(self.tracker, GetAttrTracker):
+            self._debug_name = (
+                f"{self.tracker.obj.debug_name}.{self.tracker.attr}"
+            )
+        elif len(inputs) == 0:
+            self._debug_name = "tmp_var"
+        else:  # len(inputs) >= 0
+            for input in inputs:
+                assert input is not None
+            self._debug_name = "tmp_var_" + "_".join(
+                input.debug_name for input in inputs
+            )
+        return self._debug_name
+
+    @debug_name.setter
+    def debug_name(self, name):
+        self._debug_name = name
+
+    def __hash__(self):
+        return hash(self.id)
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        """
+        Create a StringifyExpression object that represents a guard expression for this variable.
+
+        Returns:
+            StringifyExpression: An object that contains the guard expression and the free variables used in the expression.
+        """
+
+        # Get a ValueTracer object from the Tracker object associated with the variable
+        frame_value_tracer = self.tracker.trace_value_from_frame()
+
+        return [
+            StringifyExpression(
+                f"id(type({{}})) == {id(self.get_py_type())}",
+                [frame_value_tracer],
+                union_free_vars(frame_value_tracer.free_vars),
+            ),
+            StringifyExpression(
+                f"{{}} == {self.get_py_value()!r}",
+                [frame_value_tracer],
+                union_free_vars(frame_value_tracer.free_vars),
+            ),
+        ]
+
+    def get_py_value(self, allow_tensor=False) -> Any:
+        """
+        Abstract method to get the value of the variable
+        """
+        raise NotImplementedError()
+
+    def get_py_type(self):
+        """
+        Method to get the type of the variable's value
+        """
+        return type(self.get_py_value())
+
+    def is_none(self) -> bool:
+        """
+        Method to check if the variable's value is None
+        """
+        return self.get_py_value() is None
+
+    def reconstruct(
+        self,
+        codegen: PyCodeGen,
+        *,
+        use_tracker: bool = True,
+        add_to_global_guarded_vars: bool = True,
+    ):
+        if self.tracker.is_traceable() and use_tracker:
+            self.tracker.gen_instructions(codegen)
+        else:
+            if add_to_global_guarded_vars:
+                self.graph.add_global_guarded_variable(self)
+            self._reconstruct(codegen)
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        """
+        Abstract method to construct an opcode and append it into codegen.instructions
+        """
+        raise FallbackError(
+            f'{self.__class__.__name__} does not implement "_reconstruct" method'
+        )
+
+    def flatten_items(self) -> list[VariableBase]:
+        """
+        Recursively flatten the items in this container variable to a list of Variable objects.
+
+        Returns:
+            list[VariableBase]: Flattened items of a container variable.
+        """
+        from .container import ContainerVariable
+
+        if not isinstance(self, ContainerVariable):
+            return [self]
+        flattened_items = []
+        for item in self.get_items():
+            flattened_items.extend(item.flatten_items())
+        return flattened_items
+
+    def get_inputs(self) -> list[VariableBase]:
+        """
+        This method is used to get the inputs for the current variable.
+
+        Returns:
+            list[VariableBase]: Inputs for the current variable.
+        """
+        return self.tracker.inputs
+
+    def get_traceable_inputs(self) -> list[VariableBase]:
+        """
+        This method is used to get the traceable inputs for the current variable.
+
+        Returns:
+            list[VariableBase]: Traceable inputs for the current variable.
+        """
+        return list(
+            filter(lambda x: x.tracker.is_traceable(), self.tracker.inputs)
+        )
+
+    def call_function(self, /, *args, **kwargs):
+        pass
+
+    @cached_property
+    def attr_proxy(self):
+        return self.graph.side_effects.get_proxy(
+            MutableDictLikeData, self.get_py_value(), self.attr_proxy_getter
+        )
+
+    def attr_proxy_getter(self, proxy: MutableDictLikeData, name: str):
+        if not hasattr(proxy.original_data, name):  # can't true.
+            return MutableDictLikeData.Empty()
+
+        attr = getattr(proxy.original_data, name)
+        if inspect.ismethod(attr) or (
+            hasattr(attr, "__self__")
+            and inspect.ismethoddescriptor(
+                getattr(attr.__self__.__class__, name, None)
+            )
+        ):
+            from .callable import MethodVariable
+
+            fn = None
+            if inspect.ismethoddescriptor(
+                getattr(attr.__self__.__class__, name, None)
+            ):
+                class_var = VariableFactory.from_value(
+                    self.get_py_type(),
+                    self.graph,
+                    GetAttrTracker(self, "__class__"),
+                )
+                fn = VariableFactory.from_value(
+                    getattr(attr.__self__.__class__, name),
+                    self.graph,
+                    GetAttrTracker(class_var, name),
+                )
+            return MethodVariable.wrap_method(
+                value=attr,
+                instance=self,
+                fn=fn,
+                graph=self.graph,
+                tracker=GetAttrTracker(self, name),
+                method_name=name,
+            )
+
+        return VariableFactory.from_value(
+            attr, self.graph, tracker=GetAttrTracker(self, name)
+        )
+
+    def hasattr(self, name: str):
+        from .basic import ConstantVariable
+
+        try:
+            self.getattr(name)
+            return ConstantVariable(
+                True, graph=self.graph, tracker=DummyTracker([self])
+            )
+        except HasNoAttributeError:
+            # NOTE(SigureMo): Only the HasNoAttributeError is raised, we can
+            # ensure that the attribute does not exist. Otherwise, we should
+            # raise the error.
+            return ConstantVariable(
+                False, graph=self.graph, tracker=DummyTracker([self])
+            )
+
+    def getattr(self, name: str, default=None):
+        result = self.attr_proxy.get(name)
+        if isinstance(result, MutableDictLikeData.Empty):
+            if default is not None:
+                assert isinstance(default, VariableBase)
+                return default
+            raise HasNoAttributeError(
+                f"{self.__class__.__name__} {self} has no attribute {name}"
+            )
+        return result
+
+    def setattr(self, key: str, value):
+        from .basic import ConstantVariable
+
+        self.attr_proxy.set(key, value)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def delattr(self, key: str):
+        from .basic import ConstantVariable
+
+        self.attr_proxy.delete(key)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def __setitem__(self, key, value):
+        return self.setitem(key, value)
+
+    def setitem(self, key, value):
+        raise FallbackError(f"{self} is not support setitem.")
+
+    def __repr__(self):
+        info = {**self.main_info, **self.debug_info}
+        info_str = ", ".join([f"{value}" for value in info.values()])
+        return f"{self.__class__.__name__}({info_str})"
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __getitem__(self, idx):
+        return Dispatcher.call(operator.getitem, self, idx)
+
+    def getitem(self, item):
+        class_var = VariableFactory.from_value(
+            self.get_py_value().__class__,
+            self.graph,
+            GetAttrTracker(self, '__class__'),
+        )
+        fn_var = VariableFactory.from_value(
+            get_unbound_method(self.get_py_value(), '__getitem__'),
+            self.graph,
+            GetAttrTracker(class_var, '__getitem__'),
+        )
+        self.graph.add_global_guarded_variable(item)
+        item = item.get_py_value()
+        output = fn_var(self, item)
+        return output
+
+    def __call__(self, /, *args, **kwargs):
+        """
+        Call the object represented by this variable with the given arguments.
+
+        Args:
+            *args: Positional arguments to pass to the object's __call__ method.
+            **kwargs: Keyword arguments to pass to the object's __call__ method.
+
+        Returns:
+            VariableBase: A new variable representing the result of calling the object's __call__ method.
+        """
+        from .callable import BuiltinVariable, UserDefinedFunctionVariable
+
+        class_var = VariableFactory.from_value(
+            self.get_py_value().__class__,
+            self.graph,
+            GetAttrTracker(self, '__class__'),
+        )
+        assert class_var is not None
+        # if __call__ is a method, we should add self to arguments.
+        if inspect.ismethod(self.get_py_value().__call__):
+            args = (self,) + args
+        unbound_method = get_unbound_method(self.get_py_value(), '__call__')
+        if hasattr(unbound_method, "__code__"):
+            fn_var = UserDefinedFunctionVariable(
+                unbound_method,
+                self.graph,
+                GetAttrTracker(class_var, '__call__'),
+            )
+        else:
+            fn_var = BuiltinVariable(
+                self.value,
+                self.graph,
+                GetAttrTracker(class_var, '__call__'),
+            )
+        output = fn_var(*args, **kwargs)
+        return output
+
+    def get_iter(self):
+        from .iter import UserDefinedIterVariable
+
+        return UserDefinedIterVariable(self, self.graph, GetIterTracker(self))
+
+    @VariableFactory.register_from_value()
+    def from_value(
+        value: Any,
+        graph: FunctionGraph | None,
+        tracker: Tracker,
+    ) -> VariableBase | None:
+        """
+        Create a new variable from a given value, or return None if the value cannot be converted to a variable.
+        Args:
+            value (Any): The value to create a variable from.
+            graph (FunctionGraph | None): The graph in which the variable will be used.
+            tracker (Tracker): The variable tracker to put the new variable in if created.
+
+        Returns:
+            VariableBase | None: A new variable if one can be created from the given value, or None if the value cannot be converted to a variable.
+        """
+        if isinstance(value, VariableBase):
+            return value
+        return None
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
new file mode 100644
index 00000000000000..ba0a7f51c91a03
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
@@ -0,0 +1,888 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import operator
+import types
+from functools import cached_property, reduce
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+
+import paddle
+
+from ....infer_meta import MetaInfo
+from ....symbolic.statement_ir import Symbol
+from ....utils import (
+    BreakGraphError,
+    FallbackError,
+    NameGenerator,
+    paddle_tensor_methods,
+)
+from ....utils.exceptions import HasNoAttributeError, InnerError
+from ..dispatch_functions import tensor_numel
+from ..guard import (
+    StringifyExpression,
+    check_guard,
+    object_equal_stringify_guard,
+    union_free_vars,
+)
+from ..mutable_data import MutableDictLikeData
+from ..pycode_generator import PyCodeGen
+from ..tracker import (
+    ConstTracker,
+    DanglingTracker,
+    DummyTracker,
+    GetAttrTracker,
+    GetIterTracker,
+    GlobalTracker,
+    Tracker,
+)
+from .base import ConstTypes, VariableBase, VariableFactory
+
+if TYPE_CHECKING:
+    from ..function_graph import FunctionGraph
+    from .callable import FunctionVariable
+
+
+FP_DTYPE_ABBRS = {
+    paddle.bfloat16: 'bfloat16',
+    paddle.float64: 'float64',
+    paddle.float32: 'float32',
+    paddle.float16: 'float16',
+}
+
+CP_DTYPE_ABBRS = {
+    paddle.complex64: 'complex64',
+    paddle.complex128: 'complex128',
+}
+
+INT_DTYPE_ABBRS = {
+    paddle.int8: 'int8',
+    paddle.int16: 'int16',
+    paddle.int32: 'int32',
+    paddle.int64: 'int64',
+    paddle.uint8: 'uint8',
+}
+
+DTYPE_ABBRS = {
+    **FP_DTYPE_ABBRS,
+    **CP_DTYPE_ABBRS,
+    **INT_DTYPE_ABBRS,
+    paddle.bool: 'bool',
+}
+
+
+class ConstantVariable(VariableBase):
+    """
+    ConstantVariable is a subclass of VariableBase used to wrap a Variable of the const type.
+
+    Args:
+        value(Any): The value to be wrapped.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self,
+        value: Any,
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(graph, tracker)
+        self.value = value
+
+    def get_py_value(self, allow_tensor=False):
+        return self.value
+
+    @property
+    def debug_name(self) -> str:
+        return f"{self.value}"
+
+    @debug_name.setter
+    def debug_name(self, name):
+        pass
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        codegen.gen_load_const(self.value)
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {"value": self.value}
+
+    def __bool__(self) -> bool:
+        return bool(self.value)
+
+    def bool(self):
+        return ConstantVariable(bool(self), self.graph, DummyTracker([self]))
+
+    def bool_not(self):
+        assert isinstance(
+            self.get_py_value(), bool
+        ), "Bool_not can only be applied to a bool variable."
+        return ConstantVariable(
+            not bool(self.get_py_value()), self.graph, DummyTracker([self])
+        )
+
+    def str(self):
+        return ConstantVariable(
+            str(self.value), self.graph, DummyTracker([self])
+        )
+
+    def format(self, *args):
+        return ConstantVariable(
+            str(self.value).format(*[str(a.value) for a in args]),
+            self.graph,
+            DummyTracker([self, *args]),
+        )
+
+    def lower(self):
+        return ConstantVariable(
+            str(self.value).lower(),
+            self.graph,
+            DummyTracker([self]),
+        )
+
+    def ord(self):
+        return ConstantVariable(
+            ord(self.value),
+            self.graph,
+            DummyTracker([self]),
+        )
+
+    def chr(self):
+        return ConstantVariable(
+            chr(self.value),
+            self.graph,
+            DummyTracker([self]),
+        )
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if type(value) in ConstTypes:
+            return ConstantVariable(value, graph, tracker)
+        return None
+
+    @staticmethod
+    def wrap_literal(value: Any, graph: FunctionGraph) -> ConstantVariable:
+        """
+        Wrap a literal value in a ConstantVariable.
+
+        Args:
+            value(Any): The literal value to be wrapped.
+
+        Returns:
+            ConstantVariable: A new ConstantVariable object that wraps the given value.
+        """
+        if isinstance(value, ConstantVariable):
+            return value
+        assert isinstance(
+            value, ConstTypes
+        ), f"value: {value},type: {type(value)}"
+        return ConstantVariable(value, graph, ConstTracker(value))
+
+
+class PrintStmtVariable(VariableBase):
+    def __init__(self, value: Any, graph: FunctionGraph):
+        # TODO: graph should be not None
+        super().__init__(None, DanglingTracker())
+        self.args, self.kwargs = value
+        self.graph = graph
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        # do we need ? may be too strict.
+        for var in self.args:
+            self.graph.add_global_guarded_variable(var)
+        for var in self.kwargs.values():
+            self.graph.add_global_guarded_variable(var)
+        # currently dont' consider kwargs
+        codegen.gen_load_global("print", push_null=True)
+        for var in self.args:
+            var.reconstruct(codegen)
+        codegen.gen_call_function(len(self.args))
+        codegen.gen_pop_top()
+
+    def flatten_items(self):
+        return self.args
+
+
+IMPLEMENTED_TENSOR_PROPERTIES = set()
+
+
+def tensor_property(func):
+    IMPLEMENTED_TENSOR_PROPERTIES.add(func.__name__)
+    return property(func)
+
+
+class DataVariable(VariableBase):
+    """
+    A value only object.
+    If it's all magic method don't change the function_graph state, [tensor op, guard, side_effect]
+    we will call it a ValueObjectVariable, we directy call python operator on it.
+    """
+
+    def __init__(
+        self,
+        value: Any,
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(graph, tracker)
+        self.value = value
+
+    def get_py_value(self, allow_tensor=False):
+        return self.value
+
+
+class TensorDtypeVariable(DataVariable):
+    def __init__(self, value, graph, tracker):
+        super().__init__(value, graph, tracker)
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        if isinstance(self.tracker, GetAttrTracker) and isinstance(
+            self.tracker.obj, TensorVariable
+        ):
+            tensor_value_tracer = (
+                self.tracker.obj.tracker.trace_value_from_frame()
+            )
+            return [
+                StringifyExpression(
+                    f"str(MetaInfo.from_tensor({{}}).dtype) == '{str(self.value)}'",
+                    [tensor_value_tracer],
+                    {"MetaInfo": MetaInfo},
+                )
+            ]
+        else:
+            return object_equal_stringify_guard(self)
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "dtype": self.value,
+        }
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, paddle.dtype):
+            return TensorDtypeVariable(value, graph, tracker)
+
+
+class TensorVariable(VariableBase):
+    """
+    TensorVariable is a subclass of VariableBase used to wrap a Variable of the tensor type.
+
+    Args:
+        tensor (paddle.Tensor | MetaInfo): The tensor to be wrapped.
+        graph (FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker (Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    var_name_generator = NameGenerator("var_")
+    mutable_attrs = ["meta"]
+
+    def __init__(
+        self,
+        tensor: paddle.Tensor | MetaInfo,
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(graph, tracker)
+        if isinstance(tensor, paddle.Tensor):
+            self.value = None
+            self.meta = MetaInfo.from_tensor(tensor)
+        elif isinstance(tensor, MetaInfo):
+            self.value = None
+            self.meta = tensor
+        else:
+            raise InnerError(
+                "Required type(tensor) is paddle.Tensor or ProxyTensor, but received {}.".format(
+                    type(tensor).__name__
+                )
+            )
+        self.origin_meta = self.meta
+        self.var_name = TensorVariable.var_name_generator.next()
+        self.graph.side_effects.record_mutable_variable(self)
+
+    def __len__(self):
+        if self.meta.shape[0] == -1:
+            raise BreakGraphError(
+                "length of tensor variable with first dimension == -1"
+            )
+        return self.meta.shape[0]
+
+    def get_py_value(self, allow_tensor=False):
+        if allow_tensor:
+
+            class SotTensor:
+                def __init__(self, id_):
+                    self.id = id_
+
+                def __eq__(self, var):
+                    if not hasattr(var, "id"):
+                        return False
+                    else:
+                        return self.id == var.id
+
+            return SotTensor(self.id)
+
+        raise BreakGraphError(
+            "Called TensorVariable.get_py_value. Should not use Tensor's value in simulating."
+        )
+
+    def get_py_type(self):
+        return paddle.Tensor
+
+    def get_symbol(self) -> Symbol:
+        return Symbol(self.var_name)
+
+    @property
+    def out_var_name(self):
+        return f"{self.graph.OUT_VAR_PREFIX}{self.var_name}"
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        codegen.gen_load_fast(self.out_var_name)
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        frame_value_tracer = self.tracker.trace_value_from_frame()
+
+        return [
+            StringifyExpression(
+                f"MetaInfo.from_tensor({{}}).guard_str() == '{self.origin_meta.guard_str()}'",
+                [frame_value_tracer],
+                union_free_vars(
+                    {"MetaInfo": MetaInfo},
+                    frame_value_tracer.free_vars,
+                ),
+            )
+        ]
+
+    def get_iter(self):
+        from .iter import SequenceIterVariable
+
+        return SequenceIterVariable(self, self.graph, GetIterTracker(self))
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "shape": self.meta.shape,
+            "dtype": DTYPE_ABBRS[self.meta.dtype],
+            "stop_gradient": self.meta.stop_gradient,
+            "var_name": self.var_name,
+        }
+
+    def getitem(self, key):
+        return self.graph.call_tensor_method('__getitem__', self, key)
+
+    def setitem(self, key, value):
+        self.graph.add_global_guarded_variable(value)
+
+        key_var = VariableFactory.from_value(
+            key, self.graph, tracker=ConstTracker(key)
+        )
+        new_tensor = self.graph.call_paddle_api(
+            paddle.static.setitem,
+            self,
+            key_var,
+            value,
+        )
+
+        self.meta = new_tensor.meta
+        self.graph.add_inplace_tensors(self)
+
+    @tensor_property
+    def T(self):
+        """
+        Return a new TensorVariable object that wraps the result of calling the transpose method on the wrapped value of this TensorVariable.
+        """
+        from .container import ListVariable
+
+        perm = list(range(len(self.meta.shape) - 1, -1, -1))
+        perm_var = ListVariable(perm, self.graph, tracker=ConstTracker(perm))
+        assert perm_var is not None
+        out = self.graph.call_paddle_api(paddle.transpose, self, perm_var)
+        return out
+
+    @tensor_property
+    def ndim(self):
+        """
+        Return a ConstantVariable object that represents the number of dimensions of the wrapped value of this TensorVariable.
+        """
+        return ConstantVariable(
+            len(self.meta.shape), self.graph, DummyTracker([self])
+        )
+
+    @tensor_property
+    def size(self):
+        """
+        Return a ConstantVariable object that represents the total number of elements in the wrapped value of this TensorVariable.
+        """
+        # TODO: maybe break graph.
+        if self.meta.is_dynamic_shape():
+            raise BreakGraphError(
+                f"Getting size for a dynamic shape tensor causes graph break. shape = {self.meta.shape}"
+            )
+        elements = reduce(operator.mul, self.meta.shape, 1)
+        return ConstantVariable(elements, self.graph, DummyTracker([self]))
+
+    @tensor_property
+    def shape(self):
+        if self.meta.is_dynamic_shape():
+            raise BreakGraphError(
+                f"Getting shape for a dynamic shape tensor causes graph break. shape = {self.meta.shape}"
+            )
+        from .container import ListVariable
+
+        return ListVariable(
+            self.meta.shape, self.graph, tracker=DummyTracker([self])
+        )
+
+    def numel(self):
+        return self.size
+
+    def len(self):
+        if len(self.meta.shape) == 0:
+            raise InnerError("len() of a 0-D tensor is wrong")
+        first_dim = self.meta.shape[0]
+        if first_dim == -1:
+            raise BreakGraphError(
+                "Getting len() for a dynamic shape tensor causes graph break."
+            )
+
+        return ConstantVariable(first_dim, self.graph, DummyTracker([self]))
+
+    def is_tensor(self):
+        return ConstantVariable(True, self.graph, DummyTracker([self]))
+
+    def is_complex(self):
+        dtype = self.meta.dtype
+        is_cp_dtype = dtype in CP_DTYPE_ABBRS
+        return ConstantVariable(is_cp_dtype, self.graph, DummyTracker([self]))
+
+    def is_integer(self):
+        dtype = self.meta.dtype
+        is_int_dtype = dtype in INT_DTYPE_ABBRS
+        return ConstantVariable(is_int_dtype, self.graph, DummyTracker([self]))
+
+    def is_floating_point(self):
+        dtype = self.meta.dtype
+        is_fp_dtype = dtype in FP_DTYPE_ABBRS
+        return ConstantVariable(is_fp_dtype, self.graph, DummyTracker([self]))
+
+    def getattr(self, name: str, default=None):
+        if default is not None:
+            raise FallbackError(
+                "default argument for getattr is not implemented"
+            )
+        method_name_to_builtin_fn = {
+            "dim": paddle.rank,
+            "numel": tensor_numel,
+            "ndimension": paddle.rank,
+            "is_tensor": paddle.is_tensor,
+            "is_complex": paddle.is_complex,
+            "is_integer": paddle.is_integer,
+            "is_floating_point": paddle.is_floating_point,
+        }
+        if name in ["dtype", "type", "name", "persistable", "stop_gradient"]:
+            if name == "name" and self.meta.name.startswith(
+                "infer_meta_variable_tmp"
+            ):
+                raise BreakGraphError(f"{self.meta.name} is a middle tensor.")
+            return VariableFactory.from_value(
+                getattr(self.meta, name),
+                self.graph,
+                tracker=GetAttrTracker(self, name),
+            )
+        elif name in IMPLEMENTED_TENSOR_PROPERTIES:
+            return getattr(self, name)
+        elif name in method_name_to_builtin_fn:
+            # TODO: backward, gradient
+            from .callable import BuiltinVariable
+
+            builtin_fn = method_name_to_builtin_fn[name]
+
+            return BuiltinVariable(
+                builtin_fn, self.graph, DanglingTracker()
+            ).bind(self, name)
+        elif name in paddle_tensor_methods:
+            from .callable import TensorFunctionVariable
+
+            fn_var = TensorFunctionVariable(
+                name, graph=self.graph, tracker=DanglingTracker()
+            )
+            return fn_var.bind(self, name)
+        else:
+            raise HasNoAttributeError(f"Unknown Tensor attribute: {name}")
+
+    def setattr(self, key, val):
+        # support tensor variable store attr, like:
+        # t.stop_gradient = True
+        self.graph.call_tensor_method(
+            "__setattr__",
+            self,
+            VariableFactory().from_value(key, self.graph, ConstTracker(key)),
+            val,
+        )
+
+    def delattr(self, key):
+        raise BreakGraphError("Don't support TensorVariable delattr")
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, (paddle.Tensor, MetaInfo)):
+            return TensorVariable(value, graph, tracker)
+        return None
+
+
+class ObjectVariable(VariableBase):
+    """
+    ObjectVariable is a subclass of VariableBase used to wrap a Variable of the object type.
+
+    Args:
+        obj(Any): The object to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    make_stringify_guard = object_equal_stringify_guard
+
+    def __init__(self, obj, graph, tracker):
+        super().__init__(graph, tracker)
+        self.value = obj
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {"value": self.value}
+
+    def get_py_value(self, allow_tensor=False) -> Any:
+        return self.value
+
+
+class SliceVariable(VariableBase):
+    """
+    SliceVariable is a subclass of VariableBase used to wrap a Variable of the slice type.
+
+    Args:
+        slice_(slice): The slice to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(self, slice_: slice, graph, tracker):
+        super().__init__(graph, tracker)
+        self.value = slice_
+
+    @property
+    def debug_name(self) -> str:
+        return ":".join(
+            [
+                str(self.value.start) if self.value.start is not None else "",
+                str(self.value.stop) if self.value.stop is not None else "",
+                str(self.value.step) if self.value.step is not None else "",
+            ]
+        )
+
+    @debug_name.setter
+    def debug_name(self, name):
+        pass
+
+    @cached_property
+    def attr_proxy(self):
+        return self.graph.side_effects.get_proxy(
+            MutableDictLikeData, self.value, self.attr_proxy_getter
+        )
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {"value": self.value}
+
+    def get_py_value(self, allow_tensor=False):
+        return slice(
+            self.getattr("start").get_py_value(),
+            self.getattr("stop").get_py_value(),
+            self.getattr("step").get_py_value(),
+        )
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        frame_value_tracer = self.tracker.trace_value_from_frame()
+        result = (
+            [
+                StringifyExpression(
+                    "isinstance({}, slice)",
+                    [frame_value_tracer],
+                    frame_value_tracer.free_vars,
+                ),
+            ]
+            + self.getattr("start").make_stringify_guard()
+            + self.getattr("stop").make_stringify_guard()
+            + self.getattr("step").make_stringify_guard()
+        )
+        return result
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        if all(
+            isinstance(x, ConstantVariable)
+            for x in [
+                self.getattr("start"),
+                self.getattr("stop"),
+                self.getattr("step"),
+            ]
+        ):
+            self.graph.add_global_guarded_variable(self)
+            self.getattr("start").reconstruct(codegen)
+            self.getattr("stop").reconstruct(codegen)
+            self.getattr("step").reconstruct(codegen)
+            codegen.gen_build_slice(3)
+        else:
+            super()._reconstruct(codegen)
+
+    def setattr(self, key, val):
+        raise BreakGraphError("Don't support SliceVariable setattr")
+
+    def delattr(self, key):
+        raise BreakGraphError("Don't support SliceVariable delattr")
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, slice):
+            return SliceVariable(value, graph, tracker)
+        return None
+
+
+class ModuleVariable(VariableBase):
+    """
+    ModuleVariable is a subclass of VariableBase used to wrap a Variable of the module type.
+
+    Args:
+        func: The module to be wrapped.
+        graph: The FunctionGraph object that this variable is associated with.
+        tracker: The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(self, func, graph, tracker):
+        super().__init__(graph, tracker)
+        self.value = func
+
+    def get_py_value(self, allow_tensor=False):
+        return self.value
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {"value": self.value}
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, types.ModuleType):
+            return ModuleVariable(value, graph, tracker)
+        return None
+
+    # Happened in a inline import statement.
+    make_stringify_guard = object_equal_stringify_guard
+
+
+class DygraphTracerVariable(VariableBase):
+    # TODO(SigureMo): Remove this trick after we add CompareTracker
+    def __init__(self, value, graph, tracker):
+        super().__init__(graph, tracker)
+        self.value = value
+
+    def get_py_value(self, allow_tensor=False):
+        return self.value
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        return []
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "is_none": self.value is None,
+        }
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, paddle.base.dygraph.tracer.Tracer):
+            return DygraphTracerVariable(value, graph, tracker)
+        return None
+
+
+class NumpyVariable(VariableBase):
+    """
+    NumpyVariable is a subclass of VariableBase used to wrap a Variable of the numpy type.
+
+    Args:
+        value: The numpy value to be wrapped.
+        graph: The FunctionGraph object that this variable is associated with.
+        tracker: The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(self, value, graph, tracker):
+        super().__init__(graph, tracker)
+        self.value = value
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {"value": self.value}
+
+    def get_py_value(self, allow_tensor=False) -> Any:
+        return self.value
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        if isinstance(self.get_py_value(), np.number):
+            frame_value_tracer = self.tracker.trace_value_from_frame()
+
+            def format_dtype(dtype: np.dtype):
+                return f"np.{str(dtype)}"
+
+            def format_number(number: np.number):
+                return f"{format_dtype(number.dtype)}({str(number.item())})"
+
+            return [
+                StringifyExpression(
+                    f"{{}} == {format_number(self.get_py_value())}",
+                    [frame_value_tracer],
+                    union_free_vars(frame_value_tracer.free_vars, {"np": np}),
+                ),
+                StringifyExpression(
+                    f"{{}}.dtype == {format_dtype(self.get_py_value().dtype)}",
+                    [frame_value_tracer],
+                    union_free_vars(frame_value_tracer.free_vars, {"np": np}),
+                ),
+            ]
+        else:
+            return object_equal_stringify_guard(self)
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, (np.ndarray, np.number)):
+            return NumpyVariable(value, graph, tracker)
+        return None
+
+
+class NullVariable(VariableBase):
+    """
+    NullVariable is a subclass of VariableBase used to represent a placeholder variable that has no value or reference associated with it.
+    """
+
+    def __init__(self):
+        # TODO: graph should be not None
+        super().__init__(None, DanglingTracker())
+
+    def reconstruct(self, codegen: PyCodeGen):
+        codegen.gen_push_null()
+
+
+class CellVariable(VariableBase):
+    def __init__(self, value=None):
+        # TODO: graph should be not None
+        super().__init__(
+            None, DanglingTracker()
+        )  # should reconstruct cell variable
+        assert isinstance(value, (VariableBase, type(None)))
+        self.set_value(value)
+
+    def reconstruct(
+        self,
+        codegen: PyCodeGen,
+        *,
+        use_tracker: bool = True,
+        add_to_global_guarded_vars: bool = True,
+    ):
+        raise FallbackError("Break graph in closure is not support.")
+
+    def cell_content(self):
+        return self.value
+
+    def set_value(self, value):
+        self.value = value
+
+    def empty(self):
+        return self.value is None
+
+
+class GlobalVariable(VariableBase):
+    def __init__(
+        self,
+        val_dict,
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(graph, tracker)
+        self.proxy = self.graph.side_effects.get_proxy(
+            MutableDictLikeData, val_dict, self.proxy_getter
+        )
+
+    def proxy_getter(self, proxy: MutableDictLikeData, key: Any):
+        if key not in proxy.original_data:
+            return MutableDictLikeData.Empty()
+        return VariableFactory.from_value(
+            proxy.original_data[key],
+            self.graph,
+            tracker=GlobalTracker(key),
+        )
+
+    def get_value(self):
+        return dict(self.proxy.get_all().items())
+
+    def keys(self):
+        return self.proxy.get_all().keys()
+
+    def get(self, key):
+        if isinstance(key, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: recieved {key} to get value."
+            )
+        return self.proxy.get(key)
+
+    def set(self, key, value):
+        if isinstance(key, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: recieved {key} as key."
+            )
+        if not isinstance(value, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: recieved {value} to set value."
+            )
+        self.proxy.set(key, value)
+        self.graph.side_effects.record_proxy_variable(self)
+
+    def delete(self, key):
+        self.proxy.delete(key)
+        self.graph.side_effects.record_proxy_variable(self)
+
+
+class FunctionGlobalVariable(GlobalVariable):
+    def __init__(
+        self,
+        fn: FunctionVariable,
+        val_dict: dict[str, Any],
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(val_dict, graph, tracker)
+        self.fn = fn
+
+    def proxy_getter(self, proxy: MutableDictLikeData, key: Any):
+        from ..opcode_inline_executor import FunctionGlobalTracker
+
+        if key not in proxy.original_data:
+            return MutableDictLikeData.Empty()
+        return VariableFactory.from_value(
+            proxy.original_data[key],
+            self.graph,
+            tracker=FunctionGlobalTracker(self.fn, key),
+        )
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py b/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py
new file mode 100644
index 00000000000000..819580710beba8
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py
@@ -0,0 +1,759 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import inspect
+import operator
+import types
+from functools import reduce
+from typing import TYPE_CHECKING, Any, Callable
+
+import paddle
+
+from .... import psdb
+from ....profiler import EventGuard
+from ....utils import (
+    is_break_graph_api,
+    is_break_graph_tensor_methods,
+    is_builtin_fn,
+    is_paddle_api,
+    magic_method_builtin_dispatch,
+)
+from ....utils.exceptions import BreakGraphError, FallbackError, SotErrorBase
+from ..dispatcher import Dispatcher
+from ..guard import (
+    StringifyExpression,
+    check_guard,
+    object_equal_stringify_guard,
+    union_free_vars,
+)
+from ..tracker import (
+    ConstTracker,
+    CreateLayerTracker,
+    DanglingTracker,
+    DummyTracker,
+    GetAttrTracker,
+    GetItemTracker,
+    GetIterTracker,
+    Tracker,
+)
+from .base import VariableBase, VariableFactory
+from .basic import ConstantVariable, PrintStmtVariable, SliceVariable
+
+if TYPE_CHECKING:
+    from ..function_graph import FunctionGraph
+
+
+PD_ALL_CONTAINERS = (paddle.nn.Sequential, paddle.nn.LayerList)
+PD_SEQ_CONTAINERS = (paddle.nn.Sequential, paddle.nn.LayerList)
+
+
+class CallableVariable(VariableBase):
+    """
+    CallableVariable is a subclass of VariableBase used to wrap a callable variable.
+
+    Args:
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(self, graph: FunctionGraph, tracker: Tracker):
+        super().__init__(graph, tracker)
+
+    def __call__(self, /, *args, **kwargs) -> VariableBase:
+        """Why we need '/' to make self positional only?
+
+        If kwargs have {'self': xxx}, this function call raise a error.
+        See: test_str_format.py for details.
+        """
+        with EventGuard(f"call_function: {self.__class__.__name__}"):
+            return self.call_function(*args, **kwargs)
+
+    def call_function(self, /, *args, **kwargs):
+        raise NotImplementedError("call_function is not implemented.")
+
+
+class FunctionVariable(CallableVariable):
+    """
+    FunctionVariable is a subclass of CallableVariable used to wrap a function variable.
+
+    Args:
+        fn (Callable[..., Any]): The function to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, fn: Callable[..., Any], graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(graph, tracker)
+        self.value = fn
+
+    def get_py_value(self, allow_tensor=False):
+        return self.value
+
+    def get_code(self) -> types.CodeType:
+        return self.value.__code__
+
+    def bind(self, instance: VariableBase, name: str):
+        method_var = MethodVariable(
+            instance,
+            self,
+            graph=self.graph,
+            tracker=GetAttrTracker(instance, name),
+        )
+        class_var = VariableFactory.from_value(
+            instance.get_py_type(),
+            graph=self.graph,
+            tracker=GetAttrTracker(instance, "__class__"),
+        )
+        assert class_var is not None
+        self.tracker = GetAttrTracker(class_var, name)
+        return method_var
+
+    make_stringify_guard = object_equal_stringify_guard
+
+
+class UserDefinedFunctionVariable(FunctionVariable):
+    """
+    UserDefinedFunctionVariable is a subclass of FunctionVariable used to wrap a user-defined function.
+
+    Args:
+        fn (Callable[..., Any]): The user-defined function to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, fn: Callable[..., Any], graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(fn, graph, tracker)
+
+    def handle_psdb_function(self, /, *args, **kwargs):
+        # special function for inner debug.
+        if self.value is psdb.assert_true:
+            return ConstantVariable.wrap_literal(
+                self.value(args[0].value), self.graph
+            )
+        elif self.value is psdb.print:
+            sot_prefix = ConstantVariable.wrap_literal("[SOT]", self.graph)
+            self.graph.add_print_variables(
+                PrintStmtVariable(([sot_prefix, *args], kwargs), self.graph)
+            )
+            return ConstantVariable.wrap_literal(None, self.graph)
+        elif self.value is psdb.breakpoint:
+            # do nothing. just return None.
+            from ...breakpoint import BM
+
+            BM.locate(BM.executors[-1])
+            BM.add(BM.cur_exe._code.co_filename, BM.cur_exe._current_line)
+            return ConstantVariable.wrap_literal(None, self.graph)
+        elif self.value is psdb.breakgraph:
+            raise BreakGraphError("breakgraph by psdb.breakgraph")
+        elif self.value is psdb.fallback:
+            raise FallbackError("fallback by psdb.fallback")
+        elif self.value is psdb.in_sot:
+            return ConstantVariable.wrap_literal(True, self.graph)
+        return None
+
+    def call_function(self, /, *args, **kwargs) -> VariableBase:
+        from ..opcode_inline_executor import OpcodeInlineExecutor
+
+        result = self.handle_psdb_function(*args, **kwargs)
+        if result is not None:
+            return result
+
+        checkpoint = self.graph.save_memo()
+        try:
+            inline_executor = OpcodeInlineExecutor(self, *args, **kwargs)
+            with EventGuard(
+                f"Inline Call: {inline_executor._code.co_name.replace('<', '(').replace('>', ')')}, file {inline_executor._code.co_filename}, line {int(inline_executor._code.co_firstlineno)}"
+            ):
+                output = inline_executor.inline_call()
+        except SotErrorBase as e:
+            self.graph.restore_memo(checkpoint)
+            raise BreakGraphError(
+                f"({e}) raised while inline call {self.value.__code__}."
+            )
+        return output
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, (types.FunctionType)):
+            return UserDefinedFunctionVariable(value, graph, tracker)
+        if isinstance(
+            value, paddle.jit.dy2static.program_translator.StaticFunction
+        ):
+            return UserDefinedFunctionVariable(
+                value.dygraph_function, graph, tracker
+            )
+        return None
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "name": self.value.__name__,
+        }
+
+
+class PaddleApiVariable(FunctionVariable):
+    """
+    PaddleApiVariable is a subclass of FunctionVariable used to wrap a paddlepaddle API function.
+
+    Args:
+        fn (Callable[..., Any]): The paddlepaddle API to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, fn: Callable[..., Any], graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(fn, graph, tracker)
+
+    def call_function(self, /, *args, **kwargs):
+        if is_break_graph_api(self.value):
+            raise BreakGraphError(
+                f"breakgraph by unsupport function: {self.value.__name__}"
+            )
+        return self.graph.call_paddle_api(self.value, *args, **kwargs)
+
+    @VariableFactory.register_from_value(
+        successor="UserDefinedFunctionVariable"
+    )
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if callable(value) and is_paddle_api(value):
+            return PaddleApiVariable(value, graph, tracker)
+        return None
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "name": self.value.__name__,
+        }
+
+    make_stringify_guard = object_equal_stringify_guard
+
+
+class TensorFunctionVariable(FunctionVariable):
+    """
+    TensorFunctionVariable is a subclass of FunctionVariable used to wrap a method of a tensor.
+
+    Args:
+        method_name (str): The name of the tensor method to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, method_name: str, graph: FunctionGraph, tracker: Tracker
+    ):
+        fn = getattr(paddle.static.Variable, method_name)
+        super().__init__(fn, graph, tracker)
+        self.method_name = method_name
+
+    def call_function(self, /, *args, **kwargs):
+        if is_break_graph_tensor_methods(self.method_name):
+            raise BreakGraphError()
+        return self.graph.call_tensor_method(self.method_name, *args, **kwargs)
+
+    def bind(self, instance: VariableBase, name: str):
+        method_var = MethodVariable(
+            instance,
+            self,
+            graph=self.graph,
+            tracker=GetAttrTracker(instance, name),
+        )
+        class_var = VariableFactory.from_value(
+            instance.get_py_type(),
+            graph=self.graph,
+            tracker=ConstTracker(instance.get_py_type()),
+        )
+        assert class_var is not None
+        self.tracker = GetAttrTracker(class_var, name)
+        return method_var
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "name": self.value.__name__,
+        }
+
+
+class MethodVariable(CallableVariable):
+    """
+    MethodVariable is a subclass of CallableVariable used to wrap a method variable.
+
+    Args:
+        bound_instance (VariableBase): The instance of the method.
+        fn (VariableBase): The method to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+        method_name (str): The name of the method to be wrapped.
+    """
+
+    def __init__(
+        self,
+        bound_instance: VariableBase,
+        fn: VariableBase,
+        graph: FunctionGraph,
+        tracker: Tracker,
+        *,
+        method_name: str | None = None,
+    ):
+        super().__init__(graph, tracker)
+        self.bound_instance = bound_instance
+        self.fn = fn
+        self.method_name = method_name
+
+    def get_py_value(self, allow_tensor=False):
+        return self.fn.get_py_value().__get__(
+            self.bound_instance.get_py_value(allow_tensor),
+            self.bound_instance.get_py_value(allow_tensor).__class__,
+        )
+
+    def _reconstruct(self, pycode_gen):
+        assert self.method_name is not None
+        self.tensor.reconstruct(pycode_gen)
+        pycode_gen.gen_load_attr(self.method_name)
+
+    def call_function(self, /, *args, **kwargs):
+        return self.fn(*(self.bound_instance, *args), **kwargs)
+
+    @staticmethod
+    def wrap_method(
+        value: types.MethodType,
+        *,
+        graph: FunctionGraph,
+        tracker: Tracker,
+        instance: VariableBase | None = None,
+        fn: VariableBase | None = None,
+        method_name: str | None = None,
+    ):
+        # NOTE(SigureMo): Since the method_self need method_var as the obj
+        # of the tracker, we need to temporarily set the tracker of method_self
+        # to DummyTracker, and set it to GetAttrTracker after method_var is created.
+        instance_var = (
+            VariableFactory.from_value(value.__self__, graph, DanglingTracker())
+            if instance is None
+            else instance
+        )
+
+        fn_var = (
+            VariableFactory.from_value(value.__func__, graph, DanglingTracker())
+            if fn is None
+            else fn
+        )
+
+        method_var = MethodVariable(
+            instance_var,
+            fn_var,
+            method_name=method_name,
+            graph=graph,
+            tracker=tracker,
+        )
+        if instance is None:
+            instance_var.tracker = GetAttrTracker(method_var, "__self__")
+        if fn is None:
+            fn_var.tracker = GetAttrTracker(method_var, "__func__")
+        return method_var
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if inspect.ismethod(value):
+            return MethodVariable.wrap_method(
+                value=value, tracker=tracker, graph=graph
+            )
+        return None
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "method": self.method_name,
+        }
+
+
+class LayerVariable(CallableVariable):
+    """
+    LayerVariable is a subclass of CallableVariable used to wrap a layer.
+
+    Args:
+        layer (paddle.nn.Layer): The layer to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, layer: paddle.nn.Layer, graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(graph, tracker)
+        self.value = layer
+
+    def get_py_value(self, allow_tensor=False):
+        return self.value
+
+    def call_function(self, /, *args, **kwargs):
+        fn_var = UserDefinedFunctionVariable(
+            self.value.__class__.__call__,
+            self.graph,
+            GetAttrTracker(self, "__call__"),
+        )
+
+        return fn_var(*(self, *args), **kwargs)
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        frame_value_tracer = self.tracker.trace_value_from_frame()
+        return [
+            StringifyExpression(
+                f"id({{}}) == {id(self.get_py_value())}",
+                [frame_value_tracer],
+                union_free_vars(frame_value_tracer.free_vars),
+            ),
+            StringifyExpression(
+                f"{{}}.training == {self.get_py_value().training}",
+                [frame_value_tracer],
+                union_free_vars(frame_value_tracer.free_vars),
+            ),
+        ]
+
+
+class ContainerLayerVariable(LayerVariable):
+    def __init__(
+        self, layer: paddle.nn.Layer, graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(layer, graph, tracker)
+
+    def __len__(self):
+        return len(self.value)
+
+    def len(self):
+        return ConstantVariable(len(self), self.graph, DummyTracker([self]))
+
+    def getitem(self, key):
+        if isinstance(self.value, PD_SEQ_CONTAINERS) and isinstance(
+            key, SliceVariable
+        ):
+            try:
+                slice_py_value = key.get_py_value()
+                new_layer_list = self.value[slice_py_value]
+                self.graph.add_global_guarded_variable(key)
+                return VariableFactory.from_value(
+                    new_layer_list,
+                    self.graph,
+                    GetItemTracker(self, slice_py_value),
+                )
+            except Exception as e:
+                raise BreakGraphError(
+                    f"call {self.value.__class__.__name__}.__getitem__ with slice as key, and slice with py value failed: {e}."
+                )
+
+        else:
+            return super().getitem(key)
+
+    def get_iter(self):
+        if isinstance(self.value, PD_SEQ_CONTAINERS):
+            from .iter import SequenceIterVariable
+
+            return SequenceIterVariable(self, self.graph, GetIterTracker(self))
+        else:
+            return super().get_iter()
+
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        if isinstance(self.value, PD_SEQ_CONTAINERS):
+            frame_value_tracer = self.tracker.trace_value_from_frame()
+
+            len_guard = StringifyExpression(
+                f"len({{}}) == {len(self.value)}",
+                [frame_value_tracer],
+                frame_value_tracer.free_vars,
+            )
+
+            guards = [len_guard]
+            for idx, layer in enumerate(self.value):
+                layer_variable = VariableFactory.from_value(
+                    layer, self.graph, GetItemTracker(self, idx)
+                )
+                guards.extend(layer_variable.make_stringify_guard())
+
+            return guards
+        else:
+            return super().make_stringify_guard()
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "name": self.value.__class__.__name__,
+        }
+
+    @VariableFactory.register_from_value(successor="PaddleLayerVariable")
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, PD_ALL_CONTAINERS):
+            return ContainerLayerVariable(value, graph, tracker)
+        return None
+
+
+class PaddleLayerVariable(LayerVariable):
+    """
+    PaddleLayerVariable is a subclass of LayerVariable used to wrap a paddlepaddle layer.
+
+    Args:
+        layer (paddle.nn.Layer): The paddle built-in layer to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, layer: paddle.nn.Layer, graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(layer, graph, tracker)
+
+    def call_function(self, /, *args, **kwargs):
+        self.graph.add_global_guarded_variable(self)
+        return self.graph.call_layer(self, *args, **kwargs)
+
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        if isinstance(self.tracker, CreateLayerTracker):
+            return reduce(
+                operator.add,
+                [var.make_stringify_guard() for var in self.tracker.inputs],
+            )
+        else:
+            return super().make_stringify_guard()
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "name": self.value.__class__.__name__,
+        }
+
+    @VariableFactory.register_from_value(successor="UserDefinedLayerVariable")
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        # TODO(SigureMo): Add a more common way to check if a value is a paddle builtin layer.
+        if isinstance(value, paddle.nn.Layer):
+            # If there is a user-defined behavior, such as a container class layer
+            # or a hook on the layer, it needs to be converted to UserDefinedLayerVariable,
+            # otherwise converted to PaddleLayerVariable
+            if (
+                hasattr(value, "_forward_pre_hooks")
+                and value._forward_pre_hooks
+                or hasattr(value, "_forward_post_hooks")
+                and value._forward_post_hooks
+            ):
+                return None
+            if value.__module__.startswith("paddle.nn."):
+                return PaddleLayerVariable(value, graph, tracker)
+        return None
+
+
+class UserDefinedLayerVariable(LayerVariable):
+    """
+    UserDefinedLayerVariable is a subclass of LayerVariable used to wrap a user-defined layer.
+
+    Args:
+        layer (paddle.nn.Layer): The user-defined layer to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, layer: paddle.nn.Layer, graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(layer, graph, tracker)
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "name": self.value.__class__.__name__,
+        }
+
+    @VariableFactory.register_from_value(successor="PaddleApiVariable")
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if isinstance(value, paddle.nn.Layer):
+            return UserDefinedLayerVariable(value, graph, tracker)
+        return None
+
+
+class BuiltinVariable(FunctionVariable):
+    """
+    BuiltinVariable is a subclass of FunctionVariable used to wrap a built-in function.
+    Args:
+        fn (Callable[..., Any]): The built-in function to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, fn: Callable[..., Any], graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(fn, graph, tracker)
+        self.value = fn
+
+    def call_function(self, /, *args, **kwargs):
+        # Lookup the handler from dispatcher
+        handler = Dispatcher.dispatch(self.value, *args, **kwargs)
+        if handler is not None:
+            return handler(*args, **kwargs)
+
+        # Try to inline call the magic function
+        magic_methods = magic_method_builtin_dispatch(self.value)
+        for magic_method in magic_methods:
+            sorted_args = args
+            if magic_method.is_reverse:
+                sorted_args = sorted_args[::-1]
+            arg_type = sorted_args[0].get_py_type()
+            if hasattr(arg_type, magic_method.name):
+                class_fn = getattr(arg_type, magic_method.name)
+                class_var = VariableFactory.from_value(
+                    arg_type,
+                    self.graph,
+                    GetAttrTracker(args[0], "__class__"),
+                )
+                assert isinstance(class_var, VariableBase)
+                fn_var = VariableFactory.from_value(
+                    class_fn,
+                    self.graph,
+                    GetAttrTracker(class_var, class_fn.__name__),
+                )
+                assert isinstance(fn_var, VariableBase)
+                return fn_var(*args)
+
+        # Break graph if neither of the above conditions is met
+        arg_types = ", ".join([type(arg).__name__ for arg in args])
+        fn_name = (
+            self.value.__name__
+            if hasattr(self.value, '__name__')
+            else self.value
+        )
+        raise BreakGraphError(
+            f"Not support builtin function: {fn_name} with args: Args({arg_types})"
+        )
+
+    @VariableFactory.register_from_value(successor="ClassVariable")
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if is_builtin_fn(value):
+            return BuiltinVariable(value, graph, tracker)
+        return None
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "name": self.value.__name__,
+        }
+
+
+class UserDefinedGeneratorVariable(FunctionVariable):
+    """
+    UserDefinedGeneratorVariable is a subclass of FunctionVariable used to wrap a user-defined generator.
+    Args:
+        fn (Callable[..., Any]): The user-defined generator to be wrapped.
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self, fn: Callable[..., Any], graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(fn, graph, tracker)
+
+    def call_function(self, /, *args, **kwargs):
+        iter_ = self.value(*args, **kwargs)
+        var = VariableFactory.from_value(
+            iter_, self.graph, DummyTracker([self])
+        )
+        return var
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {"name": self.value.__name__}
+
+    @VariableFactory.register_from_value(
+        successor="UserDefinedFunctionVariable"
+    )
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if inspect.isgeneratorfunction(value):
+            return UserDefinedGeneratorVariable(value, graph, tracker)
+        return None
+
+
+class ClassVariable(CallableVariable):
+    def __init__(self, class_: type, graph: FunctionGraph, tracker: Tracker):
+        super().__init__(graph, tracker)
+        self.value = class_
+
+    def get_py_value(self, allow_tensor=False):
+        return self.value
+
+    def call_function(self, /, *args, **kwargs):
+        new_object = self.value.__new__(self.value)
+
+        # do not have init function
+        if self.value.__init__ is object.__init__:
+            return VariableFactory.from_value(
+                new_object, self.graph, DummyTracker([self])
+            )
+
+        if not hasattr(self.value.__init__, "__code__"):
+            fn_var = BuiltinVariable(
+                self.value.__init__,
+                self.graph,
+                GetAttrTracker(self, "__init__"),
+            )
+        else:
+            fn_var = UserDefinedFunctionVariable(
+                self.value.__init__,
+                self.graph,
+                GetAttrTracker(self, "__init__"),
+            )
+
+        # need classify variable type here?
+        new_object_variable = VariableFactory.from_value(
+            new_object,
+            self.graph,
+            DummyTracker([self] + list(args) + list(kwargs.values())),
+        )
+        fn_var(new_object_variable, *args, **kwargs)
+        return new_object_variable
+
+    make_stringify_guard = object_equal_stringify_guard
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if inspect.isclass(value):
+            return ClassVariable(value, graph, tracker)
+        return None
+
+
+class PaddleLayerClassVariable(ClassVariable):
+    def __init__(self, class_: type, graph: FunctionGraph, tracker: Tracker):
+        super().__init__(class_, graph, tracker)
+
+    def call_function(self, /, *args, **kwargs):
+        input_py_args = [var.get_py_value() for var in args]
+        input_py_kwargs = {k: v.get_py_value() for k, v in kwargs.items()}
+        new_layer = self.value(*input_py_args, **input_py_kwargs)
+        return PaddleLayerVariable(
+            new_layer, self.graph, CreateLayerTracker(self, args, kwargs)
+        )
+
+    @VariableFactory.register_from_value(successor="ClassVariable")
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if (
+            inspect.isclass(value)
+            and issubclass(value, paddle.nn.Layer)
+            and value.__module__.startswith("paddle.nn.")
+        ):
+            return PaddleLayerClassVariable(value, graph, tracker)
+        return None
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/container.py b/python/paddle/jit/sot/opcode_translator/executor/variables/container.py
new file mode 100644
index 00000000000000..b1c318e9187bd1
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/container.py
@@ -0,0 +1,1011 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import operator
+from collections import OrderedDict
+from functools import reduce
+from typing import TYPE_CHECKING, Any
+
+from ....utils.exceptions import FallbackError, InnerError
+from ..dispatcher import Dispatcher
+from ..guard import StringifyExpression, check_guard
+from ..mutable_data import MutableDictLikeData, MutableListLikeData
+from ..pycode_generator import PyCodeGen
+from ..tracker import (
+    ConstTracker,
+    DanglingTracker,
+    DummyTracker,
+    GetItemTracker,
+    GetIterTracker,
+    Tracker,
+)
+from .base import ConstTypes, VariableBase, VariableFactory
+from .basic import ConstantVariable
+from .callable import BuiltinVariable, UserDefinedFunctionVariable
+
+if TYPE_CHECKING:
+    from ..function_graph import FunctionGraph
+
+
+class ContainerVariable(VariableBase):
+    """
+    ContainerVariable is a wrapper for container types, such as range, list, tuple, dict.
+    """
+
+    @property
+    def init_value(self):
+        return self.value
+
+    def get_items(self) -> list[VariableBase]:
+        raise FallbackError('ContainerVariable.get_items do not implement')
+
+    def get_wrapped_items(self):
+        raise FallbackError(
+            "ContainerVariable.get_wrapped_items do not implement"
+        )
+
+    def __len__(self):
+        raise FallbackError('ContainerVariable.__len__ do not implement')
+
+    def len(self):
+        return ConstantVariable(len(self), self.graph, DummyTracker([self]))
+
+    def __bool__(self) -> bool:
+        return len(self) > 0
+
+    def bool(self):
+        return ConstantVariable(bool(self), self.graph, DummyTracker([self]))
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        frame_value_tracer = self.tracker.trace_value_from_frame()
+
+        type_guard = StringifyExpression(
+            f"isinstance({{}}, {self.get_py_type().__name__})",
+            [frame_value_tracer],
+            frame_value_tracer.free_vars,
+        )
+        len_guard = StringifyExpression(
+            f"len({{}}) == {len(self.init_value)}",
+            [frame_value_tracer],
+            frame_value_tracer.free_vars,
+        )
+        if isinstance(self, (ListVariable, TupleVariable)):
+            guard_variables = self.proxy.reproduce(0)
+
+        elif isinstance(self, DictVariable):
+            guard_variables = filter(
+                lambda var: not isinstance(var, MutableDictLikeData.Empty),
+                self.proxy.reproduce(0).values(),
+            )
+        else:
+            raise InnerError(f"Unsupported container type: {type(self)}")
+        return reduce(
+            operator.add,
+            [[type_guard, len_guard]]
+            + [item.make_stringify_guard() for item in guard_variables],
+        )
+
+
+class ListVariable(ContainerVariable):
+    """
+    ListVariable is a wrapper for list and contains common APIs for list methods
+
+    Args:
+        val_list(List[VariableBase]): the list to wrap
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self,
+        val_list: list[VariableBase],
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(graph, tracker)
+
+        # everything in stack is VariableBase, so just accept the input list is ok
+        self.proxy = self.graph.side_effects.get_proxy(
+            MutableListLikeData, val_list, self.proxy_getter
+        )
+        self.value = val_list
+
+    def proxy_getter(self, proxy: MutableListLikeData, key: Any):
+        if key < 0 or key >= len(proxy.original_data):
+            return MutableListLikeData.Empty()
+        return VariableFactory.from_value(
+            proxy.original_data[key],
+            self.graph,
+            tracker=GetItemTracker(self, key, changed=proxy.has_changed),
+        )
+
+    def get_py_value(self, allow_tensor=False):
+        items = self.proxy.get_all()
+        return [item.get_py_value(allow_tensor) for item in items]
+
+    def get_py_type(self):
+        return list
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        size = len(self)
+        for idx in range(size):
+            Dispatcher.call(operator.getitem, self, idx).reconstruct(codegen)
+        codegen.gen_build_list(size)
+
+    def get_items(self):
+        size = len(self)
+        return [
+            Dispatcher.call(operator.getitem, self, idx) for idx in range(size)
+        ]
+
+    def get_wrapped_items(self):
+        return self.get_items()
+
+    def get_iter(self):
+        from .iter import SequenceIterVariable
+
+        return SequenceIterVariable(self, self.graph, GetIterTracker(self))
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "len": len(self),
+        }
+
+    def __len__(self):
+        return self.proxy.length
+
+    def getitem(self, key):
+        self.graph.add_global_guarded_variable(key)
+        key = key.get_py_value()
+        if isinstance(key, int):
+            res = self.proxy.get(key)
+            if self.proxy.is_empty(res):
+                raise InnerError(f"List {self} out of range (index={key})")
+            return res
+        elif isinstance(key, slice):
+            items = self.proxy.get_all()
+            return VariableFactory.from_value(
+                items[key],
+                self.graph,
+                tracker=GetItemTracker(
+                    self, key, changed=self.proxy.has_changed
+                ),
+            )
+        else:
+            raise InnerError(
+                f"Unsupported key type {key.__class__.__name__} for ListVariable"
+            )
+
+    def setitem(self, key, value):
+        if not isinstance(value, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: received {value} to set value."
+            )
+        if isinstance(key, int):
+            self.proxy.set(key, value)
+        elif isinstance(key, slice) and isinstance(
+            value, (ListVariable, TupleVariable)
+        ):
+            start, end, step = key.indices(self.proxy.length)
+            indices = list(range(start, end, step))
+            if step == 1:
+                # replace a continuous range
+                for i, idx in enumerate(indices):
+                    self.proxy.delete(idx - i)
+                for i, item in enumerate(value.get_wrapped_items()):
+                    self.proxy.insert(start + i, item)
+            else:
+                # replace some elements
+                if len(indices) != len(value):
+                    raise InnerError(
+                        f"Attempt to replace {len(indices)} items with {len(value)}"
+                    )
+                for i, idx in enumerate(indices):
+                    self.proxy.set(idx, value[i])
+        else:
+            raise InnerError(
+                f"Unsupported key type {key.__class__.__name__} and value type {value.__class__.__name__} for ListVariable"
+            )
+
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def __delitem__(self, key):
+        return self.delitem(key)
+
+    def delitem(self, key):
+        if isinstance(key, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: received {key} as key to delete."
+            )
+        self.proxy.delete(key)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def insert(self, index: int, value: VariableBase):
+        self.proxy.insert(index, value)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def append(self, value: VariableBase):
+        self.insert(self.proxy.length, value)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def extend(self, data):
+        for item in data.proxy.get_all():
+            self.append(item)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def concat(self, list_):
+        assert isinstance(list_, ListVariable)
+        return ListVariable(
+            self.proxy.get_all() + list_.proxy.get_all(),
+            self.graph,
+            DummyTracker([self, list_]),
+        )
+
+    def repeat(self, length):
+        assert isinstance(length, ConstantVariable)
+        return ListVariable(
+            self.proxy.get_all() * length.value,
+            self.graph,
+            DummyTracker([self, length]),
+        )
+
+    def pop(self, index: ConstantVariable | None = None):
+        if index is None:
+            index = ConstantVariable.wrap_literal(-1, self.graph)
+        res = self.proxy.get(index.get_py_value())
+        self.proxy.delete(index.get_py_value())
+        self.graph.side_effects.record_proxy_variable(self)
+        return res
+
+    def copy(self):
+        return ListVariable(
+            self.proxy.get_all(),
+            self.graph,
+            DummyTracker([self]),
+        )
+
+    def clear(self):
+        for idx in range(self.proxy.length):
+            self.delitem(0)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def remove(self, value):
+        for idx in range(self.proxy.length):
+            if self[idx].get_py_value(allow_tensor=True) == value.get_py_value(
+                allow_tensor=True
+            ):
+                self.delitem(idx)
+                break
+        else:
+            raise InnerError(f"List {self} does not contain {value}")
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def sort(self, key=None, reverse=None):
+        if (
+            key is None
+            or isinstance(key, ConstantVariable)
+            and key.get_py_value() is None
+        ):
+            key = UserDefinedFunctionVariable(
+                lambda x: x, self.graph, DanglingTracker()
+            )
+            assert key is not None
+        if reverse is None:
+            reverse = ConstantVariable.wrap_literal(False, self.graph)
+
+        permutation = list(range(self.proxy.length))
+        permutation.sort(
+            key=lambda x: key.get_py_value()(
+                Dispatcher.call(operator.getitem, self, x).value
+            ),
+            reverse=reverse.get_py_value(),
+        )
+        self.proxy.permutate(permutation)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def reverse(self):
+        permutation = list(range(self.proxy.length))
+        permutation.reverse()
+        self.proxy.permutate(permutation)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def count(self, value: VariableBase):
+        count: int = 0
+        getitem = BuiltinVariable(
+            operator.getitem, self.graph, DanglingTracker()
+        )
+        for index in range(len(self)):
+            index_value = getitem(self, index)
+            if index_value.id == value.id:
+                count += 1
+                continue
+            eq = BuiltinVariable(operator.eq, self.graph, DanglingTracker())(
+                index_value, value
+            )
+            eq_bool = BuiltinVariable(bool, self.graph, DanglingTracker())(eq)
+            assert isinstance(
+                eq_bool, ConstantVariable
+            ), "bool should return ConstantVariable"
+            if eq.get_py_value() is True:
+                count += 1
+                continue
+
+        return ConstantVariable(count, self.graph, DummyTracker([self, value]))
+
+    def index(self, value: VariableBase):
+        res = 0
+        getitem = BuiltinVariable(
+            operator.getitem, self.graph, DanglingTracker()
+        )
+        for index in range(len(self)):
+            index_value = getitem(self, index)
+            if index_value.id == value.id:
+                return ConstantVariable(
+                    res, self.graph, DummyTracker([self, value])
+                )
+            eq = BuiltinVariable(operator.eq, self.graph, DanglingTracker())(
+                index_value, value
+            )
+            eq_bool = BuiltinVariable(bool, self.graph, DanglingTracker())(eq)
+            assert isinstance(
+                eq_bool, ConstantVariable
+            ), "bool should return ConstantVariable"
+            if eq.get_py_value() is True:
+                return ConstantVariable(
+                    res, self.graph, DummyTracker([self, value])
+                )
+            res += 1
+
+        return ConstantVariable(-1, self.graph, DummyTracker([self, value]))
+
+    def max(self):
+        if len(self) == 0:
+            raise ValueError("max() arg is an empty sequence")
+        res = self[0]
+        getitem = BuiltinVariable(
+            operator.getitem, self.graph, DanglingTracker()
+        )
+        for index in range(len(self)):
+            index_value = getitem(self, index)
+            gt = BuiltinVariable(operator.gt, self.graph, DanglingTracker())(
+                index_value, res
+            )
+            if gt.get_py_value() is True:
+                res = index_value
+        return res
+
+    def min(self):
+        if len(self) == 0:
+            raise ValueError("max() arg is an empty sequence")
+        res = self[0]
+        getitem = BuiltinVariable(
+            operator.getitem, self.graph, DanglingTracker()
+        )
+        for index in range(len(self)):
+            index_value = getitem(self, index)
+            lt = BuiltinVariable(operator.lt, self.graph, DanglingTracker())(
+                index_value, res
+            )
+            if lt.get_py_value() is True:
+                res = index_value
+        return res
+
+    def getattr(self, name: str, default=None):
+        from .callable import BuiltinVariable
+
+        if default is not None:
+            raise FallbackError(
+                "default argument for getattr is not implemented"
+            )
+
+        method_name_to_builtin_fn = {
+            "insert": list.insert,
+            "append": list.append,
+            "extend": list.extend,
+            "pop": list.pop,
+            "copy": list.copy,
+            "clear": list.clear,
+            "remove": list.remove,
+            "sort": list.sort,
+            "reverse": list.reverse,
+            "count": list.count,
+            "index": list.index,
+        }
+
+        if name in method_name_to_builtin_fn:
+            builtin_fn = method_name_to_builtin_fn[name]
+            return BuiltinVariable(
+                builtin_fn, self.graph, DanglingTracker()
+            ).bind(self, name)
+        else:
+            raise FallbackError(f"attribute {name} for list is not implemented")
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        # Note(SigureMo): Why not use isinstance?
+        # Because user may define a class that inherit from list.
+        # We should convert it to ObjectVariable instead of ListVariable.
+        if type(value) is list:  # noqa: E721
+            return ListVariable(value, graph=graph, tracker=tracker)
+        return None
+
+
+class TupleVariable(ContainerVariable):
+    """
+    TupleVariable is a wrapper for tuple and contains common APIs for tuple methods.
+
+    Args:
+        val_tuple(tuple[VariableBase, ...]): the tuple to wrap
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self,
+        val_tuple: tuple[VariableBase, ...],
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(graph, tracker)
+
+        self.proxy = self.graph.side_effects.get_proxy(
+            MutableListLikeData, list(val_tuple), self.proxy_getter
+        )
+        self.value = val_tuple
+
+    def getattr(self, name: str, default=None):
+        from .callable import BuiltinVariable
+
+        if default is not None:
+            raise FallbackError(
+                "default argument for getattr is not implemented"
+            )
+
+        method_name_to_builtin_fn = {
+            "count": tuple.count,
+            "index": tuple.index,
+        }
+        if name in method_name_to_builtin_fn:
+            builtin_fn = method_name_to_builtin_fn[name]
+            return BuiltinVariable(
+                builtin_fn, self.graph, DanglingTracker()
+            ).bind(self, name)
+        else:
+            raise FallbackError(
+                f"attribute {name} for tuple is not implemented"
+            )
+
+    def proxy_getter(self, proxy: MutableListLikeData, key: Any):
+        if key < 0 or key >= len(proxy.original_data):
+            return MutableListLikeData.Empty()
+        return VariableFactory.from_value(
+            proxy.original_data[key],
+            self.graph,
+            tracker=GetItemTracker(self, key, changed=False),
+        )
+
+    def get_py_value(self, allow_tensor=False):
+        return tuple(
+            self[idx].get_py_value(allow_tensor) for idx in range(len(self))
+        )
+
+    def get_py_type(self):
+        return tuple
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        size = len(self)
+        for idx in range(size):
+            Dispatcher.call(operator.getitem, self, idx).reconstruct(codegen)
+        codegen.gen_build_tuple(size)
+
+    def get_items(self):
+        size = len(self)
+        return [
+            Dispatcher.call(operator.getitem, self, idx) for idx in range(size)
+        ]
+
+    def get_wrapped_items(self):
+        return tuple(self.get_items())
+
+    def get_iter(self):
+        from .iter import SequenceIterVariable
+
+        return SequenceIterVariable(self, self.graph, GetIterTracker(self))
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "len": len(self),
+        }
+
+    def __len__(self):
+        return self.proxy.length
+
+    def getitem(self, key):
+        self.graph.add_global_guarded_variable(key)
+        key = key.get_py_value()
+        if isinstance(key, int):
+            res = self.proxy.get(key)
+            if self.proxy.is_empty(res):
+                raise InnerError(f"List {self} out of range (index={key})")
+            return res
+        elif isinstance(key, slice):
+            return TupleVariable(
+                tuple(self.proxy.get_all())[key],
+                self.graph,
+                tracker=GetItemTracker(self, key, changed=False),
+            )
+        else:
+            raise InnerError(
+                f"Unsupported key type {key.__class__.__name__} for TupleVariable"
+            )
+
+    def setitem(self, key, value):
+        raise InnerError(
+            f"[{self.__class__.__name__}]: setitem is not allowed."
+        )
+
+    def __delitem__(self, key):
+        return self.delitem(key)
+
+    def delitem(self, key):
+        raise InnerError(
+            f"[{self.__class__.__name__}]: delitem is not allowed."
+        )
+
+    def concat(self, tuple_):
+        assert isinstance(tuple_, TupleVariable)
+        new_tuple_variable = TupleVariable(
+            tuple(self.proxy.get_all() + tuple_.proxy.get_all()),
+            self.graph,
+            DummyTracker([self, tuple_]),
+        )
+        return new_tuple_variable
+
+    def repeat(self, length):
+        assert isinstance(length, ConstantVariable)
+        new_tuple_variable = TupleVariable(
+            tuple(self.proxy.get_all()) * length.value,
+            self.graph,
+            DummyTracker([self, length]),
+        )
+        return new_tuple_variable
+
+    def count(self, value: VariableBase):
+        count: int = 0
+        getitem = BuiltinVariable(
+            operator.getitem, self.graph, DanglingTracker()
+        )
+        for index in range(len(self)):
+            index_value = getitem(self, index)
+            if index_value.id == value.id:
+                count += 1
+                continue
+            eq = BuiltinVariable(operator.eq, self.graph, DanglingTracker())(
+                index_value, value
+            )
+            eq_bool = BuiltinVariable(bool, self.graph, DanglingTracker())(eq)
+            assert isinstance(
+                eq_bool, ConstantVariable
+            ), "bool should return ConstantVariable"
+            if eq.get_py_value() is True:
+                count += 1
+                continue
+
+        return ConstantVariable(count, self.graph, DummyTracker([self, value]))
+
+    def index(self, value: VariableBase):
+        res = 0
+        getitem = BuiltinVariable(
+            operator.getitem, self.graph, DanglingTracker()
+        )
+        for index in range(len(self)):
+            index_value = getitem(self, index)
+            if index_value.id == value.id:
+                return ConstantVariable(
+                    res, self.graph, DummyTracker([self, value])
+                )
+            eq = BuiltinVariable(operator.eq, self.graph, DanglingTracker())(
+                index_value, value
+            )
+            eq_bool = BuiltinVariable(bool, self.graph, DanglingTracker())(eq)
+            assert isinstance(
+                eq_bool, ConstantVariable
+            ), "bool should return ConstantVariable"
+            if eq.get_py_value() is True:
+                return ConstantVariable(
+                    res, self.graph, DummyTracker([self, value])
+                )
+            res += 1
+
+        return ConstantVariable(-1, self.graph, DummyTracker([self, value]))
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if type(value) is tuple:
+            return TupleVariable(value, graph, tracker)
+        return None
+
+
+class RangeVariable(ContainerVariable):
+    """
+    RangeVariable is a wrapper for range.
+
+    Args:
+        val_range(range): the range to wrap
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self,
+        val_range: range,
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(graph, tracker)
+        self.value = val_range
+
+    def get_py_type(self):
+        return range
+
+    def get_py_value(self, allow_tensor=False):
+        return self.value
+
+    def getitem(self, key):
+        self.graph.add_global_guarded_variable(self)
+        self.graph.add_global_guarded_variable(key)
+        key = key.get_py_value()
+        retval = self.value[key]
+        return ConstantVariable.wrap_literal(retval, self.graph)
+
+    def get_items(self):
+        size = len(self)
+        return [self[idx] for idx in range(size)]
+
+    def get_wrapped_items(self):
+        return self.get_items()
+
+    def get_iter(self):
+        from .iter import SequenceIterVariable
+
+        return SequenceIterVariable(self, self.graph, GetIterTracker(self))
+
+    def __len__(self):
+        return len(self.value)
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        codegen.gen_load_global("range", push_null=True)
+        # The start default value is 0, step is 1
+        # So we can always construct range with 3 args
+        codegen.gen_load_const(self.value.start)
+        codegen.gen_load_const(self.value.stop)
+        codegen.gen_load_const(self.value.step)
+        codegen.gen_call_function(3)
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if type(value) is range:
+            return RangeVariable(value, graph, tracker)
+        return None
+
+    @check_guard
+    def make_stringify_guard(self) -> list[StringifyExpression]:
+        frame_value_tracer = self.tracker.trace_value_from_frame()
+
+        return [
+            StringifyExpression(
+                "isinstance({0}, range) and "
+                + f"{{0}}.start == {self.init_value.start} and "
+                + f"{{0}}.stop == {self.init_value.stop} and "
+                + f"{{0}}.step == {self.init_value.step}",
+                [frame_value_tracer],
+                frame_value_tracer.free_vars,
+            )
+        ]
+
+    @property
+    def debug_name(self) -> str:
+        return ":".join(
+            [
+                str(self.value.start) if self.value.start is not None else "",
+                str(self.value.stop) if self.value.stop is not None else "",
+                str(self.value.step) if self.value.step is not None else "",
+            ]
+        )
+
+    @debug_name.setter
+    def debug_name(self, name):
+        pass
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {"value": self.value}
+
+
+class DictVariable(ContainerVariable):
+    """
+    DictVariable is a wrapper for dict and contains common APIs for dict methods
+
+    Args:
+        val_dict(dict[object, VariableBase]): the dict to wrap
+        graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
+        tracker(Tracker): The Tracker object that tracks the information of this variable.
+    """
+
+    def __init__(
+        self,
+        val_dict: dict[object, VariableBase],
+        graph: FunctionGraph,
+        tracker: Tracker,
+    ):
+        super().__init__(graph, tracker)
+
+        self.proxy = self.graph.side_effects.get_proxy(
+            MutableDictLikeData, val_dict, self.proxy_getter
+        )
+        self.value = val_dict
+
+    def proxy_getter(self, proxy: MutableDictLikeData, key: Any):
+        if key not in proxy.original_data:
+            return MutableDictLikeData.Empty()
+        return VariableFactory.from_value(
+            proxy.original_data[key],
+            self.graph,
+            tracker=GetItemTracker(self, key, changed=proxy.has_changed),
+        )
+
+    def get_py_value(self, allow_tensor=False):
+        return {
+            key: value.get_py_value(allow_tensor)
+            for key, value in self.proxy.get_all().items()
+        }
+
+    def get_py_type(self):
+        return dict
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        from .basic import ConstantVariable
+
+        size = len(self)
+        for key in self.proxy.get_all().keys():
+            if not isinstance(key, ConstTypes):
+                raise InnerError(
+                    f"[{self.__class__.__name__}]: recieved {key} as key."
+                )
+            key_var = ConstantVariable.wrap_literal(key, self.graph)
+            value_var = self[key]
+            key_var.reconstruct(codegen)
+            value_var.reconstruct(codegen)
+        codegen.gen_build_map(size)
+
+    def get_items(self):
+        items = []
+        for key in self.proxy.get_all().keys():
+            if not isinstance(key, ConstTypes):
+                raise InnerError(
+                    f"[{self.__class__.__name__}]: recieved {key} as key."
+                )
+            key_var = VariableFactory.from_value(
+                key, self.graph, tracker=ConstTracker(key)
+            )
+            value_var = self[key]
+            items.extend([key_var, value_var])
+        return items
+
+    def get_wrapped_items(self):
+        items = {}
+        for key in self.proxy.get_all().keys():
+            if not isinstance(key, ConstTypes):
+                raise InnerError(
+                    f"[{self.__class__.__name__}]: recieved {key} as key."
+                )
+            items[key] = self[key]
+        return items
+
+    def get_iter(self):
+        return self.keys()
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "len": len(self),
+        }
+
+    def __len__(self):
+        return len(self.proxy.get_all())
+
+    def get(self, key, default=None):
+        if isinstance(key, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: recieved {key} to get value."
+            )
+
+        if default is None:
+            return Dispatcher.call(operator.getitem, self, key)
+
+        if isinstance(self.proxy.get(key), MutableDictLikeData.Empty):
+            assert isinstance(default, VariableBase)
+            return default
+
+        return Dispatcher.call(operator.getitem, self, key)
+
+    def getitem(self, key):
+        self.graph.add_global_guarded_variable(key)
+        key = key.get_py_value()
+        return self.proxy.get(key)
+
+    def setitem(self, key, value):
+        if isinstance(key, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: recieved {key} as key."
+            )
+
+        if not isinstance(value, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: recieved {value} to set value."
+            )
+
+        self.proxy.set(key, value)
+        self.graph.side_effects.record_proxy_variable(self)
+
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def clear(self):
+        # TODO: Replace with self.proxy.clear()
+        for key in self.value:
+            self.delitem(key)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def __delitem__(self, key):
+        return self.delitem(key)
+
+    def delitem(self, key):
+        if isinstance(key, VariableBase):
+            raise InnerError(
+                f"[{self.__class__.__name__}]: recieved {key} as key to delete."
+            )
+        self.proxy.delete(key)
+        self.graph.side_effects.record_proxy_variable(self)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def keys(self):
+        from .iter import SequenceIterVariable
+
+        raw_list = [
+            ConstantVariable(x, self.graph, ConstTracker(x))
+            for x in self.proxy.get_all().keys()
+        ]
+        key_list = ListVariable(raw_list, self.graph, DummyTracker(raw_list))
+        assert key_list is not None
+        return SequenceIterVariable(
+            key_list, self.graph, DummyTracker([key_list])
+        )
+
+    def values(self):
+        from .iter import SequenceIterVariable
+
+        raw_list = list(self.get_wrapped_items().values())
+        value_list = ListVariable(raw_list, self.graph, DummyTracker([self]))
+        assert value_list is not None
+        return SequenceIterVariable(
+            value_list, self.graph, DummyTracker([value_list])
+        )
+
+    def items(self):
+        from .iter import SequenceIterVariable
+
+        keys = [
+            ConstantVariable(x, self.graph, ConstTracker(x))
+            for x in self.proxy.get_all().keys()
+        ]
+        values = list(self.get_wrapped_items().values())
+        raw_list = list(zip(keys, values))
+        item_list = ListVariable(raw_list, self.graph, DummyTracker([self]))
+        assert item_list is not None
+        return SequenceIterVariable(
+            item_list, self.graph, DummyTracker([item_list])
+        )
+
+    def update(self, data: DictVariable):
+        for key, value in data.proxy.get_all().items():
+            self.setitem(key, value)
+        return ConstantVariable.wrap_literal(None, self.graph)
+
+    def copy(self):
+        new_dict_variable = DictVariable(
+            self.get_wrapped_items(), self.graph, DummyTracker([self])
+        )
+        return new_dict_variable
+
+    def setdefault(self, key, default=None):
+        if isinstance(self.proxy.get(key), MutableDictLikeData.Empty):
+            if default is None:
+                self.setitem(
+                    key, ConstantVariable.wrap_literal(default, self.graph)
+                )
+            else:
+                self.setitem(key, default)
+
+        return Dispatcher.call(operator.getitem, self, key)
+
+    def pop(self, key, default=None):
+        if isinstance(self.proxy.get(key), MutableDictLikeData.Empty):
+            assert isinstance(default, VariableBase)
+            return default
+
+        # default is not None, or key is in dict
+        temp_value = Dispatcher.call(operator.getitem, self, key)
+        self.delitem(key)
+        return temp_value
+
+    def popitem(self):
+        key = self.keys().hold.get_py_value()[-1]
+        value = Dispatcher.call(operator.getitem, self, key)
+        # TODO: key, value should be VariableBase but key maybe a int
+        # assert isinstance(key, VariableBase), key
+        # assert isinstance(value, VariableBase), value
+        new_tuple_variable = TupleVariable(
+            (key, value), self.graph, DummyTracker([self])
+        )
+        self.delitem(key)
+        return new_tuple_variable
+
+    def getattr(self, name: str, default=None):
+        from .callable import BuiltinVariable
+
+        if default is not None:
+            raise FallbackError(
+                "default argument for getattr is not implemented"
+            )
+
+        method_name_to_builtin_fn = {
+            "keys": dict.keys,
+            "values": dict.values,
+            "items": dict.items,
+            "update": dict.update,
+            "setdefault": dict.setdefault,
+            "get": dict.get,
+            "copy": dict.copy,
+            "clear": dict.clear,
+            "pop": dict.pop,
+            "popitem": dict.popitem,
+        }
+
+        if name in method_name_to_builtin_fn:
+            builtin_fn = method_name_to_builtin_fn[name]
+            return BuiltinVariable(
+                builtin_fn, self.graph, DanglingTracker()
+            ).bind(self, name)
+        else:
+            raise FallbackError(f"attribute {name} for dict is not implemented")
+
+    @VariableFactory.register_from_value()
+    def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
+        if type(value) in (dict, OrderedDict):
+            return DictVariable(value, graph=graph, tracker=tracker)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/iter.py b/python/paddle/jit/sot/opcode_translator/executor/variables/iter.py
new file mode 100644
index 00000000000000..82ff8fe2534a74
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/iter.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from ....utils import BreakGraphError, FallbackError
+from ..pycode_generator import PyCodeGen
+from ..tracker import ConstTracker, DummyTracker
+from .base import VariableBase
+from .basic import ConstantVariable
+from .container import ContainerVariable, TupleVariable
+
+if TYPE_CHECKING:
+    from ..function_graph import FunctionGraph
+    from ..tracker import Tracker
+
+
+class IterVariable(VariableBase):
+    """
+    This Variable (include subclasses) should be generated only when simulate GET_ITER opcode
+    """
+
+    def __init__(
+        self, obj: VariableBase, graph: FunctionGraph, tracker: Tracker
+    ):
+        super().__init__(graph, tracker)
+        self.hold = obj
+
+    def make_stringify_guard(self):
+        return self.hold.make_stringify_guard()
+
+    def next(self):
+        raise NotImplementedError(f"Can not simulate `next` for {type(self)}")
+
+    def get_iter(self):
+        return self
+
+    def get_hold(self):
+        return self.hold
+
+
+class SequenceIterVariable(IterVariable):
+    """
+    The basic SequenceIterVariable wraps iterators which can be simulated by call getitem
+    Currently includes: List | Tuple | Dict (keys) | Range | Tensor | nn.LayerList
+    """
+
+    mutable_attrs = ["idx"]
+
+    def __init__(self, obj, graph: FunctionGraph, tracker: Tracker):
+        super().__init__(obj, graph, tracker)
+        self.idx = 0
+        self.graph.side_effects.record_mutable_variable(self)
+
+    def next(self):
+        # TODO: self.hold should have a __len__ method
+        if self.idx < len(self.hold):
+            val = self.hold[self.idx]
+            self.idx += 1
+            return val
+        else:
+            raise StopIteration()
+
+    def to_list(self) -> list:
+        if self.has_side_effect():
+            raise FallbackError("Can not convert an used iterator into list")
+        self.idx = len(self.hold)
+        retval = []
+        for i in range(len(self.hold)):
+            retval.append(self.hold[i])
+        return retval
+
+    def has_side_effect(self) -> bool:
+        return self.idx != 0
+
+    @property
+    def main_info(self) -> dict[str, Any]:
+        return {
+            "idx": self.idx,
+        }
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        if self.has_side_effect():
+            super()._reconstruct(codegen)
+        else:
+            self.hold.reconstruct(codegen)
+            codegen.gen_get_iter()
+
+
+class EnumerateVariable(SequenceIterVariable):
+    """
+    EnumerateVariable holds a SequenceIterVariable and return additional index
+    """
+
+    def __init__(self, val_iterator, graph, tracker):
+        super().__init__(val_iterator, graph, tracker)
+
+    def next(self):
+        val = self.hold.next()
+        idx_var = ConstantVariable(self.idx, self.graph, ConstTracker(self.idx))
+        self.idx += 1
+        return TupleVariable(
+            (idx_var, val), self.graph, DummyTracker([idx_var, val])
+        )
+
+    def to_list(self):
+        values = self.hold.to_list()
+        idx = [
+            ConstantVariable(i, self.graph, ConstTracker(i))
+            for i in range(len(values))
+        ]
+        return list(zip(idx, values))
+
+    def has_side_effect(self) -> bool:
+        return self.hold.has_side_effect() or self.idx != 0
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        if self.has_side_effect():
+            super()._reconstruct(codegen)
+        else:
+            codegen.gen_load_global("enumerate", push_null=True)
+            self.hold.reconstruct(codegen)
+            codegen.gen_call_function(1)
+
+    def get_hold(self):
+        return self.hold.get_hold()
+
+    @staticmethod
+    def from_iterator(value, graph: FunctionGraph | None, tracker: Tracker):
+        iter_variable = value.get_iter()
+        if isinstance(iter_variable, SequenceIterVariable):
+            return EnumerateVariable(iter_variable, graph, tracker)
+        else:
+            return UserDefinedIterVariable(value, graph, tracker)
+
+
+class MapVariable(SequenceIterVariable):
+    """
+    MapVariable holds a SequenceIterVariable and return a Iterable Variable after map function
+    """
+
+    def __init__(self, func, val_iterator, graph, tracker):
+        super().__init__(val_iterator, graph, tracker)
+        self.func = func
+
+    def next(self):
+        return self.func(self.hold.next())
+
+    def to_list(self) -> list:
+        retval = []
+        while True:
+            try:
+                retval.append(self.func(self.hold.next()))
+            except StopIteration:
+                break
+        return retval
+
+    def has_side_effect(self) -> bool:
+        return self.hold.has_side_effect()
+
+    def _reconstruct(self, codegen: PyCodeGen):
+        if self.has_side_effect():
+            super()._reconstruct(codegen)
+        else:
+            codegen.gen_load_global("map", push_null=True)
+            self.func.reconstruct(codegen)
+            self.hold.reconstruct(codegen)
+            codegen.gen_call_function(2)
+
+    @staticmethod
+    def from_iterator(
+        func, value, graph: FunctionGraph | None, tracker: Tracker
+    ):
+        iter_variable = (
+            value.get_iter() if isinstance(value, ContainerVariable) else value
+        )
+
+        if isinstance(iter_variable, IterVariable):
+            return MapVariable(func, iter_variable, graph, tracker)
+        else:
+            return UserDefinedIterVariable(value, graph, tracker)
+
+
+# what UserDefinedIterVariable holds doesn't matter, because use user defined iterator will trigger break graph
+class UserDefinedIterVariable(IterVariable):
+    def __init__(self, obj, graph, tracker):
+        super().__init__(obj, graph, tracker)
+
+    def next(self):
+        raise BreakGraphError("Break graph when using user defined iterator")
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/__init__.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/__init__.py
new file mode 100644
index 00000000000000..5fc71359e93868
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .instruction_utils import (  # noqa: F401
+    Instruction,
+    calc_offset_from_bytecode_offset,
+    calc_stack_effect,
+    convert_instruction,
+    gen_instr,
+    get_instructions,
+    instrs_info,
+    modify_extended_args,
+    modify_instrs,
+    modify_vars,
+    relocate_jump_target,
+    replace_instr,
+    reset_offset,
+)
+from .opcode_analysis import (  # noqa: F401
+    Space,
+    analysis_inputs,
+    analysis_used_names_with_space,
+)
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_utils.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_utils.py
new file mode 100644
index 00000000000000..182ba54279eeff
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_utils.py
@@ -0,0 +1,407 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import dataclasses
+import dis
+import sys
+from typing import TYPE_CHECKING, Any
+
+from ...utils import InnerError
+from .opcode_info import ABS_JUMP, ALL_JUMP, REL_BWD_JUMP, REL_JUMP
+
+if TYPE_CHECKING:
+    import types
+
+
+@dataclasses.dataclass
+class Instruction:
+    opcode: int
+    opname: str
+    arg: int | None
+    argval: Any
+    offset: int | None = None
+    starts_line: int | None = None
+    is_jump_target: bool = False
+    jump_to: Instruction | None = None
+    is_generated: bool = True
+
+    # for analys EXTENDED_ARG
+    first_ex_arg: Instruction | None = None
+    ex_arg_for: Instruction | None = None
+
+    # used in modify_extended_args
+    def __hash__(self):
+        return id(self)
+
+
+def gen_instr(name, arg=None, argval=None, gened=True, jump_to=None):
+    return Instruction(
+        opcode=dis.opmap[name],
+        opname=name,
+        arg=arg,
+        argval=argval,
+        is_generated=gened,
+        jump_to=jump_to,
+    )
+
+
+def convert_instruction(instr: dis.Instruction) -> Instruction:
+    """
+    Converts a disassembled instruction to a customized Instruction object.
+
+    Args:
+        instr (dis.Instruction): The disassembled instruction.
+
+    Returns:
+        Instruction: A customized Instruction object.
+    """
+    return Instruction(
+        instr.opcode,
+        instr.opname,
+        instr.arg,
+        instr.argval,
+        instr.offset,
+        instr.starts_line,
+        instr.is_jump_target,
+        jump_to=None,
+        is_generated=False,
+    )
+
+
+def get_instructions(code: types.CodeType) -> list[Instruction]:
+    """
+    Returns parsed instructions from the given code object and exclude
+    any opcodes that contain `EXTENDED_ARG`.
+
+    Args:
+        code (types.CodeType): The code object to extract instructions from.
+
+    Returns:
+        list[Instruction]: A list of Instruction objects representing the
+            bytecode instructions in the code object.
+    """
+    # instrs do not contain EXTENDED_ARG
+    instrs = list(map(convert_instruction, dis.get_instructions(code)))
+    for instr in instrs:
+        if instr.opname in ALL_JUMP:
+            origin_jump_target = calc_offset_from_bytecode_offset(
+                instr.argval, instrs
+            )
+            jump_offset = origin_jump_target
+
+            while instrs[jump_offset].opname == "EXTENDED_ARG":
+                jump_offset += 1
+
+            if origin_jump_target != jump_offset:
+                # copy infos from EXETENDED_ARG to other opcode
+
+                if instrs[origin_jump_target].is_jump_target:
+                    instrs[jump_offset].is_jump_target = instrs[
+                        origin_jump_target
+                    ].is_jump_target
+                if instrs[origin_jump_target].starts_line:
+                    instrs[jump_offset].starts_line = instrs[
+                        origin_jump_target
+                    ].starts_line
+
+            instr.jump_to = instrs[jump_offset]
+
+    # if the origin opcode contains EXTENDED_ARG, it should be like:
+    #     >>  EXTENDED_ARG 1
+    #         XX 388    <-  256 + 132
+    # filter all EXTENDED_ARG here
+    instrs = [x for x in instrs if x.opname != "EXTENDED_ARG"]
+    return instrs
+
+
+def modify_instrs(instructions: list[Instruction]) -> None:
+    """
+    Modifies the given list of instructions. It contains three steps:
+
+    1. reset offset
+    2. relocate jump target
+    3. add EXTENDED_ARG instruction if needed
+
+    Args:
+        instructions (list): The list of Instruction objects representing bytecode instructions.
+
+    Returns:
+        None
+    """
+    modify_completed = False
+    while not modify_completed:
+        reset_offset(instructions)
+        relocate_jump_target(instructions)
+        modify_completed = modify_extended_args(instructions)
+
+
+def reset_offset(instructions: list[Instruction]) -> None:
+    """
+    Resets the offset for each instruction in the list.
+
+    Args:
+        instructions (list): The list of Instruction objects representing bytecode instructions.
+
+    Returns:
+        None
+    """
+    from ..executor.pycode_generator import get_instruction_size
+
+    if sys.version_info >= (3, 11):
+        current_offset = 0
+        for instr in instructions:
+            instr.offset = current_offset
+            current_offset += get_instruction_size(instr)
+        return
+    for idx, instr in enumerate(instructions):
+        instr.offset = idx * 2
+
+
+def correct_jump_direction(instr: Instruction, arg: int) -> Instruction:
+    """
+    Corrects the jump direction of the given instruction.
+    NOTE(zrr1999): In Python 3.11, JUMP_ABSOLUTE is removed, so python generates JUMP_FORWARD or JUMP_BACKWARD instead,
+    but in for loop breakgraph, we reuse JUMP_BACKWARD to jump forward, so we need to change it to JUMP_FORWARD.
+
+    Args:
+        instr (Instruction): The instruction to be corrected.
+    """
+    if instr.opname in ABS_JUMP:
+        instr.arg = arg
+        return instr
+    elif instr.opname in REL_JUMP:
+        if arg < 0:
+            if instr.opname in REL_BWD_JUMP:
+                forward_op_name = instr.opname.replace("BACKWARD", "FORWARD")
+                if forward_op_name not in dis.opmap:
+                    raise InnerError(f"Unknown jump type {instr.opname}")
+                instr.opname = forward_op_name
+                instr.opcode = dis.opmap[forward_op_name]
+            else:  # instr.opname in REL_FWD_JUMP
+                backward_op_name = instr.opname.replace("FORWARD", "BACKWARD")
+                if backward_op_name not in dis.opmap:
+                    raise InnerError(f"Unknown jump type {instr.opname}")
+                instr.opname = backward_op_name
+                instr.opcode = dis.opmap[backward_op_name]
+            instr.arg = -arg
+        else:
+            instr.arg = arg
+        return instr
+    else:
+        raise ValueError(f"unknown jump type: {instr.opname}")
+
+
+def relocate_jump_target(instructions: list[Instruction]) -> None:
+    """
+    If a jump instruction is found, this function will adjust the jump targets based on the presence of EXTENDED_ARG instructions.
+    If an EXTENDED_ARG instruction exists for the jump target, use its offset as the new target.
+
+    Args:
+        instructions (list): The list of Instruction objects representing bytecode instructions.
+
+    Returns:
+        None
+    """
+    extended_arg = []
+    for instr in instructions:
+        if instr.opname == "EXTENDED_ARG":
+            extended_arg.append(instr)
+            continue
+
+        if instr.opname in ALL_JUMP:
+            assert instr.jump_to is not None
+            assert instr.offset is not None
+            # if jump target has extended_arg, should jump to the first extended_arg opcode
+            jump_target = (
+                instr.jump_to.offset
+                if instr.jump_to.first_ex_arg is None
+                else instr.jump_to.first_ex_arg.offset
+            )
+            assert jump_target is not None
+
+            if instr.opname in ABS_JUMP:
+                new_arg = jump_target
+            else:  # instr.opname in REL_JUMP
+                new_arg = jump_target - instr.offset - 2
+                if instr.opname in REL_BWD_JUMP:
+                    new_arg = -new_arg
+
+            if sys.version_info >= (3, 10):
+                new_arg //= 2
+            correct_jump_direction(instr, new_arg)
+            assert instr.arg is not None
+            if extended_arg:
+                instr.arg &= 0xFF
+                new_arg = new_arg >> 8
+                for ex in reversed(extended_arg):
+                    ex.arg = new_arg & 0xFF
+                    new_arg = new_arg >> 8
+
+                # need more extended_args instr
+                # set arg in the first extended_arg
+                if new_arg > 0:
+                    extended_arg[0].arg += new_arg << 8
+        extended_arg.clear()
+
+
+def modify_extended_args(instructions: list[Instruction]) -> bool:
+    """
+    This function replaces any instruction with an argument greater than or equal to 256 with one or more EXTENDED_ARG instructions.
+
+    Args:
+        instructions (list): The list of Instruction objects representing bytecode instructions.
+
+    Returns:
+        bool: True if the modification is completed, False otherwise.
+    """
+
+    modify_completed = True
+    extend_args_record = {}
+    for instr in instructions:
+        if instr.arg and instr.arg >= 256:  # more than one byte
+            _instrs = [
+                instr
+            ]  # replace instr with _instrs later (it is a set of instrs), all operations will be recorded in extend_args_record
+            val = instr.arg
+            instr.arg = val & 0xFF
+            val = val >> 8
+            while val > 0:
+                _instrs.append(gen_instr("EXTENDED_ARG", arg=val & 0xFF))
+                val = val >> 8
+
+            extend_args_record.update({instr: list(reversed(_instrs))})
+
+    if extend_args_record:
+        # if new EXTENDED_ARG inserted, we need update offset and jump target
+        modify_completed = False
+
+        def bind_ex_arg_with_instr(ex_arg, instr):
+            # move opcode info to EXTENDED_ARG
+            ex_arg.starts_line = instr.starts_line
+            instr.starts_line = None
+            ex_arg.is_jump_target = instr.is_jump_target
+            instr.is_jump_target = False
+
+            if instr.ex_arg_for is not None:
+                # instr is also an ex_arg for another instr
+                instr.ex_arg_for.first_ex_arg = ex_arg
+                ex_arg.ex_arg_for = instr.ex_arg_for
+                instr.ex_arg_for = None
+            else:
+                instr.first_ex_arg = ex_arg
+                ex_arg.ex_arg_for = instr
+
+        for key, val in extend_args_record.items():
+            bind_ex_arg_with_instr(val[0], key)
+            replace_instr(instructions, instr=key, new_instr=val)
+
+    return modify_completed
+
+
+def modify_vars(instructions, code_options):
+    co_names = code_options['co_names']
+    co_varnames = code_options['co_varnames']
+    co_freevars = code_options['co_freevars']
+    for instrs in instructions:
+        if instrs.opname == 'LOAD_FAST' or instrs.opname == 'STORE_FAST':
+            assert (
+                instrs.argval in co_varnames
+            ), f"`{instrs.argval}` not in {co_varnames}"
+            instrs.arg = co_varnames.index(instrs.argval)
+        elif instrs.opname == "LOAD_DEREF" or instrs.opname == "STORE_DEREF":
+            if sys.version_info >= (3, 11):
+                namemap = co_varnames + co_freevars
+                assert (
+                    instrs.argval in namemap
+                ), f"`{instrs.argval}` not in {namemap}"
+                instrs.arg = namemap.index(instrs.argval)
+
+
+def calc_offset_from_bytecode_offset(
+    bytecode_offset: int,
+    instructions: list[dis.Instruction] | list[Instruction],
+) -> int:
+    """
+    Calculate the index from bytecode offset, because it have 2 bytes per instruction (for Python <= 3.10).
+
+    Args:
+        bytecode_offset (int): The bytecode offset of the instruction.
+
+    Returns:
+        int: The index of the instruction in the instruction list.
+    """
+
+    if sys.version_info >= (3, 11):
+        instruction_offsets = [x.offset for x in instructions]
+        return instruction_offsets.index(bytecode_offset)
+    return bytecode_offset // 2
+
+
+def replace_instr(instructions, instr, new_instr):
+    idx = instructions.index(instr)
+    instructions[idx : idx + 1] = new_instr
+
+
+def instrs_info(instrs, mark=None, range=None):
+    ret = []
+    start = -1
+    end = 1000000
+    if mark is not None and range is not None:
+        start = mark - range
+        end = mark + range + 1
+    for idx, instr in enumerate(instrs):
+        if idx < start or idx >= end:
+            continue
+        if instr.starts_line is not None:
+            ret.append("")
+        ret.append(
+            "{line:<8s}{is_jump_target:>2s}{offset:>4d} {opname:<30s}{arg:<4s}{argval:<40s}{mark}".format(
+                line=str(instr.starts_line) if instr.starts_line else "",
+                is_jump_target=">>" if instr.is_jump_target else "  ",
+                offset=instr.offset
+                if instr.offset or instr.offset == 0
+                else -1,
+                opname=instr.opname,
+                arg=str(instr.arg) if instr.arg is not None else "",
+                argval=f"({instr.argval})" if instr.argval else "",
+                mark="",
+            )
+        )
+        if idx == mark:
+            ret[-1] = "\033[31m" + ret[-1] + "\033[0m"
+    return ret
+
+
+def calc_stack_effect(instr: Instruction, *, jump: bool | None = None) -> int:
+    """
+    Gets the stack effect of the given instruction. In Python 3.11, the stack effect of `CALL` is -1,
+    refer to https://github.com/python/cpython/blob/3.11/Python/compile.c#L1123-L1124.
+
+    Args:
+        instr: The instruction.
+
+    Returns:
+        The stack effect of the instruction.
+
+    """
+    if sys.version_info[:2] == (3, 11):
+        if instr.opname == "PRECALL":
+            return 0
+        elif instr.opname == "CALL":
+            # NOTE(zrr1999): push_n = 1, pop_n = oparg + 2, stack_effect = push_n - pop_n = -oparg-1
+            assert instr.arg is not None
+            return -instr.arg - 1
+    return dis.stack_effect(instr.opcode, instr.arg, jump=jump)
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py
new file mode 100644
index 00000000000000..dcda7558e5a395
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import dataclasses
+from enum import Enum
+
+from ...utils import InnerError, OrderedSet
+from .instruction_utils import Instruction
+from .opcode_info import ALL_JUMP, HAS_FREE, HAS_LOCAL, UNCONDITIONAL_JUMP
+
+
+@dataclasses.dataclass
+class State:
+    reads: OrderedSet[str]
+    writes: OrderedSet[str]
+    visited: OrderedSet[int]
+
+
+def is_read_opcode(opname):
+    if opname in [
+        "LOAD_FAST",
+        "LOAD_DEREF",
+        "LOAD_NAME",
+        "LOAD_GLOBAL",
+        "LOAD_CLOSURE",
+    ]:
+        return True
+    if opname in (
+        "DELETE_FAST",
+        "DELETE_DEREF",
+        "DELETE_NAME",
+        "DELETE_GLOBAL",
+    ):
+        return True
+    return False
+
+
+def is_write_opcode(opname):
+    if opname in ["STORE_FAST", "STORE_NAME", "STORE_DEREF", "STORE_GLOBAL"]:
+        return True
+    if opname in (
+        "DELETE_FAST",
+        "DELETE_DEREF",
+        "DELETE_NAME",
+        "DELETE_GLOBAL",
+    ):
+        return True
+    return False
+
+
+def analysis_inputs(
+    instructions: list[Instruction],
+    current_instr_idx: int,
+    stop_instr_idx: int | None = None,
+) -> OrderedSet[str]:
+    """
+    Analyze the inputs of the instructions from current_instr_idx to stop_instr_idx.
+
+    Args:
+        instructions (list[Instruction]): The instructions to analyze.
+        current_instr_idx (int): The index of the current instruction.
+        stop_instr_idx (int | None, optional): The index of the instruction to stop. Defaults to None.
+            If None, the analysis will stop at the end of the instructions.
+
+    Returns:
+        set[str]: The analysis result.
+    """
+    root_state = State(OrderedSet(), OrderedSet(), OrderedSet())
+
+    def fork(
+        state: State, start: int, jump: bool, jump_target: int
+    ) -> OrderedSet[str]:
+        new_start = start + 1 if not jump else jump_target
+        new_state = State(
+            OrderedSet(state.reads),
+            OrderedSet(state.writes),
+            OrderedSet(state.visited),
+        )
+        return walk(new_state, new_start)
+
+    def walk(state: State, start: int) -> OrderedSet[str]:
+        end = len(instructions) if stop_instr_idx is None else stop_instr_idx
+        for i in range(start, end):
+            if i in state.visited:
+                return state.reads
+            state.visited.add(i)
+
+            instr = instructions[i]
+            if instr.opname in HAS_LOCAL | HAS_FREE:
+                if is_read_opcode(instr.opname) and instr.argval not in (
+                    state.writes
+                ):
+                    state.reads.add(instr.argval)
+                elif is_write_opcode(instr.opname):
+                    state.writes.add(instr.argval)
+            elif instr.opname in ALL_JUMP:
+                assert instr.jump_to is not None
+                target_idx = instructions.index(instr.jump_to)
+                # Fork to two branches, jump or not
+                jump_branch = fork(state, i, True, target_idx)
+                not_jump_branch = (
+                    fork(state, i, False, target_idx)
+                    if instr.opname not in UNCONDITIONAL_JUMP
+                    else OrderedSet()
+                )
+                return jump_branch | not_jump_branch
+            elif instr.opname == "RETURN_VALUE":
+                return state.reads
+        return state.reads
+
+    return walk(root_state, current_instr_idx)
+
+
+@dataclasses.dataclass
+class SpaceState:
+    reads: dict[str, Space]
+    writes: dict[str, Space]
+    visited: OrderedSet[int]
+
+    def __or__(self, other):
+        reads = {}
+        reads.update(other.reads)
+        reads.update(self.reads)
+        writes = {}
+        writes.update(other.writes)
+        writes.update(self.writes)
+        return SpaceState(reads, writes, OrderedSet())
+
+
+class Space(Enum):
+    locals = 1
+    globals = 2
+    cells = 3
+    all = 4
+
+
+def get_space(opname: str):
+    if "FAST" in opname:
+        return Space.locals
+    elif "GLOBAL" in opname:
+        return Space.globals
+    elif "DEREF" in opname or "CLOSURE" in opname:
+        return Space.cells
+    elif "NAME" in opname:
+        return Space.all
+    else:
+        raise InnerError(f"Unknown space for {opname}")
+
+
+def analysis_used_names_with_space(
+    instructions: list[Instruction],
+    start_instr_idx: int,
+    stop_instr_idx: int | None = None,
+):
+    root_state = SpaceState({}, {}, OrderedSet())
+
+    def fork(
+        state: SpaceState, start: int, jump: bool, jump_target: int
+    ) -> SpaceState:
+        new_start = start + 1 if not jump else jump_target
+        new_state = SpaceState(
+            dict(state.reads),
+            dict(state.writes),
+            OrderedSet(state.visited),
+        )
+        return walk(new_state, new_start)
+
+    def walk(state: SpaceState, start: int) -> SpaceState:
+        end = len(instructions) if stop_instr_idx is None else stop_instr_idx
+        for i in range(start, end):
+            if i in state.visited:
+                return state
+            state.visited.add(i)
+
+            instr = instructions[i]
+            if instr.opname in HAS_LOCAL | HAS_FREE:
+                if is_read_opcode(instr.opname) and instr.argval not in (
+                    state.writes
+                ):
+                    space = get_space(instr.opname)
+                    state.reads[instr.argval] = space
+                elif is_write_opcode(instr.opname):
+                    space = get_space(instr.opname)
+                    state.writes[instr.argval] = space
+            elif instr.opname in ALL_JUMP:
+                assert instr.jump_to is not None
+                target_idx = instructions.index(instr.jump_to)
+                # Fork to two branches, jump or not
+                jump_branch = fork(state, i, True, target_idx)
+                not_jump_branch = (
+                    fork(state, i, False, target_idx)
+                    if instr.opname not in UNCONDITIONAL_JUMP
+                    else SpaceState({}, {}, OrderedSet())
+                )
+                return jump_branch | not_jump_branch
+            elif instr.opname == "RETURN_VALUE":
+                return state
+        return state
+
+    state = walk(root_state, start_instr_idx)
+    all_used_vars = {}
+    all_used_vars.update(state.writes)
+    all_used_vars.update(state.reads)
+    return all_used_vars
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_info.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_info.py
new file mode 100644
index 00000000000000..cc63d5ecde967a
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_info.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from enum import Enum
+
+import opcode
+
+REL_JUMP = {opcode.opname[x] for x in opcode.hasjrel}
+REL_BWD_JUMP = {opname for opname in REL_JUMP if "BACKWARD" in opname}
+REL_FWD_JUMP = REL_JUMP - REL_BWD_JUMP
+ABS_JUMP = {opcode.opname[x] for x in opcode.hasjabs}
+HAS_LOCAL = {opcode.opname[x] for x in opcode.haslocal}
+HAS_FREE = {opcode.opname[x] for x in opcode.hasfree}
+ALL_JUMP = REL_JUMP | ABS_JUMP
+UNCONDITIONAL_JUMP = {"JUMP_ABSOLUTE", "JUMP_FORWARD"}
+if sys.version_info >= (3, 11):
+    UNCONDITIONAL_JUMP.add("JUMP_BACKWARD")
+
+
+class JumpDirection(Enum):
+    FORWARD = "FORWARD"
+    BACKWARD = "BACKWARD"
+
+
+class PopJumpCond(Enum):
+    FALSE = "FALSE"
+    TRUE = "TRUE"
+    NONE = "NONE"
+    NOT_NONE = "NOT_NONE"
+
+
+# Cache for some opcodes, it's for Python 3.11+
+# https://github.com/python/cpython/blob/3.11/Include/internal/pycore_opcode.h#L41-L53
+PYOPCODE_CACHE_SIZE = {
+    "BINARY_SUBSCR": 4,
+    "STORE_SUBSCR": 1,
+    "UNPACK_SEQUENCE": 1,
+    "STORE_ATTR": 4,
+    "LOAD_ATTR": 4,
+    "COMPARE_OP": 2,
+    "LOAD_GLOBAL": 5,
+    "BINARY_OP": 1,
+    "LOAD_METHOD": 10,
+    "PRECALL": 1,
+    "CALL": 4,
+}
diff --git a/python/paddle/jit/sot/opcode_translator/skip_files.py b/python/paddle/jit/sot/opcode_translator/skip_files.py
new file mode 100644
index 00000000000000..5d5d04e56eca91
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/skip_files.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import codecs
+import collections
+import contextlib
+import copy
+import copyreg
+import dataclasses
+import enum
+import functools
+import importlib
+import inspect
+import linecache
+import logging
+import multiprocessing
+import operator
+import os
+import posixpath
+import random
+import re
+import selectors
+import signal
+import sys
+import tempfile
+import threading
+import tokenize
+import traceback
+import types
+import typing
+import unittest
+import uuid
+import warnings
+import weakref
+
+import _collections_abc
+import _weakrefset
+import decorator
+import google.protobuf
+import numpy
+import setuptools
+
+import paddle
+
+from ..utils import log
+
+NEED_SKIP_THIRD_PARTIY_MODULES = {
+    abc,
+    collections,
+    contextlib,
+    copy,
+    copyreg,
+    dataclasses,
+    enum,
+    functools,
+    google.protobuf,
+    importlib,
+    inspect,
+    linecache,
+    logging,
+    multiprocessing,
+    numpy,
+    operator,
+    os,
+    posixpath,
+    random,
+    re,
+    selectors,
+    signal,
+    tempfile,
+    threading,
+    tokenize,
+    traceback,
+    types,
+    typing,
+    unittest,
+    weakref,
+    _collections_abc,
+    _weakrefset,
+    decorator,
+    codecs,
+    uuid,
+    setuptools,
+    warnings,
+}
+
+if sys.version_info < (3, 11):
+    import sre_compile
+    import sre_parse
+
+    NEED_SKIP_THIRD_PARTIY_MODULES.add(sre_compile)
+    NEED_SKIP_THIRD_PARTIY_MODULES.add(sre_parse)
+
+if sys.version_info < (3, 12):
+    import distutils
+
+    NEED_SKIP_THIRD_PARTIY_MODULES.add(distutils)
+
+
+def _strip_init_py(s):
+    return re.sub(r"__init__.py$", "", s)
+
+
+def _module_dir(m: types.ModuleType):
+    return _strip_init_py(m.__file__)
+
+
+skip_file_names = {_module_dir(m) for m in NEED_SKIP_THIRD_PARTIY_MODULES}
+
+
+sot_path = os.path.dirname(__file__).rpartition(os.sep)[0] + os.sep
+paddle_path = sys.modules["paddle"].__file__.rpartition(os.sep)[0] + os.sep
+
+skip_file_names.add(sot_path)
+skip_file_names.add(paddle_path)
+skip_file_names.add(
+    "<frozen importlib",
+)
+skip_file_names.add("<__array_function__ internals>")
+
+skip_file_name_re = re.compile(
+    f"^({'|'.join(map(re.escape, skip_file_names))})"
+)
+
+customed_skip_code = set()
+
+no_skip_code = {paddle.nn.Sequential.forward.__code__}
+
+
+def need_skip_path(filepath: str) -> bool:
+    """
+    Check if the file should be skipped and not transcribed.
+
+    Args:
+        filepath: The path of the file to check.
+
+    Returns:
+        bool: True if the file should be skipped.
+    """
+    if not filepath.startswith("<"):
+        filepath = os.path.abspath(filepath)
+    return bool(skip_file_name_re.match(filepath))
+
+
+def skip_function(function):
+    customed_skip_code.add(function.__code__)
+    return function
+
+
+def need_skip(frame):
+    pycode = frame.f_code
+    if pycode in no_skip_code:
+        return False
+    if pycode in customed_skip_code:
+        log(3, f"Skip frame by code: {pycode}\n")
+        return True
+    filename = pycode.co_filename
+    if sys.version_info >= (3, 11) and filename.startswith("<frozen"):
+        # NOTE(SigureMo): In Python 3.11, the core modules essential for
+        # Python startup are “frozen”. So we need get original filename from
+        # frame.
+        # see https://docs.python.org/3/whatsnew/3.11.html#faster-startup for more details.
+        # This workaround is refer to pdb.py
+        # https://github.com/python/cpython/blob/3.11/Lib/pdb.py#L1328-L1331
+        _filename = frame.f_globals.get('__file__', None)
+        if isinstance(_filename, str):
+            filename = _filename
+    return need_skip_path(filename)
diff --git a/python/paddle/jit/sot/opcode_translator/transform.py b/python/paddle/jit/sot/opcode_translator/transform.py
new file mode 100644
index 00000000000000..8fcf3cc5a2b72d
--- /dev/null
+++ b/python/paddle/jit/sot/opcode_translator/transform.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import dis
+import sys
+from functools import partial
+
+from ..profiler import EventGuard
+from ..utils import CodeStatus, log, log_do
+from .custom_code import CustomCode
+from .executor.executor_cache import OpcodeExecutorCache
+from .skip_files import need_skip
+
+
+def print_locals(frame):
+    local_key = [
+        key for key in frame.f_locals.keys() if not key.startswith("__")
+    ]
+    print(
+        f"[eval_frame_callback] {frame.f_code.co_name} with locals {local_key}"
+    )
+    print(
+        f"[eval_frame_callback] {' ' * len(frame.f_code.co_name)} with cellvars + freevars:  {frame.f_code.co_cellvars + frame.f_code.co_freevars}"
+    )
+
+    def convert_obj(obj):
+        import paddle
+
+        if isinstance(obj, paddle.Tensor):
+            return "Tensor(" + str(obj.shape) + ")"
+        if isinstance(obj, list):
+            return [convert_obj(i) for i in obj]
+        return obj
+
+    for key in local_key:
+        print(
+            f"[eval_frame_callback] {' ' * len(frame.f_code.co_name)} {key} = {convert_obj(frame.f_locals[key])}"
+        )
+
+
+def eval_frame_callback(frame, **kwargs) -> CustomCode:
+    with EventGuard(
+        f"eval_frame_callback: {frame.f_code.co_name}", event_level=2
+    ):
+        # is generator
+        if frame.f_code.co_flags & 0x20 > 0:
+            return CustomCode(None, True)
+
+        # NOTE(SigureMo): Temporary fallback when code has exception handling.
+        if sys.version_info >= (3, 11) and frame.f_code.co_exceptiontable:
+            log(
+                3,
+                f"[eval_frame_callback] {frame.f_code} has co_exceptiontable\n",
+            )
+            return CustomCode(None, False)
+
+        if need_skip(frame):
+            log(3, f"[eval_frame_callback] skip {frame.f_code}\n")
+            custom_code = CustomCode(None, False)
+            new_code = frame.f_code
+        else:
+            log(
+                2, f"[eval_frame_callback] start to translate: {frame.f_code}\n"
+            )
+            log_do(4, partial(print_locals, frame))
+
+            log(3, f"[transform] OriginCode: {frame.f_code.co_name}\n")
+            log_do(3, lambda: dis.dis(frame.f_code))
+
+            custom_code = OpcodeExecutorCache()(frame, **kwargs)
+
+            if custom_code.code is None:
+                log(
+                    3,
+                    "[transform] NewCode (same as origin code): "
+                    + frame.f_code.co_name
+                    + "\n",
+                )
+                new_code = frame.f_code
+            else:
+                log(
+                    3,
+                    "[transform] NewCode: " + custom_code.code.co_name + "\n",
+                )
+                log_do(3, lambda: dis.dis(custom_code.code))
+                new_code = custom_code.code
+
+        # just check those codes which need open eval_frame
+        if (
+            custom_code.disable_eval_frame is False
+            and CodeStatus().is_code_without_graph(new_code)
+        ):
+            log(
+                3,
+                "[eval_frame_callback] Code has no graph, block it.\n",
+            )
+            return CustomCode(None, True)
+
+        return custom_code
diff --git a/python/paddle/jit/sot/profiler.py b/python/paddle/jit/sot/profiler.py
new file mode 100644
index 00000000000000..8315e03dd37f5c
--- /dev/null
+++ b/python/paddle/jit/sot/profiler.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from contextlib import contextmanager
+from functools import wraps
+
+from paddle.framework import core
+
+_event_level = int(os.environ.get("EVENT_LEVEL", "-1"))
+
+
+class SotProfiler:
+    def __enter__(self):
+        self.enable()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.disable()
+
+    def enable(self, tag=None):
+        core.nvprof_start()
+        core.nvprof_enable_record_event()
+
+    def disable(self):
+        core.nvprof_stop()
+
+
+@contextmanager
+def EventGuard(event_name, event_level=0):
+    try:
+        global _event_level
+        need_pop = False
+        if _event_level >= event_level:
+            core.nvprof_nvtx_push(event_name)
+            need_pop = True
+        yield
+    finally:
+        if need_pop:
+            core.nvprof_nvtx_pop()
+
+
+if _event_level == -1:
+
+    @contextmanager
+    def _EmptyEventGuard(event_name, event_level=0):
+        yield
+
+    EventGuard = _EmptyEventGuard  # noqa: F811
+
+
+def event_register(event_name, event_level=0):
+    def event_wrapper(func):
+        @wraps(func)
+        def call_with_event(*args, **kwargs):
+            with EventGuard(event_name, event_level=0):
+                return func(*args, **kwargs)
+
+        return call_with_event
+
+    def do_nothing(func):
+        return func
+
+    global _event_level
+    if _event_level >= event_level:
+        return event_wrapper
+    else:
+        return do_nothing
diff --git a/python/paddle/jit/sot/psdb.py b/python/paddle/jit/sot/psdb.py
new file mode 100644
index 00000000000000..38fa4d7479e160
--- /dev/null
+++ b/python/paddle/jit/sot/psdb.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import builtins
+import types
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from typing import TypeVar
+
+    from typing_extensions import ParamSpec
+
+    T = TypeVar("T")
+    P = ParamSpec("P")
+
+NO_BREAKGRAPH_CODES: set[types.CodeType] = set()
+NO_FALLBACK_CODES: set[types.CodeType] = set()
+
+
+def assert_true(input: bool):
+    assert input
+
+
+def print(*args, **kwargs):
+    builtins.print("[Dygraph]", *args, **kwargs)
+
+
+def breakpoint():
+    import paddle
+
+    old = paddle.framework.core.set_eval_frame(None)
+    builtins.breakpoint()
+    paddle.framework.core.set_eval_frame(old)
+
+
+def check_no_breakgraph(fn: Callable[P, T]) -> Callable[P, T]:
+    NO_BREAKGRAPH_CODES.add(fn.__code__)
+    return fn
+
+
+def breakgraph():
+    pass
+
+
+def check_no_fallback(fn: Callable[P, T]) -> Callable[P, T]:
+    NO_FALLBACK_CODES.add(fn.__code__)
+    return fn
+
+
+def fallback():
+    pass
+
+
+def in_sot():
+    return False
diff --git a/python/paddle/jit/sot/symbolic/compile_cache.py b/python/paddle/jit/sot/symbolic/compile_cache.py
new file mode 100644
index 00000000000000..7fb01c729b8ebb
--- /dev/null
+++ b/python/paddle/jit/sot/symbolic/compile_cache.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import paddle
+
+from ..profiler import EventGuard
+from ..utils import (
+    Cache,
+    CodeStatus,
+    GraphLogger,
+    Singleton,
+    StepInfoManager,
+    log_do,
+)
+from .interpreter import compile_sir
+
+if TYPE_CHECKING:
+    from .symbolic_context import SymbolicTraceContext
+
+
+def clear_eager_tensor_name(output_tensors):
+    for output_tensor in output_tensors:
+        output_tensor.name = ""
+
+
+class FallbackWrapper:
+    """
+    Used to store and call static graph methods generated by paddle.jit.to_static
+    """
+
+    def __init__(self, compiled_fn, SIR):
+        self.compiled_fn = compiled_fn
+        self.partial_program = None
+        self.concrete_program = None
+        self.SIR = SIR  # for debug
+
+    def __call__(self, *args, **kwargs):
+        with EventGuard(f"FallbackWrapper: {self.SIR.name}"):
+            if StepInfoManager().need_back_trace:
+                CodeStatus().trace_back_frames()
+
+            log_do(
+                2,
+                lambda: print("[FallbackWrapper] start run SIR: \n", self.SIR),
+            )
+            log_do(
+                4,
+                lambda: print(
+                    self.compiled_fn.get_concrete_program(*args, **kwargs)[
+                        1
+                    ].train_program
+                ),
+            )
+            if self.partial_program is None:
+                with EventGuard("FallbackWrapper: call compiled_fn"):
+                    outputs = self.compiled_fn(*args, **kwargs)
+                    (
+                        self.concrete_program,
+                        self.partial_program,
+                    ) = self.compiled_fn.get_concrete_program(*args, **kwargs)
+            else:
+                # Speed up Resnet from 0.0068 --> 0.0057
+                with EventGuard("FallbackWrapper: call partial_program"):
+                    outputs = self.partial_program(*args, **kwargs)
+
+            clear_eager_tensor_name(outputs)
+            log_do(
+                1,
+                lambda: GraphLogger().add_subgraph(
+                    self.concrete_program.main_program
+                ),
+            )
+            log_do(
+                4,
+                lambda: print("[CompileCache] run sir forward success."),
+            )
+            return outputs
+
+
+@Singleton
+class CompileSIRCache(Cache):
+    """
+    Cache the compiled function of SIR
+    """
+
+    def __init__(self):
+        super().__init__(weak=False)
+
+    def key_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
+        """
+        generate a hash key for a SIR
+
+        Args:
+            context: The context to compile
+            sir_name: The name of the sir to compile
+            build_strategy: The build strategy to compile
+
+        Returns:
+            The hash key of the SIR
+        """
+        sir = context.get_sir(sir_name)
+        # NOTE(dev): Is str(sir) a heavy opearation ?
+        hash_key = hash(str(sir))
+        return hash_key
+
+    def value_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
+        """
+        Generate static graph function
+
+        Args:
+            context: The context to compile
+            sir_name: The name of the sir to compile
+            build_strategy: The build strategy to compile
+
+        Returns:
+            The static graph function
+        """
+        build_strategy = kwargs.get("build_strategy", None)
+        backend = kwargs.get("backend", None)
+        return FallbackWrapper(
+            paddle.jit.to_static(
+                compile_sir(context, sir_name),
+                build_strategy=build_strategy,
+                backend=backend,
+                full_graph=True,
+            ),
+            context.get_sir(sir_name),
+        )
diff --git a/python/paddle/jit/sot/symbolic/interpreter.py b/python/paddle/jit/sot/symbolic/interpreter.py
new file mode 100644
index 00000000000000..13265bbab4e380
--- /dev/null
+++ b/python/paddle/jit/sot/symbolic/interpreter.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import paddle
+from paddle.utils import to_sequence
+
+from ..utils import InnerError, map_if, map_if_extend
+from .statement_ir import SIRRuntimeCache, Symbol
+
+if TYPE_CHECKING:
+    from .statement_ir import Statement, StatementIR
+    from .symbolic_context import SymbolicTraceContext
+
+
+def replace_symbol(
+    values: list[Symbol] | list[object], state: dict[str, Symbol]
+):
+    """
+    Replaces Symbol objects with their corresponding values.
+
+    Args:
+        values: A list of values that may contain Symbol objects.
+        state: A dict mapping Symbol names to their corresponding values.
+
+    Returns:
+        A new list with Symbol objects replaced by their corresponding values in the state dict.
+    """
+    # deal with list / map etc.
+    values = map_if_extend(
+        values,
+        pred=lambda x: isinstance(x, Symbol),
+        true_fn=lambda x: state[x.name],
+        false_fn=lambda x: x,
+    )
+    return values
+
+
+def _append_opstack_between(start, end, stack):
+    # NOTE(xiongkun): we don't sync for speed. careful!!
+    # [start, end)
+    from paddle.framework import core
+
+    op_maker = core.op_proto_and_checker_maker
+    callstack_attr_name = op_maker.kOpCreationCallstackAttrName()
+    for op in for_each_ops_between(start, end):
+        op._set_attr(callstack_attr_name, stack)
+
+
+def for_each_ops_between(start, end):
+    # NOTE(xiongkun): we don't sync for speed. careful!!
+    # [start, end)
+    program = paddle.static.default_main_program()
+    ops = program.current_block().ops[start:end]
+    yield from ops
+
+
+def opnum_in_program():
+    # NOTE(xiongkun): we don't sync for speed. careful!!
+    program = paddle.static.default_main_program()
+    return len(program.current_block().ops)
+
+
+class Interpreter:
+    """
+    Interpreter is used to interpret and execute SIR.
+    """
+
+    def __init__(self, symbolic_context: SymbolicTraceContext):
+        self._context = symbolic_context
+
+    def get_sir(self, name: str) -> StatementIR:
+        """
+        Returns the StatementIR object by given name.
+
+        Args:
+            name: The name of the StatementIR.
+
+        Returns:
+            The StatementIR object with the given name.
+        """
+        return self._context.get_sir(name)
+
+    def run_sir(self, name: str, state: dict[str, Symbol]):
+        """
+        Runs the StatementIR with the given name using the provided state.
+
+        Args:
+            name: The name of the given StatementIR to run.
+            state: A dict mapping Symbol names to their corresponding values.
+
+        Returns:
+            A list of the Symbol of the StatementIR after execution.
+        """
+        SIR = self.get_sir(name)
+        for stmt in SIR.statements:
+            stmt: Statement
+            before_stmt_opnum = opnum_in_program()
+            inputs = replace_symbol(stmt.inputs, state)
+            outs = getattr(self, stmt.type)(stmt, inputs)
+
+            def _set(v, s):
+                state[s.name] = v
+
+            if len(to_sequence(outs)) != len(to_sequence(stmt.outputs)):
+                raise InnerError("Number output mismatch, some error happen.")
+
+            _append_opstack_between(
+                before_stmt_opnum, opnum_in_program() + 1, stmt.stmt_stack
+            )
+
+            map_if(
+                outs,
+                stmt.outputs,
+                pred=lambda v, s: isinstance(s, Symbol),
+                true_fn=lambda v, s: _set(v, s),
+                false_fn=lambda v, s: None,
+            )
+        # fetch outputs
+        return replace_symbol(SIR.outputs, state)
+
+    def call(self, stmt: Statement, inputs):
+        SIR = self.get_sir(stmt.sir_name)
+        state = prepare_state(SIR, inputs)
+        return self.run_sir(stmt.sir_name, state)
+
+    def api(self, stmt, inputs):
+        args, kwargs = inputs
+        return stmt.api(*args, **kwargs)
+
+    def method(self, stmt, inputs):
+        args, kwargs = inputs
+        var = args[0]
+        return getattr(var, stmt.method)(*args[1:], **kwargs)
+
+    def layer(self, stmt, inputs):
+        args, kwargs = inputs
+        layer = stmt.layer()
+        assert layer is not None, "SIR bound layer is None."
+        return layer(*args, **kwargs)
+
+
+def compile_sir(context: SymbolicTraceContext, name: str):
+    """
+    Compile a SIR to a new function
+
+    Args:
+        context: The context to compile
+        name: The name of the sir to compile
+
+    """
+
+    @paddle.jit.not_to_static
+    def wrapper(args):
+        """
+        This function will be decorated by paddle.to_static.
+        so the args is variables, not eager tensors.
+        """
+        interpreter = Interpreter(context)
+        SIR = interpreter.get_sir(name)
+        state = prepare_state(SIR, args)
+        return interpreter.run_sir(name, state)
+
+    return wrapper
+
+
+def prepare_state(SIR, inputs):
+    state = {}
+
+    # update free vars if exsits
+    if SIRRuntimeCache().has_key(SIR.name):  # noqa: W601
+        free_var_seeker = SIRRuntimeCache().get_free_vars(SIR.name)
+        if free_var_seeker:
+            state = free_var_seeker()
+
+    # bind inputs
+    for sir_inp, inp in zip(SIR.inputs, inputs):
+        state[sir_inp.name] = inp
+
+    return state
diff --git a/python/paddle/jit/sot/symbolic/statement_ir.py b/python/paddle/jit/sot/symbolic/statement_ir.py
new file mode 100644
index 00000000000000..11a08f36acd9d0
--- /dev/null
+++ b/python/paddle/jit/sot/symbolic/statement_ir.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+THIS FILE IS PRIVATE !!
+
+use interface in symbolic_context.py first.
+"""
+from __future__ import annotations
+
+import weakref
+from typing import Any, Callable
+
+import paddle
+from paddle.utils import is_sequence, map_structure
+
+from ..utils import NameGenerator, OrderedSet, Singleton, flatten_extend
+
+
+class Symbol:
+    """
+    Symbol is used to distinguish a string and a `math variable`.
+    """
+
+    def __init__(self, name: str):
+        self.name = name
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return str(self)
+
+    def __eq__(self, other):
+        if isinstance(other, str):
+            return self.name == other
+        return self.name == other.name
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __deepcopy__(self, memo=None):
+        return Symbol(self.name)
+
+
+class Statement:
+    """
+    Statement is used to represent a sentence of code for building the neural network model,
+    which has four types: "call", "api", "method", and "layer".
+
+    Note:
+        Statement temporarily does not support control flow.
+    """
+
+    def __init__(
+        self,
+        type: str,
+        name: str,
+        inputs: list[Symbol],
+        outputs: list[Symbol],
+        stacks: list[str],
+    ):
+        assert type in ["call", "api", "method", "layer"]
+        self.name = name
+        self.inputs = inputs  # (list of Symbols, dict of Symbols)
+        self.outputs = outputs  # list of Symbol | PythonObj
+        self.stmt_stack = (
+            stacks  # a list of string to record the source code callstack.
+        )
+        self.type = type
+
+    def __str__(self):
+        def to_string(inps):
+            if isinstance(inps, str) or not is_sequence(inps):
+                return inps.__str__()
+            inps = (x.__str__() for x in inps)
+            return ", ".join(inps)
+
+        return "{} || {} = {} ({}) ".format(
+            self.type + " " * (10 - len(self.type)),
+            to_string(self.outputs),
+            self.name,
+            to_string(self.inputs),
+        )
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class CallStatement(Statement):
+    def __init__(
+        self,
+        name: str,
+        inputs: list[Symbol],
+        outputs: list[Symbol],
+        stacks: list[str],
+    ):
+        super().__init__("call", name, inputs, outputs, stacks)
+        self.sir_name = name
+
+
+class ApiStatement(Statement):
+    def __init__(
+        self,
+        api: Callable,
+        inputs: list[Symbol],
+        outputs: list[Symbol],
+        stacks: list[str],
+    ):
+        super().__init__(
+            "api", "paddle." + api.__name__, inputs, outputs, stacks
+        )
+        self.api = api
+
+
+class MethodStatement(Statement):
+    def __init__(
+        self,
+        name: str,
+        inputs: list[Symbol],
+        outputs: list[Symbol],
+        stacks: list[str],
+    ):
+        super().__init__("method", name, inputs, outputs, stacks)
+        self.method = name
+
+
+class LayerStatement(Statement):
+    def __init__(
+        self,
+        layer: paddle.nn.Layer,
+        inputs: list[Symbol],
+        outputs: list[Symbol],
+        stacks: list[str],
+    ):
+        super().__init__(
+            "layer", layer.__class__.__name__, inputs, outputs, stacks
+        )
+        self.layer = weakref.ref(layer)
+
+
+class StatementIR:
+    """
+    StatementIR is the carrier that records the code for building the neural network model.It is
+    a representation of a purely computational structure, and does not care about specific values.
+    The function converted from StatementIR can ensure that it can be turned into a static state.
+    In this way, we can reuse the original `to_static` function to realize the execution of the static graph.
+
+    Note:
+        Don't create by yourself, just use the StatementIRCache.get()
+    """
+
+    def __init__(self, name: str):
+        self.name = name
+        self.inputs = []  # list of Symbol | PythonObj
+        self.outputs = []  # list of Symbol | PythonObj
+        self.statements = []  # list of Statement
+
+    def __len__(self):
+        return len(self.statements)
+
+    def __deepcopy__(self, memo=None):
+        new_sir = StatementIR(self.name)
+        new_sir.inputs = list(self.inputs)
+        new_sir.outputs = list(self.outputs)
+        new_sir.statements = list(self.statements)
+        return new_sir
+
+    def add_input(self, input):
+        self.inputs.append(input)
+
+    def add_output(self, output):
+        self.outputs.append(output)
+
+    def add_statement(self, statement):
+        assert isinstance(statement, Statement)
+        self.statements.append(statement)
+
+    def analyse_inputs(self):
+        used_symbols = OrderedSet()
+        generated_symbols = OrderedSet()
+        for stmt in self.statements:
+            for inp in flatten_extend(stmt.inputs):
+                if isinstance(inp, Symbol) and inp not in generated_symbols:
+                    used_symbols.add(inp)
+            for out in flatten_extend(stmt.outputs):
+                if isinstance(out, Symbol):
+                    generated_symbols.add(out)
+
+        input_symbols = sorted(used_symbols, key=lambda x: x.name)
+        return input_symbols
+
+    def __str__(self):
+        strs = []
+        strs.append("StatmentIR: %s" % self.name)
+        strs.append(f"  inputs: {map_structure(lambda x: x.name, self.inputs)}")
+        strs.append(
+            f"  outputs: {map_structure(lambda x: x.name, self.outputs)}"
+        )
+        strs.append("  statements: ")
+        for stmt in self.statements:
+            strs.append(f"    {stmt}")
+        return "\n".join(strs)
+
+    def __repr__(self):
+        return self.__str__()
+
+    def graph_size(self):
+        call_layers = [x for x in self.statements if x.type == "layer"]
+        return len(self.statements) + len(call_layers)
+
+
+@Singleton
+class StatementIRFactory:
+    """
+    It is used to create a StatementIR.
+    """
+
+    def __init__(self):
+        self.cache = {}
+        self.name_generator = NameGenerator("SIR_")
+
+    def __getitem__(self, key):
+        return self.cache[key]
+
+    def create(self, input_name=None):
+        if input_name:
+            name = input_name
+        else:
+            name = self.name_generator.next()
+
+        sir = StatementIR(name)
+        self.cache[name] = sir
+        return sir
+
+    def update(self, stmt_ir):
+        name = stmt_ir.name
+        self.cache[name] = stmt_ir
+
+    def clear(self):
+        want_clear = [
+            key
+            for key in self.cache.keys()
+            if self.name_generator.match_name(key)
+        ]
+        for key in want_clear:
+            del self.cache[key]
+
+
+@Singleton
+class SIRRuntimeCache:
+    """
+    It is used to cache the runtime information of the StatementIR.
+    """
+
+    def __init__(self):
+        self.cache = {}
+        #     { name : (inputs, outputs, free_vars) }
+        #       inputs  : can be used when call_SIR, if free_vars exist
+        #       outputs : used for generator new ProxyTensor output before fallback
+        #       free_vars: (name, function)
+
+    def __getitem__(self, key):
+        return self.cache[key]
+
+    def has_key(self, key: str) -> bool:
+        """
+        has_key is used to check whether the key is in the cache.
+        """
+        return key in self.cache.keys()
+
+    def set_origin_inputs(self, key: str, inputs: Any):
+        """
+        Set Cache origin Inputs of the StatementIR
+        """
+        if key in self.cache.keys():
+            val = self.cache[key]
+            self.cache[key] = (inputs, val[1], val[2])
+        else:
+            self.cache[key] = (inputs, None, None)
+
+    def set_origin_outputs(self, key: str, outputs: Any):
+        """
+        Set Cache origin outputs of the StatementIR
+        """
+        if key in self.cache.keys():
+            val = self.cache[key]
+            self.cache[key] = (val[0], outputs, val[2])
+        else:
+            self.cache[key] = (None, outputs, None)
+
+    def set_free_vars(self, key: str, free_vars: Any):
+        """
+        Set Cache free variables of the StatementIR
+        """
+        if key in self.cache.keys():
+            val = self.cache[key]
+            self.cache[key] = (val[0], val[1], free_vars)
+        else:
+            self.cache[key] = (None, None, free_vars)
+
+    def get_origin_inputs(self, key: str):
+        """
+        Get the origin inputs of the StatementIR.
+        """
+        if key in self.cache.keys():
+            return self.cache[key][0]
+        else:
+            return None
+
+    def get_origin_outputs(self, key: str):
+        """
+        Get the origin outputs of the StatementIR.
+        """
+        if key in self.cache.keys():
+            return self.cache[key][1]
+        else:
+            return None
+
+    def get_free_vars(self, key: str):
+        """
+        Get the free variables of the StatementIR.
+        """
+        if key in self.cache.keys():
+            return self.cache[key][2]
+        else:
+            return None
diff --git a/python/paddle/jit/sot/symbolic/symbolic_context.py b/python/paddle/jit/sot/symbolic/symbolic_context.py
new file mode 100644
index 00000000000000..47f40bbcc9ec74
--- /dev/null
+++ b/python/paddle/jit/sot/symbolic/symbolic_context.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from ..utils import log
+from .compile_cache import CompileSIRCache
+from .statement_ir import (
+    ApiStatement,
+    CallStatement,
+    LayerStatement,
+    MethodStatement,
+    StatementIR,
+    StatementIRFactory,
+    Symbol,
+)
+
+
+class SymbolicTraceContext:
+    """
+    SymbolicTraceContext is a context manager, which is used to record the symbolic trace.
+
+    """
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        """
+        Reset the context.
+        """
+
+        # TODO(dev): StatementIRFactory is a singleton, but SymbolicTraceContext is not.
+        # whether will two different SymbolicTraceContext objects be conflict ?
+        self.statement_factory = StatementIRFactory()
+        self.sir_stack = [self.statement_factory.create()]
+
+    @property
+    def TOS(self):
+        """
+        The top SIR of sir_stack.
+
+        Returns:
+            StatementIR: the top of stack.
+        """
+
+        return self.sir_stack[-1]
+
+    def call_SIR(self, sirname, inputs, outputs, stacks):
+        """
+        Call a SIR, which is a subgraph.
+        """
+
+        stmt = CallStatement(sirname, inputs, outputs, stacks)
+        self.TOS.add_statement(stmt)
+
+    def call_API(self, api, inputs, outputs, stacks):
+        """
+        Call a paddle api.
+        """
+
+        assert callable(api), "call_API must receive a paddle api."
+        stmt = ApiStatement(api, inputs, outputs, stacks)
+        self.TOS.add_statement(stmt)
+
+    def call_METHOD(self, method_name, inputs, outputs, stacks):
+        """
+        Call a method of a api. The API here can be python or Paddle
+        """
+        assert isinstance(
+            method_name, str
+        ), "call_METHOD must method api name. string."
+        assert isinstance(
+            inputs[0][0], Symbol
+        ), "call_METHOD must first augument must be Symbol Variable."
+        stmt = MethodStatement(method_name, inputs, outputs, stacks)
+        self.TOS.add_statement(stmt)
+
+    def call_LAYER(self, layer, inputs, outputs, stacks):
+        """
+        Call a layer of a api.
+        """
+        stmt = LayerStatement(layer, inputs, outputs, stacks)
+        self.TOS.add_statement(stmt)
+
+    def get_sir(self, name: str):
+        """
+        Get a SIR from statement_factory.
+
+        Args:
+            name (str): the name of SIR.
+
+        Returns:
+            StatementIR: the SIR.
+        """
+        return self.statement_factory[name]
+
+    def reset_TOS(self):
+        """
+        Reset the TOS.
+        """
+        self.sir_stack.pop()
+        self.sir_stack.append(self.statement_factory.create())
+
+    def replace_TOS(self, sir):
+        """
+        Use deepcopyed sir to replace the TOS.
+        This function will update statment_factory.
+        """
+        self.sir_stack.pop()
+        self.sir_stack.append(sir)
+        self.statement_factory.update(sir)
+
+    def compile_do_nothing(self, ret_vals):
+        """
+        Return a dummy function, which will return an empty list.
+
+        Args:
+            ret_vals (list[Symbol]): the return values of the function.
+        """
+
+        def dummy_func(*args, **kwargs):
+            return []
+
+        # return None function
+        dummy_stmt_ir = StatementIR("dummy_func")
+        dummy_stmt_ir.outputs = []
+        dummy_stmt_ir.inputs = []
+        return dummy_func, dummy_stmt_ir
+
+    def compile_fn(self, ret_vals, **kwargs):
+        """
+        start compile and return the python function, which must can be to_static without errors.
+        """
+        cur_sir: StatementIR = self.TOS
+        # step0: if no statement, return a dummy function
+        if len(cur_sir.statements) == 0:
+            return self.compile_do_nothing(ret_vals)
+        # step1: analyse sir inputs and outputs
+        cur_sir.inputs = cur_sir.analyse_inputs()
+        # TODO: output analysis
+        cur_sir.outputs = ret_vals
+        log(2, "start subgraph compile and execution.\n")
+        log(2, self.TOS, "\n")
+        # step2: call compile_sir and get python function, third cache is triggered here.
+        static_func = CompileSIRCache()(self, cur_sir.name, **kwargs)
+        # step3: GC and reset TOS
+        # self.reset_TOS()
+
+        return static_func, cur_sir
diff --git a/python/paddle/jit/sot/translate.py b/python/paddle/jit/sot/translate.py
new file mode 100644
index 00000000000000..88f569460a5ca0
--- /dev/null
+++ b/python/paddle/jit/sot/translate.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable, TypeVar
+
+import paddle
+
+from .opcode_translator import eval_frame_callback
+from .utils import GraphLogger, StepInfoManager, StepState, log_do
+
+if TYPE_CHECKING:
+    from typing_extensions import ParamSpec
+
+    P = ParamSpec("P")
+    R = TypeVar("R")
+
+
+def symbolic_translate(fn: Callable[P, R], **kwargs) -> Callable[P, R]:
+    """
+    This function is the entry point of PaddleSOT. It sets eval_frame_callback before input
+    function to achieve Opcode-level translation. The translation process depends on the
+    simulation execution, in which information will be collected, especially the network
+    code. After the simulation execution is completed, the network code will be compiled
+    into a static graph Program to improve performance.
+
+    Args:
+        fn: The input function.
+
+    Returns:
+        Callable, The wrapped function.
+
+    Examples:
+        >>> # doctest: +SKIP("Cound not get source code of function foo."")
+        >>> import paddle
+        >>> import numpy as np
+        >>> from sot.translate import symbolic_translate
+        >>> def foo(cond: paddle.Tensor, x: paddle.Tensor):
+        ...     x += 1
+        ...     if cond:
+        ...         x += 1
+        ...     else:
+        ...         x -= 1
+        ...     return x
+        >>> symbolic_translate_foo = symbolic_translate(foo)
+        >>> # For the true branch, the output is 2.
+        >>> cond = paddle.to_tensor(True)
+        >>> x = paddle.to_tensor(0)
+        >>> dygraph_out = foo(cond, x)
+        >>> symbolic_translate_out = symbolic_translate_foo(cond, x)
+        >>> dygraph_out
+        Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True,
+        2)
+        >>> symbolic_translate_out
+        Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True,
+        2)
+        >>> np.testing.assert_allclose(
+        ...     dygraph_out.numpy(), symbolic_translate_out.numpy()
+        ... )
+        >>> # For the false branch, the output is 0.
+        >>> cond = paddle.to_tensor(False)
+        >>> dygraph_out = foo(cond, x)
+        >>> symbolic_translate_out = symbolic_translate_foo(cond, x)
+        >>> dygraph_out
+        Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True,
+        0)
+        >>> symbolic_translate_out
+        Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True,
+        0)
+        >>> np.testing.assert_allclose(
+        ...     dygraph_out.numpy(), symbolic_translate_out.numpy()
+        ... )
+
+    """
+
+    def callback(frame):
+        return eval_frame_callback(frame, **kwargs)
+
+    def impl_sot(*args: P.args, **kwargs: P.kwargs) -> R:
+        assert hasattr(
+            fn, "__code__"
+        ), "Target function doesn't have code for simulating."
+        StepInfoManager().sot_step()
+        GraphLogger().clear()
+        paddle.framework.core.set_eval_frame(callback)
+        try:
+            outs = fn(*args, **kwargs)
+        except Exception as e:
+            raise e
+        finally:
+            paddle.framework.core.set_eval_frame(None)
+
+        log_do(1, lambda: GraphLogger().print_info())
+        return outs
+
+    def impl_dynamic(*args: P.args, **kwargs: P.kwargs) -> R:
+        outs = fn(*args, **kwargs)
+        return outs
+
+    def impl(*args: P.args, **kwargs: P.kwargs) -> R:
+        with StepInfoManager().step_guard(fn.__code__):
+            state = StepInfoManager().current_state
+
+            if state == StepState.RUN_SOT:
+                return impl_sot(*args, **kwargs)
+            elif state == StepState.RUN_DYN:
+                return impl_dynamic(*args, **kwargs)
+            elif state == StepState.COLLECT_INFO:
+                return StepInfoManager().collect_info(
+                    impl_dynamic, impl_sot, *args, **kwargs
+                )
+
+    return impl
diff --git a/python/paddle/jit/sot/utils/__init__.py b/python/paddle/jit/sot/utils/__init__.py
new file mode 100644
index 00000000000000..a1f26ea622772b
--- /dev/null
+++ b/python/paddle/jit/sot/utils/__init__.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .code_status import CodeStatus  # noqa: F401
+from .exceptions import (  # noqa: F401
+    BreakGraphError,
+    FallbackError,
+    InnerError,
+    inner_error_default_handler,
+)
+from .magic_methods import magic_method_builtin_dispatch  # noqa: F401
+from .paddle_api_config import (  # noqa: F401
+    is_break_graph_tensor_methods,
+    is_inplace_api,
+    paddle_tensor_methods,
+)
+from .utils import (  # noqa: F401
+    Cache,
+    GraphLogger,
+    NameGenerator,
+    OrderedSet,
+    ResumeFnNameFactory,
+    Singleton,
+    SotUndefinedVar,
+    StepInfoManager,
+    StepState,
+    cost_model,
+    count_if,
+    current_tmp_name_records,
+    execute_time,
+    flatten_extend,
+    get_unbound_method,
+    hashable,
+    in_paddle_module,
+    is_break_graph_api,
+    is_builtin_fn,
+    is_clean_code,
+    is_paddle_api,
+    is_strict_mode,
+    list_contain_by_id,
+    list_find_index_by_id,
+    log,
+    log_do,
+    map_if,
+    map_if_extend,
+    meta_str,
+    min_graph_size,
+    no_eval_frame,
+    show_trackers,
+    tmp_name_guard,
+)
diff --git a/python/paddle/jit/sot/utils/code_status.py b/python/paddle/jit/sot/utils/code_status.py
new file mode 100644
index 00000000000000..007e77f6340041
--- /dev/null
+++ b/python/paddle/jit/sot/utils/code_status.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from enum import Enum
+
+import paddle
+
+from .utils import Singleton, log
+
+
+class CodeState(Enum):
+    UNKNOW = 1
+    WITH_GRAPH = 2
+    WITHOUT_GRAPH = 3
+
+
+class CodeInfo:
+    def __init__(self):
+        self.state = CodeState.UNKNOW
+        self.counter = 0
+
+    def __repr__(self):
+        return f"state: {self.state}, counter: {self.counter}"
+
+
+@Singleton
+class CodeStatus:
+    WITH_GRAPH_API = [
+        paddle.nn.Layer.__call__.__code__,
+        paddle.nn.Layer._dygraph_call_func.__code__,
+    ]
+
+    def __init__(self):
+        self.code_map = {}
+        self.setup_code_map()
+
+    def setup_code_map(self):
+        for code in self.WITH_GRAPH_API:
+            info = CodeInfo()
+            info.state = CodeState.WITH_GRAPH
+            self.code_map[code] = info
+
+    def clear(self):
+        self.code_map.clear()
+        self.setup_code_map()
+
+    def is_code_without_graph(self, code):
+        if code not in self.code_map:
+            info = CodeInfo()
+            self.code_map[code] = info
+        else:
+            info = self.code_map[code]
+
+        if info.state == CodeState.WITHOUT_GRAPH:
+            return True
+        if info.state == CodeState.UNKNOW:
+            info.counter += 1
+            if info.counter >= 10:
+                log(
+                    3,
+                    f"[CodeStatus] Switch state to WITHOUT_GRAPH for {code}\n",
+                )
+                info.state = CodeState.WITHOUT_GRAPH
+        return False
+
+    def trace_back_frames(self):
+        frame = inspect.currentframe()
+        while frame.f_back is not None:
+            frame = frame.f_back
+            code = frame.f_code
+            if code in self.code_map:
+                info = self.code_map[code]
+                if info.state != CodeState.WITH_GRAPH:
+                    log(
+                        3,
+                        f"[CodeStatus] Switch state to WITH_GRAPH for {code}\n",
+                    )
+                    info.state = CodeState.WITH_GRAPH
diff --git a/python/paddle/jit/sot/utils/exceptions.py b/python/paddle/jit/sot/utils/exceptions.py
new file mode 100644
index 00000000000000..ff26f4ee2ba107
--- /dev/null
+++ b/python/paddle/jit/sot/utils/exceptions.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import traceback
+
+
+class SotErrorBase(Exception):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        from ..opcode_translator.breakpoint import BreakpointManager
+
+        BreakpointManager().on_event(f"{self.__class__.__name__}")
+
+    def print(self):
+        lines = traceback.format_tb(self.__traceback__)
+        print("".join(lines))
+
+
+class InnerError(SotErrorBase):
+    pass
+
+
+class HasNoAttributeError(InnerError):
+    pass
+
+
+class FallbackError(SotErrorBase):
+    def __init__(self, msg, disable_eval_frame=False):
+        super().__init__(msg)
+        self.disable_eval_frame = disable_eval_frame
+
+
+# raise in inline function call strategy.
+class BreakGraphError(SotErrorBase):
+    pass
+
+
+def inner_error_default_handler(func, message_fn):
+    """Wrap function and an error handling function and throw an InnerError."""
+
+    def impl(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            message = message_fn(*args, **kwargs)
+            origin_exception_message = "\n".join(
+                traceback.format_exception(type(e), e, e.__traceback__)
+            )
+            raise InnerError(
+                f"{message}.\nOrigin Exception is: \n {origin_exception_message}"
+            ) from e
+
+    return impl
diff --git a/python/paddle/jit/sot/utils/magic_methods.py b/python/paddle/jit/sot/utils/magic_methods.py
new file mode 100644
index 00000000000000..56b20abdb05419
--- /dev/null
+++ b/python/paddle/jit/sot/utils/magic_methods.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import operator
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable
+
+from .utils import hashable
+
+if TYPE_CHECKING:
+    BinaryOp = Callable[[Any, Any], Any]
+    UnaryOp = Callable[[Any], Any]
+
+
+INPLACE_BINARY_OPS_TO_MAGIC_NAMES: dict[BinaryOp, tuple[str, BinaryOp]] = {
+    # inplace op fn: (magic name, non-inplace op fn)
+    operator.iadd: ("__iadd__", operator.add),
+    operator.iand: ("__iand__", operator.and_),
+    operator.iconcat: ("__iconcat__", operator.concat),
+    operator.ifloordiv: ("__ifloordiv__", operator.floordiv),
+    operator.ilshift: ("__ilshift__", operator.lshift),
+    operator.imatmul: ("__imatmul__", operator.matmul),
+    operator.imod: ("__imod__", operator.mod),
+    operator.imul: ("__imul__", operator.mul),
+    operator.ior: ("__ior__", operator.or_),
+    operator.ipow: ("__ipow__", operator.pow),
+    operator.irshift: ("__irshift__", operator.rshift),
+    operator.isub: ("__isub__", operator.sub),
+    operator.itruediv: ("__itruediv__", operator.truediv),
+    operator.ixor: ("__ixor__", operator.xor),
+}
+
+NON_INPLACE_BINARY_OPS_TO_MAGIC_NAMES: dict[
+    BinaryOp, tuple[str, str | None]
+] = {
+    # op fn: (magic name, reverse magic name)
+    operator.add: ("__add__", "__radd__"),
+    operator.and_: ("__and__", "__rand__"),
+    operator.contains: ("__contains__", None),
+    operator.delitem: ("__delitem__", None),
+    operator.eq: ("__eq__", "__eq__"),
+    operator.floordiv: ("__floordiv__", "__rfloordiv__"),
+    operator.ge: ("__ge__", "__le__"),
+    operator.getitem: ("__getitem__", None),
+    operator.gt: ("__gt__", "__lt__"),
+    operator.le: ("__le__", "__ge__"),
+    operator.lshift: ("__lshift__", "__rlshift__"),
+    operator.lt: ("__lt__", "__gt__"),
+    operator.matmul: ("__matmul__", "__rmatmul__"),
+    operator.mod: ("__mod__", "__rmod__"),
+    operator.mul: ("__mul__", "__rmul__"),
+    operator.ne: ("__ne__", "__ne__"),
+    operator.or_: ("__or__", "__ror__"),
+    operator.pow: ("__pow__", "__rpow__"),
+    operator.rshift: ("__rshift__", "__rrshift__"),
+    operator.sub: ("__sub__", "__rsub__"),
+    operator.truediv: ("__truediv__", "__rtruediv__"),
+    operator.xor: ("__xor__", "__rxor__"),
+}
+
+UNARY_OPS_TO_MAGIC_NAMES: dict[UnaryOp, str] = {
+    operator.neg: "__neg__",
+    operator.invert: "__invert__",
+    operator.pos: "__pos__",
+    operator.abs: "__abs__",
+    operator.index: "__index__",
+    operator.inv: "__inv__",
+    operator.invert: "__invert__",
+    operator.not_: "__not__",
+    operator.pos: "__pos__",
+    operator.truth: "__bool__",
+    bool: "__bool__",
+    abs: "__abs__",
+    float: "__float__",
+    len: "__len__",
+    int: "__int__",
+}
+# TODO(SigureMo): support any, all, sum
+
+
+INPLACE_BINARY_OPS = set(INPLACE_BINARY_OPS_TO_MAGIC_NAMES.keys())
+NON_INPLACE_BINARY_OPS = set(NON_INPLACE_BINARY_OPS_TO_MAGIC_NAMES.keys())
+BINARY_OPS = INPLACE_BINARY_OPS | NON_INPLACE_BINARY_OPS
+UNARY_OPS = set(UNARY_OPS_TO_MAGIC_NAMES.keys())
+
+
+@dataclass
+class MagicMethod:
+    name: str
+    is_inplace: bool = False
+    is_reverse: bool = False
+
+
+def magic_method_builtin_dispatch(fn: BinaryOp | UnaryOp) -> list[MagicMethod]:
+    if not hashable(fn):
+        return []
+    if fn in INPLACE_BINARY_OPS:
+        inplace_magic_name, non_inplace_op = INPLACE_BINARY_OPS_TO_MAGIC_NAMES[
+            fn
+        ]
+        return [
+            MagicMethod(inplace_magic_name, is_inplace=True)
+        ] + magic_method_builtin_dispatch(non_inplace_op)
+    elif fn in NON_INPLACE_BINARY_OPS:
+        magic_name, reverse_magic_name = NON_INPLACE_BINARY_OPS_TO_MAGIC_NAMES[
+            fn
+        ]
+        magic_methods = [MagicMethod(magic_name)]
+        if reverse_magic_name is not None:
+            magic_methods.append(
+                MagicMethod(reverse_magic_name, is_reverse=True)
+            )
+        return magic_methods
+    elif fn in UNARY_OPS:
+        magic_name = UNARY_OPS_TO_MAGIC_NAMES[fn]
+        return [MagicMethod(magic_name)]
+    return []
diff --git a/python/paddle/jit/sot/utils/paddle_api_config.py b/python/paddle/jit/sot/utils/paddle_api_config.py
new file mode 100644
index 00000000000000..06852d186a76c5
--- /dev/null
+++ b/python/paddle/jit/sot/utils/paddle_api_config.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+import paddle
+
+
+def is_inplace_api(func):
+    inplace_apis = {paddle.static.setitem}
+    return func in inplace_apis
+
+
+def get_tensor_methods():
+    return [
+        member_name
+        for member_name, member in inspect.getmembers(paddle.static.Variable)
+        if inspect.isfunction(member)
+    ]
+
+
+def get_paddle_api():
+    modules = [
+        paddle,
+        paddle.nn.functional,
+        paddle.linalg,
+        paddle.signal,
+        paddle.fft,
+        paddle.vision.ops,
+    ]
+    special_paddle_apis = [paddle.tensor.fill_constant]
+    non_operator_related_apis = [
+        paddle.in_dynamic_mode,
+        paddle.save,
+        paddle.load,
+        paddle.get_cuda_rng_state,
+        paddle.set_rng_state,
+        paddle.set_cuda_rng_state,
+        paddle.get_rng_state,
+        paddle.set_default_dtype,
+        paddle.check_shape,
+        paddle.summary,
+        paddle.finfo,
+        paddle.iinfo,
+        paddle.enable_static,
+        paddle.disable_static,
+        paddle.is_grad_enabled,
+    ]
+    # TODO: users should not call static_apis, but we need to use, so add static_apis here temporary
+    static_apis = [paddle.static.setitem, paddle.static.accuracy]
+    paddle_api_list = []
+    for module in modules:
+        for fn_name in getattr(module, "__all__", []):
+            fn = getattr(module, fn_name)
+            if inspect.isfunction(fn):
+                paddle_api_list.append(fn)
+    return list(
+        set(special_paddle_apis)
+        | set(static_apis)
+        | set(paddle_api_list) - set(non_operator_related_apis)
+    )
+
+
+paddle_tensor_methods = get_tensor_methods()
+paddle_api_list = get_paddle_api()
+
+# TODO(Aurelius84): It seems that we use it to judge 'in_paddle_module()'.
+# Bug what does 'is_paddle_module' really means? Is all paddle.xx sub module
+# considered as paddle module？
+paddle_api_module_prefix = {
+    "paddle.nn.functional",
+    "paddle.nn.layer.activation",
+}
+
+break_graph_set = set()
+
+
+break_graph_tensor_method = {
+    'register_hook',
+    'numpy',
+    'clear_gradient',
+    # TODO: Browse all possible functions and make prior judgments.
+}
+
+
+def is_break_graph_tensor_methods(method_name):
+    return method_name in break_graph_tensor_method
+
+
+def add_break_graph_apis(apis: list):
+    break_graph_set.update(apis)
diff --git a/python/paddle/jit/sot/utils/utils.py b/python/paddle/jit/sot/utils/utils.py
new file mode 100644
index 00000000000000..912ae7dec2692c
--- /dev/null
+++ b/python/paddle/jit/sot/utils/utils.py
@@ -0,0 +1,730 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import builtins
+import inspect
+import os
+import time
+import types
+import weakref
+from collections import OrderedDict
+from contextlib import contextmanager
+from enum import Enum
+from typing import Any, Generic, Iterable, Iterator, TypeVar
+from weakref import WeakValueDictionary
+
+import numpy as np
+
+import paddle
+from paddle.framework import Program
+from paddle.utils import flatten, map_structure
+
+from .paddle_api_config import (
+    break_graph_set,
+    paddle_api_list,
+    paddle_api_module_prefix,
+)
+
+T = TypeVar("T")
+
+
+def cost_model():
+    return os.environ.get("COST_MODEL", "False") == "True"
+
+
+def min_graph_size():
+    return int(os.environ.get("MIN_GRAPH_SIZE", 10))
+
+
+class Singleton(Generic[T]):
+    def __init__(self, cls: type[T]):
+        self._cls = cls
+        self._instance = {}
+
+    def __call__(self) -> T:
+        if self._cls not in self._instance:
+            self._instance[self._cls] = self._cls()
+        return self._instance[self._cls]
+
+
+class NameGenerator:
+    def __init__(self, prefix):
+        self.counter = 0
+        self.prefix = prefix
+
+    def next(self):
+        name = self.prefix + str(self.counter)
+        self.counter += 1
+        return name
+
+    def match_name(self, name: str) -> bool:
+        return name.startswith(self.prefix)
+
+
+_tmp_name_records = None
+
+
+class TmpNameRecords:
+    def __init__(self):
+        self.name_generator = NameGenerator(prefix="_sot_tmp_")
+        self.tmp_names_record = OrderedDict()
+
+    def next_name(self):
+        return self.name_generator.next()
+
+    def add_tmp_var(self, expr):
+        if expr in self.tmp_names_record:
+            return self.tmp_names_record[expr]
+        else:
+            tmp_name = self.next_name()
+            self.tmp_names_record[expr] = tmp_name
+            return tmp_name
+
+
+@contextmanager
+def tmp_name_guard():
+    global _tmp_name_records
+    old = _tmp_name_records
+    _tmp_name_records = TmpNameRecords()
+    yield
+    _tmp_name_records = old
+
+
+def current_tmp_name_records():
+    global _tmp_name_records
+    return _tmp_name_records
+
+
+@Singleton
+class ResumeFnNameFactory:
+    def __init__(self) -> None:
+        self.gen = NameGenerator('resume_')
+
+    def next(self):
+        name = self.gen.next()
+        return name
+
+
+def log(level, *args):
+    cur_level = int(os.environ.get("SOT_LOG_LEVEL", "0"))
+    if level <= cur_level:
+        print(*args, end="")
+
+
+def log_do(level, fn):
+    cur_level = int(os.environ.get("SOT_LOG_LEVEL", "0"))
+    if level <= cur_level:
+        fn()
+
+
+def no_eval_frame(func):
+    def no_eval_frame_func(*args, **kwargs):
+        old_cb = paddle.framework.core.set_eval_frame(None)
+        try:
+            retval = func(*args, **kwargs)
+        except:
+            raise
+        finally:
+            paddle.framework.core.set_eval_frame(old_cb)
+        return retval
+
+    return no_eval_frame_func
+
+
+def is_paddle_api(func):
+    if isinstance(func, paddle.nn.Layer):  # ignore all the classes
+        return False
+    if hasattr(func, "__self__"):  # ignore all the methods
+        return False
+    if inspect.isclass(
+        func
+    ):  # paddle.Tensor should not be wrapped, but how about other situations?
+        return False
+    return in_paddle_module(func) or func in paddle_api_list
+
+
+def is_builtin_fn(fn):
+    special_builtin_fns = [weakref.ref]
+    if fn in special_builtin_fns:
+        return True
+    if isinstance(fn, types.BuiltinFunctionType):
+        return True
+    for member_name, member in inspect.getmembers(builtins):
+        if member is fn and isinstance(member, type):
+            return True
+    return False
+
+
+def in_paddle_module(func):
+    if hasattr(func, "__module__"):
+        module_str = func.__module__
+        if module_str is None:
+            return False
+        log(5, "find paddle function with __module__: ", module_str, "\n")
+        if hasattr(func, "__name__"):
+            log(
+                5, "                     with __name__  : ", func.__name__, "\n"
+            )
+        log(5, "                     with results   : ")
+        for prefix in paddle_api_module_prefix:
+            if module_str.startswith(prefix):
+                log(5, " True\n")
+                return True
+    log(5, " False\n")
+    return False
+
+
+def is_break_graph_api(func):
+    return func in break_graph_set
+
+
+def map_if(*structures, pred, true_fn, false_fn):
+    def replace(*args):
+        if pred(*args):
+            return true_fn(*args)
+        return false_fn(*args)
+
+    return map_structure(replace, *structures)
+
+
+def flatten_extend(structure):
+    for item in flatten(structure):
+        if isinstance(item, slice):
+            yield item.start
+            yield item.stop
+            yield item.step
+        else:
+            yield item
+
+
+def map_if_extend(structure, pred, true_fn, false_fn):
+    """support extended structures like slice and SliceVariable"""
+
+    def wrapped_pred(x):
+        if isinstance(x, slice):
+            return True
+        return pred(x)
+
+    def wrapped_true_fn(x):
+        if isinstance(x, (slice)):
+            l = [x.start, x.stop, x.step]
+            l = map_if_extend(l, pred, true_fn, false_fn)
+            return slice(*l)
+        return true_fn(x)
+
+    return map_if(
+        structure, pred=wrapped_pred, true_fn=wrapped_true_fn, false_fn=false_fn
+    )
+
+
+def count_if(*structures, pred):
+    def is_true(*args):
+        if pred(*args):
+            return 1
+        return 0
+
+    return sum(flatten(map_structure(is_true, *structures)))
+
+
+class Cache:
+    def __init__(self, weak=False):
+        if not weak:
+            self.cache = {}
+        else:
+            self.cache = WeakValueDictionary()
+        self.hit_num = 0
+
+    def __call__(self, *args, **kwargs):
+        cache_key = self.key_fn(*args, **kwargs)
+        if cache_key is None:
+            return self.value_fn(*args, **kwargs)
+        if cache_key in self.cache:
+            log(5, "cache hit: ", cache_key, "\n")
+            self.hit_num += 1
+            return self.cache[cache_key]
+        value = self.value_fn(*args, **kwargs)
+        self.cache[cache_key] = value
+        return value
+
+    def clear(self):
+        self.cache.clear()
+        self.hit_num = 0
+
+    def key_fn(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def value_fn(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
+def execute_time(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        execution_time = end_time - start_time
+        print("Execute time:", execution_time)
+        return result
+
+    return wrapper
+
+
+def meta_str(shape, dtype, stop_gradient):
+    return f"(shape: {shape}, dtype: {dtype}, stop_gradient: {stop_gradient})"
+
+
+def is_strict_mode():
+    return os.environ.get("STRICT_MODE", "0") == "1"
+
+
+def show_trackers() -> str | None:
+    return os.environ.get("SHOW_TRACKERS", None)
+
+
+def is_clean_code() -> bool:
+    return os.environ.get('CLEAN_CODE', "False") == "True"
+
+
+def list_find_index_by_id(li: list[Any], item: Any) -> int:
+    return [id(it) for it in li].index(id(item))
+
+
+def list_contain_by_id(li: list[Any], item: Any) -> int:
+    return id(item) in [id(it) for it in li]
+
+
+def get_unbound_method(obj, name):
+    # TODO(dev): Consider the case of patching methods to instances
+    return getattr(obj.__class__, name)
+
+
+@Singleton
+class GraphLogger:
+    graph_num: int
+    op_num: int
+    graphs: list[Program]
+    ops: list[paddle.base.framework.Operator]
+
+    def __init__(self):
+        self.clear()
+
+    def clear(self):
+        self.graph_num = 0
+        self.op_num = 0
+        self.graphs = []
+        self.ops = []
+
+    def get_graph_num(self):
+        return self.graph_num
+
+    def get_op_num(self):
+        return self.op_num
+
+    def add_subgraph(self, program: Program):
+        self.graph_num += 1
+        self.graphs.append(program)
+
+        for block in program.blocks:
+            sub_op = []
+            for op in block.ops:
+                self.op_num += 1
+                sub_op.append(op)
+            self.ops.append(sub_op)
+
+    def add_subgprah_info(self, strs):
+        for i in range(len(self.graphs)):
+            strs.append(
+                "------------------------------------------------------"
+            )
+
+            strs.append(f"subgraph {i}, OpNum: {len(self.ops[i])}")
+            strs.append(f"{self.graphs[i]}")
+
+    def __str__(self):
+        strs = []
+        strs.append("---------------- PaddleSOT graph info ----------------")
+        strs.append(f"SubgraphNum: {self.get_graph_num()}")
+        strs.append(f"OpNum: {self.get_op_num()}")
+
+        # We can display every subgraph info
+        log_do(5, lambda: self.add_subgprah_info(strs))
+
+        strs.append("---------------- PaddleSOT graph info ----------------")
+        return "\n".join(strs)
+
+    def __repr__(self):
+        return self.__str__()
+
+    def print_info(self):
+        print(self)
+
+
+@Singleton
+class SotUndefinedVar:
+    pass
+
+
+def hashable(obj):
+    try:
+        hash(obj)
+        return True
+    except TypeError as e:
+        return False
+
+
+class OrderedSet(Generic[T]):
+    """
+    A set that preserves the order of insertion.
+    """
+
+    _data: dict[T, None]
+
+    def __init__(self, items: Iterable[T] | None = None):
+        """
+        Examples:
+            >>> s = OrderedSet([1, 2, 3])
+            >>> s
+            OrderedSet(1, 2, 3)
+            >>> s = OrderedSet()
+            >>> s
+            OrderedSet()
+        """
+        self._data = dict.fromkeys(items) if items is not None else {}
+
+    def __iter__(self) -> Iterator[T]:
+        """
+        Examples:
+            >>> s = OrderedSet([1, 2, 3])
+            >>> for item in s:
+            ...     print(item)
+            1
+            2
+            3
+        """
+        return iter(self._data)
+
+    def __or__(self, other: OrderedSet[T]) -> OrderedSet[T]:
+        """
+        Union two sets.
+
+        Args:
+            other: Another set to be unioned.
+
+        Returns:
+            The union of two sets.
+
+        Examples:
+            >>> s1 = OrderedSet([1, 2, 3])
+            >>> s2 = OrderedSet([2, 3, 4])
+            >>> s1 | s2
+            OrderedSet(1, 2, 3, 4)
+        """
+        return OrderedSet(list(self) + list(other))
+
+    def __ior__(self, other: OrderedSet[T]):
+        """
+        Union two sets in place.
+
+        Args:
+            other: Another set to be unioned.
+
+        Examples:
+            >>> s1 = OrderedSet([1, 2, 3])
+            >>> s2 = OrderedSet([2, 3, 4])
+            >>> s1 |= s2
+            >>> s1
+            OrderedSet(1, 2, 3, 4)
+        """
+        self._data.update(dict.fromkeys(other))
+        return self
+
+    def __and__(self, other: OrderedSet[T]) -> OrderedSet[T]:
+        """
+        Intersect two sets.
+
+        Args:
+            other: Another set to be intersected.
+
+        Returns:
+            The intersection of two sets.
+
+        Examples:
+            >>> s1 = OrderedSet([1, 2, 3])
+            >>> s2 = OrderedSet([2, 3, 4])
+            >>> s1 & s2
+            OrderedSet(2, 3)
+        """
+        return OrderedSet([item for item in self if item in other])
+
+    def __iand__(self, other: OrderedSet[T]):
+        """
+        Intersect two sets in place.
+
+        Args:
+            other: Another set to be intersected.
+
+        Examples:
+            >>> s1 = OrderedSet([1, 2, 3])
+            >>> s2 = OrderedSet([2, 3, 4])
+            >>> s1 &= s2
+            >>> s1
+            OrderedSet(2, 3)
+        """
+        self._data = {item: None for item in self if item in other}
+        return self
+
+    def __sub__(self, other: OrderedSet[T]) -> OrderedSet[T]:
+        """
+        Subtract two sets.
+
+        Args:
+            other: Another set to be subtracted.
+
+        Returns:
+            The subtraction of two sets.
+
+        Examples:
+            >>> s1 = OrderedSet([1, 2, 3])
+            >>> s2 = OrderedSet([2, 3, 4])
+            >>> s1 - s2
+            OrderedSet(1)
+        """
+        return OrderedSet([item for item in self if item not in other])
+
+    def __isub__(self, other: OrderedSet[T]):
+        """
+        Subtract two sets in place.
+
+        Args:
+            other: Another set to be subtracted.
+
+        Examples:
+            >>> s1 = OrderedSet([1, 2, 3])
+            >>> s2 = OrderedSet([2, 3, 4])
+            >>> s1 -= s2
+            >>> s1
+            OrderedSet(1)
+        """
+        self._data = {item: None for item in self if item not in other}
+        return self
+
+    def add(self, item: T):
+        """
+        Add an item to the set.
+
+        Args:
+            item: The item to be added.
+
+        Examples:
+            >>> s = OrderedSet([1, 2, 3])
+            >>> s.add(4)
+            >>> s
+            OrderedSet(1, 2, 3, 4)
+        """
+        self._data.setdefault(item)
+
+    def remove(self, item: T):
+        """
+        Remove an item from the set.
+
+        Args:
+            item: The item to be removed.
+
+        Examples:
+            >>> s = OrderedSet([1, 2, 3])
+            >>> s.remove(2)
+            >>> s
+            OrderedSet(1, 3)
+        """
+        del self._data[item]
+
+    def __contains__(self, item: T) -> bool:
+        """
+        Examples:
+            >>> s = OrderedSet([1, 2, 3])
+            >>> 1 in s
+            True
+            >>> 4 in s
+            False
+        """
+        return item in self._data
+
+    def __len__(self) -> int:
+        """
+        Examples:
+            >>> s = OrderedSet([1, 2, 3])
+            >>> len(s)
+            3
+        """
+        return len(self._data)
+
+    def __bool__(self) -> bool:
+        """
+        Examples:
+            >>> s = OrderedSet([1, 2, 3])
+            >>> bool(s)
+            True
+            >>> s = OrderedSet()
+            >>> bool(s)
+            False
+        """
+        return bool(self._data)
+
+    def __eq__(self, other: object) -> bool:
+        """
+        Examples:
+            >>> s1 = OrderedSet([1, 2, 3])
+            >>> s2 = OrderedSet([1, 2, 3])
+            >>> s1 == s2
+            True
+            >>> s3 = OrderedSet([3, 2, 1])
+            >>> s1 == s3
+            False
+        """
+        if not isinstance(other, OrderedSet):
+            return NotImplemented
+        return list(self) == list(other)
+
+    def __repr__(self) -> str:
+        data_repr = ", ".join(map(repr, self._data))
+        return f"OrderedSet({data_repr})"
+
+
+class StepState(Enum):
+    COLLECT_INFO = 1
+    RUN_SOT = 2
+    RUN_DYN = 3
+
+
+class StepInfo:
+    REQUIRED_DYN_INFOS = 10
+    REQUIRED_SOT_INFOS = 10
+
+    USED_DYN_INFOS = 5
+
+    COLLECT_INFO_MAX_STEP = 50
+    CV_BOUNDARY = 0.1
+
+    BACK_TRACE_STEPS = 20
+
+    def __init__(self):
+        self.step_count = -1
+        self.state = (
+            StepState.COLLECT_INFO if cost_model() else StepState.RUN_SOT
+        )
+        self.dyn_time_costs = []
+        self.avg_dyn_time = 0
+        self.sot_time_costs = []
+        self.sot_step = -1
+
+    def add_dynamic_time_info(self, time_cost):
+        self.dyn_time_costs.append(time_cost)
+        if len(self.dyn_time_costs) == self.REQUIRED_DYN_INFOS:
+            self.avg_dyn_time = np.mean(
+                self.dyn_time_costs[-self.USED_DYN_INFOS :]
+            )
+
+    def add_sot_time_info(self, time_cost, current_code):
+        self.sot_time_costs.append(time_cost)
+        if len(self.sot_time_costs) == self.REQUIRED_SOT_INFOS:
+            avg_sot_time = np.mean(self.sot_time_costs)
+            log(
+                1,
+                f"[Cost Model] sot: {avg_sot_time}, dyn: {self.avg_dyn_time}\n",
+            )
+            if avg_sot_time < self.avg_dyn_time:
+                log(1, f"[Cost Model] Switch to RUN_SOT: {current_code} \n")
+                self.state = StepState.RUN_SOT
+            elif (
+                self.step_count > self.COLLECT_INFO_MAX_STEP
+                or np.std(self.sot_time_costs) / avg_sot_time < self.CV_BOUNDARY
+            ):
+                log(1, f"[Cost Model] Switch to RUN_DYN: {current_code}\n")
+                self.state = StepState.RUN_DYN
+            else:
+                log(1, f"[Cost Model] Decision delayed: {current_code}\n")
+                self.sot_time_costs.clear()
+
+    def need_back_trace(self):
+        return self.step_count < self.BACK_TRACE_STEPS
+
+    def need_dynamic_info(self):
+        return len(self.dyn_time_costs) < self.REQUIRED_DYN_INFOS
+
+
+@Singleton
+class StepInfoManager:
+    def __init__(self):
+        self.step_record = {}
+        self.current_code = None
+        self.current_step_info = None
+
+    @contextmanager
+    def step_guard(self, code):
+        try:
+            old_code = self.current_code
+            old_info = self.current_step_info
+
+            self.current_code = code
+            if code not in self.step_record:
+                self.step_record[code] = StepInfo()
+            self.current_step_info = self.step_record[code]
+
+            self.current_step_info.step_count += 1
+
+            log(
+                2,
+                f"[Cost Model] New step start, current state is {self.current_state}\n",
+            )
+            yield
+        finally:
+            self.current_code = old_code
+            self.current_step_info = old_info
+
+    def sot_step(self):
+        self.current_step_info.sot_step += 1
+
+    def collect_info(self, impl_dynamic, impl_sot, /, *args, **kwargs):
+        if self.current_step_info.need_dynamic_info():
+            start_time = time.perf_counter()
+            outs = impl_dynamic(*args, **kwargs)
+            time_cost = time.perf_counter() - start_time
+            self.current_step_info.add_dynamic_time_info(time_cost)
+        else:
+            start_time = time.perf_counter()
+            outs = impl_sot(*args, **kwargs)
+            time_cost = time.perf_counter() - start_time
+            self.current_step_info.add_sot_time_info(
+                time_cost, self.current_code
+            )
+        return outs
+
+    @property
+    def need_back_trace(self):
+        return self.current_step_info.need_back_trace()
+
+    @property
+    def current_step(self):
+        return self.current_step_info.step_count
+
+    @property
+    def current_state(self):
+        return self.current_step_info.state
+
+    def clear(self):
+        self.step_record.clear()
+        self.current_code = None
+        self.current_step = -1
diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py
index 766e72e0553e87..b5590e3194eef5 100644
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -512,6 +512,11 @@ def _preprocess(self, program_desc):
 
     @switch_to_static_graph
     def _append_scale_to_output(self, program):
+        # 0. scale don't support bool output, we skip append scale for it
+        for out_desc in self._output_descs:
+            if out_desc.dtype() == core.VarDesc.VarType.BOOL:
+                return
+
         # 1. append scale & save var
         scale_output_vars = []
         with framework.program_guard(program):
diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py
index 9239f68a731743..4c2d6c00b9f0d1 100644
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -12,34 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .tensor import inverse as inv  # noqa: F401
-from .tensor.linalg import cholesky  # noqa: F401
-from .tensor.linalg import cholesky_solve  # noqa: F401
-from .tensor.linalg import cond  # noqa: F401
-from .tensor.linalg import corrcoef  # noqa: F401
-from .tensor.linalg import cov  # noqa: F401
-from .tensor.linalg import det  # noqa: F401
-from .tensor.linalg import eig  # noqa: F401
-from .tensor.linalg import eigh  # noqa: F401
-from .tensor.linalg import eigvals  # noqa: F401
-from .tensor.linalg import eigvalsh  # noqa: F401
-from .tensor.linalg import lu  # noqa: F401
-from .tensor.linalg import lu_unpack  # noqa: F401
-from .tensor.linalg import matrix_power  # noqa: F401
-from .tensor.linalg import matrix_rank  # noqa: F401
-from .tensor.linalg import multi_dot  # noqa: F401
-from .tensor.linalg import norm  # noqa: F401
-from .tensor.linalg import pca_lowrank  # noqa: F401
-from .tensor.linalg import pinv  # noqa: F401
-from .tensor.linalg import qr  # noqa: F401
-from .tensor.linalg import slogdet  # noqa: F401
-from .tensor.linalg import solve  # noqa: F401
-from .tensor.linalg import svd  # noqa: F401
-from .tensor.linalg import triangular_solve  # noqa: F401
-from .tensor.linalg import lstsq
+from .tensor import inverse as inv
+from .tensor.linalg import (
+    cholesky,
+    cholesky_solve,
+    cond,
+    corrcoef,
+    cov,
+    det,
+    eig,
+    eigh,
+    eigvals,
+    eigvalsh,
+    lstsq,
+    lu,
+    lu_unpack,
+    matrix_power,
+    matrix_rank,
+    multi_dot,
+    norm,
+    pca_lowrank,
+    pinv,
+    qr,
+    slogdet,
+    solve,
+    svd,
+    triangular_solve,
+)
 
 __all__ = [
-    'cholesky',  # noqa
+    'cholesky',
     'norm',
     'cond',
     'cov',
diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py
index 60dff58ec48bfe..4cede829766f34 100644
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -12,14 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .metrics import Metric  # noqa: F401
-from .metrics import Accuracy  # noqa: F401
-from .metrics import Precision  # noqa: F401
-from .metrics import Recall  # noqa: F401
-from .metrics import Auc  # noqa: F401
-from .metrics import accuracy  # noqa: F401
+from .metrics import Accuracy, Auc, Metric, Precision, Recall, accuracy
 
-__all__ = [  # noqa
+__all__ = [
     'Metric',
     'Accuracy',
     'Precision',
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index dbef7079c1bf35..1ef27639abd132 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -207,7 +207,7 @@ def weight_norm(*args):
     return utils.weight_norm(*args)
 
 
-__all__ = [  # noqa
+__all__ = [
     'BatchNorm',
     'CELU',
     'GroupNorm',
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index 5fda0adff5efa8..13742ae6d9be82 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -965,7 +965,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
                 It can be a list of parameter or a list of parameter's name.
                 Default None, meaning that all parameters in the program will be included.
         program (Program, optional): The program where parameters are located.
-                Default None, meaning that using :ref:`api_base_default_main_program` .
+                Default None, meaning that using :ref:`api_paddle_static_default_main_program` .
 
     Returns:
         None
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 87f2eabba1f59e..608587becd9522 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -140,7 +140,7 @@
 from .flash_attention import scaled_dot_product_attention
 from .flash_attention import sdp_kernel
 
-__all__ = [  # noqa
+__all__ = [
     'celu',
     'conv1d',
     'conv1d_transpose',
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index f15c9f280db611..c74748793a4e9d 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -764,13 +764,9 @@ def relu(x, name=None):
             [0., 0., 1.])
     """
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.relu(x)
     else:
-        if paddle.framework.in_dynamic_or_pir_mode():
-            # Below code will be removed after we can generate IR api automatically
-            return paddle._pir_ops.relu(x)
-
         check_variable_and_dtype(
             x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'relu'
         )
@@ -1281,7 +1277,7 @@ def softplus(x, beta=1, threshold=20, name=None):
             \end{cases}
 
     Parameters:
-        x (Tensor): The input Tensor with data type float32, float64.
+        x (Tensor): The input Tensor with data type float32, float64, complex64, complex128.
         beta (float, optional): The value of :math:`\beta` for softplus. Default is 1
         threshold (float, optional): The value of :math:`\varepsilon` for softplus. Default is 20
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
@@ -1306,7 +1302,17 @@ def softplus(x, beta=1, threshold=20, name=None):
         return _C_ops.softplus(x, beta, threshold)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'softplus'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'softplus',
         )
         helper = LayerHelper('softplus', **locals())
         out = helper.create_variable_for_type_inference(x.dtype)
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 28341db5588aed..62050410b9c1a8 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1196,7 +1196,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
 
             # get mask shape
             input_shape = x.shape
-            if not in_dynamic_or_pir_mode():
+            if not in_dynamic_mode():
                 input_shape_tensor = paddle.shape(x)
             drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
@@ -1212,7 +1212,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
                     )
                 )
             mask_shape = [1] * len(input_shape)
-            if not in_dynamic_or_pir_mode():
+            if not in_dynamic_mode():
                 for i in drop_axes:
                     mask_shape[i] = input_shape_tensor[i]
             else:
@@ -1658,10 +1658,16 @@ def pad(x, pad, mode='constant', value=0.0, data_format="NCHW", name=None):
         paddings = pad
         pad_value = value
 
-        if in_dynamic_or_pir_mode():
+        if in_dynamic_mode():
             out = _C_ops.pad(x, paddings, float(pad_value))
             return out
 
+        if in_pir_mode():
+            if isinstance(pad_value, paddle.pir.OpResult):
+                return _C_ops.pad(x, paddings, pad_value)
+            else:
+                return _C_ops.pad(x, paddings, float(pad_value))
+
         check_variable_and_dtype(
             x,
             'x',
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 6caf0370366f4d..138146f376aeeb 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
-from paddle.base.framework import _global_flags
+from paddle.base.framework import _global_flags, in_dynamic_or_pir_mode
 from paddle.device import (
     get_all_custom_device_type,
     is_compiled_with_cuda,
@@ -126,7 +126,7 @@ def _conv_nd(
     name=None,
 ):
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
-    if in_dynamic_mode() and op_type == "conv2d":
+    if in_dynamic_or_pir_mode() and op_type == "conv2d":
         pre_bias = _C_ops.conv2d(
             x,
             weight,
@@ -155,7 +155,7 @@ def _conv_nd(
         else:
             return pre_bias
 
-    if in_dynamic_mode() and op_type == "depthwise_conv2d":
+    if in_dynamic_or_pir_mode() and op_type == "depthwise_conv2d":
         pre_bias = _C_ops.depthwise_conv2d(
             x,
             weight,
@@ -174,7 +174,7 @@ def _conv_nd(
         else:
             return pre_bias
 
-    if in_dynamic_mode() and op_type == "conv3d":
+    if in_dynamic_or_pir_mode() and op_type == "conv3d":
         pre_bias = _C_ops.conv3d(
             x,
             weight,
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 704eb880c516c2..ed77c07ffdb457 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -194,7 +194,7 @@ def batch_norm(
     else:
         trainable_statistics = not use_global_stats
 
-    if in_dygraph_mode():
+    if in_dynamic_or_pir_mode():
         batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm(
             x,
             running_mean,
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 6f111c61cb5071..6e53daa02cddb8 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -15,7 +15,11 @@
 import numpy as np
 
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
-from paddle.base.framework import Variable, in_dygraph_mode
+from paddle.base.framework import (
+    Variable,
+    in_dygraph_mode,
+    in_dynamic_or_pir_mode,
+)
 
 from ...base.data_feeder import check_type, check_variable_and_dtype
 from ...base.layer_helper import LayerHelper
@@ -372,7 +376,7 @@ def avg_pool2d(
         padding, 2, channel_last, ceil_mode=ceil_mode
     )
 
-    if in_dygraph_mode():
+    if in_dynamic_or_pir_mode():
         output = _C_ops.pool2d(
             x,
             kernel_size,
@@ -1254,7 +1258,7 @@ def max_pool2d(
             "When setting return_mask to true, data_format must be set to NCHW in API:max_pool2d"
         )
 
-    if in_dygraph_mode():
+    if in_dynamic_or_pir_mode():
         if return_mask:
             output = _C_ops.max_pool2d_with_index(
                 x, kernel_size, stride, padding, False, False
@@ -1647,8 +1651,9 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     elif _contain_var(output_size):
         output_size = _convert_to_tensor_list(output_size)
 
-    if in_dygraph_mode():
-        x = x._use_gpudnn(False)
+    if in_dynamic_or_pir_mode():
+        if in_dygraph_mode():
+            x = x._use_gpudnn(False)
         return _C_ops.pool2d(
             x,
             output_size,
@@ -1662,7 +1667,6 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             True,
             "EXPLICIT",
         )
-
     else:
         l_type = 'pool2d'
         check_variable_and_dtype(
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index adc81e5bbfd5d0..c1e0866ad8f067 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -45,7 +45,7 @@
 from .kaiming import MSRAInitializer  # noqa: F401
 from .assign import NumpyArrayInitializer  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'Bilinear',
     'Constant',
     'KaimingUniform',
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index c1bcb89f676f72..3a05bbed121f36 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import _C_ops
+from paddle import _C_ops, pir
 
 from ...base import core, framework, unique_name
 from ...base.data_feeder import check_variable_and_dtype
-from ...base.framework import _current_expected_place, in_dygraph_mode
+from ...base.framework import (
+    _current_expected_place,
+    in_dygraph_mode,
+    in_pir_mode,
+)
 from .initializer import Initializer
 
 __all__ = []
@@ -54,7 +58,7 @@ def forward(self, var, block=None):
         """
         block = self._check_block(block)
 
-        assert isinstance(block, framework.Block)
+        assert isinstance(block, (framework.Block, pir.Block))
 
         check_variable_and_dtype(
             var,
@@ -78,7 +82,17 @@ def forward(self, var, block=None):
             )
             out_var._share_underline_tensor_to(var)
             return None
-
+        elif in_pir_mode():
+            place = _current_expected_place()
+            out_var = _C_ops.gaussian(
+                var.shape,
+                self._mean,
+                self._std_dev,
+                self._seed,
+                var.dtype,
+                place,
+            )
+            return out_var
         else:
             op = block.append_op(
                 type="gaussian_random",
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index 204023378b5d33..791b5549ee7a2d 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -26,7 +26,7 @@
 from paddle.base import core, framework, unique_name
 from paddle.base.core import VarDesc
 from paddle.base.dygraph import no_grad
-from paddle.base.dygraph.base import in_declarative_mode  # noqa F401
+from paddle.base.dygraph.base import in_declarative_mode  # noqa: F401
 from paddle.base.dygraph.base import (
     _convert_into_variable,
     in_to_static_mode,
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 9944a4b4811267..a690daab0ef211 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -37,7 +37,13 @@
 
 from ...base import dygraph_utils
 from ...base.data_feeder import check_variable_and_dtype
-from ...framework import ParamAttr, _global_flags, get_default_dtype, no_grad
+from ...framework import (
+    ParamAttr,
+    _global_flags,
+    get_default_dtype,
+    in_dynamic_or_pir_mode,
+    no_grad,
+)
 from .. import functional as F
 from ..functional import batch_norm, instance_norm, layer_norm
 from ..initializer import Constant, Normal
@@ -1076,7 +1082,7 @@ def __init__(
         self._trainable_statistics = trainable_statistics
 
     def forward(self, input):
-        if in_dynamic_mode():
+        if in_dynamic_or_pir_mode():
             batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
                 input,
                 self._mean,
@@ -1092,9 +1098,13 @@ def forward(self, input):
             )
             if self._act is None:
                 return batch_norm_out
-            return dygraph_utils._append_activation_in_dygraph(
-                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
-            )
+            if in_dynamic_mode():
+                return dygraph_utils._append_activation_in_dygraph(
+                    batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
+                )
+            else:
+                act_op = getattr(_C_ops, self._act)
+                return act_op(input)
         else:
             # create output
             # mean and mean_out share the same memory
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index aa7f6d91edbfa1..8dc383d83c510f 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -777,28 +777,65 @@ def __init__(
                 )
             )
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = self.create_parameter(
-            (hidden_size, input_size),
-            weight_ih_attr,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.weight_hh = self.create_parameter(
-            (hidden_size, hidden_size),
-            weight_hh_attr,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.bias_ih = self.create_parameter(
-            (hidden_size,),
-            bias_ih_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.bias_hh = self.create_parameter(
-            (hidden_size,),
-            bias_hh_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std),
-        )
+        if weight_ih_attr is not False:
+            self.weight_ih = self.create_parameter(
+                (hidden_size, input_size),
+                weight_ih_attr,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.weight_ih = self.create_parameter(
+                (hidden_size, input_size),
+                None,
+                default_initializer=I.Constant(1.0),
+            )
+            self.weight_ih.stop_gradient = True
+
+        if weight_hh_attr is not False:
+            self.weight_hh = self.create_parameter(
+                (hidden_size, hidden_size),
+                weight_hh_attr,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.weight_hh = self.create_parameter(
+                (hidden_size, hidden_size),
+                None,
+                default_initializer=I.Constant(1.0),
+            )
+            self.weight_hh.stop_gradient = True
+
+        if bias_ih_attr is not False:
+            self.bias_ih = self.create_parameter(
+                (hidden_size,),
+                bias_ih_attr,
+                is_bias=True,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.bias_ih = self.create_parameter(
+                (hidden_size,),
+                None,
+                is_bias=True,
+                default_initializer=I.Constant(0.0),
+            )
+            self.bias_ih.stop_gradient = True
+
+        if bias_hh_attr is not False:
+            self.bias_hh = self.create_parameter(
+                (hidden_size,),
+                bias_hh_attr,
+                is_bias=True,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.bias_hh = self.create_parameter(
+                (hidden_size,),
+                None,
+                is_bias=True,
+                default_initializer=I.Constant(0.0),
+            )
+            self.bias_hh.stop_gradient = True
 
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -935,28 +972,62 @@ def __init__(
                 )
             )
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = self.create_parameter(
-            (4 * hidden_size, input_size),
-            weight_ih_attr,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.weight_hh = self.create_parameter(
-            (4 * hidden_size, hidden_size),
-            weight_hh_attr,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.bias_ih = self.create_parameter(
-            (4 * hidden_size,),
-            bias_ih_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.bias_hh = self.create_parameter(
-            (4 * hidden_size,),
-            bias_hh_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std),
-        )
+        if weight_ih_attr is not False:
+            self.weight_ih = self.create_parameter(
+                (4 * hidden_size, input_size),
+                weight_ih_attr,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.weight_ih = self.create_parameter(
+                (4 * hidden_size, input_size),
+                None,
+                default_initializer=I.Constant(1.0),
+            )
+            self.weight_ih.stop_gradient = True
+        if weight_hh_attr is not False:
+            self.weight_hh = self.create_parameter(
+                (4 * hidden_size, hidden_size),
+                weight_hh_attr,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.weight_hh = self.create_parameter(
+                (4 * hidden_size, hidden_size),
+                None,
+                default_initializer=I.Constant(1.0),
+            )
+            self.weight_hh.stop_gradient = True
+        if bias_ih_attr is not False:
+            self.bias_ih = self.create_parameter(
+                (4 * hidden_size,),
+                bias_ih_attr,
+                is_bias=True,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.bias_ih = self.create_parameter(
+                (4 * hidden_size,),
+                None,
+                is_bias=True,
+                default_initializer=I.Constant(0.0),
+            )
+            self.bias_ih.stop_gradient = True
+        if bias_hh_attr is not False:
+            self.bias_hh = self.create_parameter(
+                (4 * hidden_size,),
+                bias_hh_attr,
+                is_bias=True,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.bias_hh = self.create_parameter(
+                (4 * hidden_size,),
+                None,
+                is_bias=True,
+                default_initializer=I.Constant(0.0),
+            )
+            self.bias_hh.stop_gradient = True
 
         self.hidden_size = hidden_size
         self.input_size = input_size
@@ -1094,28 +1165,64 @@ def __init__(
                 )
             )
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = self.create_parameter(
-            (3 * hidden_size, input_size),
-            weight_ih_attr,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.weight_hh = self.create_parameter(
-            (3 * hidden_size, hidden_size),
-            weight_hh_attr,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.bias_ih = self.create_parameter(
-            (3 * hidden_size,),
-            bias_ih_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std),
-        )
-        self.bias_hh = self.create_parameter(
-            (3 * hidden_size,),
-            bias_hh_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std),
-        )
+        if weight_ih_attr is not False:
+            self.weight_ih = self.create_parameter(
+                (3 * hidden_size, input_size),
+                weight_ih_attr,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.weight_ih = self.create_parameter(
+                (3 * hidden_size, input_size),
+                None,
+                default_initializer=I.Constant(1.0),
+            )
+            self.weight_ih.stop_gradient = True
+        if weight_hh_attr is not False:
+            self.weight_hh = self.create_parameter(
+                (3 * hidden_size, hidden_size),
+                weight_hh_attr,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.weight_hh = self.create_parameter(
+                (3 * hidden_size, hidden_size),
+                None,
+                default_initializer=I.Constant(1.0),
+            )
+            self.weight_hh.stop_gradient = True
+
+        if bias_ih_attr is not False:
+            self.bias_ih = self.create_parameter(
+                (3 * hidden_size,),
+                bias_ih_attr,
+                is_bias=True,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.bias_ih = self.create_parameter(
+                (3 * hidden_size,),
+                None,
+                is_bias=True,
+                default_initializer=I.Constant(0.0),
+            )
+            self.bias_ih.stop_gradient = True
+
+        if bias_hh_attr is not False:
+            self.bias_hh = self.create_parameter(
+                (3 * hidden_size,),
+                bias_hh_attr,
+                is_bias=True,
+                default_initializer=I.Uniform(-std, std),
+            )
+        else:
+            self.bias_hh = self.create_parameter(
+                (3 * hidden_size,),
+                None,
+                is_bias=True,
+                default_initializer=I.Constant(0.0),
+            )
+            self.bias_hh.stop_gradient = True
 
         self.hidden_size = hidden_size
         self.input_size = input_size
diff --git a/python/paddle/nn/quant/__init__.py b/python/paddle/nn/quant/__init__.py
index 4962aacb4a5bd3..85d9650ce400f6 100644
--- a/python/paddle/nn/quant/__init__.py
+++ b/python/paddle/nn/quant/__init__.py
@@ -25,8 +25,15 @@
 from .quantized_linear import weight_only_linear  # noqa: F401
 from .quantized_linear import llm_int8_linear  # noqa: F401
 from .quantized_linear import weight_quantize  # noqa: F401
+from .quantized_linear import weight_dequantize  # noqa: F401
 from .quant_layers import QuantStub  # noqa: F401
 from . import qat
 from .stub import Stub
 
-__all__ = ["Stub", "weight_only_linear", "llm_int8_linear", "weight_quantize"]
+__all__ = [
+    "Stub",
+    "weight_only_linear",
+    "llm_int8_linear",
+    "weight_quantize",
+    "weight_dequantize",
+]
diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py
index 803135ff9f5a6a..8f962da6b6766c 100644
--- a/python/paddle/nn/quant/quantized_linear.py
+++ b/python/paddle/nn/quant/quantized_linear.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 from paddle import _C_ops
+from paddle.base.data_feeder import check_dtype
+from paddle.base.framework import convert_np_dtype_to_dtype_
 from paddle.framework import LayerHelper, in_dynamic_mode
 
 
@@ -22,25 +24,26 @@ def weight_quantize(x, algo="weight_only_int8"):
 
     Args:
         x (Tensor): The input Tensor to be quantized, the data type is float16 or bfloat16.
-        algo (str|None): The algo that is x will be apply, must be one of 'weight_only_int8',
+        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
             'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
 
     Returns:
-        out (Tensor): The Tensor which is the quantitative results, the data type is the same as that of x.
+        out (Tensor): The Tensor which is the quantitative results, the data type is int8, the shape is transposition of x.
         scale (Tensor): The scale Tensor which is the scale of pre-channel, the data type is float32.
     Examples:
         .. code-block:: python
 
-            import paddle
-            import numpy as np
-            from paddle.nn.quant import weight_quantize
-
-            paddle.device.set_device("cpu")
-            x = np.random.randn(64, 32).astype('float16')
-            x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace())
-            out, scale = weight_quantize(x, algo='weight_only_int8')
-            print(out.shape) # [32, 64]
-            print(scale.shape) # [32]
+            >>> # doctest: +SKIP('No testing required')
+            >>> import paddle
+            >>> from paddle.nn.quant import weight_quantize
+
+            >>> paddle.seed(2023)
+            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
+            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
+            >>> print(out.shape)
+            [32, 64]
+            >>> print(scale.shape)
+            [32]
     """
 
     if in_dynamic_mode():
@@ -60,6 +63,52 @@ def weight_quantize(x, algo="weight_only_int8"):
         return (out, scale)
 
 
+def weight_dequantize(x, scale, algo="weight_only_int8", out_dtype='float16'):
+    """
+    Dequantization function for weight_only and llm.int8's weight.
+
+    Args:
+        x (Tensor): The input Tensor to be dequantized, the data type is int8.
+        scale (Tensor): The scale Tensor which is the output of weight_quantize, the data type is float32.
+        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
+            'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
+        out_dtype (str|np.dtype): The output Tensor's data type, must be one of 'float16' and 'bfloat16', default: 'float16'.
+
+    Returns:
+        out (Tensor): The Tensor which is the dequantitative results, the data type is float16 or bfloat16, the shape is transposition of x.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +SKIP('No testing required')
+            >>> import paddle
+            >>> from paddle.nn.quant import weight_quantize, weight_dequantize
+
+            >>> paddle.seed(2023)
+            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
+            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
+            >>> x_dequant = weight_dequantize(out, scale)
+    """
+    check_dtype(
+        out_dtype, 'out_dtype', ['float16', 'bfloat16'], 'weight_dequantize'
+    )
+    out_dtype = convert_np_dtype_to_dtype_(out_dtype)
+    if in_dynamic_mode():
+        return _C_ops.weight_dequantize(x, scale, algo, out_dtype)
+    else:
+        type = "weight_dequantize"
+        helper = LayerHelper(type, **locals())
+        out = helper.create_variable_for_type_inference(out_dtype)
+
+        helper.append_op(
+            type=type,
+            inputs={"x": x, "scale": scale},
+            outputs={'out': out},
+            attrs={"algo": algo, "out_dtype": out_dtype},
+        )
+        return out
+
+
 def weight_only_linear(
     x,
     weight,
@@ -84,17 +133,18 @@ def weight_only_linear(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            from paddle.nn.quant import weight_only_linear
-
-            x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
-            weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
-            scale = paddle.randn([32], dtype='float32')
-            bias = paddle.cast(paddle.randn([32]), dtype='float16')
-            if paddle.device.cuda.get_device_capability()[0] >= 8:
-                out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8')
-                print(out.shape) # [1, 2, 32]
+            >>> # doctest: +SKIP('No testing required')
+            >>> import paddle
+            >>> from paddle.nn.quant import weight_only_linear
+
+            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
+            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
+            >>> scale = paddle.randn([32], dtype='float32')
+            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
+            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
+            ...    out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8')
+            ...    print(out.shape)
+            [1, 2, 32]
     """
     if in_dynamic_mode():
         out = _C_ops.weight_only_linear(
@@ -102,6 +152,9 @@ def weight_only_linear(
         )
         return out
     else:
+        check_dtype(
+            weight_dtype, 'weight_dtype', ['int8', 'int4'], 'weight_only_linear'
+        )
         type = "weight_only_linear"
         helper = LayerHelper(type, **locals())
         dtype = x.dtype
@@ -111,7 +164,7 @@ def weight_only_linear(
             'weight': [weight],
             'weight_scale': [weight_scale],
         }
-        if bias:
+        if bias is not None:
             inputs["bias"] = [bias]
         attrs = {'weight_dtype': weight_dtype}
 
@@ -151,17 +204,18 @@ def llm_int8_linear(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            from paddle.nn.quant import llm_int8_linear
-
-            x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
-            weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
-            scale = paddle.randn([32], dtype='float32')
-            bias = paddle.cast(paddle.randn([32]), dtype='float16')
-            if paddle.device.cuda.get_device_capability()[0] >= 8:
-                out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
-                print(out.shape) # [1, 2, 32]
+            >>> # doctest: +SKIP('No testing required')
+            >>> import paddle
+            >>> from paddle.nn.quant import llm_int8_linear
+
+            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
+            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
+            >>> scale = paddle.randn([32], dtype='float32')
+            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
+            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
+            ...    out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
+            ...    print(out.shape)
+            [1, 2, 32]
     """
     if in_dynamic_mode():
         out = _C_ops.llm_int8_linear(x, weight, bias, weight_scale, threshold)
diff --git a/python/paddle/nn/quant/stub.py b/python/paddle/nn/quant/stub.py
index 487db44a09b39e..7e75889a4a037c 100644
--- a/python/paddle/nn/quant/stub.py
+++ b/python/paddle/nn/quant/stub.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """ Define stub used in quantization."""
 
 from paddle.nn import Layer
diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py
index d1645deb905a98..2d255055d8cf56 100644
--- a/python/paddle/nn/utils/__init__.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -22,7 +22,7 @@
 from .clip_grad_norm_ import clip_grad_norm_  # noqa: F401
 from .clip_grad_value_ import clip_grad_value_  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'weight_norm',
     'remove_weight_norm',
     'spectral_norm',
diff --git a/python/paddle/nn/utils/transform_parameters.py b/python/paddle/nn/utils/transform_parameters.py
index 7cb628565cff95..8db65d61bb5bac 100644
--- a/python/paddle/nn/utils/transform_parameters.py
+++ b/python/paddle/nn/utils/transform_parameters.py
@@ -121,6 +121,7 @@ def parameters_to_vector(parameters, name=None):
         )
     for i, param in enumerate(parameters):
         _inplace_reshape_dygraph(param, origin_shapes[i])
+    out.stop_gradient = False
     return out
 
 
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 7d9737dc7da1f0..af865739052736 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -25,7 +25,7 @@
 from .lbfgs import LBFGS  # noqa: F401
 from . import lr  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'Optimizer',
     'Adagrad',
     'Adam',
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 6fb777447f8a1a..37a46f53707f11 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -28,7 +28,7 @@
 )
 from paddle.base.layer_helper import LayerHelper
 
-__all__ = [  # noqa
+__all__ = [
     'LRScheduler',
     'NoamDecay',
     'PiecewiseDecay',
@@ -45,6 +45,7 @@
     'MultiplicativeDecay',
     'OneCycleLR',
     'CyclicLR',
+    'LinearLR',
 ]
 
 
@@ -2229,6 +2230,125 @@ def get_lr(self):
         return lr
 
 
+class LinearLR(LRScheduler):
+    r"""
+    Set the learning rate according to linear scheduler.
+    The learning rate will be firstly multiplied by start_factor and linearly increase to end learning rate.
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        total_steps (int): Number of iterations that the learning_rate reaches end learning_rate.
+        start_factor (float): Start learning rate is defined by `start_factor * learning_rate` . Default: 1./3.
+        end_factor (float) End learning rate is defined by `end_factor * learning_rate`. Default: 1.0.
+        last_epoch (int, optional): The index of last epoch. Can be set to restart training.Default: -1, means initial learning rate.
+        verbose: (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``LinearLR`` instance to schedule learning rate.
+
+    Examples:
+        .. code-block:: python
+            :name: code-dynamic
+
+            >>> # Example1: train on default dynamic graph mode
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> # train on default dynamic graph mode
+            >>> linear = paddle.nn.Linear(10, 10)
+            >>> scheduler = paddle.optimizer.lr.LinearLR(learning_rate=0.5, total_steps=5, verbose=True)
+            >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            >>> for epoch in range(5):
+            ...     for batch_id in range(20):
+            ...         x = paddle.uniform([10, 10])
+            ...         out = linear(x)
+            ...         loss = paddle.mean(out)
+            ...         loss.backward()
+            ...         sgd.step()
+            ...         sgd.clear_gradients()
+            ...         scheduler.step()
+
+        .. code-block:: python
+            :name: code-static
+
+            >>> # Example2: train on static graph mode
+            >>> import paddle
+            >>> import numpy as np
+            >>> paddle.enable_static()
+            >>> main_prog = paddle.static.Program()
+            >>> start_prog = paddle.static.Program()
+            >>> with paddle.static.program_guard(main_prog, start_prog):
+            ...     x = paddle.static.data(name='x', shape=[None, 4, 5])
+            ...     y = paddle.static.data(name='y', shape=[None, 4, 5])
+            ...     z = paddle.static.nn.fc(x, 100)
+            ...     loss = paddle.mean(z)
+            ...     scheduler = paddle.optimizer.lr.LinearLR(learning_rate=0.5,
+            ...        total_steps=5, verbose=True)
+            ...     sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+            ...     sgd.minimize(loss)
+            ...
+            >>> exe = paddle.static.Executor()
+            >>> exe.run(start_prog)
+            >>> for epoch in range(5):
+            ...     for batch_id in range(20):
+            ...         out = exe.run(
+            ...             main_prog,
+            ...             feed={
+            ...                 'x': np.random.randn(3, 4, 5).astype('float32'),
+            ...                 'y': np.random.randn(3, 4, 5).astype('float32')
+            ...             },
+            ...             fetch_list=loss.name)
+            ...         scheduler.step()
+    """
+
+    def __init__(
+        self,
+        learning_rate,
+        total_steps,
+        start_factor=1.0 / 3,
+        end_factor=1.0,
+        last_epoch=-1,
+        verbose=False,
+    ):
+        if start_factor > 1.0 or start_factor <= 0:
+            raise ValueError(
+                "`start_factor` must be greater than 0 and less or equal to 1, but got {}".format(
+                    start_factor
+                )
+            )
+
+        if end_factor > 1.0 or end_factor < 0:
+            raise ValueError(
+                "`end_factor` must be greater than 0 and less than 1, but got {}".format(
+                    end_factor
+                )
+            )
+
+        if total_steps <= 0:
+            raise ValueError(
+                f"`total_steps` must be greater than 0, but got {total_steps}"
+            )
+
+        self.start_factor = start_factor
+        self.end_factor = end_factor
+        self.total_steps = total_steps
+
+        super().__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr * self.start_factor
+        elif self.last_epoch > self.total_steps:
+            return self.last_lr
+        else:
+            base_lr = self.total_steps * self.start_factor
+            cur_factor = self.end_factor - self.start_factor
+            factor = 1.0 + cur_factor / (
+                base_lr + (self.last_epoch - 1) * cur_factor
+            )
+            return self.last_lr * factor
+
+
 def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     """
     :api_attr: Static Graph
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 2e1314a3a1536d..f25a5bf9f771b3 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -312,11 +312,11 @@ def state_dict(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                emb = paddle.nn.Embedding(10, 10)
+                >>> import paddle
+                >>> emb = paddle.nn.Embedding(10, 10)
 
-                adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
-                state_dict = adam.state_dict()
+                >>> adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
+                >>> state_dict = adam.state_dict()
 
         '''
         state_dict = {}
@@ -1243,7 +1243,7 @@ def backward(
             loss (Tensor): ``loss`` tensor to run optimizations.
             startup_program (Program, optional): :ref:`api_paddle_static_Program` for
                 initializing parameters in ``parameters``. The default value
-                is None, at this time :ref:`api_base_default_startup_program` will be used.
+                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
             parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
@@ -1306,6 +1306,16 @@ def backward(
             parameter_list = parameters if parameters else self._parameter_list
             with paddle.static.program_guard(program, startup_program):
                 if in_pir_mode():
+                    if parameter_list is None:
+                        # all parameters will be updated.
+                        program_all_params = (
+                            program.global_block().all_parameters()
+                        )
+                        parameter_list = [
+                            param
+                            for param in program_all_params
+                            if param.stop_gradient is False
+                        ]
                     params_grads = []
                     grads = paddle.autograd.ir_backward.grad(
                         loss, parameter_list, no_grad_vars=act_no_grad_set
@@ -1604,7 +1614,7 @@ def minimize(
             loss (Tensor): A ``Tensor`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_paddle_static_Program` for
                 initializing parameters in ``parameters``. The default value
-                is None, at this time :ref:`api_base_default_startup_program` will be used.
+                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
             parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
diff --git a/python/paddle/pir/__init__.py b/python/paddle/pir/__init__.py
index 39b8c71ca5a2f4..145eb103918bf2 100644
--- a/python/paddle/pir/__init__.py
+++ b/python/paddle/pir/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.base.libpaddle.pir import (
+from paddle.base.libpaddle.pir import (  # noqa: F401
     Program,
     Block,
     Operation,
@@ -22,9 +22,10 @@
     fake_op_result,
     is_fake_op_result,
     Type,
-)  # noqa: F401
-from paddle.base.libpaddle.pir import (
+)
+from paddle.base.libpaddle.pir import (  # noqa: F401
     translate_to_new_ir,
+    translate_to_new_ir_with_param_map,
     set_global_program,
     set_insertion_point,
     reset_insertion_point_to_start,
@@ -32,8 +33,10 @@
     check_unregistered_ops,
     register_paddle_dialect,
     PassManager,
-)  # noqa: F401
+)
 
 from . import core
 
+from .math_op_patch import monkey_patch_opresult
+
 __all__ = []
diff --git a/python/paddle/pir/math_op_patch.py b/python/paddle/pir/math_op_patch.py
new file mode 100644
index 00000000000000..6f0acfaedbbbb1
--- /dev/null
+++ b/python/paddle/pir/math_op_patch.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import warnings
+
+from paddle.base.libpaddle import DataType
+
+from . import OpResult
+
+_already_patch_opresult = False
+
+_supported_int_dtype_ = [
+    DataType.BOOL,
+    DataType.UINT8,
+    DataType.INT8,
+    DataType.INT16,
+    DataType.INT32,
+    DataType.INT64,
+]
+
+
+def create_tensor_with_batchsize(ref_var, value, dtype):
+    assert isinstance(ref_var, OpResult)
+    value = float(value)
+    batch_dim = -1
+    out_shape = []
+    for i, d in enumerate(ref_var.shape):
+        if d < 0:
+            if batch_dim < 0:
+                batch_dim = i
+                out_shape.append(d)
+            else:
+                out_shape.append(1)
+        else:
+            out_shape.append(d)
+    assert batch_dim != -1
+    from paddle import _C_ops
+    from paddle.framework import core
+
+    out = _C_ops.full_batch_size_like(
+        ref_var, out_shape, dtype, value, batch_dim, batch_dim, core.Place()
+    )
+    out.stop_gradient = True
+
+    return out
+
+
+def monkey_patch_opresult():
+    def safe_get_dtype(var):
+        try:
+            dtype = var.dtype
+        except:
+            raise ValueError("Cannot get data type from var")
+        return dtype
+
+    def place(self):
+        """
+        OpResult don't have 'place' interface in static graph mode
+        But this interface can greatly facilitate dy2static.
+        So we give a warnning here and return None.
+        """
+        warnings.warn(
+            "OpResult do not have 'place' interface for pir graph mode, try not to use it. None will be returned."
+        )
+
+    @property
+    def _ndim(self):
+        """
+        Returns the dimension of current OpResult
+
+        Returns:
+            the dimension
+
+        Examples:
+            .. code-block:: python
+
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> # create a static OpResult
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # print the dimension of the OpResult
+                >>> print(x.ndim)
+                3
+        """
+        return len(self.shape)
+
+    def ndimension(self):
+        """
+        Returns the dimension of current OpResult
+
+        Returns:
+            the dimension
+
+        Examples:
+            .. code-block:: python
+
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> # create a static OpResult
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # print the dimension of the OpResult
+                >>> print(x.ndimension())
+                3
+        """
+        return len(self.shape)
+
+    def dim(self):
+        """
+        Returns the dimension of current OpResult
+
+        Returns:
+            the dimension
+
+        Examples:
+            .. code-block:: python
+
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> # create a static OpResult
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # print the dimension of the OpResult
+                >>> print(x.dim())
+                3
+        """
+        return len(self.shape)
+
+    def _item(self):
+        """
+        In order to be compatible with the item interface introduced by the dynamic graph, it does nothing but returns self.
+        It will check that the shape must be a 1-D tensor
+        """
+        if len(self.shape) > 1:
+            raise TypeError(
+                f"Required input var should be 1-D OpResult, but received {self.shape}"
+            )
+        return self
+
+    def astype(self, dtype):
+        """
+        **Notes**:
+
+        Cast a OpResult to a specified data type.
+
+        Args:
+
+            self(OpResult): The source OpResult
+
+            dtype: The target data type
+
+        Returns:
+            OpResult: OpResult with new dtype
+
+        Examples:
+            In Static Graph Mode:
+
+            .. code-block:: python
+
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> startup_prog = paddle.static.Program()
+                >>> main_prog = paddle.static.Program()
+                >>> with paddle.static.program_guard(startup_prog, main_prog):
+                ...     original_value = paddle.static.data(name = "new_value", shape=[2,2], dtype='float32')
+                ...     new_value = original_value.astype('int64')
+                ...     print("new value's dtype is: {}".format(new_value.dtype))
+                ...
+                new OpResult's dtype is: paddle.int64
+
+        """
+        from paddle import _C_ops
+
+        if not isinstance(dtype, DataType):
+            dtype = paddle.pir.core.convert_np_dtype_to_dtype_(dtype)
+        return _C_ops.cast(self, dtype)
+
+    def _scalar_add_(var, value):
+        return paddle.scale(var, 1.0, value)
+
+    def _scalar_sub_(var, value):
+        return paddle.scale(var, 1.0, -value)
+
+    def _scalar_rsub_(var, value):
+        return paddle.scale(var, -1.0, value)
+
+    def _scalar_mul_(var, value):
+        return paddle.scale(var, value, 0.0)
+
+    def _scalar_div_(var, value):
+        return paddle.scale(var, 1.0 / value, 0.0)
+
+    def _binary_creator_(
+        method_name,
+        python_api,
+        reverse=False,
+        scalar_method=None,
+    ):
+        def __impl__(self, other_var):
+            # 1. scalar exists cases
+            # we need combine the tensor.dtype and scalar.dtype, cast correct object
+            if isinstance(other_var, float):
+                # in all cases(+, -, *, /, **, //, %), we need cast tensor.dtype to float
+                if self.dtype in _supported_int_dtype_:
+                    self = astype(self, DataType.FLOAT32)
+                # here use `scale` replace `elementwise` to get better performance
+                # but only +, -, *, / can use this method
+                if scalar_method is not None:
+                    return scalar_method(self, other_var)
+            elif isinstance(other_var, int):
+                # in all cases(+, -, *, /, **, //, %), we can cast it to float
+                # because the output tensor.dtype depend on the type of input tensor
+                other_var = float(other_var)
+                # division is a special case
+                # NOTE(chenweihang): because we cast tensor to float32 instead float64,
+                # the division result can only guarantee the numerical accuracy of 6 digits
+                # after the decimal point. The result of numpy calculation is of float64 type,
+                # so the calculation result here and the calculation result of numpy are
+                # different after 6 decimal point. If necessary, we can also use float64 here.
+                # torch's behavior here is consistent with ours
+                if (
+                    python_api == paddle.divide
+                    and self.dtype in _supported_int_dtype_
+                ):
+                    paddle.cast(self, DataType.FLOAT32)
+                # here use `scale` replace `elementwise` to get better performance
+                # but only +, -, *, / can use this method
+                if scalar_method is not None:
+                    return scalar_method(self, other_var)
+            else:
+                # do nothing
+                pass
+
+            # 2. create OpResult for scalar
+            lhs_dtype = safe_get_dtype(self)
+            other_var_opresult = other_var
+            if not isinstance(other_var, OpResult):
+                if reverse:
+                    for elem in self.shape:
+                        if elem < 0:
+                            other_var_opresult = create_tensor_with_batchsize(
+                                self, other_var, lhs_dtype
+                            )
+
+                            break
+                    else:
+                        # when break is not triggered, enter the else branch
+                        other_var_opresult = paddle.fill_constant(
+                            self.shape,
+                            lhs_dtype,
+                            other_var,
+                        )
+                else:
+                    # add fill_op to current_block
+                    other_var_opresult = paddle.fill_constant(
+                        [],
+                        lhs_dtype,
+                        other_var,
+                    )
+
+            # 3. unify right var type to left var
+            rhs_dtype = safe_get_dtype(other_var_opresult)
+            if lhs_dtype != rhs_dtype:
+                other_var_opresult = paddle.cast(other_var_opresult, lhs_dtype)
+            if reverse:
+                tmp = self
+                self = other_var_opresult
+                other_var_opresult = tmp
+
+            if (
+                python_api == paddle.divide
+            ) and self.dtype in _supported_int_dtype_:
+                self = paddle.cast(self, DataType.FLOAT32)
+                other_var = paddle.cast(other_var_opresult, DataType.FLOAT32)
+
+            out = python_api(self, other_var_opresult)
+            return out
+
+        __impl__.__doc__ = """
+            Args:
+                self(OpResult): left hand OpResult
+                other_var(OpResult|float|int): right hand OpResult
+
+            Returns:
+                OpResult
+            """
+        __impl__.__name__ = method_name
+        return __impl__
+
+    import paddle
+
+    opresult_methods = [
+        ('place', place),
+        ('item', _item),
+        ('dim', dim),
+        ('ndimension', ndimension),
+        ('ndim', _ndim),
+        ('astype', astype),
+        (
+            '__add__',
+            _binary_creator_('__add__', paddle.tensor.add, False, _scalar_add_),
+        ),
+        #  a+b == b+a. Do not need to reverse explicitly
+        (
+            '__radd__',
+            _binary_creator_(
+                '__radd__', paddle.tensor.add, False, _scalar_add_
+            ),
+        ),
+        (
+            '__sub__',
+            _binary_creator_(
+                '__sub__', paddle.tensor.subtract, False, _scalar_sub_
+            ),
+        ),
+        (
+            '__rsub__',
+            _binary_creator_(
+                '__rsub__', paddle.tensor.subtract, True, _scalar_rsub_
+            ),
+        ),
+        (
+            '__mul__',
+            _binary_creator_(
+                '__mul__', paddle.tensor.multiply, False, _scalar_mul_
+            ),
+        ),
+        #  a*b == b*a. Do not need to reverse explicitly
+        (
+            '__rmul__',
+            _binary_creator_(
+                '__rmul__', paddle.tensor.multiply, False, _scalar_mul_
+            ),
+        ),
+        (
+            '__div__',
+            _binary_creator_(
+                '__div__', paddle.tensor.divide, False, _scalar_div_
+            ),
+        ),
+        (
+            '__truediv__',
+            _binary_creator_(
+                '__truediv__', paddle.tensor.divide, False, _scalar_div_
+            ),
+        ),
+        (
+            '__rdiv__',
+            _binary_creator_('__rdiv__', paddle.tensor.divide, True, None),
+        ),
+        (
+            '__rtruediv__',
+            _binary_creator_('__rtruediv__', paddle.tensor.divide, True, None),
+        ),
+        (
+            '__pow__',
+            _binary_creator_('__pow__', paddle.tensor.pow, False, None),
+        ),
+        (
+            '__rpow__',
+            _binary_creator_('__rpow__', paddle.tensor.pow, True, None),
+        ),
+        (
+            '__floordiv__',
+            _binary_creator_(
+                '__floordiv__', paddle.tensor.floor_divide, False, None
+            ),
+        ),
+        (
+            '__mod__',
+            _binary_creator_('__mod__', paddle.tensor.remainder, False, None),
+        ),
+        (
+            '__matmul__',
+            _binary_creator_('__matmul__', paddle.tensor.matmul, False, None),
+        ),
+    ]
+
+    global _already_patch_opresult
+    if not _already_patch_opresult:
+        for method in opresult_methods:
+            method_name = method[0]
+            method_impl = method[1]
+            setattr(OpResult, method_name, method_impl)
+
+        # Handling Tensor Methods
+        import paddle.tensor
+
+        for method_name in paddle.tensor.tensor_method_func:
+            if hasattr(OpResult, method_name):
+                continue
+            method_impl = getattr(paddle.tensor, method_name, None)
+            if method_impl:
+                setattr(OpResult, method_name, method_impl)
+
+        # Handling __getitem__
+        from ..base.variable_index import _getitem_static
+
+        OpResult.__getitem__ = _getitem_static
+
+        _already_patch_opresult = True
diff --git a/python/paddle/pir_utils.py b/python/paddle/pir_utils.py
index 28d261b0155fce..a2b5244cad7c53 100644
--- a/python/paddle/pir_utils.py
+++ b/python/paddle/pir_utils.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 
+from functools import wraps
+
 import paddle
 
 
@@ -64,9 +66,16 @@ def _switch_to_pir(self):
                 {"FLAGS_enable_new_ir_in_executor": True}
             )
             paddle.pir.register_paddle_dialect()
-            paddle.static.Program = paddle.pir.Program
+
             paddle.base.Program = paddle.pir.Program
             paddle.base.program_guard = paddle.pir.core.program_guard
+            # paddle.base.default_main_program = (
+            #     paddle.pir.core.default_main_program
+            # )
+            # paddle.base.default_startup_program = (
+            #     paddle.pir.core.default_startup_program
+            # )
+            paddle.static.Program = paddle.pir.Program
             paddle.static.program_guard = paddle.pir.core.program_guard
             paddle.static.default_main_program = (
                 paddle.pir.core.default_main_program
@@ -82,9 +91,14 @@ def _switch_to_old_ir(self):
             paddle.framework.set_flags(
                 {"FLAGS_enable_new_ir_in_executor": False}
             )
-            paddle.static.Program = self.old_Program
+
             paddle.base.Program = self.old_Program
             paddle.base.program_guard = self.old_program_guard
+            # paddle.base.default_main_program = self.old_default_main_program
+            # paddle.base.default_startup_program = (
+            #     self.old_default_startup_program
+            # )
+            paddle.static.Program = self.old_Program
             paddle.static.program_guard = self.old_program_guard
             paddle.static.default_main_program = self.old_default_main_program
             paddle.static.default_startup_program = (
@@ -95,3 +109,13 @@ def _switch_to_old_ir(self):
                 "IrGuard._switch_to_old_ir only work when paddle.framework.in_pir_mode() is false, \
                 please set FLAGS_enable_pir_api = false"
             )
+
+
+def test_with_pir_api(func):
+    @wraps(func)
+    def impl(*args, **kwargs):
+        func(*args, **kwargs)
+        with IrGuard():
+            func(*args, **kwargs)
+
+    return impl
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 5cf44a3efc7c08..57c4abec6d8d0e 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -79,7 +79,7 @@
 from ..base.framework import program_guard  # noqa: F401
 from ..base.framework import Program  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'append_backward',
     'gradients',
     'Executor',
diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py
index 3023628e9a3892..06630039ca8772 100644
--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -253,7 +253,6 @@ def _update_list(self):
     'uniform_random',
     'uniform_random_batch_size_like',
     'gaussian_random',
-    'gaussian_random_batch_size_like',
     'slice',
     'rank',
     'scale',
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index d144f87ec32cb9..f3693e1501c408 100755
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -60,7 +60,7 @@
 from .control_flow import cond
 from .static_pylayer import static_pylayer
 
-__all__ = [  # noqa
+__all__ = [
     'fc',
     'batch_norm',
     'bilinear_tensor_product',
diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py
index 76fa477ca8403c..45f6a7c4bdb7fc 100644
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
@@ -1344,7 +1344,7 @@ def check_ret_none(seq_true, seq_false, seq_names):
     def merge_every_var_list(false_vars, true_vars, name):
         return map_structure(partial(merge_func, name), false_vars, true_vars)
 
-    merged_output = list(
+    merged_output_fns = list(
         map(
             merge_every_var_list,
             _to_sequence_except_dict(false_output),
@@ -1352,6 +1352,7 @@ def merge_every_var_list(false_vars, true_vars, name):
             _to_sequence_except_dict(return_names),
         )
     )
+    merged_output = map_structure(lambda fn: fn(), merged_output_fns)
     merged_output = pack_sequence_as(false_output, flatten(merged_output))
     return merged_output
 
@@ -1469,13 +1470,7 @@ def select_input_with_buildin_type(inputs, mask, name):
 
     false_var, true_var = inputs
 
-    if isinstance(false_var, UndefinedVar) and isinstance(
-        true_var, UndefinedVar
-    ):
-        """None -> UndefinedVar, so the real value is a [None, UndefinedVar] or [None, None], we just return None."""
-        return None
-
-    if isinstance(false_var, Variable) and isinstance(true_var, Variable):
+    def start_select_input():
         try:
             return select_input(inputs, mask)
         except Exception as e:
@@ -1483,11 +1478,20 @@ def select_input_with_buildin_type(inputs, mask, name):
                 f"Exceptions throwed while doing select_input on {name}:\n{e}"
             )
 
+    if isinstance(false_var, UndefinedVar) and isinstance(
+        true_var, UndefinedVar
+    ):
+        """None -> UndefinedVar, so the real value is a [None, UndefinedVar] or [None, None], we just return None."""
+        return lambda: None
+
+    if isinstance(false_var, Variable) and isinstance(true_var, Variable):
+        return start_select_input
+
     elif isinstance(false_var, support_ret_buildin_type) and isinstance(
         false_var, type(true_var)
     ):
         if false_var == true_var:
-            return false_var
+            return lambda: false_var
         else:
             inputs = [
                 to_static_variable(false_var),
@@ -1514,12 +1518,6 @@ def select_input_with_buildin_type(inputs, mask, name):
         isinstance(true_var, UndefinedVar)
         and isinstance(false_var, (Variable,) + support_ret_buildin_type)
     ):
-
-        def create_var_if_not_undefined_var(a):
-            if isinstance(a, UndefinedVar):
-                return a
-            return to_static_variable(a)
-
         true_var, false_var = to_static_variable(true_var), to_static_variable(
             false_var
         )
@@ -1531,12 +1529,7 @@ def create_var_if_not_undefined_var(a):
                 type(false_var), type(true_var)
             )
         )
-    try:
-        return select_input(inputs, mask)
-    except Exception as e:
-        raise RuntimeError(
-            f"Exceptions throwed while doing select_input on {name}:\n{e}"
-        )
+    return start_select_input
 
 
 def _is_sequence_except_dict(x):
@@ -1711,7 +1704,7 @@ def Print(
     check_variable_and_dtype(
         input,
         'input',
-        ['float32', 'float64', 'int32', 'int64', 'bool'],
+        ['uint16', 'float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
         'paddle.static.Print',
     )
 
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 83546b8f577972..66fbfe7d7ec350 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -329,6 +329,8 @@
 from .math import polygamma_  # noqa: F401
 from .math import renorm  # noqa: F401
 from .math import renorm_  # noqa: F401
+from .math import hypot  # noqa: F401
+from .math import hypot_  # noqa: F401
 from .math import combinations  # noqa: F401
 
 from .random import multinomial  # noqa: F401
@@ -382,7 +384,7 @@
 from ..signal import stft  # noqa: F401
 
 # this list used in math_op_patch.py for _binary_creator_
-tensor_method_func = [  # noqa
+tensor_method_func = [
     'create_parameter',
     'create_tensor',
     'matmul',
@@ -465,6 +467,8 @@
     'sum',
     'nan_to_num',
     'nan_to_num_',
+    'hypot',
+    'hypot_',
     'nansum',
     'nanmean',
     'count_nonzero',
@@ -715,37 +719,6 @@
     'acosh_',
     'asinh_',
     'diag',
-    'eye',
-    'linspace',
-    'fill_constant',
-    'ones',
-    'ones_like',
-    'zeros',
-    'zeros_like',
-    'arange',
-    'full',
-    'full_like',
-    'meshgrid',
-    'empty',
-    'empty_like',
-    'complex',
-    'eigh',
-    'standard_normal',
-    'normal',
-    'uniform',
-    'randn',
-    'rand',
-    'randint',
-    'randint_like',
-    'randperm',
-    'poisson',
-    'searchsorted',
-    'set_printoptions',
-    'array_length',
-    'array_read',
-    'array_write',
-    'create_array',
-    'einsum',
     'normal_',
     "combinations",
 ]
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 918b5f2c01e9cf..e4f0ea824e3a41 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -36,6 +36,7 @@
 from ..framework import (
     LayerHelper,
     _current_expected_place,
+    _current_expected_place_,
     _get_paddle_place,
     convert_np_dtype_to_dtype_,
     core,
@@ -651,10 +652,11 @@ def _handle_np_dtype(ndarray, dtype):
 
 
 def _to_tensor_static(data, dtype=None, stop_gradient=None):
-    if isinstance(data, Variable):
+    if isinstance(data, (Variable, paddle.pir.OpResult)):
         output = data
         if dtype is not None and dtype != data.dtype:
             output = paddle.cast(output, dtype)
+
     else:
         if isinstance(data, np.number):  # Special case for numpy scalars
             data = np.array(data)
@@ -692,6 +694,9 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
             # fix numpy default dtype
             if data.dtype in ['float16', 'float32', 'float64']:
                 data = data.astype(paddle.get_default_dtype())
+            # Windows default type is 'int32', while Linux/Mac is 'int64'. Unify they.
+            elif data.dtype in ['int32']:
+                data = data.astype("int64")
 
         if dtype:
             target_dtype = dtype
@@ -701,6 +706,8 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
             target_dtype = paddle.get_default_dtype()
         target_dtype = convert_dtype(target_dtype)
 
+        if data.dtype == "int16":
+            data = data.astype("int32")
         output = assign(data)
 
         if convert_dtype(output.dtype) != target_dtype:
@@ -782,8 +789,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     """
     place = _get_paddle_place(place)
     if place is None:
-        place = _current_expected_place()
-
+        place = _current_expected_place_()
     if in_dynamic_mode():
         return _to_tensor_non_static(data, dtype, place, stop_gradient)
 
@@ -791,7 +797,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     else:
         re_exp = re.compile(r'[(](.+?)[)]', re.S)
         place_str = re.findall(re_exp, str(place))[0]
-
         with paddle.static.device_guard(place_str):
             return _to_tensor_static(data, dtype, stop_gradient)
 
@@ -884,24 +889,34 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
         place = _current_expected_place()
         if force_cpu:
             place = core.CPUPlace()
-        if isinstance(shape, (list, tuple)):
-            shape = paddle.utils.convert_shape_to_list(shape)
 
         if not isinstance(dtype, (core.VarDesc.VarType, core.DataType)):
             dtype = convert_np_dtype_to_dtype_(dtype)
 
+        if in_dynamic_mode():
+            value = float(value)
+            if isinstance(shape, (list, tuple)):
+                shape = paddle.utils.convert_shape_to_list(shape)
+
+        else:
+            if isinstance(shape, (list, tuple)):
+                if paddle.utils._contain_var(shape):
+                    shape = paddle.utils.get_int_tensor_list(shape, place)
+            elif isinstance(shape, paddle.pir.OpResult):
+                pass
+            else:
+                TypeError("Shape only supports OpReslut, or list, or tuple.")
+
         if out is None:
-            value = float(value) if in_dynamic_mode() else value
             out = _C_ops.full(shape, value, dtype, place)
             out.stop_gradient = True
             return out
 
         if out is not None:
-            value = float(value) if in_dynamic_mode() else value
-            # final state mode is support out is not None.
             _C_ops.full_(out, shape, value, dtype, place)
             out.stop_gradient = True
             return out
+
     else:
         attrs = {'force_cpu': force_cpu}
         dtype = convert_dtype(dtype)
@@ -2264,22 +2279,16 @@ def convert_scalar(x):
             )
             dtype = core.DataType.FLOAT32
 
-        if dtype == core.VarDesc.VarType.BOOL or dtype == core.DataType.BOOL:
+        if dtype in [core.VarDesc.VarType.BOOL, core.DataType.BOOL]:
             value_name = "bool_values"
             values = [int(v) for v in input.flat]
-        elif (
-            dtype == core.VarDesc.VarType.FP32 or dtype == core.DataType.FLOAT32
-        ):
+        elif dtype in [core.VarDesc.VarType.FP32, core.DataType.FLOAT32]:
             value_name = "fp32_values"
             values = [float(v) for v in input.flat]
-        elif (
-            dtype == core.VarDesc.VarType.INT32 or dtype == core.DataType.INT32
-        ):
+        elif dtype in [core.VarDesc.VarType.INT32, core.DataType.INT32]:
             value_name = "int32_values"
             values = [int(v) for v in input.flat]
-        elif (
-            dtype == core.VarDesc.VarType.INT64 or dtype == core.DataType.INT64
-        ):
+        elif dtype in [core.VarDesc.VarType.INT64, core.DataType.INT64]:
             value_name = "int64_values"
             values = [int(v) for v in input.flat]
         else:
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index 02ab66eb1da2ad..30574b93baf48a 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -27,6 +27,7 @@
     convert_np_dtype_to_dtype_,
     core,
     in_dynamic_mode,
+    in_dynamic_or_pir_mode,
 )
 
 __all__ = []
@@ -266,7 +267,7 @@ def generate_activation_fn(op_type):
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
 
     def func(x, name=None):
-        if in_dynamic_mode():
+        if in_dynamic_or_pir_mode():
             if hasattr(_C_ops, op_type):
                 op = getattr(_C_ops, op_type)
                 return op(x)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 12d19e7f7b98ef..71016a2208c154 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1143,7 +1143,7 @@ def dot(x, y, name=None):
             [32, 64])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.dot(x, y)
     else:
         op_type = 'dot'
@@ -2621,7 +2621,7 @@ def eig(x, name=None):
               (-0.21026138961315155+0j)])
     """
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.eig(x)
     else:
         check_variable_and_dtype(
@@ -2692,7 +2692,7 @@ def eigvals(x, name=None):
             )
         )
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.eigvals(x)
     else:
         check_variable_and_dtype(
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 0deeefcc15c745..9b50993b891667 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -718,7 +718,7 @@ def greater_than(x, y, name=None):
             Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
             [False, False, True ])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.greater_than(x, y)
     else:
         check_variable_and_dtype(
@@ -807,7 +807,7 @@ def less_equal(x, y, name=None):
             Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
             [True , True , False])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.less_equal(x, y)
     else:
         check_variable_and_dtype(
@@ -896,7 +896,7 @@ def less_than(x, y, name=None):
             Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
             [False, True , False])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.less_than(x, y)
     else:
         check_variable_and_dtype(
@@ -985,7 +985,7 @@ def not_equal(x, y, name=None):
             Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
             [False, True , True ])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.not_equal(x, y)
     else:
         check_variable_and_dtype(
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index e6f1484db154c9..ae61880c997bed 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -112,15 +112,15 @@ def tensor_array_to_tensor(input, axis=1, use_stack=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy
-            import paddle
-            x0 = paddle.assign(numpy.random.rand(2, 2).astype("float32"))
-            x1 = paddle.assign(numpy.random.rand(2, 2).astype("float32"))
-            i = paddle.full(shape=[1], dtype="int64", fill_value=0)
-            array = paddle.tensor.array.create_array(dtype='float32')
-            paddle.tensor.array.array_write(x0, i, array)
-            paddle.tensor.array.array_write(x1, i + 1, array)
-            output, output_index = paddle.tensor.manipulation.tensor_array_to_tensor(input=array)
+            >>> import numpy
+            >>> import paddle
+            >>> x0 = paddle.assign(numpy.random.rand(2, 2).astype("float32"))
+            >>> x1 = paddle.assign(numpy.random.rand(2, 2).astype("float32"))
+            >>> i = paddle.full(shape=[1], dtype="int64", fill_value=0)
+            >>> array = paddle.tensor.array.create_array(dtype='float32')
+            >>> paddle.tensor.array.array_write(x0, i, array)
+            >>> paddle.tensor.array.array_write(x1, i + 1, array)
+            >>> output, output_index = paddle.tensor.manipulation.tensor_array_to_tensor(input=array)
     """
     if in_dynamic_mode():
         assert isinstance(
@@ -175,10 +175,10 @@ def cast(x, dtype):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([2, 3, 4], 'float64')
-            y = paddle.cast(x, 'uint8')
+            >>> x = paddle.to_tensor([2, 3, 4], 'float64')
+            >>> y = paddle.cast(x, 'uint8')
     """
     if not isinstance(dtype, (core.VarDesc.VarType, core.DataType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
@@ -295,24 +295,24 @@ def slice(input, axes, starts, ends):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            input = paddle.rand(shape=[4, 5, 6], dtype='float32')
-            # example 1:
-            # attr starts is a list which doesn't contain tensor.
-            axes = [0, 1, 2]
-            starts = [-3, 0, 2]
-            ends = [3, 2, 4]
-            sliced_1 = paddle.slice(input, axes=axes, starts=starts, ends=ends)
-            # sliced_1 is input[1:3, 0:2, 2:4].
-
-            # example 2:
-            # attr starts is a list which contain tensor.
-            minus_3 = paddle.full([1], -3, "int32")
-            sliced_2 = paddle.slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends)
-            # sliced_2 is input[1:3, 0:2, 2:4].
+            >>> import paddle
+
+            >>> input = paddle.rand(shape=[4, 5, 6], dtype='float32')
+            >>> # example 1:
+            >>> # attr starts is a list which doesn't contain tensor.
+            >>> axes = [0, 1, 2]
+            >>> starts = [-3, 0, 2]
+            >>> ends = [3, 2, 4]
+            >>> sliced_1 = paddle.slice(input, axes=axes, starts=starts, ends=ends)
+            >>> # sliced_1 is input[1:3, 0:2, 2:4].
+
+            >>> # example 2:
+            >>> # attr starts is a list which contain tensor.
+            >>> minus_3 = paddle.full([1], -3, "int32")
+            >>> sliced_2 = paddle.slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends)
+            >>> # sliced_2 is input[1:3, 0:2, 2:4].
     """
-    if in_dynamic_or_pir_mode():
+    if in_dynamic_mode():
         attrs = ()
         starts_tensor = None
         ends_tensor = None
@@ -357,6 +357,38 @@ def slice(input, axes, starts, ends):
             infer_flags = [-1 for i in range(len(axes))]
 
         return _C_ops.slice(input, axes, starts, ends, infer_flags, [])
+    elif in_pir_mode():
+        if not isinstance(starts, (list, tuple, paddle.pir.OpResult)):
+            raise ValueError(
+                "Input starts must be an OpResult, python list or tuple."
+            )
+        if not isinstance(ends, (list, tuple, paddle.pir.OpResult)):
+            raise ValueError(
+                "Input ends must be an OpResult, python list or tuple."
+            )
+        infer_flags = [1 for i in range(len(axes))]
+        # starts
+        if isinstance(starts, paddle.pir.OpResult):
+            starts.stop_gradient = True
+            infer_flags = [-1 for i in range(len(axes))]
+        elif isinstance(starts, (list, tuple)):
+            if paddle.utils._contain_var(starts):
+                for i, dim in enumerate(starts):
+                    if isinstance(dim, paddle.pir.OpResult):
+                        infer_flags[i] = -1
+                starts = paddle.utils.get_int_tensor_list(starts)
+
+        # ends
+        if isinstance(ends, paddle.pir.OpResult):
+            ends.stop_gradient = True
+            infer_flags = [-1 for i in range(len(axes))]
+        elif isinstance(ends, (list, tuple)):
+            if paddle.utils._contain_var(ends):
+                for i, dim in enumerate(ends):
+                    if isinstance(dim, paddle.pir.OpResult):
+                        infer_flags[i] = -1
+                ends = paddle.utils.get_int_tensor_list(ends)
+        return _C_ops.slice(input, axes, starts, ends, infer_flags, [])
     else:
         if not isinstance(starts, (list, tuple, Variable)):
             raise ValueError(
@@ -467,12 +499,12 @@ def transpose(x, perm, name=None):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.randn([2, 3, 4])
-            x_transposed = paddle.transpose(x, perm=[1, 0, 2])
-            print(x_transposed.shape)
-            # [3L, 2L, 4L]
+            >>> x = paddle.randn([2, 3, 4])
+            >>> x_transposed = paddle.transpose(x, perm=[1, 0, 2])
+            >>> print(x_transposed.shape)
+            [3, 2, 4]
 
     """
     if in_dynamic_or_pir_mode():
@@ -544,9 +576,9 @@ def unstack(x, axis=0, num=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
-            y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
+            >>> import paddle
+            >>> x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
+            >>> y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
 
     """
     if not (-x.ndim <= axis < x.ndim):
@@ -617,14 +649,15 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     Examples:
         .. code-block:: python
 
-            import paddle
-            label = paddle.to_tensor([[16], [1]], "int64")
-            shard_label = paddle.shard_index(input=label,
-                                             index_num=20,
-                                             nshards=2,
-                                             shard_id=0)
-            print(shard_label)
-            # [[-1], [1]]
+            >>> import paddle
+            >>> label = paddle.to_tensor([[16], [1]], "int64")
+            >>> shard_label = paddle.shard_index(input=label,
+            ...                                  index_num=20,
+            ...                                  nshards=2,
+            ...                                  shard_id=0)
+            >>> print(shard_label.numpy())
+            [[-1]
+             [ 1]]
     """
     if in_dynamic_mode():
         return _C_ops.shard_index(
@@ -716,29 +749,29 @@ def crop(x, shape=None, offsets=None, name=None):
 
         .. code-block:: python
 
-            import paddle
-            x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-            # x.shape = [3, 3]
-            # x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
-
-            # shape can be a 1-D Tensor or list or tuple.
-            shape = paddle.to_tensor([2, 2], dtype='int32')
-            # shape = [2, 2]
-            # shape = (2, 2)
-            out = paddle.crop(x, shape)
-            # out.shape = [2, 2]
-            # out = [[1,2], [4,5]]
-
-            # offsets can be a 1-D Tensor or list or tuple.
-            offsets = paddle.to_tensor([0, 1], dtype='int32')
-            # offsets = [1, 0]
-            # offsets = (1, 1)
-            out = paddle.crop(x, shape, offsets)
-            # out.shape = [2, 2]
-            # if offsets = [0, 0], out = [[1,2], [4,5]]
-            # if offsets = [0, 1], out = [[2,3], [5,6]]
-            # if offsets = [1, 0], out = [[4,5], [7,8]]
-            # if offsets = [1, 1], out = [[5,6], [8,9]]
+            >>> import paddle
+            >>> x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+            >>> # x.shape = [3, 3]
+            >>> # x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+
+            >>> # shape can be a 1-D Tensor or list or tuple.
+            >>> shape = paddle.to_tensor([2, 2], dtype='int32')
+            >>> # shape = [2, 2]
+            >>> # shape = (2, 2)
+            >>> out = paddle.crop(x, shape)
+            >>> # out.shape = [2, 2]
+            >>> # out = [[1,2], [4,5]]
+
+            >>> # offsets can be a 1-D Tensor or list or tuple.
+            >>> offsets = paddle.to_tensor([0, 1], dtype='int32')
+            >>> # offsets = [1, 0]
+            >>> # offsets = (1, 1)
+            >>> out = paddle.crop(x, shape, offsets)
+            >>> # out.shape = [2, 2]
+            >>> # if offsets = [0, 0], out = [[1,2], [4,5]]
+            >>> # if offsets = [0, 1], out = [[2,3], [5,6]]
+            >>> # if offsets = [1, 0], out = [[4,5], [7,8]]
+            >>> # if offsets = [1, 1], out = [[5,6], [8,9]]
 
     """
 
@@ -873,12 +906,13 @@ def fill_(x, value):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            tensor = paddle.to_tensor([0, 1, 2, 3, 4])
+            >>> tensor = paddle.to_tensor([0, 1, 2, 3, 4])
 
-            tensor.fill_(0)
-            print(tensor.tolist())   #[0, 0, 0, 0, 0]
+            >>> tensor.fill_(0)
+            >>> print(tensor.tolist())
+            [0, 0, 0, 0, 0]
 
     """
     if not isinstance(value, (float, int)):
@@ -906,12 +940,13 @@ def zero_(x):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            tensor = paddle.to_tensor([0, 1, 2, 3, 4])
+            >>> tensor = paddle.to_tensor([0, 1, 2, 3, 4])
 
-            tensor.zero_()
-            print(tensor.tolist())   #[0, 0, 0, 0, 0]
+            >>> tensor.zero_()
+            >>> print(tensor.tolist())
+            [0, 0, 0, 0, 0]
 
     """
     return _C_ops.fill_(x, 0.0)
@@ -937,10 +972,12 @@ def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
 
     Examples:
         .. code-block:: python
-            import paddle
-            x = paddle.ones((4, 3)) * 2
-            x.fill_diagonal_(1.0)
-            print(x.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
+
+            >>> import paddle
+            >>> x = paddle.ones((4, 3)) * 2
+            >>> x.fill_diagonal_(1.0)
+            >>> print(x.tolist())
+            [[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
     """
     if in_dynamic_mode():
         if len(x.shape) == 2:
@@ -1003,12 +1040,13 @@ def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.ones((4, 3)) * 2
-            y = paddle.ones((3,))
-            x.fill_diagonal_tensor_(y)
-            print(x.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
+            >>> x = paddle.ones((4, 3)) * 2
+            >>> y = paddle.ones((3,))
+            >>> x.fill_diagonal_tensor_(y)
+            >>> print(x.tolist())
+            [[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
 
     """
     return _fill_diagonal_tensor_impl(
@@ -1034,12 +1072,13 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.ones((4, 3)) * 2
-            y = paddle.ones((3,))
-            nx = x.fill_diagonal_tensor(y)
-            print(nx.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
+            >>> x = paddle.ones((4, 3)) * 2
+            >>> y = paddle.ones((3,))
+            >>> nx = x.fill_diagonal_tensor(y)
+            >>> print(nx.tolist())
+            [[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
 
     """
     return _fill_diagonal_tensor_impl(
@@ -1065,14 +1104,16 @@ def tolist(x):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            t = paddle.to_tensor([0,1,2,3,4])
-            expectlist = t.tolist()
-            print(expectlist)   #[0, 1, 2, 3, 4]
+            >>> t = paddle.to_tensor([0,1,2,3,4])
+            >>> expectlist = t.tolist()
+            >>> print(expectlist)
+            [0, 1, 2, 3, 4]
 
-            expectlist = paddle.tolist(t)
-            print(expectlist)   #[0, 1, 2, 3, 4]
+            >>> expectlist = paddle.tolist(t)
+            >>> print(expectlist)
+            [0, 1, 2, 3, 4]
 
     """
     # TODO(zhouwei): will remove 0-D Tensor.numpy() hack
@@ -1099,28 +1140,36 @@ def concat(x, axis=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x1 = paddle.to_tensor([[1, 2, 3],
-                                   [4, 5, 6]])
-            x2 = paddle.to_tensor([[11, 12, 13],
-                                   [14, 15, 16]])
-            x3 = paddle.to_tensor([[21, 22],
-                                   [23, 24]])
-            zero = paddle.full(shape=[1], dtype='int32', fill_value=0)
-            # When the axis is negative, the real axis is (axis + Rank(x))
-            # As follow, axis is -1, Rank(x) is 2, the real axis is 1
-            out1 = paddle.concat(x=[x1, x2, x3], axis=-1)
-            out2 = paddle.concat(x=[x1, x2], axis=0)
-            out3 = paddle.concat(x=[x1, x2], axis=zero)
-            # out1
-            # [[ 1  2  3 11 12 13 21 22]
-            #  [ 4  5  6 14 15 16 23 24]]
-            # out2 out3
-            # [[ 1  2  3]
-            #  [ 4  5  6]
-            #  [11 12 13]
-            #  [14 15 16]]
+            >>> import paddle
+
+            >>> x1 = paddle.to_tensor([[1, 2, 3],
+            ...                        [4, 5, 6]])
+            >>> x2 = paddle.to_tensor([[11, 12, 13],
+            ...                        [14, 15, 16]])
+            >>> x3 = paddle.to_tensor([[21, 22],
+            ...                        [23, 24]])
+            >>> zero = paddle.full(shape=[1], dtype='int32', fill_value=0)
+            >>> # When the axis is negative, the real axis is (axis + Rank(x))
+            >>> # As follow, axis is -1, Rank(x) is 2, the real axis is 1
+            >>> out1 = paddle.concat(x=[x1, x2, x3], axis=-1)
+            >>> out2 = paddle.concat(x=[x1, x2], axis=0)
+            >>> out3 = paddle.concat(x=[x1, x2], axis=zero)
+            >>> print(out1)
+            Tensor(shape=[2, 8], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1 , 2 , 3 , 11, 12, 13, 21, 22],
+             [4 , 5 , 6 , 14, 15, 16, 23, 24]])
+            >>> print(out2)
+            Tensor(shape=[4, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1 , 2 , 3 ],
+             [4 , 5 , 6 ],
+             [11, 12, 13],
+             [14, 15, 16]])
+            >>> print(out3)
+            Tensor(shape=[4, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1 , 2 , 3 ],
+             [4 , 5 , 6 ],
+             [11, 12, 13],
+             [14, 15, 16]])
     """
     input = x
     if in_dynamic_or_pir_mode():
@@ -1227,12 +1276,12 @@ def broadcast_tensors(input, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x1 = paddle.rand([1, 2, 3, 4]).astype('float32')
-            x2 = paddle.rand([1, 2, 1, 4]).astype('float32')
-            x3 = paddle.rand([1, 1, 3, 1]).astype('float32')
-            out1, out2, out3 = paddle.broadcast_tensors(input=[x1, x2, x3])
-            # out1, out2, out3: tensors broadcasted from x1, x2, x3 with shape [1,2,3,4]
+            >>> import paddle
+            >>> x1 = paddle.rand([1, 2, 3, 4]).astype('float32')
+            >>> x2 = paddle.rand([1, 2, 1, 4]).astype('float32')
+            >>> x3 = paddle.rand([1, 1, 3, 1]).astype('float32')
+            >>> out1, out2, out3 = paddle.broadcast_tensors(input=[x1, x2, x3])
+            >>> # out1, out2, out3: tensors broadcasted from x1, x2, x3 with shape [1,2,3,4]
     """
 
     num_inputs = len(input)
@@ -1337,15 +1386,29 @@ def flip(x, axis, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          image_shape=(3, 2, 2)
-          img = paddle.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
-          tmp = paddle.flip(img, [0,1])
-          print(tmp) # [[[10,11],[8, 9]], [[6, 7],[4, 5]], [[2, 3],[0, 1]]]
-
-          out = paddle.flip(tmp,-1)
-          print(out) # [[[11,10],[9, 8]], [[7, 6],[5, 4]], [[3, 2],[1, 0]]]
+            >>> import paddle
+
+            >>> image_shape=(3, 2, 2)
+            >>> img = paddle.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
+            >>> tmp = paddle.flip(img, [0,1])
+            >>> print(tmp)
+            Tensor(shape=[3, 2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[[10, 11],
+              [8 , 9 ]],
+             [[6 , 7 ],
+              [4 , 5 ]],
+             [[2 , 3 ],
+              [0 , 1 ]]])
+
+            >>> out = paddle.flip(tmp,-1)
+            >>> print(out)
+            Tensor(shape=[3, 2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[[11, 10],
+              [9 , 8 ]],
+             [[7 , 6 ],
+              [5 , 4 ]],
+             [[3 , 2 ],
+              [1 , 0 ]]])
     """
     if isinstance(axis, int):
         axis = [axis]
@@ -1397,38 +1460,38 @@ def rot90(x, k=1, axes=[0, 1], name=None):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          data = paddle.arange(4)
-          data = paddle.reshape(data, (2, 2))
-          print(data)
-          #[[0, 1],
-          # [2, 3]]
-
-          y = paddle.rot90(data, 1, [0, 1])
-          print(y)
-          #[[1, 3],
-          # [0, 2]]
-
-          y= paddle.rot90(data, -1, [0, 1])
-          print(y)
-          #[[2, 0],
-          # [3, 1]]
-
-          data2 = paddle.arange(8)
-          data2 = paddle.reshape(data2, (2,2,2))
-          print(data2)
-          #[[[0, 1],
-          #  [2, 3]],
-          # [[4, 5],
-          #  [6, 7]]]
-
-          y = paddle.rot90(data2, 1, [1, 2])
-          print(y)
-          #[[[1, 3],
-          #  [0, 2]],
-          # [[5, 7],
-          #  [4, 6]]]
+            >>> import paddle
+
+            >>> data = paddle.arange(4)
+            >>> data = paddle.reshape(data, (2, 2))
+            >>> print(data.numpy())
+            [[0 1]
+             [2 3]]
+
+            >>> y = paddle.rot90(data, 1, [0, 1])
+            >>> print(y.numpy())
+            [[1 3]
+             [0 2]]
+
+            >>> y= paddle.rot90(data, -1, [0, 1])
+            >>> print(y.numpy())
+            [[2 0]
+             [3 1]]
+
+            >>> data2 = paddle.arange(8)
+            >>> data2 = paddle.reshape(data2, (2,2,2))
+            >>> print(data2.numpy())
+            [[[0 1]
+              [2 3]]
+             [[4 5]
+              [6 7]]]
+
+            >>> y = paddle.rot90(data2, 1, [1, 2])
+            >>> print(y.numpy())
+            [[[1 3]
+              [0 2]]
+             [[5 7]
+              [4 6]]]
     """
 
     helper = LayerHelper("rot90", **locals())
@@ -1535,21 +1598,24 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            image_shape=(2, 3, 4, 4)
+            >>> image_shape=(2, 3, 4, 4)
 
-            x = paddle.arange(end=image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3])
-            img = paddle.reshape(x, image_shape)
+            >>> x = paddle.arange(end=image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3])
+            >>> img = paddle.reshape(x, image_shape)
 
-            out = paddle.flatten(img, start_axis=1, stop_axis=2)
-            # out shape is [2, 12, 4]
+            >>> out = paddle.flatten(img, start_axis=1, stop_axis=2)
+            >>> print(out.shape)
+            [2, 12, 4]
 
-            # out shares data with img in dygraph mode
-            img[0, 0, 0, 0] = -1
-            print(out[0, 0, 0]) # [-1]
+            >>> # out shares data with img in dygraph mode
+            >>> img[0, 0, 0, 0] = -1
+            >>> print(out[0, 0, 0])
+            Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True,
+            -1)
     """
-    if not (isinstance(x, Variable)):
+    if not (isinstance(x, (Variable, paddle.pir.OpResult))):
         raise ValueError("The input x should be a Tensor")
 
     x_dim = len(x.shape)
@@ -1586,7 +1652,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
         if start_axis > stop_axis:
             raise ValueError("The stop_axis should be larger than stat_axis")
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.flatten(x, start_axis, stop_axis)
     else:
         check_variable_and_dtype(
@@ -1676,26 +1742,26 @@ def roll(x, shifts, axis=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 2.0, 3.0],
-                                  [4.0, 5.0, 6.0],
-                                  [7.0, 8.0, 9.0]])
-            out_z1 = paddle.roll(x, shifts=1)
-            print(out_z1)
-            #[[9. 1. 2.]
-            # [3. 4. 5.]
-            # [6. 7. 8.]]
-            out_z2 = paddle.roll(x, shifts=1, axis=0)
-            print(out_z2)
-            #[[7. 8. 9.]
-            # [1. 2. 3.]
-            # [4. 5. 6.]]
-            out_z3 = paddle.roll(x, shifts=1, axis=1)
-            print(out_z3)
-            #[[3. 1. 2.]
-            # [6. 4. 5.]
-            # [9. 7. 8.]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 2.0, 3.0],
+            ...                       [4.0, 5.0, 6.0],
+            ...                       [7.0, 8.0, 9.0]])
+            >>> out_z1 = paddle.roll(x, shifts=1)
+            >>> print(out_z1.numpy())
+            [[9. 1. 2.]
+             [3. 4. 5.]
+             [6. 7. 8.]]
+            >>> out_z2 = paddle.roll(x, shifts=1, axis=0)
+            >>> print(out_z2.numpy())
+            [[7. 8. 9.]
+             [1. 2. 3.]
+             [4. 5. 6.]]
+            >>> out_z3 = paddle.roll(x, shifts=1, axis=1)
+            >>> print(out_z3.numpy())
+            [[3. 1. 2.]
+             [6. 4. 5.]
+             [9. 7. 8.]]
     """
     origin_shape = x.shape
     if type(shifts) == int:
@@ -1819,28 +1885,32 @@ def stack(x, axis=0, name=None):
     Returns:
         Tensor, The stacked tensor with same data type as input.
 
-    Example:
+    Examples:
         .. code-block:: python
 
-            import paddle
-
-            x1 = paddle.to_tensor([[1.0, 2.0]])
-            x2 = paddle.to_tensor([[3.0, 4.0]])
-            x3 = paddle.to_tensor([[5.0, 6.0]])
-
-            out = paddle.stack([x1, x2, x3], axis=0)
-            print(out.shape)  # [3, 1, 2]
-            print(out)
-            # [[[1., 2.]],
-            #  [[3., 4.]],
-            #  [[5., 6.]]]
-
-        out = paddle.stack([x1, x2, x3], axis=-2)
-        print(out.shape)  # [1, 3, 2]
-        print(out)
-        # [[[1., 2.],
-        #   [3., 4.],
-        #   [5., 6.]]]
+            >>> import paddle
+
+            >>> x1 = paddle.to_tensor([[1.0, 2.0]])
+            >>> x2 = paddle.to_tensor([[3.0, 4.0]])
+            >>> x3 = paddle.to_tensor([[5.0, 6.0]])
+
+            >>> out = paddle.stack([x1, x2, x3], axis=0)
+            >>> print(out.shape)
+            [3, 1, 2]
+            >>> print(out)
+            Tensor(shape=[3, 1, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[1., 2.]],
+             [[3., 4.]],
+             [[5., 6.]]])
+
+            >>> out = paddle.stack([x1, x2, x3], axis=-2)
+            >>> print(out.shape)
+            [1, 3, 2]
+            >>> print(out)
+            Tensor(shape=[1, 3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[1., 2.],
+              [3., 4.],
+              [5., 6.]]])
     """
     axis = 0 if axis is None else axis
 
@@ -1926,34 +1996,46 @@ def split(x, num_or_sections, axis=0, name=None):
     Returns:
         list(Tensor), The list of segmented Tensors.
 
-    Example:
+    Examples:
         .. code-block:: python
 
-            import paddle
-
-            # x is a Tensor of shape [3, 9, 5]
-            x = paddle.rand([3, 9, 5])
-
-            out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=1)
-            print(out0.shape)  # [3, 3, 5]
-            print(out1.shape)  # [3, 3, 5]
-            print(out2.shape)  # [3, 3, 5]
-
-            out0, out1, out2 = paddle.split(x, num_or_sections=[2, 3, 4], axis=1)
-            print(out0.shape)  # [3, 2, 5]
-            print(out1.shape)  # [3, 3, 5]
-            print(out2.shape)  # [3, 4, 5]
-
-            out0, out1, out2 = paddle.split(x, num_or_sections=[2, 3, -1], axis=1)
-            print(out0.shape)  # [3, 2, 5]
-            print(out1.shape)  # [3, 3, 5]
-            print(out2.shape)  # [3, 4, 5]
-
-            # axis is negative, the real axis is (rank(x) + axis)=1
-            out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=-2)
-            print(out0.shape)  # [3, 3, 5]
-            print(out1.shape)  # [3, 3, 5]
-            print(out2.shape)  # [3, 3, 5]
+            >>> import paddle
+
+            >>> # x is a Tensor of shape [3, 9, 5]
+            >>> x = paddle.rand([3, 9, 5])
+
+            >>> out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=1)
+            >>> print(out0.shape)
+            [3, 3, 5]
+            >>> print(out1.shape)
+            [3, 3, 5]
+            >>> print(out2.shape)
+            [3, 3, 5]
+
+            >>> out0, out1, out2 = paddle.split(x, num_or_sections=[2, 3, 4], axis=1)
+            >>> print(out0.shape)
+            [3, 2, 5]
+            >>> print(out1.shape)
+            [3, 3, 5]
+            >>> print(out2.shape)
+            [3, 4, 5]
+
+            >>> out0, out1, out2 = paddle.split(x, num_or_sections=[2, 3, -1], axis=1)
+            >>> print(out0.shape)
+            [3, 2, 5]
+            >>> print(out1.shape)
+            [3, 3, 5]
+            >>> print(out2.shape)
+            [3, 4, 5]
+
+            >>> # axis is negative, the real axis is (rank(x) + axis)=1
+            >>> out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=-2)
+            >>> print(out0.shape)
+            [3, 3, 5]
+            >>> print(out1.shape)
+            [3, 3, 5]
+            >>> print(out2.shape)
+            [3, 3, 5]
     """
     input = x
     dim = axis
@@ -2109,24 +2191,32 @@ def vsplit(x, num_or_sections, name=None):
     Returns:
         list[Tensor], The list of segmented Tensors.
 
-    Example:
+    Examples:
         .. code-block:: python
 
-            import paddle
-
-            # x is a Tensor of shape [8, 6, 7]
-            x = paddle.rand([8, 6, 7])
-            out0, out1 = paddle.vsplit(x, num_or_sections=2)
-            print(out0.shape)  # [4, 6, 7]
-            print(out1.shape)  # [4, 6, 7]
-            out0, out1, out2 = paddle.vsplit(x, num_or_sections=[1, 3, 4])
-            print(out0.shape)  # [1, 6, 7]
-            print(out1.shape)  # [3, 6, 7]
-            print(out2.shape)  # [4, 6, 7]
-            out0, out1, out2 = paddle.vsplit(x, num_or_sections=[2, 3, -1])
-            print(out0.shape)  # [2, 6, 7]
-            print(out1.shape)  # [3, 6, 7]
-            print(out2.shape)  # [3, 6, 7]
+            >>> import paddle
+
+            >>> # x is a Tensor of shape [8, 6, 7]
+            >>> x = paddle.rand([8, 6, 7])
+            >>> out0, out1 = paddle.vsplit(x, num_or_sections=2)
+            >>> print(out0.shape)
+            [4, 6, 7]
+            >>> print(out1.shape)
+            [4, 6, 7]
+            >>> out0, out1, out2 = paddle.vsplit(x, num_or_sections=[1, 3, 4])
+            >>> print(out0.shape)
+            [1, 6, 7]
+            >>> print(out1.shape)
+            [3, 6, 7]
+            >>> print(out2.shape)
+            [4, 6, 7]
+            >>> out0, out1, out2 = paddle.vsplit(x, num_or_sections=[2, 3, -1])
+            >>> print(out0.shape)
+            [2, 6, 7]
+            >>> print(out1.shape)
+            [3, 6, 7]
+            >>> print(out2.shape)
+            [3, 6, 7]
     """
     if x.ndim < 2:
         raise ValueError(
@@ -2195,17 +2285,21 @@ def squeeze(x, axis=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.rand([5, 1, 10])
-            output = paddle.squeeze(x, axis=1)
+            >>> x = paddle.rand([5, 1, 10])
+            >>> output = paddle.squeeze(x, axis=1)
 
-            print(x.shape)  # [5, 1, 10]
-            print(output.shape)  # [5, 10]
+            >>> print(x.shape)
+            [5, 1, 10]
+            >>> print(output.shape)
+            [5, 10]
 
-            # output shares data with x in dygraph mode
-            x[0, 0, 0] = 10.
-            print(output[0, 0]) # [10.]
+            >>> # output shares data with x in dygraph mode
+            >>> x[0, 0, 0] = 10.
+            >>> print(output[0, 0])
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            10.)
 
     """
     if axis is None:
@@ -2217,7 +2311,18 @@ def squeeze(x, axis=None, name=None):
 
     input = x
     axes = axis
-    if in_dynamic_or_pir_mode():
+    if in_dynamic_mode():
+        return _C_ops.squeeze(input, axes)
+    elif in_pir_mode():
+        if isinstance(axes, int):
+            axes = [axes]
+        if isinstance(axes, paddle.pir.OpResult):
+            axes.stop_gradient = True
+        elif isinstance(axes, (list, tuple)):
+            if paddle.utils._contain_var(axes):
+                axes = paddle.utils.get_int_tensor_list(
+                    axes, default_dtype='int64'
+                )
         return _C_ops.squeeze(input, axes)
     else:
         helper = LayerHelper("squeeze", **locals())
@@ -2317,40 +2422,40 @@ def unique_consecutive(
         - counts (Tensor), the counts of the every unique consecutive element in the input tensor.
             counts is provided only if return_counts is True.
 
-    Example:
+    Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([1, 1, 2, 2, 3, 1, 1, 2])
-            output = paddle.unique_consecutive(x) #
-            print(output)
-            # Tensor(shape=[5], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [1, 2, 3, 1, 2])
-
-            _, inverse, counts = paddle.unique_consecutive(x, return_inverse=True, return_counts=True)
-            print(inverse)
-            # Tensor(shape=[8], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [0, 0, 1, 1, 2, 3, 3, 4])
-            print(counts)
-            # Tensor(shape=[5], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [2, 2, 1, 2, 1])
-
-            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
-            output = paddle.unique_consecutive(x, axis=0) #
-            print(output)
-            # Tensor(shape=[3, 3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [[2, 1, 3],
-            #         [3, 0, 1],
-            #         [2, 1, 3]])
-
-            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
-            output = paddle.unique_consecutive(x, axis=0) #
-            print(output)
-            # Tensor(shape=[3, 3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [[2, 1, 3],
-            #         [3, 0, 1],
-            #         [2, 1, 3]])
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([1, 1, 2, 2, 3, 1, 1, 2])
+            >>> output = paddle.unique_consecutive(x) #
+            >>> print(output)
+            Tensor(shape=[5], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [1, 2, 3, 1, 2])
+
+            >>> _, inverse, counts = paddle.unique_consecutive(x, return_inverse=True, return_counts=True)
+            >>> print(inverse)
+            Tensor(shape=[8], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 0, 1, 1, 2, 3, 3, 4])
+            >>> print(counts)
+            Tensor(shape=[5], dtype=int64, place=Place(cpu), stop_gradient=True,
+             [2, 2, 1, 2, 1])
+
+            >>> x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
+            >>> output = paddle.unique_consecutive(x, axis=0) #
+            >>> print(output)
+            Tensor(shape=[3, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[2, 1, 3],
+             [3, 0, 1],
+             [2, 1, 3]])
+
+            >>> x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
+            >>> output = paddle.unique_consecutive(x, axis=0) #
+            >>> print(output)
+            Tensor(shape=[3, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[2, 1, 3],
+             [3, 0, 1],
+             [2, 1, 3]])
     """
 
     if axis is None:
@@ -2449,43 +2554,43 @@ def unique(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([2, 3, 3, 1, 5, 3])
-            unique = paddle.unique(x)
-            print(unique)
-            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [1, 2, 3, 5])
-
-            _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
-            print(indices)
-            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [3, 0, 1, 4])
-            print(inverse)
-            # Tensor(shape=[6], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [1, 2, 2, 0, 3, 2])
-            print(counts)
-            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [1, 1, 3, 1])
-
-            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
-            unique = paddle.unique(x)
-            print(unique)
-            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [0, 1, 2, 3])
-
-            unique = paddle.unique(x, axis=0)
-            print(unique)
-            # Tensor(shape=[2, 3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [[2, 1, 3],
-            #         [3, 0, 1]])
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([2, 3, 3, 1, 5, 3])
+            >>> unique = paddle.unique(x)
+            >>> print(unique)
+            Tensor(shape=[4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [1, 2, 3, 5])
+
+            >>> _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
+            >>> print(indices)
+            Tensor(shape=[4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [3, 0, 1, 4])
+            >>> print(inverse)
+            Tensor(shape=[6], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [1, 2, 2, 0, 3, 2])
+            >>> print(counts)
+            Tensor(shape=[4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [1, 1, 3, 1])
+
+            >>> x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
+            >>> unique = paddle.unique(x)
+            >>> print(unique)
+            Tensor(shape=[4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 1, 2, 3])
+
+            >>> unique = paddle.unique(x, axis=0)
+            >>> print(unique)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[2, 1, 3],
+             [3, 0, 1]])
     """
     if axis is None:
         axis = []
     else:
         axis = [axis]
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         out, indices, inverse, counts = _C_ops.unique(
             x, return_index, return_inverse, return_counts, axis, attr_dtype
         )
@@ -2584,31 +2689,41 @@ def unsqueeze(x, axis, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.rand([5, 10])
-            print(x.shape)  # [5, 10]
-
-            out1 = paddle.unsqueeze(x, axis=0)
-            print(out1.shape)  # [1, 5, 10]
-
-            out2 = paddle.unsqueeze(x, axis=[0, 2])
-            print(out2.shape)  # [1, 5, 1, 10]
-
-            axis = paddle.to_tensor([0, 1, 2])
-            out3 = paddle.unsqueeze(x, axis=axis)
-            print(out3.shape)  # [1, 1, 1, 5, 10]
-
-            # out1, out2, out3 share data with x in dygraph mode
-            x[0, 0] = 10.
-            print(out1[0, 0, 0]) # [10.]
-            print(out2[0, 0, 0, 0]) # [10.]
-            print(out3[0, 0, 0, 0, 0]) # [10.]
+            >>> import paddle
+
+            >>> x = paddle.rand([5, 10])
+            >>> print(x.shape)
+            [5, 10]
+
+            >>> out1 = paddle.unsqueeze(x, axis=0)
+            >>> print(out1.shape)
+            [1, 5, 10]
+
+            >>> out2 = paddle.unsqueeze(x, axis=[0, 2])
+            >>> print(out2.shape)
+            [1, 5, 1, 10]
+
+            >>> axis = paddle.to_tensor([0, 1, 2])
+            >>> out3 = paddle.unsqueeze(x, axis=axis)
+            >>> print(out3.shape)
+            [1, 1, 1, 5, 10]
+
+            >>> # out1, out2, out3 share data with x in dygraph mode
+            >>> x[0, 0] = 10.
+            >>> print(out1[0, 0, 0])
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            10.)
+            >>> print(out2[0, 0, 0, 0])
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            10.)
+            >>> print(out3[0, 0, 0, 0, 0])
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            10.)
 
     """
     input = x
     axes = axis
-    if in_dynamic_or_pir_mode():
+    if in_dynamic_mode():
         if isinstance(axes, int):
             axes = [axes]
         elif isinstance(axes, Variable):
@@ -2619,6 +2734,17 @@ def unsqueeze(x, axis, name=None):
                 for item in axes
             ]
         return _C_ops.unsqueeze(input, axes)
+    elif in_pir_mode():
+        if isinstance(axes, int):
+            axes = [axes]
+        if isinstance(axes, paddle.pir.OpResult):
+            axes.stop_gradient = True
+        elif isinstance(axes, (list, tuple)):
+            if paddle.utils._contain_var(axes):
+                axes = paddle.utils.get_int_tensor_list(
+                    axes, default_dtype='int64'
+                )
+        return _C_ops.unsqueeze(input, axes)
     else:
         check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
         check_variable_and_dtype(
@@ -2727,12 +2853,15 @@ def gather(x, index, axis=None, name=None):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            input = paddle.to_tensor([[1,2],[3,4],[5,6]])
-            index = paddle.to_tensor([0,1])
-            output = paddle.gather(input, index, axis=0)
-            # expected output: [[1,2],[3,4]]
+            >>> input = paddle.to_tensor([[1,2],[3,4],[5,6]])
+            >>> index = paddle.to_tensor([0,1])
+            >>> output = paddle.gather(input, index, axis=0)
+            >>> print(output)
+            Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1, 2],
+             [3, 4]])
     """
     if axis is None:
         axis = 0
@@ -2793,24 +2922,24 @@ def unbind(input, axis=0):
     Returns:
         list(Tensor), The list of segmented Tensor variables.
 
-    Example:
+    Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            # input is a Tensor which shape is [3, 4, 5]
-            input = paddle.rand([3, 4, 5])
+            >>> # input is a Tensor which shape is [3, 4, 5]
+            >>> input = paddle.rand([3, 4, 5])
 
-            [x0, x1, x2] = paddle.unbind(input, axis=0)
-            # x0.shape [4, 5]
-            # x1.shape [4, 5]
-            # x2.shape [4, 5]
+            >>> [x0, x1, x2] = paddle.unbind(input, axis=0)
+            >>> # x0.shape [4, 5]
+            >>> # x1.shape [4, 5]
+            >>> # x2.shape [4, 5]
 
-            [x0, x1, x2, x3] = paddle.unbind(input, axis=1)
-            # x0.shape [3, 5]
-            # x1.shape [3, 5]
-            # x2.shape [3, 5]
-            # x3.shape [3, 5]
+            >>> [x0, x1, x2, x3] = paddle.unbind(input, axis=1)
+            >>> # x0.shape [3, 5]
+            >>> # x1.shape [3, 5]
+            >>> # x2.shape [3, 5]
+            >>> # x3.shape [3, 5]
     """
     if not isinstance(axis, (int)):
         raise TypeError(
@@ -2870,26 +2999,27 @@ def scatter(x, index, updates, overwrite=True, name=None):
     .. code-block:: python
         :name: code-example1
 
-        import paddle
-        #input:
-        x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
-        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
-        # shape of updates should be the same as x
-        # shape of updates with dim > 1 should be the same as input
-        updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
-        overwrite = False
-        # calculation:
-        if not overwrite:
-            for i in range(len(index)):
-                x[index[i]] = paddle.zeros([2])
-        for i in range(len(index)):
-            if (overwrite):
-                x[index[i]] = updates[i]
-            else:
-                x[index[i]] += updates[i]
-        # output:
-        out = paddle.to_tensor([[3, 3], [6, 6], [1, 1]])
-        out.shape # [3, 2]
+        >>> import paddle
+        >>> #input:
+        >>> x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
+        >>> index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        >>> # shape of updates should be the same as x
+        >>> # shape of updates with dim > 1 should be the same as input
+        >>> updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+        >>> overwrite = False
+        >>> # calculation:
+        >>> if not overwrite:
+        ...     for i in range(len(index)):
+        ...         x[index[i]] = paddle.zeros([2])
+        >>> for i in range(len(index)):
+        ...     if (overwrite):
+        ...         x[index[i]] = updates[i]
+        ...     else:
+        ...         x[index[i]] += updates[i]
+        >>> # output:
+        >>> out = paddle.to_tensor([[3, 3], [6, 6], [1, 1]])
+        >>> print(out.shape)
+        [3, 2]
 
     **NOTICE**: The order in which updates are applied is nondeterministic,
     so the output will be nondeterministic if index contains duplicates.
@@ -2907,31 +3037,33 @@ def scatter(x, index, updates, overwrite=True, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
-            index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
-            updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
-
-            output1 = paddle.scatter(x, index, updates, overwrite=False)
-            # [[3., 3.],
-            #  [6., 6.],
-            #  [1., 1.]]
-
-            output2 = paddle.scatter(x, index, updates, overwrite=True)
-            # CPU device:
-            # [[3., 3.],
-            #  [4., 4.],
-            #  [1., 1.]]
-            # GPU device maybe have two results because of the repeated numbers in index
-            # result 1:
-            # [[3., 3.],
-            #  [4., 4.],
-            #  [1., 1.]]
-            # result 2:
-            # [[3., 3.],
-            #  [2., 2.],
-            #  [1., 1.]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
+            >>> index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+            >>> updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+            >>> output1 = paddle.scatter(x, index, updates, overwrite=False)
+            >>> print(output1)
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[3., 3.],
+             [6., 6.],
+             [1., 1.]])
+
+            >>> output2 = paddle.scatter(x, index, updates, overwrite=True)
+            >>> # CPU device:
+            >>> # [[3., 3.],
+            >>> #  [4., 4.],
+            >>> #  [1., 1.]]
+            >>> # GPU device maybe have two results because of the repeated numbers in index
+            >>> # result 1:
+            >>> # [[3., 3.],
+            >>> #  [4., 4.],
+            >>> #  [1., 1.]]
+            >>> # result 2:
+            >>> # [[3., 3.],
+            >>> #  [2., 2.],
+            >>> #  [1., 1.]]
     """
     if in_dynamic_mode():
         return _C_ops.scatter(x, index, updates, overwrite)
@@ -3020,17 +3152,17 @@ def scatter_nd_add(x, index, updates, name=None):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.rand(shape=[3, 5, 9, 10], dtype='float32')
-            updates = paddle.rand(shape=[3, 9, 10], dtype='float32')
-            index = paddle.to_tensor([[1, 1],
-                                    [0, 1],
-                                    [1, 3]], dtype='int64')
+            >>> x = paddle.rand(shape=[3, 5, 9, 10], dtype='float32')
+            >>> updates = paddle.rand(shape=[3, 9, 10], dtype='float32')
+            >>> index = paddle.to_tensor([[1, 1],
+            ...                           [0, 1],
+            ...                           [1, 3]], dtype='int64')
 
-            output = paddle.scatter_nd_add(x, index, updates)
-            print(output.shape)
-            # [3, 5, 9, 10]
+            >>> output = paddle.scatter_nd_add(x, index, updates)
+            >>> print(output.shape)
+            [3, 5, 9, 10]
     """
     if in_dynamic_mode():
         return _C_ops.scatter_nd_add(x, index, updates)
@@ -3077,15 +3209,15 @@ def scatter_nd(index, updates, shape, name=None):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            index = paddle.to_tensor([[1, 1],
-                                    [0, 1],
-                                    [1, 3]], dtype="int64")
-            updates = paddle.rand(shape=[3, 9, 10], dtype='float32')
-            shape = [3, 5, 9, 10]
+            >>> index = paddle.to_tensor([[1, 1],
+            ...                           [0, 1],
+            ...                           [1, 3]], dtype="int64")
+            >>> updates = paddle.rand(shape=[3, 9, 10], dtype='float32')
+            >>> shape = [3, 5, 9, 10]
 
-            output = paddle.scatter_nd(index, updates, shape)
+            >>> output = paddle.scatter_nd(index, updates, shape)
     """
     return scatter_nd_add(zeros(shape, updates.dtype), index, updates, name)
 
@@ -3108,22 +3240,22 @@ def chunk(x, chunks, axis=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.rand([3, 9, 5])
+            >>> x = paddle.rand([3, 9, 5])
 
-            out0, out1, out2 = paddle.chunk(x, chunks=3, axis=1)
-            # out0.shape [3, 3, 5]
-            # out1.shape [3, 3, 5]
-            # out2.shape [3, 3, 5]
+            >>> out0, out1, out2 = paddle.chunk(x, chunks=3, axis=1)
+            >>> # out0.shape [3, 3, 5]
+            >>> # out1.shape [3, 3, 5]
+            >>> # out2.shape [3, 3, 5]
 
 
-            # axis is negative, the real axis is (rank(x) + axis) which real
-            # value is 1.
-            out0, out1, out2 = paddle.chunk(x, chunks=3, axis=-2)
-            # out0.shape [3, 3, 5]
-            # out1.shape [3, 3, 5]
-            # out2.shape [3, 3, 5]
+            >>> # axis is negative, the real axis is (rank(x) + axis) which real
+            >>> # value is 1.
+            >>> out0, out1, out2 = paddle.chunk(x, chunks=3, axis=-2)
+            >>> # out0.shape [3, 3, 5]
+            >>> # out1.shape [3, 3, 5]
+            >>> # out2.shape [3, 3, 5]
     """
     check_type(chunks, 'chunks', (int), 'chunk')
     return split(x, num_or_sections=chunks, axis=axis, name=name)
@@ -3149,46 +3281,42 @@ def tile(x, repeat_times, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            data = paddle.to_tensor([1, 2, 3], dtype='int32')
-            out = paddle.tile(data, repeat_times=[2, 1])
-            print(out)
-            # Tensor(shape=[2, 3], dtype=int32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1, 2, 3],
-            #         [1, 2, 3]])
-
-            out = paddle.tile(data, repeat_times=(2, 2))
-            print(out)
-            # Tensor(shape=[2, 6], dtype=int32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1, 2, 3, 1, 2, 3],
-            #         [1, 2, 3, 1, 2, 3]])
-
-            repeat_times = paddle.to_tensor([1, 2], dtype='int32')
-            out = paddle.tile(data, repeat_times=repeat_times)
-            print(out)
-            # Tensor(shape=[1, 6], dtype=int32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1, 2, 3, 1, 2, 3]])
+            >>> import paddle
+
+            >>> data = paddle.to_tensor([1, 2, 3], dtype='int32')
+            >>> out = paddle.tile(data, repeat_times=[2, 1])
+            >>> print(out)
+            Tensor(shape=[2, 3], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[1, 2, 3],
+             [1, 2, 3]])
+
+            >>> out = paddle.tile(data, repeat_times=(2, 2))
+            >>> print(out)
+            Tensor(shape=[2, 6], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[1, 2, 3, 1, 2, 3],
+             [1, 2, 3, 1, 2, 3]])
+
+            >>> repeat_times = paddle.to_tensor([1, 2], dtype='int32')
+            >>> out = paddle.tile(data, repeat_times=repeat_times)
+            >>> print(out)
+            Tensor(shape=[1, 6], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[1, 2, 3, 1, 2, 3]])
     """
-    if in_dynamic_or_pir_mode():
-        if isinstance(repeat_times, core.eager.Tensor):
-            assert (
-                repeat_times.ndim == 1
-            ), "Only support ndim == 1 while repeat_times is a Tensor."
-            repeat_times = repeat_times.tolist()
 
-        return _C_ops.tile(x, repeat_times)
-    else:
+    def check_input(x, repeat_times):
         check_type(
-            repeat_times, 'repeat_times', (list, tuple, Variable), 'tile'
+            repeat_times,
+            'repeat_times',
+            (list, tuple, Variable, paddle.pir.OpResult),
+            'tile',
         )
-        if isinstance(repeat_times, Variable):
+        if isinstance(repeat_times, (Variable, paddle.pir.OpResult)):
             assert (
-                repeat_times.numel() == 1
-            ), 'repeat_times must be a Tensor with one element.'
+                len(repeat_times.shape) == 1
+            ), 'repeat_times must be a Tensor with ndim == 1.'
         else:
             for elem in repeat_times:
-                if isinstance(elem, Variable):
+                if isinstance(elem, (Variable, paddle.pir.OpResult)):
                     assert (
                         elem.numel() == 1
                     ), 'Elements in repeat_times must be Tensor with one element or integers.'
@@ -3219,10 +3347,24 @@ def tile(x, repeat_times, name=None):
                 "some_var.stop_gradient == True supporting some_var is the input."
             )
 
-        helper = LayerHelper('tile', **locals())
+    if in_dynamic_mode():
+        if isinstance(repeat_times, core.eager.Tensor):
+            assert (
+                repeat_times.ndim == 1
+            ), "Only support ndim == 1 while repeat_times is a Tensor."
+            repeat_times = repeat_times.tolist()
 
-        inputs = {"X": [x]}
-        attrs = {}
+        return _C_ops.tile(x, repeat_times)
+    elif in_pir_mode():
+        check_input(x, repeat_times)
+        if isinstance(repeat_times, (list, tuple)):
+            if paddle.utils._contain_var(repeat_times):
+                repeat_times = paddle.utils._convert_to_tensor_list(
+                    repeat_times
+                )
+        return _C_ops.tile(x, repeat_times)
+    else:
+        check_input(x, repeat_times)
 
         def get_attr_repeat_times(list_repeat_times):
             attrs_repeat_times = []
@@ -3236,6 +3378,11 @@ def get_attr_repeat_times(list_repeat_times):
                     ), "All elements in repeat_times must be positive for tile."
             return attrs_repeat_times
 
+        helper = LayerHelper('tile', **locals())
+
+        inputs = {"X": [x]}
+        attrs = {}
+
         if isinstance(repeat_times, Variable):
             repeat_times.stop_gradient = True
             inputs['RepeatTimes'] = repeat_times
@@ -3273,15 +3420,15 @@ def expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            data_x = paddle.to_tensor([1, 2, 3], 'int32')
-            data_y = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], 'int32')
-            out = paddle.expand_as(data_x, data_y)
-            print(out)
-            # Tensor(shape=[2, 3], dtype=int32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1, 2, 3],
-            #         [1, 2, 3]])
+            >>> data_x = paddle.to_tensor([1, 2, 3], 'int32')
+            >>> data_y = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], 'int32')
+            >>> out = paddle.expand_as(data_x, data_y)
+            >>> print(out)
+            Tensor(shape=[2, 3], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[1, 2, 3],
+             [1, 2, 3]])
     """
     if in_dynamic_mode():
         return _C_ops.expand_as(x, None, y.shape)
@@ -3343,12 +3490,14 @@ def broadcast_to(x, shape, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            data = paddle.to_tensor([1, 2, 3], dtype='int32')
-            out = paddle.broadcast_to(data, shape=[2, 3])
-            print(out)
-            # [[1, 2, 3], [1, 2, 3]]
+            >>> data = paddle.to_tensor([1, 2, 3], dtype='int32')
+            >>> out = paddle.broadcast_to(data, shape=[2, 3])
+            >>> print(out)
+            Tensor(shape=[2, 3], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[1, 2, 3],
+             [1, 2, 3]])
     """
     if in_dynamic_mode():
         return _C_ops.expand(x, shape)
@@ -3445,12 +3594,14 @@ def expand(x, shape, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            data = paddle.to_tensor([1, 2, 3], dtype='int32')
-            out = paddle.expand(data, shape=[2, 3])
-            print(out)
-            # [[1, 2, 3], [1, 2, 3]]
+            >>> data = paddle.to_tensor([1, 2, 3], dtype='int32')
+            >>> out = paddle.expand(data, shape=[2, 3])
+            >>> print(out)
+            Tensor(shape=[2, 3], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[1, 2, 3],
+             [1, 2, 3]])
     """
     if in_dynamic_or_pir_mode():
         return _C_ops.expand(x, shape)
@@ -3563,27 +3714,28 @@ def reshape(x, shape, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.rand([2, 4, 6], dtype="float32")
-            positive_four = paddle.full([1], 4, "int32")
+            >>> x = paddle.rand([2, 4, 6], dtype="float32")
+            >>> positive_four = paddle.full([1], 4, "int32")
 
-            out = paddle.reshape(x, [-1, 0, 3, 2])
-            print(out)
-            # the shape is [2,4,3,2].
+            >>> out = paddle.reshape(x, [-1, 0, 3, 2])
+            >>> print(out.shape)
+            [2, 4, 3, 2]
 
-            out = paddle.reshape(x, shape=[positive_four, 12])
-            print(out)
-            # the shape of out_2 is [4, 12].
+            >>> out = paddle.reshape(x, shape=[positive_four, 12])
+            >>> print(out.shape)
+            [4, 12]
 
-            shape_tensor = paddle.to_tensor([8, 6], dtype=paddle.int32)
-            out = paddle.reshape(x, shape=shape_tensor)
-            print(out.shape)
-            # the shape is [8, 6].
-            # out shares data with x in dygraph mode
-            x[0, 0, 0] = 10.
-            print(out[0, 0])
-            # the value is [10.]
+            >>> shape_tensor = paddle.to_tensor([8, 6], dtype=paddle.int32)
+            >>> out = paddle.reshape(x, shape=shape_tensor)
+            >>> print(out.shape)
+            [8, 6]
+            >>> # out shares data with x in dygraph mode
+            >>> x[0, 0, 0] = 10.
+            >>> print(out[0, 0])
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            10.)
 
     """
 
@@ -3797,13 +3949,16 @@ def gather_nd(x, index, name=None):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
-                                  [[7, 8], [9, 10], [11, 12]]])
-            index = paddle.to_tensor([[0, 1]])
+            >>> x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
+            ...                       [[7, 8], [9, 10], [11, 12]]])
+            >>> index = paddle.to_tensor([[0, 1]])
 
-            output = paddle.gather_nd(x, index) #[[3, 4]]
+            >>> output = paddle.gather_nd(x, index)
+            >>> print(output)
+            Tensor(shape=[1, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[3, 4]])
 
     """
     if in_dynamic_or_pir_mode():
@@ -3907,22 +4062,22 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.zeros(shape=[3,4,5,6], dtype="float32")
-            # example 1:
-            # attr starts is a list which doesn't contain Tensor.
-            axes = [1, 2, 3]
-            starts = [-3, 0, 2]
-            ends = [3, 2, 4]
-            strides_1 = [1, 1, 1]
-            strides_2 = [1, 1, 2]
-            sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
-            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].
-            # example 2:
-            # attr starts is a list which contain tensor Tensor.
-            minus_3 = paddle.full(shape=[1], fill_value=-3, dtype='int32')
-            sliced_2 = paddle.strided_slice(x, axes=axes, starts=[minus_3, 0, 2], ends=ends, strides=strides_2)
-            # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2].
+            >>> import paddle
+            >>> x = paddle.zeros(shape=[3,4,5,6], dtype="float32")
+            >>> # example 1:
+            >>> # attr starts is a list which doesn't contain Tensor.
+            >>> axes = [1, 2, 3]
+            >>> starts = [-3, 0, 2]
+            >>> ends = [3, 2, 4]
+            >>> strides_1 = [1, 1, 1]
+            >>> strides_2 = [1, 1, 2]
+            >>> sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+            >>> # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].
+            >>> # example 2:
+            >>> # attr starts is a list which contain tensor Tensor.
+            >>> minus_3 = paddle.full(shape=[1], fill_value=-3, dtype='int32')
+            >>> sliced_2 = paddle.strided_slice(x, axes=axes, starts=[minus_3, 0, 2], ends=ends, strides=strides_2)
+            >>> # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2].
     """
     if in_dynamic_mode():
         return _C_ops.strided_slice(x, axes, starts, ends, strides)
@@ -4096,74 +4251,85 @@ def tensordot(x, y, axes=2, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            data_type = 'float64'
-
-            # For two 2-d tensor x and y, the case axes=0 is equivalent to outer product.
-            # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.
-            x = paddle.arange(4, dtype=data_type).reshape([2, 2])
-            y = paddle.arange(4, dtype=data_type).reshape([2, 2])
-            z = paddle.tensordot(x, y, axes=0)
-            # z = [[[[0., 0.],
-            #        [0., 0.]],
-            #
-            #       [[0., 1.],
-            #        [2., 3.]]],
-            #
-            #
-            #      [[[0., 2.],
-            #        [4., 6.]],
-            #
-            #       [[0., 3.],
-            #        [6., 9.]]]]
-
-
-            # For two 1-d tensor x and y, the case axes=1 is equivalent to inner product.
-            x = paddle.arange(10, dtype=data_type)
-            y = paddle.arange(10, dtype=data_type)
-            z1 = paddle.tensordot(x, y, axes=1)
-            z2 = paddle.dot(x, y)
-            # z1 = z2 = 285.
-
-
-            # For two 2-d tensor x and y, the case axes=1 is equivalent to matrix multiplication.
-            x = paddle.arange(6, dtype=data_type).reshape([2, 3])
-            y = paddle.arange(12, dtype=data_type).reshape([3, 4])
-            z1 = paddle.tensordot(x, y, axes=1)
-            z2 = paddle.matmul(x, y)
-            # z1 = z2 =  [[20., 23., 26., 29.],
-            #             [56., 68., 80., 92.]]
-
-
-            # When axes is a 1-d int list, x and y will be contracted along the same given axes.
-            # Note that axes=[1, 2] is equivalent to axes=[[1, 2]], axes=[[1, 2], []], axes=[[1, 2], [1]], and axes=[[1, 2], [1, 2]].
-            x = paddle.arange(24, dtype=data_type).reshape([2, 3, 4])
-            y = paddle.arange(36, dtype=data_type).reshape([3, 3, 4])
-            z = paddle.tensordot(x, y, axes=[1, 2])
-            # z =  [[506. , 1298., 2090.],
-            #       [1298., 3818., 6338.]]
-
-
-            # When axes is a list containing two 1-d int list, the first will be applied to x and the second to y.
-            x = paddle.arange(60, dtype=data_type).reshape([3, 4, 5])
-            y = paddle.arange(24, dtype=data_type).reshape([4, 3, 2])
-            z = paddle.tensordot(x, y, axes=([1, 0], [0, 1]))
-            # z =  [[4400., 4730.],
-            #       [4532., 4874.],
-            #       [4664., 5018.],
-            #       [4796., 5162.],
-            #       [4928., 5306.]]
-
-
-            # Thanks to the support of axes expansion, axes=[[0, 1, 3, 4], [1, 0, 3, 4]] can be abbreviated as axes= [[0, 1, 3, 4], [1, 0]].
-            x = paddle.arange(720, dtype=data_type).reshape([2, 3, 4, 5, 6])
-            y = paddle.arange(720, dtype=data_type).reshape([3, 2, 4, 5, 6])
-            z = paddle.tensordot(x, y, axes=[[0, 1, 3, 4], [1, 0]])
-            # z = [[23217330., 24915630., 26613930., 28312230.],
-            #      [24915630., 26775930., 28636230., 30496530.],
-            #      [26613930., 28636230., 30658530., 32680830.],
-            #      [28312230., 30496530., 32680830., 34865130.]]
+            >>> import paddle
+
+            >>> data_type = 'float64'
+
+            >>> # For two 2-d tensor x and y, the case axes=0 is equivalent to outer product.
+            >>> # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.
+            >>> x = paddle.arange(4, dtype=data_type).reshape([2, 2])
+            >>> y = paddle.arange(4, dtype=data_type).reshape([2, 2])
+            >>> z = paddle.tensordot(x, y, axes=0)
+            >>> print(z)
+            Tensor(shape=[2, 2, 2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+             [[[[0., 0.],
+                [0., 0.]],
+               [[0., 1.],
+                [2., 3.]]],
+              [[[0., 2.],
+                [4., 6.]],
+               [[0., 3.],
+                [6., 9.]]]])
+
+            >>> # For two 1-d tensor x and y, the case axes=1 is equivalent to inner product.
+            >>> x = paddle.arange(10, dtype=data_type)
+            >>> y = paddle.arange(10, dtype=data_type)
+            >>> z1 = paddle.tensordot(x, y, axes=1)
+            >>> z2 = paddle.dot(x, y)
+            >>> print(z1)
+            Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
+            285.)
+            >>> print(z2)
+            Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
+            285.)
+
+
+            >>> # For two 2-d tensor x and y, the case axes=1 is equivalent to matrix multiplication.
+            >>> x = paddle.arange(6, dtype=data_type).reshape([2, 3])
+            >>> y = paddle.arange(12, dtype=data_type).reshape([3, 4])
+            >>> z1 = paddle.tensordot(x, y, axes=1)
+            >>> z2 = paddle.matmul(x, y)
+            >>> print(z1)
+            Tensor(shape=[2, 4], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[20., 23., 26., 29.],
+             [56., 68., 80., 92.]])
+            >>> print(z2)
+            Tensor(shape=[2, 4], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[20., 23., 26., 29.],
+             [56., 68., 80., 92.]])
+
+            >>> # When axes is a 1-d int list, x and y will be contracted along the same given axes.
+            >>> # Note that axes=[1, 2] is equivalent to axes=[[1, 2]], axes=[[1, 2], []], axes=[[1, 2], [1]], and axes=[[1, 2], [1, 2]].
+            >>> x = paddle.arange(24, dtype=data_type).reshape([2, 3, 4])
+            >>> y = paddle.arange(36, dtype=data_type).reshape([3, 3, 4])
+            >>> z = paddle.tensordot(x, y, axes=[1, 2])
+            >>> print(z)
+            Tensor(shape=[2, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[506. , 1298., 2090.],
+             [1298., 3818., 6338.]])
+
+            >>> # When axes is a list containing two 1-d int list, the first will be applied to x and the second to y.
+            >>> x = paddle.arange(60, dtype=data_type).reshape([3, 4, 5])
+            >>> y = paddle.arange(24, dtype=data_type).reshape([4, 3, 2])
+            >>> z = paddle.tensordot(x, y, axes=([1, 0], [0, 1]))
+            >>> print(z)
+            Tensor(shape=[5, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[4400., 4730.],
+             [4532., 4874.],
+             [4664., 5018.],
+             [4796., 5162.],
+             [4928., 5306.]])
+
+            >>> # Thanks to the support of axes expansion, axes=[[0, 1, 3, 4], [1, 0, 3, 4]] can be abbreviated as axes= [[0, 1, 3, 4], [1, 0]].
+            >>> x = paddle.arange(720, dtype=data_type).reshape([2, 3, 4, 5, 6])
+            >>> y = paddle.arange(720, dtype=data_type).reshape([3, 2, 4, 5, 6])
+            >>> z = paddle.tensordot(x, y, axes=[[0, 1, 3, 4], [1, 0]])
+            >>> print(z)
+            Tensor(shape=[4, 4], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[23217330., 24915630., 26613930., 28312230.],
+             [24915630., 26775930., 28636230., 30496530.],
+             [26613930., 28636230., 30658530., 32680830.],
+             [28312230., 30496530., 32680830., 34865130.]])
     """
     op_type = 'tensordot'
     input_dtype = ['float16', 'float32', 'float64']
@@ -4287,14 +4453,13 @@ def as_complex(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
-            y = paddle.as_complex(x)
-            print(y)
-
-            # Tensor(shape=[2, 3], dtype=complex64, place=Place(gpu:0), stop_gradient=True,
-            #        [[1j      , (2+3j)  , (4+5j)  ],
-            #         [(6+7j)  , (8+9j)  , (10+11j)]])
+            >>> import paddle
+            >>> x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
+            >>> y = paddle.as_complex(x)
+            >>> print(y)
+            Tensor(shape=[2, 3], dtype=complex64, place=Place(cpu), stop_gradient=True,
+            [[1j      , (2+3j)  , (4+5j)  ],
+             [(6+7j)  , (8+9j)  , (10+11j)]])
     """
     if in_dynamic_mode():
         return _C_ops.as_complex(x)
@@ -4334,20 +4499,18 @@ def as_real(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
-            y = paddle.as_complex(x)
-            z = paddle.as_real(y)
-            print(z)
-
-            # Tensor(shape=[2, 3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[0. , 1. ],
-            #          [2. , 3. ],
-            #          [4. , 5. ]],
-
-            #         [[6. , 7. ],
-            #          [8. , 9. ],
-            #          [10., 11.]]])
+            >>> import paddle
+            >>> x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
+            >>> y = paddle.as_complex(x)
+            >>> z = paddle.as_real(y)
+            >>> print(z)
+            Tensor(shape=[2, 3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[0. , 1. ],
+             [2. , 3. ],
+             [4. , 5. ]],
+            [[6. , 7. ],
+             [8. , 9. ],
+             [10., 11.]]])
     """
     if in_dynamic_mode():
         return _C_ops.as_real(x)
@@ -4384,20 +4547,29 @@ def repeat_interleave(x, repeats, axis=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
-            repeats  = paddle.to_tensor([3, 2, 1], dtype='int32')
-
-            paddle.repeat_interleave(x, repeats, 1)
-            # [[1, 1, 1, 2, 2, 3],
-            #  [4, 4, 4, 5, 5, 6]]
-
-            paddle.repeat_interleave(x, 2, 0)
-            # [[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]]
-
-            paddle.repeat_interleave(x, 2, None)
-            # [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+            >>> repeats = paddle.to_tensor([3, 2, 1], dtype='int32')
+
+            >>> out = paddle.repeat_interleave(x, repeats, 1)
+            >>> print(out)
+            Tensor(shape=[2, 6], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1, 1, 1, 2, 2, 3],
+             [4, 4, 4, 5, 5, 6]])
+
+            >>> out = paddle.repeat_interleave(x, 2, 0)
+            >>> print(out)
+            Tensor(shape=[4, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1, 2, 3],
+             [1, 2, 3],
+             [4, 5, 6],
+             [4, 5, 6]])
+
+            >>> out = paddle.repeat_interleave(x, 2, None)
+            >>> print(out)
+            Tensor(shape=[12], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
     """
 
     if axis is None:
@@ -4452,15 +4624,17 @@ def moveaxis(x, source, destination, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.ones([3, 2, 4])
-            paddle.moveaxis(x, [0, 1], [1, 2]).shape
-            # [4, 3, 2]
+            >>> x = paddle.ones([3, 2, 4])
+            >>> outshape = paddle.moveaxis(x, [0, 1], [1, 2]).shape
+            >>> print(outshape)
+            [4, 3, 2]
 
-            x = paddle.ones([2, 3])
-            paddle.moveaxis(x, 0, 1).shape # equivalent to paddle.t(x)
-            # [3, 2]
+            >>> x = paddle.ones([2, 3])
+            >>> outshape = paddle.moveaxis(x, 0, 1).shape # equivalent to paddle.t(x)
+            >>> print(outshape)
+            [3, 2]
     """
     src = [source] if isinstance(source, int) else source
     dst = [destination] if isinstance(destination, int) else destination
@@ -4587,14 +4761,15 @@ def take_along_axis(arr, indices, axis):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7,8,9]])
-            index = paddle.to_tensor([[0]])
-            axis = 0
-            result = paddle.take_along_axis(x, index, axis)
-            print(result)
-            # [[1, 2, 3]]
+            >>> x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7,8,9]])
+            >>> index = paddle.to_tensor([[0]])
+            >>> axis = 0
+            >>> result = paddle.take_along_axis(x, index, axis)
+            >>> print(result)
+            Tensor(shape=[1, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1, 2, 3]])
     """
     if len(arr.shape) != len(indices.shape):
         raise ValueError(
@@ -4664,16 +4839,17 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[10, 30, 20], [60, 40, 50]])
-            index = paddle.to_tensor([[0]])
-            value = 99
-            axis = 0
-            result = paddle.put_along_axis(x, index, value, axis)
-            print(result)
-            # [[99, 99, 99],
-            # [60, 40, 50]]
+            >>> x = paddle.to_tensor([[10, 30, 20], [60, 40, 50]])
+            >>> index = paddle.to_tensor([[0]])
+            >>> value = 99
+            >>> axis = 0
+            >>> result = paddle.put_along_axis(x, index, value, axis)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[99, 99, 99],
+             [60, 40, 50]])
 
     """
     if len(arr.shape) != len(indices.shape):
@@ -4766,18 +4942,19 @@ def index_add(x, index, axis, value, name=None):
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-
-            input_tensor = paddle.to_tensor(paddle.ones((3, 3)), dtype="float32")
-            index = paddle.to_tensor([0, 2], dtype="int32")
-            value = paddle.to_tensor([[1, 1, 1], [1, 1, 1]], dtype="float32")
-            outplace_res = paddle.index_add(input_tensor, index, 0, value)
-            print(outplace_res)
-            # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[2., 2., 2.],
-            #         [1., 1., 1.],
-            #         [2., 2., 2.]])
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> input_tensor = paddle.to_tensor(paddle.ones((3, 3)), dtype="float32")
+            >>> index = paddle.to_tensor([0, 2], dtype="int32")
+            >>> value = paddle.to_tensor([[1, 1, 1], [1, 1, 1]], dtype="float32")
+            >>> outplace_res = paddle.index_add(input_tensor, index, 0, value)
+            >>> print(outplace_res)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            [[2., 2., 2.],
+             [1., 1., 1.],
+             [2., 2., 2.]])
     """
     if in_dynamic_mode():
         return _C_ops.index_add(x, index, value, axis)
@@ -4826,18 +5003,19 @@ def index_add_(x, index, axis, value, name=None):
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-
-            input_tensor = paddle.to_tensor(paddle.ones((3, 3)), dtype="float32")
-            index = paddle.to_tensor([0, 2], dtype="int32")
-            value = paddle.to_tensor([[1, 1], [1, 1], [1, 1]], dtype="float32")
-            inplace_res = paddle.index_add_(input_tensor, index, 1, value)
-            print(inplace_res)
-            # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[2., 1., 2.],
-            #         [2., 1., 2.],
-            #         [2., 1., 2.]])
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> input_tensor = paddle.to_tensor(paddle.ones((3, 3)), dtype="float32")
+            >>> index = paddle.to_tensor([0, 2], dtype="int32")
+            >>> value = paddle.to_tensor([[1, 1], [1, 1], [1, 1]], dtype="float32")
+            >>> inplace_res = paddle.index_add_(input_tensor, index, 1, value)
+            >>> print(inplace_res)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            [[2., 1., 2.],
+             [2., 1., 2.],
+             [2., 1., 2.]])
     """
     return _C_ops.index_add_(x, index, value, axis)
 
@@ -4863,25 +5041,25 @@ def index_put_(x, indices, value, accumulate=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.zeros([3, 3])
-            value = paddle.ones([3])
-            ix1 = paddle.to_tensor([0,1,2])
-            ix2 = paddle.to_tensor([1,2,1])
-            indices=(ix1,ix2)
-
-            out = paddle.index_put_(x,indices,value)
-            print(x)
-            # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0., 1., 0.],
-            #         [0., 0., 1.],
-            #         [0., 1., 0.]])
-            print(out)
-            # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0., 1., 0.],
-            #         [0., 0., 1.],
-            #         [0., 1., 0.]])
+            >>> import paddle
+
+            >>> x = paddle.zeros([3, 3])
+            >>> value = paddle.ones([3])
+            >>> ix1 = paddle.to_tensor([0,1,2])
+            >>> ix2 = paddle.to_tensor([1,2,1])
+            >>> indices=(ix1,ix2)
+
+            >>> out = paddle.index_put_(x,indices,value)
+            >>> print(x)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 1., 0.],
+             [0., 0., 1.],
+             [0., 1., 0.]])
+            >>> print(out)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 1., 0.],
+             [0., 0., 1.],
+             [0., 1., 0.]])
     """
     return _C_ops.index_put_(x, indices, value, accumulate)
 
@@ -4894,25 +5072,25 @@ def index_put(x, indices, value, accumulate=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.zeros([3, 3])
-            value = paddle.ones([3])
-            ix1 = paddle.to_tensor([0,1,2])
-            ix2 = paddle.to_tensor([1,2,1])
-            indices=(ix1,ix2)
-
-            out = paddle.index_put(x,indices,value)
-            print(x)
-            # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0., 0., 0.],
-            #         [0., 0., 0.],
-            #         [0., 0., 0.]])
-            print(out)
-            # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0., 1., 0.],
-            #         [0., 0., 1.],
-            #         [0., 1., 0.]])
+            >>> import paddle
+
+            >>> x = paddle.zeros([3, 3])
+            >>> value = paddle.ones([3])
+            >>> ix1 = paddle.to_tensor([0,1,2])
+            >>> ix2 = paddle.to_tensor([1,2,1])
+            >>> indices=(ix1,ix2)
+
+            >>> out = paddle.index_put(x,indices,value)
+            >>> print(x)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 0., 0.],
+             [0., 0., 0.],
+             [0., 0., 0.]])
+            >>> print(out)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 1., 0.],
+             [0., 0., 1.],
+             [0., 1., 0.]])
     """
     if in_dynamic_mode():
         return _C_ops.index_put(x, indices, value, accumulate)
@@ -4965,28 +5143,28 @@ def unflatten(x, axis, shape, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.randn(shape=[4, 6, 8])
-            shape = [2, 3]
-            axis = 1
-            res = paddle.unflatten(x, axis, shape)
-            print(res.shape)
-            # [4, 2, 3, 8]
-
-            x = paddle.randn(shape=[4, 6, 8])
-            shape = (-1, 2)
-            axis = -1
-            res = paddle.unflatten(x, axis, shape)
-            print(res.shape)
-            # [4, 6, 4, 2]
-
-            x = paddle.randn(shape=[4, 6, 8])
-            shape = paddle.to_tensor([2, 2])
-            axis = 0
-            res = paddle.unflatten(x, axis, shape)
-            print(res.shape)
-            # [2, 2, 6, 8]
+            >>> import paddle
+
+            >>> x = paddle.randn(shape=[4, 6, 8])
+            >>> shape = [2, 3]
+            >>> axis = 1
+            >>> res = paddle.unflatten(x, axis, shape)
+            >>> print(res.shape)
+            [4, 2, 3, 8]
+
+            >>> x = paddle.randn(shape=[4, 6, 8])
+            >>> shape = (-1, 2)
+            >>> axis = -1
+            >>> res = paddle.unflatten(x, axis, shape)
+            >>> print(res.shape)
+            [4, 6, 4, 2]
+
+            >>> x = paddle.randn(shape=[4, 6, 8])
+            >>> shape = paddle.to_tensor([2, 2])
+            >>> axis = 0
+            >>> res = paddle.unflatten(x, axis, shape)
+            >>> print(res.shape)
+            [2, 2, 6, 8]
     """
 
     # determine whether the input axis is valid.
@@ -5035,15 +5213,15 @@ def as_strided(x, shape, stride, offset=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
+            >>> import paddle
+            >>> paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
 
-            x = paddle.rand([2, 4, 6], dtype="float32")
+            >>> x = paddle.rand([2, 4, 6], dtype="float32")
 
-            out = paddle.as_strided(x, [8, 6], [6, 1])
-            print(out)
-            # the shape is [8, 6].
-            # the stride is [6, 1].
+            >>> out = paddle.as_strided(x, [8, 6], [6, 1])
+            >>> print(out.shape)
+            [8, 6]
+            >>> # the stride is [6, 1].
     """
     return _C_ops.as_strided(x, shape, stride, offset)
 
@@ -5067,22 +5245,24 @@ def view(x, shape_or_dtype, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
+            >>> import paddle
+            >>> paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
 
-            x = paddle.rand([2, 4, 6], dtype="float32")
+            >>> x = paddle.rand([2, 4, 6], dtype="float32")
 
-            out = paddle.view(x, [8, 6])
-            print(out)
+            >>> out = paddle.view(x, [8, 6])
+            >>> print(out.shape)
+            [8, 6]
 
+            >>> import paddle
+            >>> paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
 
-            import paddle
-            paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
+            >>> x = paddle.rand([2, 4, 6], dtype="float32")
 
-            x = paddle.rand([2, 4, 6], dtype="float32")
+            >>> out = paddle.view(x, "uint8")
+            >>> print(out.shape)
+            [2, 4, 24]
 
-            out = paddle.view(x, "uint8")
-            print(out)
     """
     if isinstance(shape_or_dtype, (list, tuple)):
         return _C_ops.view_shape(x, shape_or_dtype)
@@ -5111,14 +5291,15 @@ def view_as(x, other, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
+            >>> import paddle
+            >>> paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
 
-            x = paddle.rand([2, 4, 6], dtype="float32")
-            y = paddle.rand([8, 6], dtype="float32")
+            >>> x = paddle.rand([2, 4, 6], dtype="float32")
+            >>> y = paddle.rand([8, 6], dtype="float32")
 
-            out = paddle.view_as(x, y)
-            print(out)
+            >>> out = paddle.view_as(x, y)
+            >>> print(out.shape)
+            [8, 6]
     """
     return _C_ops.view_shape(x, other.shape)
 
@@ -5144,13 +5325,16 @@ def unfold(x, axis, size, step, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
+            >>> import paddle
+            >>> paddle.base.set_flags({"FLAGS_use_stride_kernel": True})
 
-            x = paddle.arange(9, dtype="float64")
+            >>> x = paddle.arange(9, dtype="float64")
 
-            out = paddle.unfold(x, 0, 2, 4)
-            print(out) # [[0, 1], [4, 5]]
+            >>> out = paddle.unfold(x, 0, 2, 4)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[0., 1.],
+             [4., 5.]])
     """
     return _C_ops.tensor_unfold(x, axis, size, step)
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 12b91b6ea7ad39..c2ff998b55d140 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -171,7 +171,7 @@ def log(x, name=None):
             [[0.69314718, 1.09861231, 1.38629436],
              [1.94591010, 2.07944155, 2.19722462]])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.log(x)
     else:
         check_variable_and_dtype(
@@ -941,7 +941,7 @@ def floor_divide(x, y, name=None):
             [2, 0, 2, 2])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.floor_divide(x, y)
     else:
         return _elementwise_op(LayerHelper('elementwise_floordiv', **locals()))
@@ -998,7 +998,7 @@ def remainder(x, y, name=None):
             [0, 3, 2, 1])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.remainder(x, y)
     else:
         return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
@@ -1226,7 +1226,7 @@ def maximum(x, y, name=None):
             Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
             [5.  , 3.  , inf.])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.maximum(x, y)
     else:
         return _elementwise_op(LayerHelper('elementwise_max', **locals()))
@@ -1288,7 +1288,7 @@ def minimum(x, y, name=None):
             Tensor(shape=[3], dtype=float64, place=Place(cpu), stop_gradient=True,
             [ 1.  , -inf.,  5.  ])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.minimum(x, y)
     else:
         return _elementwise_op(LayerHelper('elementwise_min', **locals()))
@@ -1955,7 +1955,7 @@ def add_n(inputs, name=None):
              [14., 16., 18.]])
     """
     if in_dynamic_or_pir_mode():
-        if isinstance(inputs, Variable):
+        if isinstance(inputs, (Variable, paddle.pir.OpResult)):
             inputs = [inputs]
         return _C_ops.add_n(inputs)
     else:
@@ -2937,7 +2937,7 @@ def min(x, axis=None, keepdim=False, name=None):
               [0., 0.]]])
     """
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.min(x, axis, keepdim)
     else:
         reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
@@ -3509,7 +3509,7 @@ def clip(x, min=None, max=None, name=None):
         min_ = float(np.finfo(np.float32).min)
         max_ = float(np.finfo(np.float32).max)
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         if isinstance(min, Variable):
             min = min.item(0)
         if isinstance(max, Variable):
@@ -4565,7 +4565,7 @@ def sign(x, name=None):
     Returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
 
     Args:
-        x (Tensor): The input tensor. The data type can be float16, float32 or float64.
+        x (Tensor): The input tensor. The data type can be int8, int16, int32, int64, float16, float32 or float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -4586,7 +4586,19 @@ def sign(x, name=None):
         return _C_ops.sign(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64', 'uint16'], 'sign'
+            x,
+            'x',
+            [
+                'int8',
+                'int16',
+                'int32',
+                'int64',
+                'float16',
+                'float32',
+                'float64',
+                'uint16',
+            ],
+            'sign',
         )
         helper = LayerHelper("sign", **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -4669,7 +4681,7 @@ def increment(x, value=1.0, name=None):
             [1.])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.increment_(x, value)
     else:
         check_variable_and_dtype(
@@ -4833,7 +4845,7 @@ def any(x, axis=None, keepdim=False, name=None):
              [True]])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.any(x, axis, keepdim)
     else:
         reduce_all, axis = _get_reduce_axis(axis, x)
@@ -6934,24 +6946,72 @@ def ldexp_(x, y, name=None):
     return paddle.multiply_(x, paddle.pow(two, y))
 
 
+def hypot(x, y, name=None):
+    """
+    Calculate the length of the hypotenuse of a right-angle triangle. The equation is:
+
+    .. math::
+        out = {\\sqrt{x^2 + y^2}}
+
+    Args:
+        x (Tensor): The input Tensor, the data type is float32, float64, int32 or int64.
+        y (Tensor): The input Tensor, the data type is float32, float64, int32 or int64.
+        name (str, optional): Name for the operation (optional, default is None).For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor. If x, y have different shapes and are "broadcastable", the resulting tensor shape is the shape of x and y after broadcasting. If x, y have the same shape, its shape is the same as x and y. And the data type is float32 or float64.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([3], dtype='float32')
+            >>> y = paddle.to_tensor([4], dtype='float32')
+            >>> res = paddle.hypot(x, y)
+            >>> print(res)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [5.])
+
+    """
+    if not isinstance(x, (paddle.Tensor, Variable)):
+        raise TypeError(f"x must be tensor type, but got {type(x)}")
+    if not isinstance(y, (paddle.Tensor, Variable)):
+        raise TypeError(f"y must be tensor type, but got {type(y)}")
+
+    out = (paddle.pow(x, 2) + paddle.pow(y, 2)).sqrt()
+    return out
+
+
+@inplace_apis_in_dygraph_only
+def hypot_(x, y, name=None):
+    r"""
+    Inplace version of ``hypot`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_paddle_hypot`.
+    """
+    if not isinstance(x, (paddle.Tensor, Variable)):
+        raise TypeError(f"x must be tensor type, but got {type(x)}")
+    if not isinstance(y, (paddle.Tensor, Variable)):
+        raise TypeError(f"y must be tensor type, but got {type(y)}")
+
+    out = x.pow_(2).add_(y.pow(2)).sqrt_()
+    return out
+
+
 def combinations(x, r=2, with_replacement=False, name=None):
     """
     Compute combinations of length r of the given tensor. The behavior is similar to python’s itertools.combinations
     when with_replacement is set to False, and itertools.combinations_with_replacement when with_replacement is set to True.
-
     Args:
         x (Tensor): 1-D input Tensor, the data type is float16, float32, float64, int32 or int64.
         r (int, optional):  number of elements to combine, default value is 2.
         with_replacement (bool, optional):  whether to allow duplication in combination, default value is False.
         name (str, optional): Name for the operation (optional, default is None).For more information, please refer to :ref:`api_guide_Name`.
-
     Returns:
         out (Tensor): tensor concatenated by combinations, same dtype with x
-
     Examples:
-
         .. code-block:: python
-
             >>> import paddle
             >>> x = paddle.to_tensor([1, 2, 3], dtype='int32')
             >>> res = paddle.combinations(x)
@@ -6960,7 +7020,6 @@ def combinations(x, r=2, with_replacement=False, name=None):
                    [[1, 2],
                     [1, 3],
                     [2, 3]])
-
     """
     if len(x.shape) != 1:
         raise TypeError(f"Expect a 1-D vector, but got x shape {x.shape}")
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index 9fbc9d16baa661..beea2e7d904250 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -525,7 +525,7 @@ def ceil(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [-0., -0., 1. , 1. ])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.ceil(x)
     else:
         check_variable_and_dtype(
@@ -564,7 +564,7 @@ def cos(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [0.92106098, 0.98006660, 0.99500418, 0.95533651])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.cos(x)
     else:
         check_variable_and_dtype(
@@ -754,7 +754,7 @@ def floor(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [-1., -1.,  0.,  0.])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.floor(x)
     else:
         check_variable_and_dtype(
@@ -839,7 +839,7 @@ def round(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [-1., -0.,  1.,  2.])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.round(x)
     else:
         check_variable_and_dtype(
@@ -916,7 +916,7 @@ def sigmoid(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [0.40131235, 0.45016602, 0.52497917, 0.57444251])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.sigmoid(x)
     else:
         check_variable_and_dtype(
@@ -963,7 +963,7 @@ def sin(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [-0.38941833, -0.19866933,  0.09983342,  0.29552022])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.sin(x)
     else:
         check_variable_and_dtype(
@@ -1057,7 +1057,7 @@ def sqrt(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [0.31622776, 0.44721359, 0.54772258, 0.63245553])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.sqrt(x)
     else:
         check_variable_and_dtype(
@@ -1097,7 +1097,7 @@ def square(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [0.16000001, 0.04000000, 0.01000000, 0.09000000])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.square(x)
     else:
         check_variable_and_dtype(
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index f87e669cf198ef..479e7a7ea09cc0 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -796,6 +796,10 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
     if in_dynamic_or_pir_mode():
         shape = paddle.utils.convert_shape_to_list(shape)
+        if in_pir_mode() and paddle.utils._contain_var(shape):
+            shape = paddle.utils.get_int_tensor_list(
+                shape, _current_expected_place()
+            )
         return _C_ops.uniform(
             shape,
             dtype,
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 7a9a0981cebb06..8fd2473231f931 100755
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -59,42 +59,45 @@ def argsort(x, axis=-1, descending=False, name=None):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[[5,8,9,5],
-                                   [0,0,1,7],
-                                   [6,9,2,4]],
-                                  [[5,2,4,2],
-                                   [4,7,7,9],
-                                   [1,7,0,6]]],
-                                dtype='float32')
-            out1 = paddle.argsort(x, axis=-1)
-            out2 = paddle.argsort(x, axis=0)
-            out3 = paddle.argsort(x, axis=1)
-
-            print(out1)
-            #[[[0 3 1 2]
-            #  [0 1 2 3]
-            #  [2 3 0 1]]
-            # [[1 3 2 0]
-            #  [0 1 2 3]
-            #  [2 0 3 1]]]
-
-            print(out2)
-            #[[[0 1 1 1]
-            #  [0 0 0 0]
-            #  [1 1 1 0]]
-            # [[1 0 0 0]
-            #  [1 1 1 1]
-            #  [0 0 0 1]]]
-
-            print(out3)
-            #[[[1 1 1 2]
-            #  [0 0 2 0]
-            #  [2 2 0 1]]
-            # [[2 0 2 0]
-            #  [1 1 0 2]
-            #  [0 2 1 1]]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[[5,8,9,5],
+            ...                        [0,0,1,7],
+            ...                        [6,9,2,4]],
+            ...                       [[5,2,4,2],
+            ...                        [4,7,7,9],
+            ...                        [1,7,0,6]]],
+            ...                      dtype='float32')
+            >>> out1 = paddle.argsort(x, axis=-1)
+            >>> out2 = paddle.argsort(x, axis=0)
+            >>> out3 = paddle.argsort(x, axis=1)
+
+            >>> print(out1)
+            Tensor(shape=[2, 3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[[0, 3, 1, 2],
+              [0, 1, 2, 3],
+              [2, 3, 0, 1]],
+             [[1, 3, 2, 0],
+              [0, 1, 2, 3],
+              [2, 0, 3, 1]]])
+
+            >>> print(out2)
+            Tensor(shape=[2, 3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[[0, 1, 1, 1],
+              [0, 0, 0, 0],
+              [1, 1, 1, 0]],
+             [[1, 0, 0, 0],
+              [1, 1, 1, 1],
+              [0, 0, 0, 1]]])
+
+            >>> print(out3)
+            Tensor(shape=[2, 3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[[1, 1, 1, 2],
+              [0, 0, 2, 0],
+              [2, 2, 0, 1]],
+             [[2, 0, 2, 0],
+              [1, 1, 0, 2],
+              [0, 2, 1, 1]]])
     """
     if in_dynamic_mode():
         _, ids = _C_ops.argsort(x, axis, descending)
@@ -154,24 +157,27 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[5,8,9,5],
-                                 [0,0,1,7],
-                                 [6,9,2,4]])
-            out1 = paddle.argmax(x)
-            print(out1) # 2
-            out2 = paddle.argmax(x, axis=0)
-            print(out2)
-            # [2, 2, 0, 1]
-            out3 = paddle.argmax(x, axis=-1)
-            print(out3)
-            # [2, 3, 1]
-            out4 = paddle.argmax(x, axis=0, keepdim=True)
-            print(out4)
-            # [[2, 2, 0, 1]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[5,8,9,5],
+            ...                       [0,0,1,7],
+            ...                       [6,9,2,4]])
+            >>> out1 = paddle.argmax(x)
+            >>> print(out1.numpy())
+            2
+            >>> out2 = paddle.argmax(x, axis=0)
+            >>> print(out2.numpy())
+            [2 2 0 1]
+            >>> out3 = paddle.argmax(x, axis=-1)
+            >>> print(out3.numpy())
+            [2 3 1]
+            >>> out4 = paddle.argmax(x, axis=0, keepdim=True)
+            >>> print(out4.numpy())
+            [[2 2 0 1]]
     """
-    if axis is not None and not isinstance(axis, (int, Variable)):
+    if axis is not None and not isinstance(
+        axis, (int, Variable, paddle.pir.OpResult)
+    ):
         raise TypeError(
             "The type of 'axis'  must be int or Tensor or None in argmax, but received %s."
             % (type(axis))
@@ -188,7 +194,7 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
         flatten = True
         axis = 0
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.argmax(x, axis, keepdim, flatten, var_dtype)
     else:
         helper = LayerHelper("argmax", **locals())
@@ -244,24 +250,27 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x =  paddle.to_tensor([[5,8,9,5],
-                                     [0,0,1,7],
-                                     [6,9,2,4]])
-            out1 = paddle.argmin(x)
-            print(out1) # 4
-            out2 = paddle.argmin(x, axis=0)
-            print(out2)
-            # [1, 1, 1, 2]
-            out3 = paddle.argmin(x, axis=-1)
-            print(out3)
-            # [0, 0, 2]
-            out4 = paddle.argmin(x, axis=0, keepdim=True)
-            print(out4)
-            # [[1, 1, 1, 2]]
+            >>> import paddle
+
+            >>> x =  paddle.to_tensor([[5,8,9,5],
+            ...                        [0,0,1,7],
+            ...                        [6,9,2,4]])
+            >>> out1 = paddle.argmin(x)
+            >>> print(out1.numpy())
+            4
+            >>> out2 = paddle.argmin(x, axis=0)
+            >>> print(out2.numpy())
+            [1 1 1 2]
+            >>> out3 = paddle.argmin(x, axis=-1)
+            >>> print(out3.numpy())
+            [0 0 2]
+            >>> out4 = paddle.argmin(x, axis=0, keepdim=True)
+            >>> print(out4.numpy())
+            [[1 1 1 2]]
     """
-    if axis is not None and not isinstance(axis, (int, Variable)):
+    if axis is not None and not isinstance(
+        axis, (int, Variable, paddle.pir.OpResult)
+    ):
         raise TypeError(
             "The type of 'axis'  must be int or Tensor or None in argmin, but received %s."
             % (type(axis))
@@ -278,7 +287,7 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
         flatten = True
         axis = 0
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.argmin(x, axis, keepdim, flatten, var_dtype)
     else:
         helper = LayerHelper("argmin", **locals())
@@ -331,20 +340,22 @@ def index_select(x, index, axis=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
-                                  [5.0, 6.0, 7.0, 8.0],
-                                  [9.0, 10.0, 11.0, 12.0]])
-            index = paddle.to_tensor([0, 1, 1], dtype='int32')
-            out_z1 = paddle.index_select(x=x, index=index)
-            #[[1. 2. 3. 4.]
-            # [5. 6. 7. 8.]
-            # [5. 6. 7. 8.]]
-            out_z2 = paddle.index_select(x=x, index=index, axis=1)
-            #[[ 1.  2.  2.]
-            # [ 5.  6.  6.]
-            # [ 9. 10. 10.]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+            ...                       [5.0, 6.0, 7.0, 8.0],
+            ...                       [9.0, 10.0, 11.0, 12.0]])
+            >>> index = paddle.to_tensor([0, 1, 1], dtype='int32')
+            >>> out_z1 = paddle.index_select(x=x, index=index)
+            >>> print(out_z1.numpy())
+            [[1. 2. 3. 4.]
+             [5. 6. 7. 8.]
+             [5. 6. 7. 8.]]
+            >>> out_z2 = paddle.index_select(x=x, index=index, axis=1)
+            >>> print(out_z2.numpy())
+            [[ 1.  2.  2.]
+             [ 5.  6.  6.]
+             [ 9. 10. 10.]]
     """
 
     if in_dynamic_mode():
@@ -405,35 +416,43 @@ def nonzero(x, as_tuple=False):
 
         .. code-block:: python
 
-            import paddle
-
-            x1 = paddle.to_tensor([[1.0, 0.0, 0.0],
-                                   [0.0, 2.0, 0.0],
-                                   [0.0, 0.0, 3.0]])
-            x2 = paddle.to_tensor([0.0, 1.0, 0.0, 3.0])
-            out_z1 = paddle.nonzero(x1)
-            print(out_z1)
-            #[[0 0]
-            # [1 1]
-            # [2 2]]
-            out_z1_tuple = paddle.nonzero(x1, as_tuple=True)
-            for out in out_z1_tuple:
-                print(out)
-            #[[0]
-            # [1]
-            # [2]]
-            #[[0]
-            # [1]
-            # [2]]
-            out_z2 = paddle.nonzero(x2)
-            print(out_z2)
-            #[[1]
-            # [3]]
-            out_z2_tuple = paddle.nonzero(x2, as_tuple=True)
-            for out in out_z2_tuple:
-                print(out)
-            #[[1]
-            # [3]]
+            >>> import paddle
+
+            >>> x1 = paddle.to_tensor([[1.0, 0.0, 0.0],
+            ...                        [0.0, 2.0, 0.0],
+            ...                        [0.0, 0.0, 3.0]])
+            >>> x2 = paddle.to_tensor([0.0, 1.0, 0.0, 3.0])
+            >>> out_z1 = paddle.nonzero(x1)
+            >>> print(out_z1)
+            Tensor(shape=[3, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 0],
+             [1, 1],
+             [2, 2]])
+
+            >>> out_z1_tuple = paddle.nonzero(x1, as_tuple=True)
+            >>> for out in out_z1_tuple:
+            ...     print(out)
+            Tensor(shape=[3, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0],
+             [1],
+             [2]])
+            Tensor(shape=[3, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0],
+             [1],
+             [2]])
+
+            >>> out_z2 = paddle.nonzero(x2)
+            >>> print(out_z2)
+            Tensor(shape=[2, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1],
+             [3]])
+
+            >>> out_z2_tuple = paddle.nonzero(x2, as_tuple=True)
+            >>> for out in out_z2_tuple:
+            ...     print(out)
+            Tensor(shape=[2, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1],
+             [3]])
 
     """
     list_out = []
@@ -503,41 +522,41 @@ def sort(x, axis=-1, descending=False, name=None):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[[5,8,9,5],
-                                   [0,0,1,7],
-                                   [6,9,2,4]],
-                                  [[5,2,4,2],
-                                   [4,7,7,9],
-                                   [1,7,0,6]]],
-                                 dtype='float32')
-            out1 = paddle.sort(x=x, axis=-1)
-            out2 = paddle.sort(x=x, axis=0)
-            out3 = paddle.sort(x=x, axis=1)
-            print(out1)
-            #[[[5. 5. 8. 9.]
-            #  [0. 0. 1. 7.]
-            #  [2. 4. 6. 9.]]
-            # [[2. 2. 4. 5.]
-            #  [4. 7. 7. 9.]
-            #  [0. 1. 6. 7.]]]
-            print(out2)
-            #[[[5. 2. 4. 2.]
-            #  [0. 0. 1. 7.]
-            #  [1. 7. 0. 4.]]
-            # [[5. 8. 9. 5.]
-            #  [4. 7. 7. 9.]
-            #  [6. 9. 2. 6.]]]
-            print(out3)
-            #[[[0. 0. 1. 4.]
-            #  [5. 8. 2. 5.]
-            #  [6. 9. 9. 7.]]
-            # [[1. 2. 0. 2.]
-            #  [4. 7. 4. 6.]
-            #  [5. 7. 7. 9.]]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[[5,8,9,5],
+            ...                        [0,0,1,7],
+            ...                        [6,9,2,4]],
+            ...                       [[5,2,4,2],
+            ...                        [4,7,7,9],
+            ...                        [1,7,0,6]]],
+            ...                      dtype='float32')
+            >>> out1 = paddle.sort(x=x, axis=-1)
+            >>> out2 = paddle.sort(x=x, axis=0)
+            >>> out3 = paddle.sort(x=x, axis=1)
+            >>> print(out1.numpy())
+            [[[5. 5. 8. 9.]
+              [0. 0. 1. 7.]
+              [2. 4. 6. 9.]]
+             [[2. 2. 4. 5.]
+              [4. 7. 7. 9.]
+              [0. 1. 6. 7.]]]
+            >>> print(out2.numpy())
+            [[[5. 2. 4. 2.]
+              [0. 0. 1. 7.]
+              [1. 7. 0. 4.]]
+             [[5. 8. 9. 5.]
+              [4. 7. 7. 9.]
+              [6. 9. 2. 6.]]]
+            >>> print(out3.numpy())
+            [[[0. 0. 1. 4.]
+              [5. 8. 2. 5.]
+              [6. 9. 9. 7.]]
+             [[1. 2. 0. 2.]
+              [4. 7. 4. 6.]
+              [5. 7. 7. 9.]]]
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         outs, _ = _C_ops.argsort(x, axis, descending)
         return outs
     else:
@@ -576,16 +595,16 @@ def mode(x, axis=-1, keepdim=False, name=None):
 
         .. code-block:: python
 
-           import paddle
+            >>> import paddle
 
-           tensor = paddle.to_tensor([[[1,2,2],[2,3,3]],[[0,5,5],[9,9,0]]], dtype=paddle.float32)
-           res = paddle.mode(tensor, 2)
-           print(res)
-           # (Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-           #   [[2., 3.],
-           #    [5., 9.]]), Tensor(shape=[2, 2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
-           #   [[2, 2],
-           #    [2, 1]]))
+            >>> tensor = paddle.to_tensor([[[1,2,2],[2,3,3]],[[0,5,5],[9,9,0]]], dtype=paddle.float32)
+            >>> res = paddle.mode(tensor, 2)
+            >>> print(res)
+            (Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[2., 3.],
+             [5., 9.]]), Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[2, 2],
+             [2, 1]]))
 
     """
     if in_dynamic_mode():
@@ -638,20 +657,21 @@ def where(condition, x=None, y=None, name=None):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
-            y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
+            >>> x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
+            >>> y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
 
-            out = paddle.where(x>1, x, y)
-            print(out)
-            #out: [1.0, 1.0, 3.2, 1.2]
+            >>> out = paddle.where(x>1, x, y)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [1.        , 1.        , 3.20000005, 1.20000005])
 
-            out = paddle.where(x>1)
-            print(out)
-            #out: (Tensor(shape=[2, 1], dtype=int64, place=CPUPlace, stop_gradient=True,
-            #            [[2],
-            #             [3]]),)
+            >>> out = paddle.where(x>1)
+            >>> print(out)
+            (Tensor(shape=[2, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[2],
+             [3]]),)
     """
     if np.isscalar(x):
         x = paddle.full([1], x, np.array([x]).dtype.name)
@@ -792,41 +812,41 @@ def index_sample(x, index):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
-                                  [5.0, 6.0, 7.0, 8.0],
-                                  [9.0, 10.0, 11.0, 12.0]], dtype='float32')
-            index = paddle.to_tensor([[0, 1, 2],
-                                      [1, 2, 3],
-                                      [0, 0, 0]], dtype='int32')
-            target = paddle.to_tensor([[100, 200, 300, 400],
-                                       [500, 600, 700, 800],
-                                       [900, 1000, 1100, 1200]], dtype='int32')
-            out_z1 = paddle.index_sample(x, index)
-            print(out_z1)
-            #[[1. 2. 3.]
-            # [6. 7. 8.]
-            # [9. 9. 9.]]
-
-            # Use the index of the maximum value by topk op
-            # get the value of the element of the corresponding index in other tensors
-            top_value, top_index = paddle.topk(x, k=2)
-            out_z2 = paddle.index_sample(target, top_index)
-            print(top_value)
-            #[[ 4.  3.]
-            # [ 8.  7.]
-            # [12. 11.]]
-
-            print(top_index)
-            #[[3 2]
-            # [3 2]
-            # [3 2]]
-
-            print(out_z2)
-            #[[ 400  300]
-            # [ 800  700]
-            # [1200 1100]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+            ...                       [5.0, 6.0, 7.0, 8.0],
+            ...                       [9.0, 10.0, 11.0, 12.0]], dtype='float32')
+            >>> index = paddle.to_tensor([[0, 1, 2],
+            ...                           [1, 2, 3],
+            ...                           [0, 0, 0]], dtype='int32')
+            >>> target = paddle.to_tensor([[100, 200, 300, 400],
+            ...                            [500, 600, 700, 800],
+            ...                            [900, 1000, 1100, 1200]], dtype='int32')
+            >>> out_z1 = paddle.index_sample(x, index)
+            >>> print(out_z1.numpy())
+            [[1. 2. 3.]
+             [6. 7. 8.]
+             [9. 9. 9.]]
+
+            >>> # Use the index of the maximum value by topk op
+            >>> # get the value of the element of the corresponding index in other tensors
+            >>> top_value, top_index = paddle.topk(x, k=2)
+            >>> out_z2 = paddle.index_sample(target, top_index)
+            >>> print(top_value.numpy())
+            [[ 4.  3.]
+             [ 8.  7.]
+             [12. 11.]]
+
+            >>> print(top_index.numpy())
+            [[3 2]
+             [3 2]
+             [3 2]]
+
+            >>> print(out_z2.numpy())
+            [[ 400  300]
+             [ 800  700]
+             [1200 1100]]
 
     """
     if in_dynamic_mode():
@@ -881,16 +901,17 @@ def masked_select(x, mask, name=None):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
-                                  [5.0, 6.0, 7.0, 8.0],
-                                  [9.0, 10.0, 11.0, 12.0]])
-            mask = paddle.to_tensor([[True, False, False, False],
-                                     [True, True, False, False],
-                                     [True, False, False, False]])
-            out = paddle.masked_select(x, mask)
-            #[1.0 5.0 6.0 9.0]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+            ...                       [5.0, 6.0, 7.0, 8.0],
+            ...                       [9.0, 10.0, 11.0, 12.0]])
+            >>> mask = paddle.to_tensor([[True, False, False, False],
+            ...                          [True, True, False, False],
+            ...                          [True, False, False, False]])
+            >>> out = paddle.masked_select(x, mask)
+            >>> print(out.numpy())
+            [1. 5. 6. 9.]
     """
 
     if in_dynamic_mode():
@@ -941,25 +962,45 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
 
         .. code-block:: python
 
-            import paddle
-
-            data_1 = paddle.to_tensor([1, 4, 5, 7])
-            value_1, indices_1 = paddle.topk(data_1, k=1)
-            print(value_1) # [7]
-            print(indices_1) # [3]
-
-            data_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]])
-            value_2, indices_2 = paddle.topk(data_2, k=1)
-            print(value_2) # [[7], [6]]
-            print(indices_2) # [[3], [1]]
-
-            value_3, indices_3 = paddle.topk(data_2, k=1, axis=-1)
-            print(value_3) # [[7], [6]]
-            print(indices_3) # [[3], [1]]
-
-            value_4, indices_4 = paddle.topk(data_2, k=1, axis=0)
-            print(value_4) # [[2, 6, 5, 7]]
-            print(indices_4) # [[1, 1, 0, 0]]
+            >>> import paddle
+
+            >>> data_1 = paddle.to_tensor([1, 4, 5, 7])
+            >>> value_1, indices_1 = paddle.topk(data_1, k=1)
+            >>> print(value_1)
+            Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [7])
+            >>> print(indices_1)
+            Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [3])
+
+            >>> data_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]])
+            >>> value_2, indices_2 = paddle.topk(data_2, k=1)
+            >>> print(value_2)
+            Tensor(shape=[2, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[7],
+             [6]])
+            >>> print(indices_2)
+            Tensor(shape=[2, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[3],
+             [1]])
+
+            >>> value_3, indices_3 = paddle.topk(data_2, k=1, axis=-1)
+            >>> print(value_3)
+            Tensor(shape=[2, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[7],
+             [6]])
+            >>> print(indices_3)
+            Tensor(shape=[2, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[3],
+             [1]])
+
+            >>> value_4, indices_4 = paddle.topk(data_2, k=1, axis=0)
+            >>> print(value_4)
+            Tensor(shape=[1, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[2, 6, 5, 7]])
+            >>> print(indices_4)
+            Tensor(shape=[1, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1, 1, 0, 0]])
 
 
     """
@@ -1014,30 +1055,30 @@ def bucketize(x, sorted_sequence, out_int32=False, right=False, name=None):
 
         .. code-block:: python
 
-            import paddle
-
-            sorted_sequence = paddle.to_tensor([2, 4, 8, 16], dtype='int32')
-            x = paddle.to_tensor([[0, 8, 4, 16], [-1, 2, 8, 4]], dtype='int32')
-            out1 = paddle.bucketize(x, sorted_sequence)
-            print(out1)
-            # Tensor(shape=[2, 4], dtype=int64, place=CPUPlace, stop_gradient=True,
-            #        [[0, 2, 1, 3],
-            #         [0, 0, 2, 1]])
-            out2 = paddle.bucketize(x, sorted_sequence, right=True)
-            print(out2)
-            # Tensor(shape=[2, 4], dtype=int64, place=CPUPlace, stop_gradient=True,
-            #        [[0, 3, 2, 4],
-            #         [0, 1, 3, 2]])
-            out3 = x.bucketize(sorted_sequence)
-            print(out3)
-            # Tensor(shape=[2, 4], dtype=int64, place=CPUPlace, stop_gradient=True,
-            #        [[0, 2, 1, 3],
-            #         [0, 0, 2, 1]])
-            out4 = x.bucketize(sorted_sequence, right=True)
-            print(out4)
-            # Tensor(shape=[2, 4], dtype=int64, place=CPUPlace, stop_gradient=True,
-            #        [[0, 3, 2, 4],
-            #         [0, 1, 3, 2]])
+            >>> import paddle
+
+            >>> sorted_sequence = paddle.to_tensor([2, 4, 8, 16], dtype='int32')
+            >>> x = paddle.to_tensor([[0, 8, 4, 16], [-1, 2, 8, 4]], dtype='int32')
+            >>> out1 = paddle.bucketize(x, sorted_sequence)
+            >>> print(out1)
+            Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 2, 1, 3],
+             [0, 0, 2, 1]])
+            >>> out2 = paddle.bucketize(x, sorted_sequence, right=True)
+            >>> print(out2)
+            Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 3, 2, 4],
+             [0, 1, 3, 2]])
+            >>> out3 = x.bucketize(sorted_sequence)
+            >>> print(out3)
+            Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 2, 1, 3],
+             [0, 0, 2, 1]])
+            >>> out4 = x.bucketize(sorted_sequence, right=True)
+            >>> print(out4)
+            Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 3, 2, 4],
+             [0, 1, 3, 2]])
 
     """
     check_variable_and_dtype(
@@ -1074,27 +1115,27 @@ def searchsorted(
 
         .. code-block:: python
 
-            import paddle
-
-            sorted_sequence = paddle.to_tensor([[1, 3, 5, 7, 9, 11],
-                                                [2, 4, 6, 8, 10, 12]], dtype='int32')
-            values = paddle.to_tensor([[3, 6, 9, 10], [3, 6, 9, 10]], dtype='int32')
-            out1 = paddle.searchsorted(sorted_sequence, values)
-            print(out1)
-            # Tensor(shape=[2, 4], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[1, 3, 4, 5],
-            #         [1, 2, 4, 4]])
-            out2 = paddle.searchsorted(sorted_sequence, values, right=True)
-            print(out2)
-            # Tensor(shape=[2, 4], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2, 3, 5, 5],
-            #         [1, 3, 4, 5]])
-            sorted_sequence_1d = paddle.to_tensor([1, 3, 5, 7, 9, 11, 13])
-            out3 = paddle.searchsorted(sorted_sequence_1d, values)
-            print(out3)
-            # Tensor(shape=[2, 4], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[1, 3, 4, 5],
-            #         [1, 3, 4, 5]])
+            >>> import paddle
+
+            >>> sorted_sequence = paddle.to_tensor([[1, 3, 5, 7, 9, 11],
+            ...                                     [2, 4, 6, 8, 10, 12]], dtype='int32')
+            >>> values = paddle.to_tensor([[3, 6, 9, 10], [3, 6, 9, 10]], dtype='int32')
+            >>> out1 = paddle.searchsorted(sorted_sequence, values)
+            >>> print(out1)
+            Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1, 3, 4, 5],
+             [1, 2, 4, 4]])
+            >>> out2 = paddle.searchsorted(sorted_sequence, values, right=True)
+            >>> print(out2)
+            Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[2, 3, 5, 5],
+             [1, 3, 4, 5]])
+            >>> sorted_sequence_1d = paddle.to_tensor([1, 3, 5, 7, 9, 11, 13])
+            >>> out3 = paddle.searchsorted(sorted_sequence_1d, values)
+            >>> print(out3)
+            Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1, 3, 4, 5],
+             [1, 3, 4, 5]])
 
     """
     if in_dynamic_mode():
@@ -1146,23 +1187,28 @@ def kthvalue(x, k, axis=None, keepdim=False, name=None):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.randn((2,3,2))
-            # Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #       [[[ 0.22954939, -0.01296274],
-            #         [ 1.17135799, -0.34493217],
-            #         [-0.19550551, -0.17573971]],
-            #
-            #        [[ 0.15104349, -0.93965352],
-            #         [ 0.14745511,  0.98209465],
-            #         [ 0.10732264, -0.55859774]]])
-            y = paddle.kthvalue(x, 2, 1)
-            # (Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            # [[ 0.22954939, -0.17573971],
-            #  [ 0.14745511, -0.55859774]]), Tensor(shape=[2, 2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
-            #  [[0, 2],
-            #  [1, 2]]))
+            >>> import paddle
+
+            >>> x = paddle.randn((2,3,2))
+            >>> print(x)
+            >>> # doctest: +SKIP('Different environments yield different output.')
+            Tensor(shape=[2, 3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[ 0.11855337, -0.30557564],
+              [-0.09968963,  0.41220093],
+              [ 1.24004936,  1.50014710]],
+             [[ 0.08612321, -0.92485696],
+              [-0.09276631,  1.15149164],
+              [-1.46587241,  1.22873247]]])
+            >>> # doctest: -SKIP
+            >>> y = paddle.kthvalue(x, 2, 1)
+            >>> print(y)
+            >>> # doctest: +SKIP('Different environments yield different output.')
+            (Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.11855337,  0.41220093],
+             [-0.09276631,  1.15149164]]), Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 1],
+             [1, 1]]))
+            >>> # doctest: -SKIP
     """
     if in_dynamic_mode():
         if axis is not None:
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 6e173545a2767b..97b8268fb6fe5d 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -126,15 +126,11 @@ def _format_item(np_var, max_width=0, signed=False):
         or np_var.dtype == np.float16
     ):
         if DEFAULT_PRINT_OPTIONS.sci_mode:
-            item_str = f'{{:.{DEFAULT_PRINT_OPTIONS.precision}e}}'.format(
-                np_var
-            )
+            item_str = f'{np_var:.{DEFAULT_PRINT_OPTIONS.precision}e}'
         elif np.ceil(np_var) == np_var:
             item_str = f'{np_var:.0f}.'
         else:
-            item_str = f'{{:.{DEFAULT_PRINT_OPTIONS.precision}f}}'.format(
-                np_var
-            )
+            item_str = f'{np_var:.{DEFAULT_PRINT_OPTIONS.precision}f}'
     else:
         item_str = f'{np_var}'
 
diff --git a/python/paddle/text/__init__.py b/python/paddle/text/__init__.py
index fbfa0c3fe2e028..378ed13431d86b 100644
--- a/python/paddle/text/__init__.py
+++ b/python/paddle/text/__init__.py
@@ -21,7 +21,7 @@
 from .datasets import WMT14  # noqa: F401
 from .datasets import WMT16  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'Conll05st',
     'Imdb',
     'Imikolov',
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 630af97f280f50..18697fdc25bfec 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -37,6 +37,7 @@
 from .layers_utils import padding_to_same_structure  # noqa: F401
 from .layers_utils import assert_same_structure  # noqa: F401
 from .layers_utils import get_shape_tensor_inputs  # noqa: F401
+from .layers_utils import get_int_tensor_list  # noqa: F401
 from .layers_utils import convert_shape_to_list  # noqa: F401
 from .layers_utils import check_shape  # noqa: F401
 from .layers_utils import try_set_static_shape_tensor  # noqa: F401
@@ -53,4 +54,4 @@
 from .layers_utils import _contain_var  # noqa: F401
 from .layers_utils import _convert_to_tensor_list  # noqa: F401
 
-__all__ = ['deprecated', 'run_check', 'require_version', 'try_import']  # noqa
+__all__ = ['deprecated', 'run_check', 'require_version', 'try_import']
diff --git a/python/paddle/utils/cpp_extension/__init__.py b/python/paddle/utils/cpp_extension/__init__.py
index 9ad431f00a65fc..96d55bea663c5f 100644
--- a/python/paddle/utils/cpp_extension/__init__.py
+++ b/python/paddle/utils/cpp_extension/__init__.py
@@ -22,7 +22,7 @@
 from .extension_utils import get_build_directory  # noqa: F401
 from .extension_utils import load_op_meta_info_and_register_op  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'CppExtension',
     'CUDAExtension',
     'load',
diff --git a/python/paddle/utils/layers_utils.py b/python/paddle/utils/layers_utils.py
index d6de149dbd148b..f8d98c5a084626 100644
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
@@ -18,10 +18,18 @@
 from uuid import uuid4
 from weakref import WeakKeyDictionary
 
+import numpy as np
+
 import paddle
+from paddle.pir.core import convert_np_dtype_to_dtype_
 
 from ..base.data_feeder import check_dtype, convert_dtype
-from ..base.framework import Block, Variable, in_dygraph_mode
+from ..base.framework import (
+    Block,
+    Variable,
+    _current_expected_place,
+    in_dygraph_mode,
+)
 
 
 def convert_to_list(value, n, name, dtype=int):
@@ -68,7 +76,9 @@ def convert_to_list(value, n, name, dtype=int):
                 + str(value)
             )
         for single_value in value_list:
-            assert not isinstance(single_value, Variable), (
+            assert not isinstance(
+                single_value, (Variable, paddle.pir.OpResult)
+            ), (
                 "Required numerical type with '%s', but received Tensor."
                 % dtype
             )
@@ -378,6 +388,29 @@ def _contain_var(list_or_tuple):
     return False
 
 
+def get_int_tensor_list(
+    ele_list, place=_current_expected_place(), default_dtype='int64'
+):
+    int_tensor_list = []
+    for ele in ele_list:
+        if isinstance(ele, paddle.pir.OpResult):
+            ele.stop_gradient = True
+            if convert_dtype(ele.dtype) != default_dtype:
+                ele = paddle.cast(x=ele, dtype=default_dtype)
+            if ele.shape == []:
+                ele = paddle.reshape(ele, [-1])
+            int_tensor_list.append(ele)
+        else:
+            temp_out = paddle.full(
+                [1],
+                ele,
+                convert_np_dtype_to_dtype_(np.dtype(default_dtype)),
+                place,
+            )
+            int_tensor_list.append(temp_out)
+    return int_tensor_list
+
+
 def get_shape_tensor_inputs(inputs, attrs, shape, op_type):
     from paddle.tensor import fill_constant
 
diff --git a/python/paddle/utils/unique_name.py b/python/paddle/utils/unique_name.py
index bfd26da255fa18..f34109b18ec0ba 100644
--- a/python/paddle/utils/unique_name.py
+++ b/python/paddle/utils/unique_name.py
@@ -17,4 +17,4 @@
 from ..base.unique_name import guard  # noqa: F401
 from ..base.unique_name import switch  # noqa: F401
 
-__all__ = ['generate', 'switch', 'guard']  # noqa
+__all__ = ['generate', 'switch', 'guard']
diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py
index a2a782c03599b8..cc70de710bf903 100644
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -112,4 +112,4 @@
 from .transforms import adjust_hue  # noqa: F401
 from .transforms import normalize  # noqa: F401
 
-__all__ = ['set_image_backend', 'get_image_backend', 'image_load']  # noqa
+__all__ = ['set_image_backend', 'get_image_backend', 'image_load']
diff --git a/python/paddle/vision/datasets/__init__.py b/python/paddle/vision/datasets/__init__.py
index 970c8cfcae86a1..a7464275eb671f 100644
--- a/python/paddle/vision/datasets/__init__.py
+++ b/python/paddle/vision/datasets/__init__.py
@@ -21,7 +21,7 @@
 from .cifar import Cifar100  # noqa: F401
 from .voc2012 import VOC2012  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'DatasetFolder',
     'ImageFolder',
     'MNIST',
diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
index 08f559bd440c9b..bf9fa0bec02880 100644
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -64,7 +64,7 @@
 from .shufflenetv2 import shufflenet_v2_x2_0  # noqa: F401
 from .shufflenetv2 import shufflenet_v2_swish  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'ResNet',
     'resnet18',
     'resnet34',
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index d38f81a57ede9b..5a8b433cea52ef 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -26,7 +26,7 @@
 from ..nn import BatchNorm2D, Conv2D, Layer, ReLU, Sequential
 from ..nn.initializer import Normal
 
-__all__ = [  # noqa
+__all__ = [
     'yolo_loss',
     'yolo_box',
     'prior_box',
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index 890e4b8982714d..3e2d39c5a88f5a 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -51,7 +51,7 @@
 from .functional import normalize  # noqa: F401
 from .functional import erase  # noqa: F401
 
-__all__ = [  # noqa
+__all__ = [
     'BaseTransform',
     'Compose',
     'Resize',
diff --git a/python/setup.py.in b/python/setup.py.in
index 39d256306bf9a6..10cbd7d54a86d0 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -429,6 +429,14 @@ packages=['paddle',
           'paddle.framework',
           'paddle.jit',
           'paddle.jit.dy2static',
+          'paddle.jit.newir_dy2static',
+          'paddle.jit.sot',
+          'paddle.jit.sot.opcode_translator',
+          'paddle.jit.sot.opcode_translator.executor',
+          'paddle.jit.sot.opcode_translator.executor.variables',
+          'paddle.jit.sot.opcode_translator.instruction_utils',
+          'paddle.jit.sot.symbolic',
+          'paddle.jit.sot.utils',
           'paddle.inference',
           'paddle.inference.contrib',
           'paddle.inference.contrib.utils',
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index e29dcf6c294462..b7866aa46109d3 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -2,8 +2,8 @@ PyGithub
 coverage==5.5
 pycrypto ; platform_system != "Windows"
 mock
-gym==0.25.2
-pygame==2.1.0
+gym==0.26.2
+pygame==2.5.2
 hypothesis
 opencv-python<=4.2.0.32
 visualdl==2.5.3
@@ -17,5 +17,5 @@ librosa==0.8.1
 parameterized
 wandb>=0.13
 xlsxwriter==3.0.9
-xdoctest
-ubelt # just for xdoctest
+xdoctest==1.1.1
+ubelt==1.3.3 # just for xdoctest
diff --git a/setup.py b/setup.py
index 221e0a0770e062..e12d676cb8a5f2 100644
--- a/setup.py
+++ b/setup.py
@@ -1425,6 +1425,13 @@ def get_setup_parameters():
         'paddle.jit',
         'paddle.jit.dy2static',
         'paddle.jit.newir_dy2static',
+        'paddle.jit.sot',
+        'paddle.jit.sot.opcode_translator',
+        'paddle.jit.sot.opcode_translator.executor',
+        'paddle.jit.sot.opcode_translator.executor.variables',
+        'paddle.jit.sot.opcode_translator.instruction_utils',
+        'paddle.jit.sot.symbolic',
+        'paddle.jit.sot.utils',
         'paddle.inference',
         'paddle.inference.contrib',
         'paddle.inference.contrib.utils',
diff --git a/test/amp/test_amp_api.py b/test/amp/test_amp_api.py
index 3f9f13d3b420ba..9f0d31e86310ea 100644
--- a/test/amp/test_amp_api.py
+++ b/test/amp/test_amp_api.py
@@ -289,7 +289,7 @@ def test_op_called_as_expected(self):
 
         func = SimpleModelIncludeSetValue()
         func = paddle.amp.decorate(func, level='O2')
-        func = paddle.jit.to_static(func)
+        func = paddle.jit.to_static(func, full_graph=True)
         input = paddle.randn((2, 3))
 
         with paddle.amp.auto_cast(level='O2'):
diff --git a/test/amp/test_amp_decorate.py b/test/amp/test_amp_decorate.py
index f956d37c63b39c..13b3b7fdd4d0f6 100644
--- a/test/amp/test_amp_decorate.py
+++ b/test/amp/test_amp_decorate.py
@@ -78,6 +78,44 @@ def forward(self, inputs):
         return x
 
 
+class LayerNorm2D(paddle.nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(args, *kwargs)
+
+    def forward(self, x):
+        x = x.transpose([0, 2, 3, 1])
+        x = super().forward(x)
+        return x.transpose([0, 3, 1, 2])
+
+
+class CustomLayer(paddle.nn.Layer):
+    def __init__(
+        self, input_channel, hidden_size, fp16_conv=True, fp16_linear=True
+    ):
+        super().__init__()
+        self.conv = ConvBNLayer(input_channel, 8, 3)
+        self.linear = paddle.nn.Linear(8, hidden_size)
+        self.layernorm = paddle.nn.Sequential(
+            LayerNorm2D(hidden_size),
+            LayerNorm2D(hidden_size),
+        )
+        self.fp16_conv = fp16_conv
+        self.fp16_linear = fp16_linear
+
+    def forward(self, inputs):
+        with paddle.amp.auto_cast(enable=self.fp16_conv):
+            if not self.fp16_conv:
+                inputs = inputs.astype('float32')
+            x = self.conv(inputs)
+        with paddle.amp.auto_cast(enable=self.fp16_linear):
+            if not self.fp16_linear:
+                x = x.astype('float32')
+            x = self.linear(x)
+        x = F.relu(x)
+        x = self.layernorm(x)
+        return x
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or paddle.device.cuda.get_device_capability()[0] < 7.0,
@@ -167,6 +205,22 @@ def test_excluded_layers_attr_none(self):
             fp16_layers=[model.conv._conv, model.linear],
         )
 
+    def test_excluded_layers_custom_layer(self):
+        if not paddle.amp.is_float16_supported():
+            return
+        model = CustomLayer(4, 8)
+        model = paddle.amp.decorate(
+            models=model,
+            level='O2',
+            dtype='bfloat16',
+            excluded_layers=[paddle.nn.LayerNorm, paddle.nn.BatchNorm],
+        )
+        with paddle.amp.auto_cast(level='O2'):
+            out = model(paddle.rand(shape=[2, 4, 8, 8], dtype='float32'))
+        self.check_results(
+            fp32_layers=[model.layernorm, model.conv._batch_norm],
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 52ba882bc3e2a6..8700ab2e070744 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -119,6 +119,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
                   test_semi_auto_parallel_single_strategy)
   set_tests_properties(test_semi_auto_parallel_single_strategy
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
+  py_test_modules(test_semi_auto_parallel_hybrid_strategy MODULES
+                  test_semi_auto_parallel_hybrid_strategy)
+  set_tests_properties(test_semi_auto_parallel_hybrid_strategy
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
   py_test_modules(test_gpt_with_newir MODULES test_gpt_with_newir)
   set_tests_properties(test_gpt_with_newir
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
diff --git a/test/auto_parallel/semi_auto_parallel_for_elementwise.py b/test/auto_parallel/semi_auto_parallel_for_elementwise.py
new file mode 100644
index 00000000000000..24bf0c8be9e88b
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_for_elementwise.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+
+
+class TestElementwiseApiForSemiAutoParallel:
+    def __init__(self):
+        self._dtype = os.getenv("dtype")
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(np1, np2, rtol=1e-05, verbose=True)
+
+    def test_unary_body(self, x_shape, out_shape, x_specs, unary_func):
+        x = paddle.randn(x_shape, self._dtype)
+        x.stop_gradient = False
+
+        x_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=x_specs)
+
+        dist_x = dist.shard_tensor(x, dist_attr=x_dist_attr)
+        dist_x.stop_gradient = False
+
+        dist_out = unary_func(dist_x)
+        out = unary_func(x)
+        self.check_tensor_eq(out, dist_out)
+
+        dist_out.backward()
+        out.backward()
+        self.check_tensor_eq(x.grad, dist_x.grad)
+
+    def test_binary_body(
+        self, x_shape, y_shape, out_shape, x_specs, y_specs, binary_func
+    ):
+        x = paddle.randn(x_shape, self._dtype)
+        y = paddle.randn(y_shape, self._dtype)
+        x.stop_gradient = False
+        y.stop_gradient = False
+
+        x_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=x_specs)
+        y_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=y_specs)
+
+        dist_x = dist.shard_tensor(x, dist_attr=x_dist_attr)
+        dist_y = dist.shard_tensor(y, dist_attr=y_dist_attr)
+        dist_x.stop_gradient = False
+        dist_y.stop_gradient = False
+
+        dist_out = binary_func(dist_x, dist_y)
+        out = binary_func(x, y)
+        self.check_tensor_eq(out, dist_out)
+
+        dist_out.backward()
+        out.backward()
+        self.check_tensor_eq(x.grad, dist_x.grad)
+        self.check_tensor_eq(y.grad, dist_y.grad)
+
+    def test_add_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.add,
+        )
+
+    def test_sub_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.subtract,
+        )
+
+    def test_add_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.add,
+        )
+
+    def test_add_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.add,
+        )
+
+    def test_add_x_y_shard_broadcast(self):
+        if self._backend == "cpu":
+            return
+
+        self.test_binary_body(
+            x_shape=[4, 16, 32],
+            y_shape=[16, 32],
+            out_shape=[4, 16, 32],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.add,
+        )
+
+    def test_sub_x_y_shard_broadcast(self):
+        if self._backend == "cpu":
+            return
+
+        self.test_binary_body(
+            x_shape=[4, 16, 32],
+            y_shape=[16, 32],
+            out_shape=[4, 16, 32],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.subtract,
+        )
+
+    def test_square_x_shard(self):
+        self.test_unary_body(
+            x_shape=[4, 16],
+            out_shape=[4, 16],
+            x_specs=['x', None],
+            unary_func=paddle.square,
+        )
+
+    def test_relu_x_shard(self):
+        self.test_unary_body(
+            x_shape=[4, 16],
+            out_shape=[4, 16],
+            x_specs=['x', None],
+            unary_func=F.relu,
+        )
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.test_add_x_shard()
+        self.test_add_x_shard_broadcast()
+        self.test_add_x_y_shard()
+        self.test_add_x_y_shard_broadcast()
+        self.test_sub_x_shard()
+        self.test_sub_x_y_shard_broadcast()
+        self.test_square_x_shard()
+        self.test_relu_x_shard()
+
+
+if __name__ == '__main__':
+    TestElementwiseApiForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/semi_auto_parallel_for_matmul.py b/test/auto_parallel/semi_auto_parallel_for_matmul.py
index bba31234ed80b2..279062f483058f 100644
--- a/test/auto_parallel/semi_auto_parallel_for_matmul.py
+++ b/test/auto_parallel/semi_auto_parallel_for_matmul.py
@@ -24,11 +24,9 @@ class TestMatmulApiForSemiAutoParallel:
     def __init__(self):
         self._dtype = os.getenv("dtype")
         self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
         self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
 
-        paddle.seed(2023)
-        np.random.seed(2023)
-
     def check_tensor_eq(self, a, b):
         np1 = a.numpy()
         np2 = b.numpy()
@@ -37,6 +35,9 @@ def check_tensor_eq(self, a, b):
     def test_body(
         self, x_shape, y_shape, x_specs, y_specs, trans_x=False, trans_y=False
     ):
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+
         x_np = np.random.random(size=x_shape).astype(self._dtype)
         y_np = np.random.random(size=y_shape).astype(self._dtype)
         x = paddle.to_tensor(x_np)
diff --git a/test/auto_parallel/semi_auto_parallel_for_replicated_spmd.py b/test/auto_parallel/semi_auto_parallel_for_replicated_spmd.py
index 87a171091c9619..3ca9baac5b5082 100644
--- a/test/auto_parallel/semi_auto_parallel_for_replicated_spmd.py
+++ b/test/auto_parallel/semi_auto_parallel_for_replicated_spmd.py
@@ -18,17 +18,17 @@
 
 import paddle
 import paddle.distributed as dist
-import paddle.nn.functional as F
 
 
 class TestReplicatedSPmdApiForSemiAutoParallel:
     def __init__(self):
         self._dtype = os.getenv("dtype")
         self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
         self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
 
-        paddle.seed(2023)
-        np.random.seed(2023)
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
 
     def check_tensor_eq(self, a, b):
         np1 = a.numpy()
@@ -49,28 +49,25 @@ def create_local_and_dist_tensor_pair(self, np_array, sharding_specs):
         return local_t, dist_t
 
     # input: phi::Tensor
-    # output: phi::Tensor
-    def test_relu(self):
-        x = np.random.random(size=[4, 4]).astype(self._dtype)
+    # output: std::vector<phi::Tensor>
+    def test_unbind(self):
+        x = np.random.random(size=[2, 8]).astype("float32")
         local_in, dist_in = self.create_local_and_dist_tensor_pair(
             x, ['x', None]
         )
-        local_out = F.relu(local_in)
-        dist_out = F.relu(dist_in)
-        np.testing.assert_equal(
-            dist_out.dist_attr.dims_mapping, [-1, -1], verbose=True
-        )
-        self.check_tensor_eq(local_out, dist_out)
+        local_out1, local_out2 = paddle.unbind(local_in, axis=0)
+        dist_out1, dist_out2 = paddle.unbind(dist_in, axis=0)
+        self.check_tensor_eq(local_out1, dist_out1)
+        self.check_tensor_eq(local_out2, dist_out2)
+
+        local_out = paddle.add(local_out1, local_out2)
+        dist_out = paddle.add(dist_out1, dist_out2)
 
-        # test backward
         local_out.backward()
         dist_out.backward()
-        np.testing.assert_equal(dist_in.grad._local_shape, [2, 4], verbose=True)
-        np.testing.assert_equal(
-            dist_in.grad.dist_attr.dims_mapping, [0, -1], verbose=True
-        )
         self.check_tensor_eq(local_in.grad, dist_in.grad)
 
+    # mutiple operators
     def test_mse_loss(self):
         x = np.random.random(size=[4, 4]).astype(self._dtype)
         y = np.random.random(size=[4]).astype(self._dtype)
@@ -103,8 +100,8 @@ def run_test_case(self):
         else:
             raise ValueError("Only support cpu or gpu backend.")
 
-        self.test_relu()
         self.test_mse_loss()
+        self.test_unbind()
 
 
 if __name__ == '__main__':
diff --git a/test/auto_parallel/semi_auto_parallel_simple_net.py b/test/auto_parallel/semi_auto_parallel_simple_net.py
index 1e0b1a92859fc8..fb7d0b4406697d 100644
--- a/test/auto_parallel/semi_auto_parallel_simple_net.py
+++ b/test/auto_parallel/semi_auto_parallel_simple_net.py
@@ -28,19 +28,19 @@
 
 # TODO(chenweihang): update to MLP Layer later
 class DemoNet(nn.Layer):
-    def __init__(self, np_w0, np_w1):
+    def __init__(self, np_w0, np_w1, param_suffix=""):
         super().__init__()
         self.w0 = self.create_parameter(
             shape=[IMAGE_SIZE, IMAGE_SIZE],
             attr=paddle.framework.ParamAttr(
-                name="demo_weight_1",
+                name="demo_weight_1" + param_suffix,
                 initializer=paddle.nn.initializer.Assign(np_w0),
             ),
         )
         self.w1 = self.create_parameter(
             shape=[IMAGE_SIZE, CLASS_NUM],
             attr=paddle.framework.ParamAttr(
-                name="nemo_weight_2",
+                name="nemo_weight_2" + param_suffix,
                 initializer=paddle.nn.initializer.Assign(np_w1),
             ),
         )
@@ -52,81 +52,101 @@ def forward(self, x):
 
 
 class DPDemoNet(nn.Layer):
-    def __init__(self, np_w0, np_w1, mesh):
+    def __init__(self, np_w0, np_w1, mesh, param_suffix=""):
         super().__init__()
-        self.replicate_dist_attr = dist.DistAttr(
-            mesh=mesh, sharding_specs=[None, None]
+        self.mesh = mesh
+        self.w0 = self.create_parameter(
+            shape=[IMAGE_SIZE, IMAGE_SIZE],
+            attr=paddle.framework.ParamAttr(
+                name="dp_demo_weight_1" + param_suffix,
+                initializer=paddle.nn.initializer.Assign(np_w0),
+            ),
         )
-        self.shard_axis0_dist_attr = dist.DistAttr(
-            mesh=mesh, sharding_specs=['x', None]
+        self.w1 = self.create_parameter(
+            shape=[IMAGE_SIZE, CLASS_NUM],
+            attr=paddle.framework.ParamAttr(
+                name="dp_nemo_weight_2" + param_suffix,
+                initializer=paddle.nn.initializer.Assign(np_w1),
+            ),
         )
+
+    def forward(self, x):
+        y = paddle.matmul(
+            dist.shard_tensor(
+                x,
+                dist_attr=dist.DistAttr(
+                    mesh=self.mesh, sharding_specs=['x', None]
+                ),
+            ),
+            self.w0,
+        )
+        z = paddle.matmul(y, self.w1)
+        return z
+
+
+class MPDemoNet(nn.Layer):
+    def __init__(self, np_w0, np_w1, mesh, param_suffix=""):
+        super().__init__()
         self.w0 = dist.shard_tensor(
             self.create_parameter(
                 shape=[IMAGE_SIZE, IMAGE_SIZE],
                 attr=paddle.framework.ParamAttr(
-                    name="dp_demo_weight_1",
+                    name="mp_demo_weight_1" + param_suffix,
                     initializer=paddle.nn.initializer.Assign(np_w0),
                 ),
             ),
-            dist_attr=self.replicate_dist_attr,
+            dist_attr=dist.DistAttr(mesh=mesh, sharding_specs=[None, 'x']),
         )
         self.w1 = dist.shard_tensor(
             self.create_parameter(
                 shape=[IMAGE_SIZE, CLASS_NUM],
                 attr=paddle.framework.ParamAttr(
-                    name="dp_nemo_weight_2",
+                    name="mp_nemo_weight_2" + param_suffix,
                     initializer=paddle.nn.initializer.Assign(np_w1),
                 ),
             ),
-            dist_attr=self.replicate_dist_attr,
+            dist_attr=dist.DistAttr(mesh=mesh, sharding_specs=['x', None]),
         )
 
     def forward(self, x):
-        y = paddle.matmul(
-            dist.shard_tensor(x, dist_attr=self.shard_axis0_dist_attr),
-            self.w0,
-        )
+        y = paddle.matmul(x, self.w0)
         z = paddle.matmul(y, self.w1)
         return z
 
 
-class MPDemoNet(nn.Layer):
-    def __init__(self, np_w0, np_w1, mesh):
+class PPDemoNet(nn.Layer):
+    def __init__(self, np_w0, np_w1, mesh0, mesh1):
         super().__init__()
-        self.replicate_dist_attr = dist.DistAttr(
-            mesh=mesh, sharding_specs=[None, None]
-        )
-        self.shard_axis0_dist_attr = dist.DistAttr(
-            mesh=mesh, sharding_specs=['x', None]
+        self.replicate_dist_attr0 = dist.DistAttr(
+            mesh=mesh0, sharding_specs=[None, None]
         )
-        self.shard_axis1_dist_attr = dist.DistAttr(
-            mesh=mesh, sharding_specs=['x', None]
+        self.replicate_dist_attr1 = dist.DistAttr(
+            mesh=mesh1, sharding_specs=[None, None]
         )
         self.w0 = dist.shard_tensor(
             self.create_parameter(
                 shape=[IMAGE_SIZE, IMAGE_SIZE],
                 attr=paddle.framework.ParamAttr(
-                    name="mp_demo_weight_1",
+                    name="pp_demo_weight_0",
                     initializer=paddle.nn.initializer.Assign(np_w0),
                 ),
             ),
-            dist_attr=self.shard_axis1_dist_attr,
+            dist_attr=self.replicate_dist_attr0,
         )
         self.w1 = dist.shard_tensor(
             self.create_parameter(
                 shape=[IMAGE_SIZE, CLASS_NUM],
                 attr=paddle.framework.ParamAttr(
-                    name="mp_nemo_weight_2",
+                    name="pp_nemo_weight_1",
                     initializer=paddle.nn.initializer.Assign(np_w1),
                 ),
             ),
-            dist_attr=self.shard_axis0_dist_attr,
+            dist_attr=self.replicate_dist_attr1,
         )
 
     def forward(self, x):
-        y = paddle.matmul(
-            dist.shard_tensor(x, dist_attr=self.replicate_dist_attr), self.w0
-        )
+        y = paddle.matmul(x, self.w0)
+        y = dist.reshard(y, dist_attr=self.replicate_dist_attr1)
         z = paddle.matmul(y, self.w1)
         return z
 
@@ -135,7 +155,10 @@ class TestSimpleNetForSemiAutoParallel:
     def __init__(self):
         self._dtype = os.getenv("dtype")
         self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
         self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        self._pp_mesh0 = dist.ProcessMesh([0], dim_names=["x"])
+        self._pp_mesh1 = dist.ProcessMesh([1], dim_names=["x"])
 
         paddle.set_device(self._backend)
 
@@ -144,8 +167,8 @@ def __init__(self):
         self.init_single_card_net_result()
 
     def init_input_data(self):
-        paddle.seed(2023)
-        np.random.seed(2023)
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
 
         self.image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype(
             'float32'
@@ -155,25 +178,22 @@ def init_input_data(self):
         self.w1 = np.random.random([IMAGE_SIZE, CLASS_NUM]).astype('float32')
 
     # TODO(chenweihang): optimizer cannot run auto-parallel now
-    def run_dynamic(self, layer, parallel=False):
+    # TODO(GhostScreaming): support pp backward later.
+    def run_dynamic(self, layer, is_pp=False):
         # create loss
         loss_fn = nn.MSELoss()
         # run forward and backward
         image = paddle.to_tensor(self.image)
         out = layer(image)
-        label = (
-            dist.shard_tensor(
-                self.label,
-                dist_attr=dist.DistAttr(
-                    mesh=self._mesh, sharding_specs=[None, None]
-                ),
-            )
-            if parallel is True
-            else paddle.to_tensor(self.label)
-        )
+
+        label = paddle.to_tensor(self.label)
         loss = loss_fn(out, label)
-        loss.backward()
-        return loss, layer.w0.grad, layer.w1.grad
+
+        if is_pp:
+            return loss, None, None
+        else:
+            loss.backward()
+            return loss, layer.w0.grad, layer.w1.grad
 
     def init_single_card_net_result(self):
         self.base_loss, self.base_w0_grad, self.base_w1_grad = self.run_dynamic(
@@ -187,7 +207,7 @@ def check_tensor_eq(self, a, b):
 
     def test_dp_demo_net(self):
         self.dp_loss, self.dp_w0_grad, self.dp_w1_grad = self.run_dynamic(
-            DPDemoNet(self.w0, self.w1, self._mesh), parallel=True
+            DPDemoNet(self.w0, self.w1, self._mesh)
         )
         self.check_tensor_eq(self.dp_loss, self.base_loss)
         self.check_tensor_eq(self.dp_w0_grad, self.base_w0_grad)
@@ -195,15 +215,31 @@ def test_dp_demo_net(self):
 
     def test_mp_demo_net(self):
         self.mp_loss, self.mp_w0_grad, self.mp_w1_grad = self.run_dynamic(
-            MPDemoNet(self.w0, self.w1, self._mesh), parallel=True
+            MPDemoNet(self.w0, self.w1, self._mesh)
         )
         self.check_tensor_eq(self.mp_loss, self.base_loss)
         self.check_tensor_eq(self.mp_w0_grad, self.base_w0_grad)
         self.check_tensor_eq(self.mp_w1_grad, self.base_w1_grad)
 
+    # TODO(GhostScreaming): support pp backward later.
+    def test_pp_demo_net(self):
+        # Send/Recv operators doens't support CPU now.
+        if self._backend != "gpu":
+            return
+        self.mp_loss, _, _ = self.run_dynamic(
+            PPDemoNet(self.w0, self.w1, self._pp_mesh0, self._pp_mesh1),
+            is_pp=True,
+        )
+        rank = dist.get_rank()
+        if rank == 1:
+            self.check_tensor_eq(self.mp_loss, self.base_loss)
+        else:
+            pass
+
     def run_test_case(self):
         self.test_dp_demo_net()
         self.test_mp_demo_net()
+        self.test_pp_demo_net()
 
 
 if __name__ == '__main__':
diff --git a/test/auto_parallel/semi_auto_parallel_simple_net_amp.py b/test/auto_parallel/semi_auto_parallel_simple_net_amp.py
new file mode 100644
index 00000000000000..3a170240631627
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_simple_net_amp.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from semi_auto_parallel_simple_net import (
+    DemoNet,
+    DPDemoNet,
+    MPDemoNet,
+    TestSimpleNetForSemiAutoParallel,
+)
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+
+
+class TestSimpleNetWithAmpForSemiAutoParallel(TestSimpleNetForSemiAutoParallel):
+    def __init__(self):
+        self._dtype = os.getenv("dtype")
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+        paddle.set_device(self._backend)
+        self.init_input_data()
+        self.init_single_card_net_result()
+
+    def run_dynamic_amp(self, layer, level='O1'):
+        if level == 'O2':
+            layer = paddle.amp.decorate(models=layer, level='O2')
+        # create loss
+        loss_fn = nn.MSELoss()
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+        # run forward and backward
+        image = paddle.to_tensor(self.image)
+
+        with paddle.amp.auto_cast(level=level):
+            out = layer(image)
+            label = paddle.to_tensor(self.label)
+            loss = loss_fn(out, label)
+
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        return loss, layer.w0.grad, layer.w1.grad
+
+    def init_single_card_net_result(self):
+        (
+            self.base_loss_o1,
+            self.base_w0_grad_o1,
+            self.base_w1_grad_o1,
+        ) = self.run_dynamic_amp(DemoNet(self.w0, self.w1, 'O1'), 'O1')
+        (
+            self.base_loss_o2,
+            self.base_w0_grad_o2,
+            self.base_w1_grad_o2,
+        ) = self.run_dynamic_amp(DemoNet(self.w0, self.w1, 'O2'), 'O2')
+
+    def test_dp_demo_net(self):
+        (
+            self.dp_loss_o1,
+            self.dp_w0_grad_o1,
+            self.dp_w1_grad_o1,
+        ) = self.run_dynamic_amp(
+            DPDemoNet(self.w0, self.w1, self._mesh, 'O1'), 'O1'
+        )
+        self.check_tensor_eq(self.dp_loss_o1, self.base_loss_o1)
+        self.check_tensor_eq(self.dp_w0_grad_o1, self.base_w0_grad_o1)
+        self.check_tensor_eq(self.dp_w1_grad_o1, self.base_w1_grad_o1)
+
+        (
+            self.dp_loss_o2,
+            self.dp_w0_grad_o2,
+            self.dp_w1_grad_o2,
+        ) = self.run_dynamic_amp(
+            DPDemoNet(self.w0, self.w1, self._mesh, 'O2'), 'O2'
+        )
+        self.check_tensor_eq(self.dp_loss_o2, self.base_loss_o2)
+        self.check_tensor_eq(self.dp_w0_grad_o2, self.base_w0_grad_o2)
+        self.check_tensor_eq(self.dp_w1_grad_o2, self.base_w1_grad_o2)
+
+    def test_mp_demo_net(self):
+        (
+            self.mp_loss_o1,
+            self.mp_w0_grad_o1,
+            self.mp_w1_grad_o1,
+        ) = self.run_dynamic_amp(
+            MPDemoNet(self.w0, self.w1, self._mesh, 'O1'), 'O1'
+        )
+        self.check_tensor_eq(self.mp_loss_o1, self.base_loss_o1)
+        self.check_tensor_eq(self.mp_w0_grad_o1, self.base_w0_grad_o1)
+        self.check_tensor_eq(self.mp_w1_grad_o1, self.base_w1_grad_o1)
+
+        (
+            self.mp_loss_o2,
+            self.mp_w0_grad_o2,
+            self.mp_w1_grad_o2,
+        ) = self.run_dynamic_amp(
+            MPDemoNet(self.w0, self.w1, self._mesh, 'O2'), 'O2'
+        )
+        self.check_tensor_eq(self.mp_loss_o2, self.base_loss_o2)
+        self.check_tensor_eq(self.mp_w0_grad_o2, self.base_w0_grad_o2)
+        self.check_tensor_eq(self.mp_w1_grad_o2, self.base_w1_grad_o2)
+
+    def run_test_case(self):
+        self.test_dp_demo_net()
+        self.test_mp_demo_net()
+
+
+if __name__ == '__main__':
+    TestSimpleNetWithAmpForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/semi_auto_parallel_simple_net_hybrid.py b/test/auto_parallel/semi_auto_parallel_simple_net_hybrid.py
new file mode 100644
index 00000000000000..90532a647812ad
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_simple_net_hybrid.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from semi_auto_parallel_simple_net import (
+    CLASS_NUM,
+    IMAGE_SIZE,
+    TestSimpleNetForSemiAutoParallel,
+)
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+
+
+class DPAndMPDemoNet(nn.Layer):
+    def __init__(self, np_w0, np_w1, mesh):
+        super().__init__()
+        self.mesh = mesh
+        self.w0 = dist.shard_tensor(
+            self.create_parameter(
+                shape=[IMAGE_SIZE, IMAGE_SIZE],
+                attr=paddle.framework.ParamAttr(
+                    name="dmp_demo_weight_1",
+                    initializer=paddle.nn.initializer.Assign(np_w0),
+                ),
+            ),
+            dist_attr=dist.DistAttr(mesh=mesh, sharding_specs=[None, 'y']),
+        )
+        self.w1 = dist.shard_tensor(
+            self.create_parameter(
+                shape=[IMAGE_SIZE, CLASS_NUM],
+                attr=paddle.framework.ParamAttr(
+                    name="dmp_nemo_weight_2",
+                    initializer=paddle.nn.initializer.Assign(np_w1),
+                ),
+            ),
+            dist_attr=dist.DistAttr(mesh=mesh, sharding_specs=['y', None]),
+        )
+
+    def forward(self, x):
+        y = paddle.matmul(
+            dist.shard_tensor(
+                x,
+                dist_attr=dist.DistAttr(
+                    mesh=self.mesh, sharding_specs=['x', None]
+                ),
+            ),
+            self.w0,
+        )
+        z = paddle.matmul(y, self.w1)
+        return z
+
+
+class TestSimpleNetHybridStrategyForSemiAutoParallel(
+    TestSimpleNetForSemiAutoParallel
+):
+    def __init__(self):
+        self._dtype = os.getenv("dtype")
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+
+        paddle.set_device(self._backend)
+
+        self.init_input_data()
+        self.init_single_card_net_result()
+
+    def test_dp_mp_demo_net(self):
+        (
+            self.dp_mp_loss,
+            self.dp_mp_w0_grad,
+            self.dp_mp_w1_grad,
+        ) = self.run_dynamic(DPAndMPDemoNet(self.w0, self.w1, self._mesh))
+        self.check_tensor_eq(self.dp_mp_loss, self.base_loss)
+        self.check_tensor_eq(self.dp_mp_w0_grad, self.base_w0_grad)
+        self.check_tensor_eq(self.dp_mp_w1_grad, self.base_w1_grad)
+
+    def run_test_case(self):
+        self.test_dp_mp_demo_net()
+
+
+if __name__ == '__main__':
+    TestSimpleNetHybridStrategyForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt
index cf034e33678aa1..c1f7e895e0486f 100644
--- a/test/auto_parallel/spmd_rules/CMakeLists.txt
+++ b/test/auto_parallel/spmd_rules/CMakeLists.txt
@@ -18,6 +18,7 @@ if(WITH_DISTRIBUTE)
   py_test_modules(test_default_data_parallel_rule MODULES
                   test_default_data_parallel_rule)
   py_test_modules(test_layer_norm_rule MODULES test_layer_norm_rule)
+  py_test_modules(test_flatten_rule MODULES test_flatten_rule)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/spmd_rules/test_flatten_rule.py b/test/auto_parallel/spmd_rules/test_flatten_rule.py
new file mode 100644
index 00000000000000..599b2ddf4bf958
--- /dev/null
+++ b/test/auto_parallel/spmd_rules/test_flatten_rule.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from collections import OrderedDict
+
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.distributed.fleet import auto
+from paddle.framework import core
+
+
+class TestFlattenSPMDRule(unittest.TestCase):
+    def setUp(self):
+        self.rule = core.get_phi_spmd_rule("flatten")
+
+        x_shape = [8, 16, 8, 24]
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [-1, -1, -1, -1]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+        self.attrs = OrderedDict()
+
+    def test_flatten_infer_forward(self):
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
+        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, 1] [ 0, -1, 1]
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, 1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = 2
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, 1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, 1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
+        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, 1] [ -1, 0, 1]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = 2
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, 1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0, 1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
+        # dims_mapping: [-1, -1, 1, 0] --> [-1, -1, -1, 0] [ -1, -1, 0]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 1, 0])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = 2
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, 0]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1, 0])
+
+        # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
+        # dims_mapping: [-1, 0, 1, -1] --> [-1, -1, -1, -1] [ -1]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1])
+        self.attrs['start_axis'] = 0
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1])
+
+        # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
+        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, -1] [ 0]
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, 1])
+        self.attrs['start_axis'] = 0
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0])
+
+        # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
+        # dims_mapping: [1, 0, -1, -1] --> [1, -1, -1, -1] [ 1]
+        self.x_dist_tensor_spec.set_dims_mapping([1, 0, -1, -1])
+        self.attrs['start_axis'] = 0
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
+        # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, -1, -1] [-1, -1]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
+        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, -1] [-1, 0]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
+        # dims_mapping: [0, 1, -1, -1] --> [0, 1, -1, -1] [0, 1]
+        self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, 1])
+
+    def test_flatten_infer_backward(self):
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+
+        output_tensor_dist_attr = TensorDistAttr()
+        output_tensor_dist_attr.dims_mapping = [-1, -1, -1]
+        output_tensor_dist_attr.process_mesh = process_mesh
+        self.output_dist_tensor_spec = DistTensorSpec(
+            [8, 16 * 8, 24], output_tensor_dist_attr
+        )
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24] (input --> output)
+        # dims_mapping: [0, -1, 1] --> [0, -1, -1, 1], [0, -1, 1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8, 16 * 8, 24]
+        self.output_dist_tensor_spec.set_dims_mapping([0, -1, 1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = 2
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, 1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, 1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24] (input --> output)
+        # dims_mapping: [0, 1, -1] --> [0, 1, -1, -1], [0, 1, -1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8, 16 * 8, 24]
+        self.output_dist_tensor_spec.set_dims_mapping([0, 1, -1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = 2
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, 1, -1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24] (input --> output)
+        # dims_mapping: [-1, 0, 1] --> [-1, 0, -1, 1], [-1, 0, 1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8, 16 * 8, 24]
+        self.output_dist_tensor_spec.set_dims_mapping([-1, 0, 1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = 2
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, 1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0, 1])
+
+        # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24] (input --> output)
+        # dims_mapping: [-1] --> [-1, -1, -1, -1], [-1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8 * 16 * 8 * 24]
+        self.output_dist_tensor_spec.set_dims_mapping([-1])
+        self.attrs['start_axis'] = 0
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1])
+
+        # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24] (input --> output)
+        # dims_mapping: [0] --> [0, -1, -1, -1], [0] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8 * 16 * 8 * 24]
+        self.output_dist_tensor_spec.set_dims_mapping([0])
+        self.attrs['start_axis'] = 0
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0])
+
+        # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24] (input --> output)
+        # dims_mapping: [1] --> [1, -1, -1, -1], [1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8 * 16 * 8 * 24]
+        self.output_dist_tensor_spec.set_dims_mapping([1])
+        self.attrs['start_axis'] = 0
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24] (input --> output)
+        # dims_mapping: [-1, -1] --> [-1, -1, -1, -1], [-1, -1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8, 16 * 8 * 24]
+        self.output_dist_tensor_spec.set_dims_mapping([-1, -1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24] (input --> output)
+        # dims_mapping: [0, -1] --> [0, -1, -1, -1], [0, -1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8, 16 * 8 * 24]
+        self.output_dist_tensor_spec.set_dims_mapping([0, -1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1])
+
+        # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24] (input --> output)
+        # dims_mapping: [0, 1] --> [0, 1, -1, -1], [0, 1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [8, 16 * 8 * 24]
+        self.output_dist_tensor_spec.set_dims_mapping([0, 1])
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = -1
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['start_axis'],
+            self.attrs['stop_axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, 1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/test_api_dist_branch.py b/test/auto_parallel/test_api_dist_branch.py
index 8880aac9d261ba..99a5d5be878d9a 100644
--- a/test/auto_parallel/test_api_dist_branch.py
+++ b/test/auto_parallel/test_api_dist_branch.py
@@ -113,45 +113,29 @@ def test_concat_for_dist_tensor(self):
         self.check_tensor_eq(local_in2.grad, dist_in2.grad)
         self.check_tensor_eq(local_in3.grad, dist_in3.grad)
 
-    # input: std::vector<phi::Tensor>
-    # output: std::vector<phi::Tensor>
-    def test_broadcast_tensors_for_dist_tensor(self):
-        x1 = np.random.random(size=[4, 4]).astype("float32")
-        x2 = np.random.random(size=[4, 4]).astype("float32")
-        local_in1, dist_in1 = self.create_local_and_dist_tensor_pair(x1)
-        local_in2, dist_in2 = self.create_local_and_dist_tensor_pair(x2)
-
-        local_out1, local_out2 = paddle.broadcast_tensors(
-            [local_in1, local_in2]
-        )
-        dist_out1, dist_out2 = paddle.broadcast_tensors([dist_in1, dist_in2])
-        self.check_tensor_eq(local_out1, dist_out1)
-        self.check_tensor_eq(local_out2, dist_out2)
-
-        local_out = paddle.concat([local_out1, local_out2])
-        dist_out = paddle.concat([dist_out1, dist_out2])
-
-        local_out.backward()
-        dist_out.backward()
-        self.check_tensor_eq(local_in1.grad, dist_in1.grad)
-        self.check_tensor_eq(local_in2.grad, dist_in2.grad)
-
-    # input: phi::Tensor
-    # output: std::vector<phi::Tensor>
-    def test_unbind_for_dist_tensor(self):
-        x = np.random.random(size=[2, 8]).astype("float32")
-        local_in, dist_in = self.create_local_and_dist_tensor_pair(x)
-        local_out1, local_out2 = paddle.unbind(local_in, axis=0)
-        dist_out1, dist_out2 = paddle.unbind(dist_in, axis=0)
-        self.check_tensor_eq(local_out1, dist_out1)
-        self.check_tensor_eq(local_out2, dist_out2)
-
-        local_out = paddle.concat([local_out1, local_out2])
-        dist_out = paddle.concat([dist_out1, dist_out2])
-
-        local_out.backward()
-        dist_out.backward()
-        self.check_tensor_eq(local_in.grad, dist_in.grad)
+    # TODO(GhostScreaming): Support paddle.concat backward later.
+    # # input: std::vector<phi::Tensor>
+    # # output: std::vector<phi::Tensor>
+    # def test_broadcast_tensors_for_dist_tensor(self):
+    #     x1 = np.random.random(size=[4, 4]).astype("float32")
+    #     x2 = np.random.random(size=[4, 4]).astype("float32")
+    #     local_in1, dist_in1 = self.create_local_and_dist_tensor_pair(x1)
+    #     local_in2, dist_in2 = self.create_local_and_dist_tensor_pair(x2)
+
+    #     local_out1, local_out2 = paddle.broadcast_tensors(
+    #         [local_in1, local_in2]
+    #     )
+    #     dist_out1, dist_out2 = paddle.broadcast_tensors([dist_in1, dist_in2])
+    #     self.check_tensor_eq(local_out1, dist_out1)
+    #     self.check_tensor_eq(local_out2, dist_out2)
+
+    #     local_out = paddle.concat([local_out1, local_out2])
+    #     dist_out = paddle.concat([dist_out1, dist_out2])
+
+    #     local_out.backward()
+    #     dist_out.backward()
+    #     self.check_tensor_eq(local_in1.grad, dist_in1.grad)
+    #     self.check_tensor_eq(local_in2.grad, dist_in2.grad)
 
     # input: paddle::optional<phi::Tensor>
     # output: phi::Tensor
diff --git a/test/auto_parallel/test_dist_op_cost.py b/test/auto_parallel/test_dist_op_cost.py
index 6477d8646bca6c..7d84c35cf3ea38 100644
--- a/test/auto_parallel/test_dist_op_cost.py
+++ b/test/auto_parallel/test_dist_op_cost.py
@@ -122,6 +122,7 @@ def test_dist_op_cost_part2(self):
         def make_program():
             main_program = paddle.static.Program()
             start_program = paddle.static.Program()
+            mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
             with paddle.static.program_guard(main_program, start_program):
                 x = paddle.static.data(name='x', shape=[4], dtype='float32')
                 x.stop_gradient = True
@@ -129,13 +130,11 @@ def make_program():
                     name="label", shape=[8, 1], dtype='float32'
                 )
                 label.stop_gradient = True
-                auto.shard_tensor(
-                    x, auto.ProcessMesh([0, 1], dim_names=["x"]), ["x"]
-                )
+                auto.shard_tensor(x, mesh, ["x"])
 
                 auto.shard_tensor(
                     label,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     ["x", None],
                 )
                 # embedding
@@ -150,8 +149,8 @@ def make_program():
                         W = main_program.global_block().vars[op.input("W")[0]]
                         auto.shard_tensor(
                             W,
-                            auto.ProcessMesh([0, 1], dim_names=["x"]),
-                            ["x", None],
+                            mesh,
+                            ["y", None],
                         )
                 out = paddle.transpose(out, [1, 0])  # [8, 2] [-1, 0]
 
@@ -161,7 +160,7 @@ def make_program():
                 )  # [2, 8] [0, -1]
                 auto.shard_tensor(
                     param1,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     ["x", None],
                 )
                 param2 = paddle.create_parameter(
@@ -169,8 +168,8 @@ def make_program():
                 )  # [8, 4] [-1, 0]
                 auto.shard_tensor(
                     param2,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
-                    [None, "x"],
+                    mesh,
+                    [None, "y"],
                 )
                 out1 = paddle.matmul(out, param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.create_parameter(
@@ -178,7 +177,7 @@ def make_program():
                 )  # [8, 8] [-1, -1]
                 auto.shard_tensor(
                     param2,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     [None, None],
                 )
                 tmp_out = paddle.matmul(out1, tmp_param)
@@ -206,7 +205,7 @@ def make_program():
         main_program, dist_context = parallelizer(make_program, 0)
         ops = main_program.global_block().ops
         cluster = Cluster()
-        cluster.gen_default_config_cluster(device_count=2)
+        cluster.gen_default_config_cluster(device_count=4)
         for idx, op in enumerate(ops):
             if op.type != "shape" and op.type != "slice":
                 dist_op = dist_context.get_dist_op_for_program(op)
@@ -231,6 +230,7 @@ def test_dist_op_cost_part3(self):
         def make_program():
             main_program = paddle.static.Program()
             start_program = paddle.static.Program()
+            mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
             with paddle.static.program_guard(main_program, start_program):
                 x = paddle.static.data(name='x', shape=[4], dtype='float32')
                 x.stop_gradient = True
@@ -238,13 +238,11 @@ def make_program():
                     name="label", shape=[8, 1], dtype='float32'
                 )
                 label.stop_gradient = True
-                auto.shard_tensor(
-                    x, auto.ProcessMesh([0, 1], dim_names=["x"]), ["x"]
-                )
+                auto.shard_tensor(x, mesh, ["x"])
 
                 auto.shard_tensor(
                     label,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     ["x", None],
                 )
                 # embedding
@@ -259,8 +257,8 @@ def make_program():
                         W = main_program.global_block().vars[op.input("W")[0]]
                         auto.shard_tensor(
                             W,
-                            auto.ProcessMesh([0, 1], dim_names=["x"]),
-                            ["x", None],
+                            mesh,
+                            ["y", None],
                         )
                 out = paddle.transpose(out, [1, 0])  # [8, 2] [-1, 0]
 
@@ -270,7 +268,7 @@ def make_program():
                 )  # [2, 8] [0, -1]
                 auto.shard_tensor(
                     param1,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     ["x", None],
                 )
                 param2 = paddle.create_parameter(
@@ -278,8 +276,8 @@ def make_program():
                 )  # [8, 4] [-1, 0]
                 auto.shard_tensor(
                     param2,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
-                    [None, "x"],
+                    mesh,
+                    [None, "y"],
                 )
                 out1 = paddle.matmul(out, param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.create_parameter(
@@ -287,7 +285,7 @@ def make_program():
                 )  # [8, 8] [-1, -1]
                 auto.shard_tensor(
                     param2,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     [None, None],
                 )
 
@@ -316,7 +314,7 @@ def make_program():
         main_program, dist_context = parallelizer(make_program, 0)
         ops = main_program.global_block().ops
         cluster = Cluster()
-        cluster.gen_default_config_cluster(device_count=2)
+        cluster.gen_default_config_cluster(device_count=4)
         for idx, op in enumerate(ops):
             if op.type != "shape" and op.type != "slice":
                 dist_op = dist_context.get_dist_op_for_program(op)
@@ -341,6 +339,7 @@ def test_dist_op_cost_part4(self):
         def make_program():
             main_program = paddle.static.Program()
             start_program = paddle.static.Program()
+            mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
             with paddle.static.program_guard(main_program, start_program):
                 x = paddle.static.data(name='x', shape=[4], dtype='float32')
                 x.stop_gradient = True
@@ -348,12 +347,10 @@ def make_program():
                     name="label", shape=[8, 1], dtype='float32'
                 )
                 label.stop_gradient = True
-                auto.shard_tensor(
-                    x, auto.ProcessMesh([0, 1], dim_names=["x"]), ["x"]
-                )
+                auto.shard_tensor(x, mesh, ["x"])
                 auto.shard_tensor(
                     label,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     ["x", None],
                 )
                 # embedding
@@ -368,8 +365,8 @@ def make_program():
                         W = main_program.global_block().vars[op.input("W")[0]]
                         auto.shard_tensor(
                             W,
-                            auto.ProcessMesh([0, 1], dim_names=["x"]),
-                            ["x", None],
+                            mesh,
+                            ["y", None],
                         )
                 out = paddle.transpose(out, [1, 0])  # [8, 2] [-1, 0]
 
@@ -379,7 +376,7 @@ def make_program():
                 )  # [2, 8] [0, -1]
                 auto.shard_tensor(
                     param1,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     ["x", None],
                 )
                 param2 = paddle.create_parameter(
@@ -387,8 +384,8 @@ def make_program():
                 )  # [8, 4] [-1, 0]
                 auto.shard_tensor(
                     param2,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
-                    [None, "x"],
+                    mesh,
+                    [None, "y"],
                 )
 
                 out1 = paddle.matmul(out, param1)  # [8, 8] [-1, -1]
@@ -397,7 +394,7 @@ def make_program():
                 )  # [8, 8] [-1, -1]
                 auto.shard_tensor(
                     param2,
-                    auto.ProcessMesh([0, 1], dim_names=["x"]),
+                    mesh,
                     [None, None],
                 )
 
@@ -425,7 +422,7 @@ def make_program():
         main_program, dist_context = parallelizer(make_program, 0)
         ops = main_program.global_block().ops
         cluster = Cluster()
-        cluster.gen_default_config_cluster(device_count=2)
+        cluster.gen_default_config_cluster(device_count=4)
         for idx, op in enumerate(ops):
             if op.type != "shape" and op.type != "slice":
                 dist_op = dist_context.get_dist_op_for_program(op)
diff --git a/test/auto_parallel/test_dist_reshape.py b/test/auto_parallel/test_dist_reshape.py
index adeb8ee906f0b2..e73e7166b58366 100644
--- a/test/auto_parallel/test_dist_reshape.py
+++ b/test/auto_parallel/test_dist_reshape.py
@@ -66,7 +66,7 @@ def test_dist_reshape_mp2(self):
             for idx, op in enumerate(ops):
                 op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                 assert op_dist_attr.impl_type == "reshape2"
-                assert op_dist_attr.impl_idx == idx
+                assert op_dist_attr.impl_idx == 0
 
                 if op_dist_attr.impl_idx == 2:
                     assert op.desc.attr('shape')[0] == 2
diff --git a/test/auto_parallel/test_dist_split.py b/test/auto_parallel/test_dist_split.py
index 131d6d4d845f9b..9a6db49c9b7541 100644
--- a/test/auto_parallel/test_dist_split.py
+++ b/test/auto_parallel/test_dist_split.py
@@ -61,7 +61,7 @@ def test_dist_split_dp2(self):
             dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
             ops = dist_main_prog.global_block().ops
             op_dist_attr = dist_context.get_op_dist_attr_for_program(ops[0])
-            assert op_dist_attr.impl_type == "split"
+            assert op_dist_attr.impl_type == "default"
             assert op_dist_attr.impl_idx == 0
 
 
diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py
index 3fe98e4d087441..8040b97d43ac94 100644
--- a/test/auto_parallel/test_semi_auto_parallel_basic.py
+++ b/test/auto_parallel/test_semi_auto_parallel_basic.py
@@ -23,7 +23,7 @@ def setUp(self):
             num_of_devices=2,
             timeout=120,
         )
-        self._default_envs = {"dtype": "float32"}
+        self._default_envs = {"dtype": "float32", "seed": "2023"}
         self._changeable_envs = {"backend": ["cpu", "gpu"]}
 
     def test_matmul_api(self):
@@ -36,6 +36,16 @@ def test_matmul_api(self):
                 user_defined_envs=envs,
             )
 
+    def test_elementwise_api(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_for_elementwise.py",
+                user_defined_envs=envs,
+            )
+
     def test_several_replicated_spmd_api(self):
         envs_list = test_base.gen_product_envs_list(
             self._default_envs, self._changeable_envs
diff --git a/test/auto_parallel/test_semi_auto_parallel_hybrid_strategy.py b/test/auto_parallel/test_semi_auto_parallel_hybrid_strategy.py
new file mode 100644
index 00000000000000..eefc47d6967163
--- /dev/null
+++ b/test/auto_parallel/test_semi_auto_parallel_hybrid_strategy.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestSemiAutoParallelHybridStrategy(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(num_of_devices=2, timeout=120, nnode=2)
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "2023",
+        }
+        # this test need to be run on 4-cards environment, but our CI only supports
+        # 2-cards distribute test, so skip gpu test now
+        self._changeable_envs = {"backend": ["cpu"]}
+
+    def test_simple_net_bybrid_strategy(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_simple_net_hybrid.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/test_semi_auto_parallel_single_strategy.py b/test/auto_parallel/test_semi_auto_parallel_single_strategy.py
index 5c30f8b5954be5..03b31f70a9e9b3 100644
--- a/test/auto_parallel/test_semi_auto_parallel_single_strategy.py
+++ b/test/auto_parallel/test_semi_auto_parallel_single_strategy.py
@@ -19,9 +19,13 @@
 
 class TestSemiAutoParallelSingleStrategy(test_base.CommunicationTestDistBase):
     def setUp(self):
-        super().setUp(num_of_devices=2, timeout=120)
+        super().setUp(
+            num_of_devices=2,
+            timeout=120,
+        )
         self._default_envs = {
             "dtype": "float32",
+            "seed": "2023",
         }
         self._changeable_envs = {"backend": ["cpu", "gpu"]}
 
@@ -35,6 +39,17 @@ def test_simple_net_single_strategy(self):
                 user_defined_envs=envs,
             )
 
+    def test_simple_net_single_strategy_with_amp(self):
+        self._changeable_envs = {"backend": ["gpu"]}
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_simple_net_amp.py",
+                user_defined_envs=envs,
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/auto_parallel/test_shard_tensor_api.py b/test/auto_parallel/test_shard_tensor_api.py
index 5e59a7c9480e4c..fa1a19596d71b8 100644
--- a/test/auto_parallel/test_shard_tensor_api.py
+++ b/test/auto_parallel/test_shard_tensor_api.py
@@ -133,7 +133,7 @@ def test_static_mode(self):
 
 class TestShardTensorStaticDy2Static(unittest.TestCase):
     def test_dy2static(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def func():
             mesh = dist.ProcessMesh(
                 [[0, 1, 2, 3], [4, 5, 6, 7]], dim_names=["x", "y"]
diff --git a/test/autograd/CMakeLists.txt b/test/autograd/CMakeLists.txt
index d4f03dc9c548c2..592517cb8e3da2 100644
--- a/test/autograd/CMakeLists.txt
+++ b/test/autograd/CMakeLists.txt
@@ -15,16 +15,8 @@ foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
 endforeach()
 
-set(STRIED_TESTS test_autograd_dynamic)
-
-foreach(STRIED_TEST ${STRIED_TESTS})
-  py_test_modules(${STRIED_TEST}_with_stride MODULES ${STRIED_TEST} ENVS
-                  FLAGS_use_stride_kernel=true)
-endforeach()
-
 set_tests_properties(test_autograd_dynamic PROPERTIES TIMEOUT 100)
 set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 200)
-set_tests_properties(test_autograd_dynamic_with_stride PROPERTIES TIMEOUT 100)
 set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
 set_tests_properties(test_minimize PROPERTIES TIMEOUT 60)
 if(NOT WIN32)
diff --git a/test/autograd/test_autograd_functional_dynamic.py b/test/autograd/test_autograd_functional_dynamic.py
index 02c4e61748d0af..f46e7a35c10d80 100644
--- a/test/autograd/test_autograd_functional_dynamic.py
+++ b/test/autograd/test_autograd_functional_dynamic.py
@@ -145,9 +145,9 @@ def check_results(self, ref, res):
 class TestVJP(TestAutogradFunctional):
     def func_vjp_i1o1(self):
         test_cases = [
-            [reduce, 'A'],  # noqa
-            [reduce_dim, 'A'],  # noqa
-        ]  # noqa
+            [reduce, 'A'],
+            [reduce_dim, 'A'],
+        ]
         for f, inputs in test_cases:
             vjp, grad = self.gen_test_pairs(f, inputs)
             vjp_result, grad_result = vjp(), grad()
@@ -155,9 +155,9 @@ def func_vjp_i1o1(self):
 
     def func_vjp_i2o1(self):
         test_cases = [
-            [matmul, ['A', 'B']],  # noqa
-            [mul, ['b', 'c']],  # noqa
-        ]  # noqa
+            [matmul, ['A', 'B']],
+            [mul, ['b', 'c']],
+        ]
         for f, inputs in test_cases:
             vjp, grad = self.gen_test_pairs(f, inputs)
             vjp_result, grad_result = vjp(), grad()
@@ -165,8 +165,8 @@ def func_vjp_i2o1(self):
 
     def func_vjp_i2o2(self):
         test_cases = [
-            [o2, ['A', 'A']],  # noqa
-        ]  # noqa
+            [o2, ['A', 'A']],
+        ]
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
             v = make_v(f, inputs)
@@ -176,8 +176,8 @@ def func_vjp_i2o2(self):
 
     def func_vjp_i2o2_omitting_v(self):
         test_cases = [
-            [o2, ['A', 'A']],  # noqa
-        ]  # noqa
+            [o2, ['A', 'A']],
+        ]
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
             vjp, grad = self.gen_test_pairs(f, inputs)
@@ -187,7 +187,7 @@ def func_vjp_i2o2_omitting_v(self):
     def func_vjp_nested(self):
         x = self.gen_input('a')
         test_cases = [
-            [nested(x), 'a'],  # noqa
+            [nested(x), 'a'],
         ]
         for f, inputs in test_cases:
             vjp, grad = self.gen_test_pairs(f, inputs)
@@ -274,9 +274,9 @@ def jac(grad_fn, f, inputs):
 class TestJVP(TestAutogradFunctional):
     def func_jvp_i1o1(self):
         test_cases = [
-            [reduce, 'A'],  # noqa
-            [reduce_dim, 'A'],  # noqa
-        ]  # noqa
+            [reduce, 'A'],
+            [reduce_dim, 'A'],
+        ]
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
             forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
@@ -284,9 +284,9 @@ def func_jvp_i1o1(self):
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o1(self):
-        test_cases = [  # noqa
-            [matmul, ['A', 'B']],  # noqa
-        ]  # noqa
+        test_cases = [
+            [matmul, ['A', 'B']],
+        ]
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
             forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
@@ -294,9 +294,9 @@ def func_jvp_i2o1(self):
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o2(self):
-        test_cases = [  # noqa
-            [o2, ['A', 'A']],  # noqa
-        ]  # noqa
+        test_cases = [
+            [o2, ['A', 'A']],
+        ]
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
             forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
@@ -304,9 +304,9 @@ def func_jvp_i2o2(self):
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o2_omitting_v(self):
-        test_cases = [  # noqa
-            [o2, ['A', 'A']],  # noqa
-        ]  # noqa
+        test_cases = [
+            [o2, ['A', 'A']],
+        ]
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
             results_omitting_v = paddle.incubate.autograd.jvp(f, inputs)
diff --git a/test/cinn/CMakeLists.txt b/test/cinn/CMakeLists.txt
index ca9989b745826d..3158c4372d8fdb 100644
--- a/test/cinn/CMakeLists.txt
+++ b/test/cinn/CMakeLists.txt
@@ -274,4 +274,40 @@ if(WITH_GPU)
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
 
+  file(
+    GLOB CINN_RUNTIME_TEST
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "runtime/test_*.py")
+
+  foreach(runtime_test_name ${EXCLUDE_RUNTIME})
+    list(REMOVE_ITEM CINN_RUNTIME_TEST runtime/${runtime_test_name}.py)
+  endforeach()
+
+  foreach(runtime_test_name ${CINN_RUNTIME_TEST})
+    string(REGEX REPLACE ".py" "" runtime_test_name ${runtime_test_name})
+    add_test(
+      NAME ${runtime_test_name}
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
+        python3 ${CMAKE_CURRENT_SOURCE_DIR}/${runtime_test_name}.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  endforeach()
+
+  file(
+    GLOB CINN_IR_TEST
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "ir/test_*.py")
+
+  foreach(ir_test_name ${CINN_IR_TEST})
+    string(REGEX REPLACE ".py" "" ir_test_name ${ir_test_name})
+    add_test(
+      NAME ${ir_test_name}
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
+        python3 ${CMAKE_CURRENT_SOURCE_DIR}/${ir_test_name}.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  endforeach()
+
 endif()
diff --git a/test/cinn/ir/test_llir_constructor.py b/test/cinn/ir/test_llir_constructor.py
new file mode 100644
index 00000000000000..05c44e8935dfbd
--- /dev/null
+++ b/test/cinn/ir/test_llir_constructor.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from cinn import ir, lang, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+
+
+def test_call_extern():
+    @to_cinn_llir
+    def call_sinh(A: DataArray((1, 4, 256, 512)), B: DataArray((1, 4, 256))):
+        for i1 in range(1):
+            for j1 in range(4):
+                for k1 in range(256):
+                    with ir.ScheduleBlockContext("init") as init:
+                        vi, vj, vk = ir.AxisMap("SSS", [i1, j1, k1])
+                        B[vi, vj, vk] = lang.call_extern(
+                            "sinh", [A[vi, vi, vj, vk]], {}
+                        )
+
+    str(call_sinh)
+
+
+if __name__ == "__main__":
+    test_call_extern()
diff --git a/test/cinn/ir/test_llir_schedule_bind.py b/test/cinn/ir/test_llir_schedule_bind.py
new file mode 100644
index 00000000000000..5be0ddf95ae172
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_bind.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from test.cinn.utils.testing import assert_llir_equal
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+def test_bind_reduce():
+    @to_cinn_llir
+    def reduce_sum(A: DataArray((1, 4, 256, 512)), B: DataArray((1, 4, 256))):
+        for i1 in range(1):
+            for j1 in range(4):
+                for k1 in range(256):
+                    with ir.ScheduleBlockContext("init") as init:
+                        vi, vj, vk = ir.AxisMap("SSS", [i1, j1, k1])
+                        B[vi, vj, vk] = 0.0
+                    for l1 in range(512):
+                        with ir.ScheduleBlockContext("B"):
+                            sch.bind(i1, "blockIdx.x")
+                            sch.bind(j1, "threadIdx.y")
+                            sch.bind(k1, "threadIdx.x")
+                            vi1, vj1, vk1, vl1 = ir.AxisMap(
+                                "SSSR", [i1, j1, k1, l1]
+                            )
+                            B[vi1, vj1, vk1] = (
+                                B[vi1, vj1, vk1] + A[vi1, vj1, vk1, vl1]
+                            )
+
+    @to_cinn_llir
+    def reduce_sum_expected(
+        A: DataArray((1, 4, 256, 512)), B: DataArray((1, 4, 256))
+    ):
+        for i1 in range(1):
+            for j1 in range(4):
+                for k1 in range(256):
+                    with ir.ScheduleBlockContext("init") as init:
+                        vi, vj, vk = ir.AxisMap("SSS", [i1, j1, k1])
+                        B[vi, vj, vk] = 0.0
+                    for l1 in range(512):
+                        with ir.ScheduleBlockContext("B"):
+                            vi1, vj1, vk1, vl1 = ir.AxisMap(
+                                "SSSR", [i1, j1, k1, l1]
+                            )
+                            B[vi1, vj1, vk1] = (
+                                B[vi1, vj1, vk1] + A[vi1, vj1, vk1, vl1]
+                            )
+        sch.bind(init.i1, "blockIdx.x")
+        sch.bind(init.j1, "threadIdx.y")
+        sch.bind(init.k1, "threadIdx.x")
+
+    assert_llir_equal(reduce_sum, reduce_sum_expected)
+
+
+if __name__ == "__main__":
+    test_bind_reduce()
diff --git a/test/cinn/ir/test_llir_schedule_cache_read_write.py b/test/cinn/ir/test_llir_schedule_cache_read_write.py
new file mode 100644
index 00000000000000..85badc819f8f55
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_cache_read_write.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test.cinn.utils.testing import assert_llir_equal
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+def test_cache_read_elementwise():
+    @to_cinn_llir
+    def elementwise_add_cache_read(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A") as A_block:
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+        for i3 in range(128):
+            for j3 in range(128):
+                with ir.ScheduleBlockContext("B") as B_block:
+                    i1, j1 = ir.AxisMap("SS", [i3, j3])
+                    Y[i1, j1] = -A[i1, j1] + 3.0
+
+        cached_a = sch.cache_read(A_block.block, 0, "global")
+        cached_b = sch.cache_read(B_block.block, 0, "local")
+
+    assert_llir_equal(elementwise_add_cache_read, elementwise_add_cache_read)
+
+
+def test_cache_write_elementwise():
+    @to_cinn_llir
+    def elementwise_add_cache_write(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A") as A_block:
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+        for i3 in range(128):
+            for j3 in range(128):
+                with ir.ScheduleBlockContext("B") as B_block:
+                    i1, j1 = ir.AxisMap("SS", [i3, j3])
+                    Y[i1, j1] = -A[i1, j1] + 3.0
+
+        cached_a = sch.cache_write(A_block.block, 0, "global")
+        cached_b = sch.cache_write(B_block.block, 0, "local")
+
+    # TODO(6clc): core dump
+    # assert_llir_equal(elementwise_add_cache_write, elementwise_add_cache_write)
+
+
+if __name__ == "__main__":
+    test_cache_read_elementwise()
+    test_cache_write_elementwise()
diff --git a/test/cinn/ir/test_llir_schedule_compute_at.py b/test/cinn/ir/test_llir_schedule_compute_at.py
new file mode 100644
index 00000000000000..0f82786935b411
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_compute_at.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test.cinn.utils.testing import assert_llir_equal
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+def test_compute_at_elementwise():
+    @to_cinn_llir
+    def elementwise_add(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A") as A_block:
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("Y"):
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    sch.compute_at(A_block.block, i, False)
+                    Y[i1, j1] = A[i1, j1] + 2.0
+
+    @to_cinn_llir
+    def elementwise_add_gt(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A"):
+                    i1, j1 = ir.AxisMap("SS", [i, 0 + j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+            for k in range(128):
+                with ir.ScheduleBlockContext("Y"):
+                    i2, k1 = ir.AxisMap("SS", [i, k])
+                    Y[i2, k1] = A[i2, k1] + 2.0
+
+    assert_llir_equal(elementwise_add, elementwise_add_gt)
+
+
+def test_reverse_compute_at():
+    @to_cinn_llir
+    def reverse_compute_at_tiled(
+        A: DataArray((128, 128)),
+        B: DataArray((128, 128)),
+        C: DataArray((128, 128)),
+    ):
+        for i0 in range(8):
+            for j0 in range(8):
+                for i1 in range(16):
+                    for j1 in range(16):
+                        with ir.ScheduleBlockContext("B") as B_block:
+                            vi, vj = ir.AxisMap(
+                                "SS", [i0 * 16 + i1, j0 * 16 + j1]
+                            )
+                            B[vi, vj] = A[vi, vj] * 2.0
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("C") as C_block:
+                    vi, vj = ir.AxisMap("SS", [i, j])
+                    C[vi, vj] = B[vi, vj] + 1.0
+
+        sch.reverse_compute_at(C_block.block, B_block.i1)
+
+    @to_cinn_llir
+    def reverse_compute_at_tiled_gt(
+        A: DataArray((128, 128)),
+        B: DataArray((128, 128)),
+        C: DataArray((128, 128)),
+    ):
+        for i0 in range(8):
+            for j0 in range(8):
+                for i1 in range(16):
+                    for j1 in range(16):
+                        with ir.ScheduleBlockContext("B") as B_block:
+                            vi, vj = ir.AxisMap(
+                                "SS", [i0 * 16 + i1, j0 * 16 + j1]
+                            )
+                            B[vi, vj] = A[vi, vj] * 2.0
+                    for j2 in range(16):
+                        with ir.ScheduleBlockContext("C") as C_block:
+                            vi, vj = ir.AxisMap(
+                                "SS", [16 * i0 + i1, 16 * j0 + j2]
+                            )
+                            C[vi, vj] = B[vi, vj] + 1.0
+
+    assert_llir_equal(reverse_compute_at_tiled, reverse_compute_at_tiled_gt)
+
+
+if __name__ == '__main__':
+    test_compute_at_elementwise()
+    test_reverse_compute_at()
diff --git a/test/cinn/ir/test_llir_schedule_compute_inline.py b/test/cinn/ir/test_llir_schedule_compute_inline.py
new file mode 100644
index 00000000000000..a95d1dd8174495
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_compute_inline.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test.cinn.utils.testing import assert_llir_equal
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+def test_compute_inline_elementwise():
+    @to_cinn_llir
+    def elementwise_add_inline(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A") as A_block:
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+        for i3 in range(128):
+            for j3 in range(128):
+                with ir.ScheduleBlockContext("Y"):
+                    i1, j1 = ir.AxisMap("SS", [i3, j3])
+                    Y[i1, j1] = -A[i1, j1] + 3.0
+
+        block_a = sch.get_block("A")
+        sch.compute_inline(block_a)
+
+    @to_cinn_llir
+    def elementwise_add_inline_gt(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("Y"):
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    Y[i1, j1] = -(X[i1, j1] * 2.0) + 3.0
+
+    assert_llir_equal(elementwise_add_inline, elementwise_add_inline_gt)
+
+
+def test_reverse_compute_inline_elementwise():
+    @to_cinn_llir
+    def elementwise_add_inline(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A") as A_block:
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+        for i3 in range(128):
+            for j3 in range(128):
+                with ir.ScheduleBlockContext("Y") as Y_block:
+                    i1, j1 = ir.AxisMap("SS", [i3, j3])
+                    Y[i1, j1] = -A[i1, j1] + 3.0
+
+        sch.reverse_compute_inline(Y_block.block)
+
+    @to_cinn_llir
+    def elementwise_add_inline_gt(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A"):
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    Y[i1, j1] = -(X[i1, j1] * 2.0) + 3.0
+
+    assert_llir_equal(elementwise_add_inline, elementwise_add_inline_gt)
+
+
+if __name__ == "__main__":
+    test_compute_inline_elementwise()
+    test_reverse_compute_inline_elementwise()
diff --git a/test/cinn/ir/test_llir_schedule_for_kind.py b/test/cinn/ir/test_llir_schedule_for_kind.py
new file mode 100644
index 00000000000000..70dc96ea0715de
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_for_kind.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test.cinn.utils.testing import assert_llir_equal
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+# Current Python DSL cannot express the parallel `for`,
+# only checks that it can be converted correctly
+def test_elementwise_parallel():
+    @to_cinn_llir
+    def elementwise_add(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A") as A_block:
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("Y"):
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    Y[i1, j1] = A[i1, j1] + 2.0
+        sch.parallel(A_block.i)
+
+    assert_llir_equal(elementwise_add, elementwise_add)
+
+
+# Current Python DSL cannot express the vectorize `for`,
+# only checks that it can be converted correctly
+def test_elementwise_vectorize():
+    @to_cinn_llir
+    def elementwise_add(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A") as A_block:
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+        for i in range(128):
+            for j0 in range(32):
+                for j1 in range(4):
+                    with ir.ScheduleBlockContext("Y") as Y_block:
+                        i1, j1 = ir.AxisMap("SS", [i, j0 * 4 + j1])
+                        Y[i1, j1] = A[i1, j1] + 2.0
+        sch.vectorize(Y_block.j1, 1)
+
+    assert_llir_equal(elementwise_add, elementwise_add)
+
+
+# Current Python DSL cannot express the unroll `for`,
+# only checks that it can be converted correctly
+def test_elementwise_unroll():
+    @to_cinn_llir
+    def elementwise_add(
+        X: DataArray((128, 128)),
+        Y: DataArray((128, 128)),
+        A: DataArray((128, 128)),
+    ):
+        for i in range(128):
+            for j in range(128):
+                with ir.ScheduleBlockContext("A") as A_block:
+                    i1, j1 = ir.AxisMap("SS", [i, j])
+                    A[i1, j1] = X[i1, j1] * 2.0
+        for i in range(128):
+            for j0 in range(32):
+                for j1 in range(4):
+                    with ir.ScheduleBlockContext("Y") as Y_block:
+                        i1, j1 = ir.AxisMap("SS", [i, j0 * 4 + j1])
+                        Y[i1, j1] = A[i1, j1] + 2.0
+        sch.unroll(Y_block.j1)
+
+    assert_llir_equal(elementwise_add, elementwise_add)
+
+
+if __name__ == "__main__":
+    test_elementwise_parallel()
+    test_elementwise_vectorize()
+    test_elementwise_unroll()
diff --git a/test/cinn/ir/test_llir_schedule_fuse_split.py b/test/cinn/ir/test_llir_schedule_fuse_split.py
new file mode 100644
index 00000000000000..f22b1a1f8d3a94
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_fuse_split.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from test.cinn.utils.testing import assert_llir_equal
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+def test_fuse():
+    @to_cinn_llir
+    def elementwise_fuse_assign_loop(
+        X: DataArray((128, 128, 128)), Y: DataArray((128, 128, 128))
+    ):
+        for i in range(128):
+            for j in range(128):
+                for k in range(128):
+                    with ir.ScheduleBlockContext("Y") as block_y:
+                        sch.fuse([i, j, k])
+                        i1, j1, k1 = ir.AxisMap("SSS", [i, j, k])
+                        Y[i1, j1, k1] = X[i1, j1, k1] * 2.0
+
+    @to_cinn_llir
+    def elementwise_fuse_assign_loop_gt(
+        X: DataArray((128, 128, 128)), Y: DataArray((128, 128, 128))
+    ):
+        for i in range(2097152):
+            with ir.ScheduleBlockContext("Y") as block_y:
+                i1_1, j1_1, k1_1 = ir.AxisMap(
+                    "SSS", [(i / 128) / 128, (i / 128) % 128, i % 128]
+                )
+                Y[i1_1, j1_1, k1_1] = X[i1_1, j1_1, k1_1] * 2.0
+
+    assert_llir_equal(
+        elementwise_fuse_assign_loop, elementwise_fuse_assign_loop_gt
+    )
+
+
+def test_split():
+    @to_cinn_llir
+    def elementwise_split(
+        X: DataArray((128, 128, 128)), Y: DataArray((128, 128, 128))
+    ):
+        for i in range(128):
+            for j in range(128):
+                for k in range(128):
+                    with ir.ScheduleBlockContext("Y") as Y_block:
+                        i1, j1, k1 = ir.AxisMap("SSS", [i, j, k])
+                        sch.split(Y_block.i, factors=[2, 1, 64])
+                        sch.split(Y_block.j, factors=[4, 32])
+                        sch.split(Y_block.k, factors=[16, 8])
+                        Y[i1, j1, k1] = X[i1, j1, k1] * 2.0
+
+    @to_cinn_llir
+    def elementwise_split_inferred_factor(
+        X: DataArray((128, 128, 128)), Y: DataArray((128, 128, 128))
+    ):
+        for i in range(128):
+            for j in range(128):
+                for k in range(128):
+                    with ir.ScheduleBlockContext("Y") as Y_block:
+                        i1, j1, k1 = ir.AxisMap("SSS", [i, j, k])
+                        sch.split(Y_block.i, factors=[-1, 1, 64])
+                        sch.split(Y_block.j, factors=[4, -1])
+                        sch.split(Y_block.k, factors=[-1, 8])
+                        Y[i1, j1, k1] = X[i1, j1, k1] * 2.0
+
+    assert_llir_equal(elementwise_split, elementwise_split_inferred_factor)
+
+
+def test_split_predicate():
+    @to_cinn_llir
+    def elementwise_split_predicate(
+        X: DataArray((128, 128, 128)), Y: DataArray((128, 128, 128))
+    ):
+        for i in range(128):
+            for j in range(128):
+                for k in range(128):
+                    with ir.ScheduleBlockContext("Y") as Y_block:
+                        i1, j1, k1 = ir.AxisMap("SSS", [i, j, k])
+                        sch.split(Y_block.i, factors=[1000, 1, 64])
+                        sch.split(Y_block.j, factors=[4, 32])
+                        sch.split(Y_block.k, factors=[16, 8])
+                        Y[i1, j1, k1] = X[i1, j1, k1] * 2.0
+
+    @to_cinn_llir
+    def elementwise_split_predicate_gt(
+        X: DataArray((128, 128, 128)), Y: DataArray((128, 128, 128))
+    ):
+        for i in range(1000):
+            for i_0 in range(1):
+                for i_1 in range(64):
+                    if ((64 * i) + ((64 * i_0) + i_1)) < 128:
+                        for j in range(4):
+                            for j_0 in range(32):
+                                for k in range(16):
+                                    for k_0 in range(8):
+                                        with ir.ScheduleBlockContext("Y"):
+                                            i1, j1, k1 = ir.AxisMap(
+                                                "SSS",
+                                                [
+                                                    (64 * i)
+                                                    + ((64 * i_0) + i_1),
+                                                    (32 * j) + j_0,
+                                                    (8 * k) + k_0,
+                                                ],
+                                            )
+                                            Y[i1, j1, k1] = X[i1, j1, k1] * 2.0
+
+    assert_llir_equal(
+        elementwise_split_predicate, elementwise_split_predicate_gt
+    )
+
+
+if __name__ == "__main__":
+    test_fuse()
+    test_split()
+    test_split_predicate()
diff --git a/test/cinn/ir/test_llir_schedule_reorder.py b/test/cinn/ir/test_llir_schedule_reorder.py
new file mode 100644
index 00000000000000..00ca99388ba941
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_reorder.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test.cinn.utils.testing import assert_llir_equal
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+def test_reorder_elementwise():
+    @to_cinn_llir
+    def reorder_elementwise(
+        X: DataArray((64, 64, 64, 64)), Y: DataArray((64, 64, 64, 64))
+    ):
+        for i in range(64):
+            for j in range(64):
+                for k in range(64):
+                    for l in range(8):
+                        with ir.ScheduleBlockContext("Y") as Y_block:
+                            vi, vj, vk, vl = ir.AxisMap(
+                                "SSSS", [i, j, k, 8 * l]
+                            )
+                            Y[vi, vj, vk, vl] = X[vi, vj, vk, vl] * 2.0
+        sch.reorder([Y_block.k, Y_block.l, Y_block.i])
+
+    @to_cinn_llir
+    def reorder_elementwise_gt(
+        X: DataArray((64, 64, 64, 64)), Y: DataArray((64, 64, 64, 64))
+    ):
+        for k in range(64):
+            for j in range(64):
+                for l in range(8):
+                    for i in range(64):
+                        with ir.ScheduleBlockContext("Y"):
+                            vi, vj, vk, vl = ir.AxisMap(
+                                "SSSS", [i, j, k, 8 * l]
+                            )
+                            Y[vi, vj, vk, vl] = X[vi, vj, vk, vl] * 2.0
+
+    assert_llir_equal(reorder_elementwise, reorder_elementwise_gt)
+
+
+def test_reorder_overlapped():
+    @to_cinn_llir
+    def reorder_overlapped(X: DataArray((28, 8)), Y: DataArray((28, 8))):
+        for i in range(12):
+            for j in range(4):
+                for k in range(4):
+                    with ir.ScheduleBlockContext("Y"):
+                        vi, vj = ir.AxisMap("SS", [i, j])
+                        sch.reorder([i, k, j])
+                        Y[vi, vj] = X[vi, vj] + 1.0
+
+    @to_cinn_llir
+    def reorder_overlapped_gt(X: DataArray((28, 8)), Y: DataArray((28, 8))):
+        for i in range(12):
+            for k in range(4):
+                for j in range(4):
+                    with ir.ScheduleBlockContext("Y"):
+                        vi, vj = ir.AxisMap("SS", [i, j])
+                        Y[vi, vj] = X[vi, vj] + 1.0
+
+    assert_llir_equal(reorder_overlapped, reorder_overlapped_gt)
+
+
+if __name__ == '__main__':
+    test_reorder_elementwise()
+    test_reorder_overlapped()
diff --git a/test/cinn/ir/test_llir_schedule_rfactor.py b/test/cinn/ir/test_llir_schedule_rfactor.py
new file mode 100644
index 00000000000000..098435686c7915
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_rfactor.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+def test_matmul():
+    @to_cinn_llir
+    def matmul(
+        A: DataArray((128, 128)),
+        B: DataArray((128, 128)),
+        C: DataArray((128, 128)),
+    ):
+        for i0 in range(128):
+            for i1 in range(128):
+                with ir.ScheduleBlockContext("init"):
+                    vi, vj = ir.AxisMap("SS", [i0, i1])
+                    C[vi, vj] = 0.0
+                for i2_outer in range(4):
+                    for i2_inner_outer in range(8):
+                        for i2_inner_inner in range(4):
+                            with ir.ScheduleBlockContext(
+                                "compute"
+                            ) as Compute_block:
+                                vi, vj, vk = ir.AxisMap(
+                                    "SSR",
+                                    [
+                                        i0,
+                                        i1,
+                                        i2_outer * 32
+                                        + i2_inner_outer * 4
+                                        + i2_inner_inner,
+                                    ],
+                                )
+                                C[vi, vj] = C[vi, vj] + (A[vi, vk] * B[vj, vk])
+        sch.rfactor(Compute_block.i2_inner_inner, 0)
+
+    # TODO(6clc): rfactor schedule rasie Error Message: iter_value not support complex reduce bindings
+    # assert_llir_equal(matmul, matmul)
+
+
+if __name__ == "__main__":
+    test_matmul()
diff --git a/test/cinn/ir/test_llir_schedule_sequence.py b/test/cinn/ir/test_llir_schedule_sequence.py
new file mode 100644
index 00000000000000..2cff0c650fd632
--- /dev/null
+++ b/test/cinn/ir/test_llir_schedule_sequence.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test.cinn.utils.testing import assert_llir_equal
+
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+def test_split_reorder_elementwise():
+    @to_cinn_llir
+    def split_reorder_elementwise(
+        X: DataArray((1024, 1024)),
+        Y: DataArray((1024, 1024)),
+        Z: DataArray((1024, 1024)),
+    ):
+        for i in range(1024):
+            for j in range(1024):
+                for k in range(1024):
+                    with ir.ScheduleBlockContext("Z"):
+                        i_split_0, i_split_1, i_split_2, i_split_3 = sch.split(
+                            i, factors=[2, 4, 64, 2]
+                        )
+                        sch.reorder([i_split_2, i_split_0])
+                        i1, j1, k1 = ir.AxisMap("SSS", [i, j, k])
+                        Z[i1, j1] = Z[i1, j1] + X[i1, k] * Y[k, j1]
+
+    @to_cinn_llir
+    def split_reorder_elementwise_gt(
+        X: DataArray((1024, 1024)),
+        Y: DataArray((1024, 1024)),
+        Z: DataArray((1024, 1024)),
+    ):
+        for i_1 in range(64):
+            for i_0 in range(4):
+                for i in range(2):
+                    for i_2 in range(2):
+                        for j in range(1024):
+                            for k in range(1024):
+                                with ir.ScheduleBlockContext("Z"):
+                                    i1, j1, k1 = ir.AxisMap(
+                                        "SSS",
+                                        [
+                                            (512 * i)
+                                            + ((128 * i_0) + ((2 * i_1) + i_2)),
+                                            j,
+                                            k,
+                                        ],
+                                    )
+                                    Z[i1, j1] = Z[i1, j1] + (
+                                        X[i1, k] * Y[k, j1]
+                                    )
+
+    assert_llir_equal(split_reorder_elementwise, split_reorder_elementwise_gt)
+
+
+if __name__ == "__main__":
+    test_split_reorder_elementwise()
diff --git a/test/cinn/op_mappers/test_one_hot_op.py b/test/cinn/op_mappers/test_one_hot_op.py
index 2735af7e699a73..439d69b67b7312 100644
--- a/test/cinn/op_mappers/test_one_hot_op.py
+++ b/test/cinn/op_mappers/test_one_hot_op.py
@@ -19,54 +19,6 @@
 import paddle
 
 
-class TestOneHotOp(OpMapperTest):
-    def init_input_data(self):
-        self.feed_data = {'x': self.random([1, 32], 'int32', low=0, high=9)}
-        self.depth = 10
-        self.dtype = "float32"
-        self.allow_out_of_range = False
-
-    def set_op_type(self):
-        return "one_hot"
-
-    def set_op_inputs(self):
-        x = paddle.static.data(
-            name='x',
-            shape=self.feed_data['x'].shape,
-            dtype=self.feed_data['x'].dtype,
-        )
-        return {'X': [x]}
-
-    def set_op_attrs(self):
-        return {
-            "depth": self.depth,
-            "dtype": self.nptype2paddledtype(self.dtype),
-            "allow_out_of_range": self.allow_out_of_range,
-        }
-
-    def set_op_outputs(self):
-        return {'Out': [str(self.feed_data['x'].dtype)]}
-
-    def test_check_results(self):
-        self.check_outputs_and_grads(all_equal=True)
-
-
-class TestOneHotOpCase1(TestOneHotOp):
-    def init_input_data(self):
-        self.feed_data = {'x': self.random([32, 64], 'int32')}
-        self.depth = 64
-        self.dtype = "int32"
-        self.allow_out_of_range = False
-
-
-class TestOneHotOpCase2(TestOneHotOp):
-    def init_input_data(self):
-        self.feed_data = {'x': self.random([32, 64, 1], 'int64')}
-        self.depth = 1
-        self.dtype = "int64"
-        self.allow_out_of_range = True
-
-
 class TestOneHotV2Op(OpMapperTest):
     def init_input_data(self):
         self.feed_data = {'x': self.random([1, 32], 'int32')}
diff --git a/test/cinn/runtime/test_launch.py b/test/cinn/runtime/test_launch.py
new file mode 100644
index 00000000000000..bb8e3d45aeee5c
--- /dev/null
+++ b/test/cinn/runtime/test_launch.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import cinn
+import numpy as np
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+
+
+@to_cinn_llir
+def bin_op_kernel(X, Y, Z, N):
+    for idx in range(N):
+        with ir.ScheduleBlockContext("Z"):
+            idx1 = ir.AxisMap("S", [idx])
+            Z[idx1] = X[idx1] + Y[idx1]
+
+
+def test_launch_fp32():
+    N = 10
+    X_np = np.random.random(N).astype(np.float32)
+    Y_np = np.random.random(N).astype(np.float32)
+    Z_np = np.zeros((N), dtype=np.float32)
+    target = cinn.common.DefaultNVGPUTarget()
+    X = DataArray.from_numpy(X_np, target)
+    Y = DataArray.from_numpy(Y_np, target)
+    Z = DataArray.from_numpy(Z_np, target)
+
+    # compile and run
+    bin_op_kernel[target](X, Y, Z, N)
+    pred = Z.to_numpy()
+    gt = np.add(X_np, Y_np)
+    np.testing.assert_allclose(pred, gt)
+
+
+def test_launch_dtype():
+    for np_dtype in (
+        np.uint16,  # convert np.uint16 to bfloat16 in Paddle and CINN
+        np.float16,
+        np.float32,
+        np.float64,
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.uint8,
+        np.uint32,
+        np.uint64,
+    ):
+        N = 10
+        X_np = np.random.random(N).astype(np_dtype)
+        Y_np = np.random.random(N).astype(np_dtype)
+        Z_np = np.zeros((N), dtype=np_dtype)
+        target = cinn.common.DefaultNVGPUTarget()
+        X = DataArray.from_numpy(X_np, target)
+        Y = DataArray.from_numpy(Y_np, target)
+        Z = DataArray.from_numpy(Z_np, target)
+
+        # compile and run
+        bin_op_kernel[target](X, Y, Z, N)
+        pred = Z.to_numpy()
+
+
+if __name__ == "__main__":
+    test_launch_fp32()
+    test_launch_dtype()
diff --git a/test/cinn/runtime/test_reduce_cuda.py b/test/cinn/runtime/test_reduce_cuda.py
new file mode 100644
index 00000000000000..3eaf160763bd49
--- /dev/null
+++ b/test/cinn/runtime/test_reduce_cuda.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import cinn
+import numpy as np
+from cinn import ir, to_cinn_llir
+from cinn.runtime.data_array import DataArray
+from cinn.schedule import IRSchedule as sch
+
+
+@to_cinn_llir
+def reduce_max(A, B):
+    for i1 in range(1):
+        for j1 in range(2):
+            for k1 in range(4):
+                with ir.ScheduleBlockContext("init") as init:
+                    vi, vj, vk = ir.AxisMap("SSS", [i1, j1, k1])
+                    B[vi, vj, vk] = 0.0
+                for l1 in range(8):
+                    with ir.ScheduleBlockContext("B"):
+                        sch.bind(i1, "blockIdx.x")
+                        sch.bind(j1, "threadIdx.y")
+                        sch.bind(k1, "threadIdx.x")
+                        vi1, vj1, vk1, vl1 = ir.AxisMap(
+                            "SSSR", [i1, j1, k1, l1]
+                        )
+                        B[vi1, vj1, vk1] = ir.Max.make(
+                            B[vi1, vj1, vk1], A[vi1, vj1, vk1, vl1]
+                        )
+
+
+@to_cinn_llir
+def reduce_sum(A, B):
+    for i1 in range(1):
+        for j1 in range(2):
+            for k1 in range(4):
+                with ir.ScheduleBlockContext("init") as init:
+                    vi, vj, vk = ir.AxisMap("SSS", [i1, j1, k1])
+                    B[vi, vj, vk] = 0.0
+                for l1 in range(8):
+                    with ir.ScheduleBlockContext("B"):
+                        sch.bind(i1, "blockIdx.x")
+                        sch.bind(j1, "threadIdx.y")
+                        sch.bind(k1, "threadIdx.x")
+                        vi1, vj1, vk1, vl1 = ir.AxisMap(
+                            "SSSR", [i1, j1, k1, l1]
+                        )
+                        B[vi1, vj1, vk1] = (
+                            B[vi1, vj1, vk1] + A[vi1, vj1, vk1, vl1]
+                        )
+
+
+def test_reduce_max_cuda():
+    # prepare input and output array
+    d1 = 2
+    d2 = 4
+    d3 = 8
+    a_np = np.random.rand(1, d1, d2, d3).astype("float32")
+    b_np = a_np.max(axis=-1).astype("float32")
+    target = cinn.common.DefaultNVGPUTarget()
+    a = DataArray.from_numpy(a_np, target)
+    b = DataArray.from_numpy(np.zeros_like(b_np), target)
+    reduce_max[target](a, b)
+    np.testing.assert_allclose(b.to_numpy(), b_np, rtol=1e-5, atol=1e-6)
+
+
+def test_reduce_sum_cuda():
+    # prepare input and output array
+    d1 = 2
+    d2 = 4
+    d3 = 8
+    a_np = np.random.rand(1, d1, d2, d3).astype("float32")
+    b_np = a_np.sum(axis=-1).astype("float32")
+    target = cinn.common.DefaultNVGPUTarget()
+    a = DataArray.from_numpy(a_np, target)
+    b = DataArray.from_numpy(np.zeros_like(b_np), target)
+    reduce_sum[target](a, b)
+    np.testing.assert_allclose(b.to_numpy(), b_np, rtol=1e-5, atol=1e-6)
+
+
+if __name__ == "__main__":
+    test_reduce_max_cuda()
+    test_reduce_sum_cuda()
diff --git a/test/cinn/utils/testing.py b/test/cinn/utils/testing.py
new file mode 100644
index 00000000000000..b67432a17c189a
--- /dev/null
+++ b/test/cinn/utils/testing.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from cinn.ir import IrCompare
+from cinn.runtime import CinnLowerLevelIrJit
+
+
+def assert_llir_equal(
+    llir1, llir2, allow_name_suffix_diff=True, only_compare_structure=True
+):
+    comparer = IrCompare(allow_name_suffix_diff, only_compare_structure)
+
+    if isinstance(llir1, CinnLowerLevelIrJit):
+        llir1_expr = llir1.convert_to_llir().body()
+        llir2_expr = llir2.convert_to_llir().body()
+    assert comparer.compare(
+        llir1_expr, llir2_expr
+    ), f'llir1: {llir1} \n llir2: {llir2}'
diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt
index 4e1a2a970d3e95..b1b57cb6cf4f5e 100644
--- a/test/collective/fleet/CMakeLists.txt
+++ b/test/collective/fleet/CMakeLists.txt
@@ -134,6 +134,21 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
   set_tests_properties(test_dygraph_sharding_stage3_for_eager PROPERTIES TIMEOUT
                                                                          "350")
 endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_dygraph_sharding_stage3_bf16
+    START_BASH
+    ../../legacy_test/dist_test.sh
+    TIMEOUT
+    "200"
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=22038;NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_dygraph_sharding_stage3_bf16 PROPERTIES TIMEOUT
+                                                                    "200")
+endif()
 if(WITH_NCCL)
   if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
     py_test_modules(
@@ -282,6 +297,20 @@ if(WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT "300")
   endif()
 endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_dygraph_dataparallel_bf16
+    START_BASH
+    ../../legacy_test/dist_test.sh
+    TIMEOUT
+    "200"
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=22024;NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_dygraph_dataparallel_bf16 PROPERTIES TIMEOUT "200")
+endif()
 if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
   bash_test_modules(
     test_dygraph_sharding_stage2
@@ -311,6 +340,21 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
   set_tests_properties(test_dygraph_sharding_stage2_bf16 PROPERTIES TIMEOUT
                                                                     "200")
 endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_dygraph_sharding_stage1_fp16
+    START_BASH
+    ../../legacy_test/dist_test.sh
+    TIMEOUT
+    "200"
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=22024;NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_dygraph_sharding_stage1_fp16 PROPERTIES TIMEOUT
+                                                                    "200")
+endif()
 if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
   bash_test_modules(
     test_parallel_dygraph_control_flow
@@ -665,11 +709,6 @@ if((WITH_GPU OR WITH_XPU) AND (LINUX OR WIN32))
     test_fleet_recompute_meta_optimizer ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
 endif()
-if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
-  py_test_modules(
-    test_fleet_private_function MODULES test_fleet_private_function ENVS
-    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
-endif()
 if((WITH_GPU OR WITH_XPU) AND LOCAL_ALL_PLAT)
   bash_test_modules(
     test_new_group
diff --git a/test/collective/fleet/c_comm_init_op.py b/test/collective/fleet/c_comm_init_op.py
index 988c0fcc27954b..15230b9b71f331 100644
--- a/test/collective/fleet/c_comm_init_op.py
+++ b/test/collective/fleet/c_comm_init_op.py
@@ -17,9 +17,6 @@
 
 import paddle
 from paddle import base
-from paddle.distributed.fleet.base.private_helper_function import (
-    wait_server_ready,
-)
 
 paddle.enable_static()
 
@@ -35,8 +32,6 @@ def setUp(self):
         self.exe = base.Executor(self.place)
         self.endpoints.remove(self.current_endpoint)
         self.other_endpoints = self.endpoints
-        if self.rank == 0:
-            wait_server_ready(self.other_endpoints)
 
     def test_specifying_devices(self):
         program = base.Program()
diff --git a/test/collective/fleet/dygraph_dataparallel_bf16.py b/test/collective/fleet/dygraph_dataparallel_bf16.py
new file mode 100644
index 00000000000000..efc7b6f993d987
--- /dev/null
+++ b/test/collective/fleet/dygraph_dataparallel_bf16.py
@@ -0,0 +1,198 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+from paddle.distributed.fleet.utils import mix_precision_utils
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+    fused_allreduce_gradients,
+)
+from paddle.nn import Linear, ReLU
+
+seed = 2022
+epoch = 2
+linear_size = 1000
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+class MLP(paddle.nn.Layer):
+    def __init__(self, linear_size=1000):
+        super().__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+        self._relu = ReLU()
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        y = self._relu(y)
+        return y
+
+
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, num_samples=200, linear_size=1000):
+        self.num_samples = num_samples
+        self.linear_size = linear_size
+
+    def __getitem__(self, idx):
+        img = np.random.rand(self.linear_size).astype('float32')
+        return img
+
+    def __len__(self):
+        return self.num_samples
+
+
+def optimizer_setting(model, use_pure_bf16, use_main_grad):
+    if use_main_grad:
+        assert use_pure_bf16
+        model = mix_precision_utils.MixPrecisionLayer(model, dtype="bfloat16")
+    optimizer = paddle.optimizer.AdamW(
+        parameters=model.parameters(),
+        learning_rate=0.00001,
+        weight_decay=0.00001,
+        grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
+        multi_precision=use_pure_bf16,
+    )
+    if use_main_grad:
+        optimizer = mix_precision_utils.MixPrecisionOptimizer(optimizer)
+
+    return optimizer
+
+
+def train_mlp(
+    model, use_pure_bf16=False, use_main_grad=False, accumulate_grad=False
+):
+    optimizer = optimizer_setting(
+        model=model, use_pure_bf16=use_pure_bf16, use_main_grad=use_main_grad
+    )
+    if use_pure_bf16:
+        level = 'O2'
+        custom_white_list = None
+        model = paddle.amp.decorate(
+            models=model,
+            dtype="bfloat16",
+            level=level,
+        )
+    else:
+        level = 'O1'
+        custom_white_list = [
+            "matmul_v2",
+            "elementwise_add",
+            "relu",
+            "reduce_mean",
+        ]
+    model = paddle.DataParallel(model)
+
+    paddle.seed(2023)
+    np.random.seed(2023)
+    train_loader = paddle.io.DataLoader(
+        RandomDataset(),
+        batch_size=100,
+        shuffle=False,
+        drop_last=True,
+        num_workers=0,
+    )
+    if not use_pure_bf16:
+        for param in model.parameters():
+            t = paddle.cast(
+                paddle.cast(param, dtype='bfloat16'), dtype='float32'
+            )
+            param.set_value(t)
+
+    losses = []
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            data.stop_gradient = True
+
+            with model.no_sync():
+                with paddle.amp.auto_cast(
+                    True,
+                    level=level,
+                    dtype="bfloat16",
+                    custom_white_list=custom_white_list,
+                ):
+                    out = model(data)
+                    loss = paddle.mean(out)
+
+                losses.append(loss)
+
+                loss.backward()
+
+            if not accumulate_grad:
+                fused_allreduce_gradients(list(model.parameters()), None)
+
+                optimizer.step()
+                optimizer.clear_grad()
+
+        if accumulate_grad:
+            fused_allreduce_gradients(list(model.parameters()), None)
+
+            optimizer.step()
+            optimizer.clear_grad()
+
+    return losses
+
+
+def test_dp_bf16():
+    if not paddle.amp.is_bfloat16_supported():
+        return
+    paddle.distributed.init_parallel_env()
+    mlp = MLP()
+    state_dict = mlp.state_dict()
+
+    # dp bf16 O1 vs dp bf16 O2 main_grad
+    mlp1 = MLP()
+    mlp2 = MLP()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    losses_o1 = train_mlp(mlp1, use_pure_bf16=False)
+    losses_o2 = train_mlp(mlp2, use_pure_bf16=True, use_main_grad=True)
+    for i in range(len(losses_o2)):
+        loss_o2 = paddle.cast(losses_o2[i], dtype='float32').detach()
+        loss_o1 = paddle.cast(losses_o1[i], dtype='float32').detach()
+        np.testing.assert_array_equal(loss_o2, loss_o1)
+
+    # grad accumulation test
+    mlp3 = MLP()
+    mlp4 = MLP()
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    losses_acc_grad_o1 = train_mlp(
+        mlp3, use_pure_bf16=False, accumulate_grad=True
+    )
+    losses_acc_grad_o2 = train_mlp(
+        mlp4, use_pure_bf16=True, use_main_grad=True, accumulate_grad=True
+    )
+    for i in range(len(losses_acc_grad_o2)):
+        loss_acc_grad_o2 = paddle.cast(
+            losses_acc_grad_o2[i], dtype='float32'
+        ).detach()
+        loss_acc_grad_o1 = paddle.cast(
+            losses_acc_grad_o1[i], dtype='float32'
+        ).detach()
+        np.testing.assert_array_equal(loss_acc_grad_o2, loss_acc_grad_o1)
+
+
+if __name__ == '__main__':
+    test_dp_bf16()
diff --git a/test/collective/fleet/dygraph_group_sharded_stage1_fp16.py b/test/collective/fleet/dygraph_group_sharded_stage1_fp16.py
new file mode 100644
index 00000000000000..601659e0fb98b9
--- /dev/null
+++ b/test/collective/fleet/dygraph_group_sharded_stage1_fp16.py
@@ -0,0 +1,263 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+from paddle.distributed import fleet
+from paddle.distributed.fleet.utils import mix_precision_utils
+from paddle.nn import Linear, ReLU
+
+seed = 2022
+epoch = 2
+linear_size = 1000
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+class MLP(paddle.nn.Layer):
+    def __init__(self, linear_size=1000):
+        super().__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+        self._relu = ReLU()
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        y = self._relu(y)
+        return y
+
+
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, num_samples=200, linear_size=1000):
+        self.num_samples = num_samples
+        self.linear_size = linear_size
+
+    def __getitem__(self, idx):
+        img = np.random.rand(self.linear_size).astype('float32')
+        return img
+
+    def __len__(self):
+        return self.num_samples
+
+
+def optimizer_setting(model, use_pure_fp16, use_main_grad):
+    if use_main_grad:
+        assert use_pure_fp16
+        model = mix_precision_utils.MixPrecisionLayer(model, dtype="float16")
+    optimizer = paddle.optimizer.AdamW(
+        parameters=model.parameters(),
+        learning_rate=0.00001,
+        weight_decay=0.00001,
+        grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
+        multi_precision=use_pure_fp16,
+    )
+    if use_main_grad:
+        optimizer = mix_precision_utils.MixPrecisionOptimizer(optimizer)
+
+    return optimizer
+
+
+def train_mlp(
+    model,
+    sharding_stage,
+    use_pure_fp16=False,
+    accumulate_grad=False,
+    use_main_grad=False,
+    test_scaler=False,
+    scale_loss=1024,
+):
+    scaler = None
+    if test_scaler:
+        assert sharding_stage == 1
+        assert not accumulate_grad
+        scaler = paddle.amp.GradScaler(init_loss_scaling=scale_loss)
+        scaler = fleet.distributed_scaler(scaler)
+    optimizer = optimizer_setting(
+        model=model, use_pure_fp16=use_pure_fp16, use_main_grad=use_main_grad
+    )
+    if use_pure_fp16:
+        level = 'O2'
+        custom_white_list = None
+        model = paddle.amp.decorate(models=model, dtype="float16", level=level)
+    else:
+        level = 'O1'
+        custom_white_list = [
+            "matmul_v2",
+            "elementwise_add",
+            "relu",
+            "reduce_mean",
+        ]
+
+    if sharding_stage == 1:
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+        model = fleet.distributed_model(model)
+    else:
+        model = paddle.DataParallel(model)
+
+    paddle.seed(2023)
+    np.random.seed(2023)
+    train_loader = paddle.io.DataLoader(
+        RandomDataset(),
+        batch_size=100,
+        shuffle=False,
+        drop_last=True,
+        num_workers=0,
+    )
+
+    if sharding_stage == 1:
+        model.to(device="gpu")
+
+    if not use_pure_fp16:
+        for param in model.parameters():
+            t = paddle.cast(
+                paddle.cast(param, dtype='float16'), dtype='float32'
+            )
+            param.set_value(t)
+
+    losses = []
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            data.stop_gradient = True
+
+            with paddle.amp.auto_cast(
+                True,
+                level=level,
+                dtype="float16",
+                custom_white_list=custom_white_list,
+            ):
+                out = model(data)
+                loss = paddle.mean(out)
+
+            losses.append(loss)
+
+            if test_scaler:
+                assert scaler is not None
+                scaler.scale(loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.clear_grad()
+            else:
+                loss.backward()
+                if not accumulate_grad:
+                    optimizer.step()
+                    optimizer.clear_grad()
+
+        if accumulate_grad:
+            optimizer.step()
+            optimizer.clear_grad()
+
+    return losses
+
+
+def test_stage1_fp16():
+    if not paddle.amp.is_float16_supported():
+        return
+    paddle.distributed.init_parallel_env()
+
+    strategy = fleet.DistributedStrategy()
+    hybrid_configs = {
+        "dp_degree": 1,
+        "mp_degree": 1,
+        "pp_degree": 1,
+        "sharding_degree": 2,
+    }
+    scale_loss = 1024
+    amp_configs = {"init_loss_scaling": scale_loss, "use_pure_fp16": True}
+    strategy.hybrid_configs = hybrid_configs
+    strategy.amp_configs = amp_configs
+
+    fleet.init(is_collective=True, strategy=strategy)
+    mlp = MLP()
+    state_dict = mlp.state_dict()
+
+    # stage1 fp16 O1 vs stage1 fp16 O2 main_grad
+    mlp1 = MLP()
+    mlp2 = MLP()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    o1_losses = train_mlp(
+        mlp1,
+        sharding_stage=1,
+        use_pure_fp16=False,
+        scale_loss=scale_loss,
+    )
+    o2_losses = train_mlp(
+        mlp2,
+        sharding_stage=1,
+        use_pure_fp16=True,
+        use_main_grad=True,
+        scale_loss=scale_loss,
+    )
+    for i in range(len(o1_losses)):
+        o1_32_loss = paddle.cast(o1_losses[i], dtype='float32').detach()
+        o2_32_loss = paddle.cast(o2_losses[i], dtype='float32').detach()
+        np.testing.assert_array_equal(o1_32_loss, o2_32_loss)
+
+    # stage1 scaler test
+    mlp3 = MLP()
+    mlp3.set_state_dict(state_dict)
+    train_mlp(
+        mlp3,
+        sharding_stage=1,
+        use_pure_fp16=True,
+        use_main_grad=True,
+        test_scaler=True,
+        scale_loss=scale_loss,
+    )
+
+    # grad accumulation test
+    mlp5 = MLP()
+    mlp6 = MLP()
+    mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
+    o1_losses_grad_acc = train_mlp(
+        mlp5,
+        sharding_stage=1,
+        use_pure_fp16=False,
+        accumulate_grad=True,
+        scale_loss=scale_loss,
+    )
+    o2_losses_grad_acc = train_mlp(
+        mlp6,
+        sharding_stage=1,
+        use_pure_fp16=True,
+        use_main_grad=True,
+        accumulate_grad=True,
+        scale_loss=scale_loss,
+    )
+    for i in range(len(o2_losses_grad_acc)):
+        o2_loss_grad_acc = paddle.cast(
+            o2_losses_grad_acc[i], dtype='float32'
+        ).detach()
+        o1_loss_grad_acc = paddle.cast(
+            o1_losses_grad_acc[i], dtype='float32'
+        ).detach()
+        np.testing.assert_array_equal(o2_loss_grad_acc, o1_loss_grad_acc)
+
+    return
+
+
+if __name__ == '__main__':
+    test_stage1_fp16()
diff --git a/test/collective/fleet/dygraph_group_sharded_stage2.py b/test/collective/fleet/dygraph_group_sharded_stage2.py
index 66795a0d2c9be7..81f6df163f1db5 100644
--- a/test/collective/fleet/dygraph_group_sharded_stage2.py
+++ b/test/collective/fleet/dygraph_group_sharded_stage2.py
@@ -94,6 +94,7 @@ def train_mlp(
     opt_group=False,
     save_model=False,
     test_minimize=False,
+    scale_fn_test=False,
 ):
     if sharding_stage != "dp":
         group = paddle.distributed.new_group([0, 1], backend="nccl")
@@ -104,6 +105,9 @@ def train_mlp(
     else:
         optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
 
+    if scale_fn_test:
+        assert sharding_stage == 2
+
     if sharding_stage == 2:
         optimizer = GroupShardedOptimizerStage2(
             params=optimizer._parameter_list, optim=optimizer, group=group
@@ -112,6 +116,13 @@ def train_mlp(
         model = GroupShardedStage2(
             model, optimizer, group=group, buffer_max_size=2**21
         )
+        if scale_fn_test:
+            param = model.parameters()[0]
+            grad = paddle.rand(param.shape, dtype=param.dtype)
+            model._get_scaled_grad_fn(param)(grad)
+            param.grad = grad
+            model._get_scaled_grad_fn(param)(None)
+            return
     else:
         model = paddle.DataParallel(model)
 
@@ -178,6 +189,7 @@ def test_dp_stage2():
     mlp5 = MLP()
     mlp6 = MLP()
     mlp7 = MLP()
+    mlp8 = MLP()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
     mlp3.set_state_dict(state_dict)
@@ -185,6 +197,7 @@ def test_dp_stage2():
     mlp5.set_state_dict(state_dict)
     mlp6.set_state_dict(state_dict)
     mlp7.set_state_dict(state_dict)
+    mlp8.set_state_dict(state_dict)
 
     # DP VS stage2
     dp_params = train_mlp(
@@ -242,6 +255,8 @@ def test_dp_stage2():
     # check optimizer.minimize() error
     train_mlp(mlp7, sharding_stage=2, test_minimize=True)
 
+    train_mlp(mlp8, sharding_stage=2, scale_fn_test=True)
+
 
 if __name__ == '__main__':
     test_dp_stage2()
diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_bf16.py b/test/collective/fleet/dygraph_group_sharded_stage3_bf16.py
new file mode 100644
index 00000000000000..002426e94b0d22
--- /dev/null
+++ b/test/collective/fleet/dygraph_group_sharded_stage3_bf16.py
@@ -0,0 +1,227 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import (
+    GroupShardedStage3,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
+    GroupShardedScaler,
+)
+from paddle.distributed.fleet.utils import mix_precision_utils
+from paddle.nn import Linear, ReLU
+
+seed = 2022
+epoch = 2
+linear_size = 1000
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+class MLP(paddle.nn.Layer):
+    def __init__(self, linear_size=1000):
+        super().__init__()
+
+        self._linear1 = Linear(linear_size, 4 * linear_size)
+        self._linear2 = Linear(4 * linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+        self._relu = ReLU()
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        y = self._relu(y)
+        return y
+
+
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, num_samples=200, linear_size=1000):
+        self.num_samples = num_samples
+        self.linear_size = linear_size
+
+    def __getitem__(self, idx):
+        img = np.random.rand(self.linear_size).astype('float32')
+        return img
+
+    def __len__(self):
+        return self.num_samples
+
+
+def optimizer_setting(model, use_pure_bf16, use_main_grad):
+    if use_main_grad:
+        assert use_pure_bf16
+        model = mix_precision_utils.MixPrecisionLayer(model, dtype="bfloat16")
+    optimizer = paddle.optimizer.AdamW(
+        parameters=model.parameters(),
+        learning_rate=0.00001,
+        weight_decay=0.00001,
+        grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
+        multi_precision=use_pure_bf16,
+    )
+    if use_main_grad:
+        optimizer = mix_precision_utils.MixPrecisionOptimizer(optimizer)
+
+    return optimizer
+
+
+def train_mlp(
+    model,
+    sharding_stage,
+    use_pure_bf16=False,
+    accumulate_grad=False,
+    use_main_grad=False,
+    test_scaler=False,
+):
+    if sharding_stage != "dp":
+        group = paddle.distributed.new_group([0, 1], backend="nccl")
+    scaler = None
+    if test_scaler:
+        assert sharding_stage == 2
+        assert not accumulate_grad
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+        scaler = GroupShardedScaler(scaler)
+    optimizer = optimizer_setting(
+        model=model, use_pure_bf16=use_pure_bf16, use_main_grad=use_main_grad
+    )
+    if use_pure_bf16:
+        level = 'O2'
+        custom_white_list = None
+        model = paddle.amp.decorate(models=model, dtype="bfloat16", level=level)
+    else:
+        level = 'O1'
+        custom_white_list = [
+            "matmul_v2",
+            "elementwise_add",
+            "relu",
+            "reduce_mean",
+        ]
+
+    paddle.seed(2023)
+    np.random.seed(2023)
+    train_loader = paddle.io.DataLoader(
+        RandomDataset(),
+        batch_size=100,
+        shuffle=False,
+        drop_last=True,
+        num_workers=0,
+    )
+
+    if sharding_stage == 3:
+        model.to(device="gpu")
+
+    if not use_pure_bf16:
+        for param in model.parameters():
+            t = paddle.cast(
+                paddle.cast(param, dtype='bfloat16'), dtype='float32'
+            )
+            param.set_value(t)
+
+    if sharding_stage == 3:
+        model = GroupShardedStage3(model, optimizer, group=group)
+    else:
+        model = paddle.DataParallel(model)
+
+    losses = []
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            data.stop_gradient = True
+
+            with paddle.amp.auto_cast(
+                True,
+                level=level,
+                dtype="bfloat16",
+                custom_white_list=custom_white_list,
+            ):
+                out = model(data)
+                loss = paddle.mean(out)
+
+            losses.append(loss)
+
+            if test_scaler:
+                assert scaler is not None
+                scaler.scale(loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.clear_grad()
+            else:
+                loss.backward()
+                if not accumulate_grad:
+                    optimizer.step()
+                    optimizer.clear_grad()
+
+        if accumulate_grad:
+            optimizer.step()
+            optimizer.clear_grad()
+
+    return losses
+
+
+def test_stage3_bf16():
+    if not paddle.amp.is_bfloat16_supported():
+        return
+    paddle.distributed.init_parallel_env()
+    mlp = MLP()
+    state_dict = mlp.state_dict()
+
+    # stage3 bf16 O1 vs stage3 bf16 O2 main_grad
+    mlp1 = MLP()
+    mlp2 = MLP()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    o1_losses = train_mlp(mlp1, sharding_stage=3, use_pure_bf16=False)
+    o2_losses = train_mlp(
+        mlp2, sharding_stage=3, use_pure_bf16=True, use_main_grad=True
+    )
+    for i in range(len(o1_losses)):
+        o1_32_loss = paddle.cast(o1_losses[i], dtype='float32').detach()
+        o2_32_loss = paddle.cast(o2_losses[i], dtype='float32').detach()
+        np.testing.assert_array_equal(o1_32_loss, o2_32_loss)
+
+    # grad accumulation test
+    mlp3 = MLP()
+    mlp4 = MLP()
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    o1_losses_grad_acc = train_mlp(
+        mlp3, sharding_stage=3, use_pure_bf16=False, accumulate_grad=True
+    )
+    o2_losses_grad_acc = train_mlp(
+        mlp4,
+        sharding_stage=3,
+        use_pure_bf16=True,
+        use_main_grad=True,
+        accumulate_grad=True,
+    )
+    for i in range(len(o2_losses_grad_acc)):
+        o2_loss_grad_acc = paddle.cast(
+            o2_losses_grad_acc[i], dtype='float32'
+        ).detach()
+        o1_loss_grad_acc = paddle.cast(
+            o1_losses_grad_acc[i], dtype='float32'
+        ).detach()
+        np.testing.assert_array_equal(o2_loss_grad_acc, o1_loss_grad_acc)
+
+    return
+
+
+if __name__ == '__main__':
+    test_stage3_bf16()
diff --git a/test/legacy_test/test_marker_op.py b/test/collective/fleet/test_dygraph_dataparallel_bf16.py
similarity index 57%
rename from test/legacy_test/test_marker_op.py
rename to test/collective/fleet/test_dygraph_dataparallel_bf16.py
index 21895d962318f4..1401399e8fc4cf 100644
--- a/test/legacy_test/test_marker_op.py
+++ b/test/collective/fleet/test_dygraph_dataparallel_bf16.py
@@ -11,27 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import unittest
-
-from op_test import OpTest
 
-from paddle.distributed.fleet.meta_optimizers.common import OpRole
+import unittest
 
+from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus
 
-class TestMarkerOp(OpTest):
-    def setUp(self):
-        self.op_type = "marker"
-        self.inputs = {}
-        self.attrs = {
-            'marker_role': 'forward',
-            'marker_pos': 'B',
-            'op_role': OpRole.Forward,
-        }
-        self.outputs = {}
 
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
+class TestDygraphDataParallel(TestMultipleGpus):
+    def test_dygraph_dataparallel_bf16(self):
+        self.run_mnist_2gpu('dygraph_dataparallel_bf16.py')
 
 
 if __name__ == "__main__":
diff --git a/test/collective/fleet/test_dygraph_recompute_for_eager.py b/test/collective/fleet/test_dygraph_recompute_for_eager.py
old mode 100755
new mode 100644
index f54208639072d1..288f69c03d9332
--- a/test/collective/fleet/test_dygraph_recompute_for_eager.py
+++ b/test/collective/fleet/test_dygraph_recompute_for_eager.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import random
 import unittest
 
 import numpy as np
 
 import paddle
+from paddle.base.framework import EagerParamBase
 from paddle.distributed.fleet.utils import recompute
 
 
@@ -54,6 +56,8 @@ def forward(self, x, pos=None):
         if pos is None:
             return self.block(x)
         else:
+            if isinstance(pos, tuple):
+                pos = pos[0]
             return self.block(x) + pos
 
 
@@ -70,12 +74,14 @@ def __init__(
         segments=1,
         use_raw_recompute=False,
         recompute_kwargs={},
+        raise_value_error=False,
     ):
         super().__init__()
         self.recompute_blocks = recompute_blocks
         self.recompute_kwargs = recompute_kwargs
         self.use_fleet_sq = use_fleet_sq
         self.use_raw_recompute = use_raw_recompute
+        self.raise_value_error = raise_value_error
         self.segments = segments
 
         self.runfunc0 = get_fc_block(0, input_size, is_last=False)
@@ -120,13 +126,20 @@ def forward(self, inputs):
             inputs = recompute(self.layers[0], inputs)
             return self.layers[1](inputs)
 
+        recompute_kwargs = copy.deepcopy(self.recompute_kwargs)
+
+        pos = (
+            recompute_kwargs.pop("pos", None)
+            if not self.raise_value_error
+            else None
+        )
         for i in range(len(self.layers)):
             if i in self.recompute_blocks:
                 inputs = recompute(
-                    self.layers[i], inputs, **self.recompute_kwargs
+                    self.layers[i], inputs, pos, **recompute_kwargs
                 )
             else:
-                inputs = self.layers[i](inputs)
+                inputs = self.layers[i](inputs, pos)
 
         return inputs
 
@@ -134,6 +147,7 @@ def forward(self, inputs):
 def run_model(
     recompute_block=[],
     recompute_kwargs={},
+    raise_value_error=False,
     use_fleet_sq=False,
     use_raw_recompute=False,
     segments=1,
@@ -153,6 +167,7 @@ def run_model(
         use_raw_recompute=use_raw_recompute,
         segments=segments,
         recompute_kwargs=recompute_kwargs,
+        raise_value_error=raise_value_error,
     )
 
     if pure_fp16:
@@ -302,7 +317,9 @@ def test_recompute_kwargs(self):
         kwargs = {"pos": pos, "use_reentrant": True}
         with self.assertRaises(ValueError):
             loss_ref, param_ref, grad_ref = run_model(
-                recompute_block=[2], recompute_kwargs=kwargs
+                recompute_block=[2],
+                recompute_kwargs=kwargs,
+                raise_value_error=True,
             )
 
         kwargs = {"pos": pos, "use_reentrant": False}
@@ -310,6 +327,48 @@ def test_recompute_kwargs(self):
             recompute_block=[2], recompute_kwargs=kwargs
         )
 
+    def test_recompute_inputs_with_param(self):
+        pos = paddle.randn(shape=[10, 10], dtype="float32")
+        new_pos = EagerParamBase(
+            shape=pos.shape, dtype=pos.dtype, name=pos.name
+        )
+        pos._share_buffer_to(new_pos)
+        new_pos.stop_gradient = False
+
+        loss, param, grad = run_model(
+            recompute_block=[], recompute_kwargs={"pos": new_pos}
+        )
+
+        loss_ref, param_ref, grad_ref = run_model(
+            recompute_block=[1, 2, 3], recompute_kwargs={"pos": new_pos}
+        )
+
+        self.assertEqual(loss_ref, loss)
+        self.assertEqual(param_ref, param)
+        self.assertEqual(grad_ref, grad)
+
+    def test_recompute_inputs_with_tuple(self):
+        pos = paddle.randn(shape=[10, 10], dtype="float32")
+        new_pos = EagerParamBase(
+            shape=pos.shape, dtype=pos.dtype, name=pos.name
+        )
+        pos._share_buffer_to(new_pos)
+        pos.stop_gradient = False
+        new_pos.stop_gradient = False
+
+        loss, param, grad = run_model(
+            recompute_block=[2, 4], recompute_kwargs={"pos": (pos,)}
+        )
+
+        loss_ref, param_ref, grad_ref = run_model(
+            recompute_block=[1, 2, 3],
+            recompute_kwargs={"pos": (new_pos,)},
+        )
+
+        self.assertEqual(loss_ref, loss)
+        self.assertEqual(param_ref, param)
+        self.assertEqual(grad_ref, grad)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/collective/fleet/test_dygraph_sharding_stage1_fp16.py b/test/collective/fleet/test_dygraph_sharding_stage1_fp16.py
new file mode 100644
index 00000000000000..580567d40e4f73
--- /dev/null
+++ b/test/collective/fleet/test_dygraph_sharding_stage1_fp16.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphShardingStage1(TestMultipleGpus):
+    # check sharding logic as well as the accuracy with single mode
+    def test_dygraph_sharding_stage1_fp16(self):
+        self.run_mnist_2gpu('dygraph_group_sharded_stage1_fp16.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/collective/fleet/test_dygraph_sharding_stage3_bf16.py b/test/collective/fleet/test_dygraph_sharding_stage3_bf16.py
new file mode 100644
index 00000000000000..f34191d848605b
--- /dev/null
+++ b/test/collective/fleet/test_dygraph_sharding_stage3_bf16.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphShardingStage3(TestMultipleGpus):
+    def test_dygraph_sharding_stage3_bf16(self):
+        self.run_mnist_2gpu('dygraph_group_sharded_stage3_bf16.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/collective/fleet/test_fleet_private_function.py b/test/collective/fleet/test_fleet_private_function.py
deleted file mode 100644
index c6a3a197c09ac4..00000000000000
--- a/test/collective/fleet/test_fleet_private_function.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import socket
-import threading
-import unittest
-
-
-class TestFleetPrivateFunction(unittest.TestCase):
-    def test_wait_port(self):
-        def init_server(port):
-            import time
-
-            time.sleep(5)
-            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            sock.bind(("127.0.0.1", port))
-            sock.listen(10)
-            while True:
-                c, addr = sock.accept()
-                c.send("0")
-                c.close()
-                break
-
-        thr = threading.Thread(target=init_server, args=(9292,))
-        thr.start()
-
-        from paddle.distributed import fleet
-
-        ep = ["127.0.0.1:9292"]
-        fleet.base.private_helper_function.wait_server_ready(ep)
-
-        thr.join()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/collective/fleet/test_fused_attention_pass_with_mp.sh b/test/collective/fleet/test_fused_attention_pass_with_mp.sh
index d00f2fdbac0e1d..4b2b48cdc08df8 100644
--- a/test/collective/fleet/test_fused_attention_pass_with_mp.sh
+++ b/test/collective/fleet/test_fused_attention_pass_with_mp.sh
@@ -17,4 +17,5 @@
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
+export FLAGS_dynamic_static_unified_comm=0
 CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch fused_attention_pass_with_mp.py
diff --git a/test/collective/fleet/testslist.csv b/test/collective/fleet/testslist.csv
index 43dd55c3754b34..b9df9ace687cf4 100644
--- a/test/collective/fleet/testslist.csv
+++ b/test/collective/fleet/testslist.csv
@@ -11,6 +11,7 @@ test_rnn_dp,,GPU;XPU,,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_p
 test_parallel_dygraph_mp_layers,,GPU,120,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
 test_tcp_store,LINUX;APPLE,,,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_dygraph_sharding_stage3_for_eager,,,350,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dygraph_sharding_stage3_bf16,,,200,DIST,../../legacy_test/dist_test.sh,2,,NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_communicator_half_async,,,120,DIST,test_runner.py,2,,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
 test_parallel_dygraph_pipeline_parallel,,GPU,500,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_parallel_dygraph_pipeline_parallel_sync_send,,GPU;XPU,300,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..;PADDLE_P2P_SYNC_SEND=1,
@@ -22,8 +23,10 @@ test_pipeline,,,160,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_pro
 test_fleet_utils,LINUX;APPLE,,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_static_model_parallel,,,240,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_parallel_dygraph_no_sync,,GPU,300,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
+test_dygraph_dataparallel_bf16,,,200,DIST,../../legacy_test/dist_test.sh,2,,NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_dygraph_sharding_stage2,,,200,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_dygraph_sharding_stage2_bf16,,,200,DIST,../../legacy_test/dist_test.sh,2,,NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dygraph_sharding_stage1_fp16,,,200,DIST,../../legacy_test/dist_test.sh,2,,NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_parallel_dygraph_control_flow,,,350,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_fleet_lars_meta_optimizer,,GPU;XPU,,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_hybrid_parallel_inference_helper,,,120,DIST,../../legacy_test/dist_test.sh,2,,NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..,
@@ -58,7 +61,6 @@ test_parallel_dygraph_sparse_embedding_over_height,,ROCM,350,DIST,../../legacy_t
 test_distributed_strategy,LINUX;APPLE,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_auto_parallel_parallelizer,,,120,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_fleet_recompute_meta_optimizer,LINUX;WIN32,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
-test_fleet_private_function,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_new_group,,GPU;XPU,,DIST,test_new_group.sh,2,,http_proxy=;https_proxy=,
 test_c_comm_init_op,LINUX,GPU;XPU,120,DIST,test_c_comm_init_op.sh,2,,http_proxy=;https_proxy=,
 test_fused_attention_pass_with_mp,LINUX,GPU,120,DIST,test_fused_attention_pass_with_mp.sh,2,,http_proxy=;https_proxy=,
diff --git a/test/collective/test_communication_api_base.py b/test/collective/test_communication_api_base.py
index 7f80730e1ccf14..abd56bfe3d3dfa 100644
--- a/test/collective/test_communication_api_base.py
+++ b/test/collective/test_communication_api_base.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import itertools
 import os
 import shutil
+import socket
 import subprocess
 import sys
 import tempfile
@@ -22,7 +24,7 @@
 
 
 class CommunicationTestDistBase(unittest.TestCase):
-    def setUp(self, save_log_dir=None, num_of_devices=2, timeout=120):
+    def setUp(self, save_log_dir=None, num_of_devices=2, timeout=120, nnode=1):
         self._python_interp = sys.executable
         self._save_log_dir = save_log_dir
         self._log_dir = tempfile.TemporaryDirectory()
@@ -31,15 +33,43 @@ def setUp(self, save_log_dir=None, num_of_devices=2, timeout=120):
         self._timeout = timeout
         self._seeds = [i + 10 for i in range(num_of_devices)]
         self._devices = ','.join(self._device_list)
+        self._nnode = nnode
+        self._port_set = set()
+
+    def _find_free_port(self):
+        def __free_port():
+            with contextlib.closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
 
     def run_test_case(self, script_file, user_defined_envs=None):
         runtime_envs = os.environ
         if user_defined_envs is not None:
             runtime_envs.update(user_defined_envs)
         runtime_envs["CUDA_VISIBLE_DEVICES"] = self._devices
-        start_command = f"{self._python_interp} -u -m paddle.distributed.launch --log_dir {self._log_dir.name} --devices {self._devices} {script_file}"
+        if self._nnode > 1:
+            start_command = f"{self._python_interp} -u -m paddle.distributed.launch --nnode={self._nnode} --master=127.0.0.1:{self._find_free_port()} --log_dir {self._log_dir.name} --devices {self._devices} {script_file}"
+        else:
+            start_command = f"{self._python_interp} -u -m paddle.distributed.launch --log_dir {self._log_dir.name} --devices {self._devices} {script_file}"
         start_command_list = start_command.strip().split()
 
+        if self._nnode > 1:
+            for i in range(1, self._nnode):
+                p = subprocess.Popen(
+                    start_command_list,
+                    env=runtime_envs,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+
         try:
             self._launcher = subprocess.run(
                 start_command_list,
diff --git a/test/cpp/auto_parallel/dist_tensor_test.cc b/test/cpp/auto_parallel/dist_tensor_test.cc
index 9882a4b831bb53..a94cfd37d6cc24 100644
--- a/test/cpp/auto_parallel/dist_tensor_test.cc
+++ b/test/cpp/auto_parallel/dist_tensor_test.cc
@@ -43,7 +43,7 @@ TEST(dist_tensor, constructor) {
   dist_attr.set_process_mesh(mesh);
 
   // copy construct
-  DenseTensor x1(alloc, meta);
+  std::shared_ptr<DenseTensor> x1 = std::make_shared<DenseTensor>(alloc, meta);
   DistTensor dist_x1(x1, dist_attr);
   EXPECT_TRUE(dist_x1.defined());
   EXPECT_TRUE(dist_x1.initialized());
diff --git a/test/cpp/cinn/concrete_program_builder.h b/test/cpp/cinn/concrete_program_builder.h
index 8da4bdab927c9a..920f725e2d78a4 100644
--- a/test/cpp/cinn/concrete_program_builder.h
+++ b/test/cpp/cinn/concrete_program_builder.h
@@ -112,5 +112,21 @@ class FillConstantAddBuilder : public ProgramBuilder {
   }
 };
 
+class ReduceBuilder : public ProgramBuilder {
+ public:
+  ReduceBuilder() : ProgramBuilder("reduce_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo,
+                          const utils::AttributeMap& attrs) {
+    CHECK_EQ(inputs_varinfo.size(), 1);
+    CHECK_EQ(attrs.count("reduce_dim"), 1);
+    std::vector<int> reduce_dim =
+        absl::get<std::vector<int>>(attrs.at("reduce_dim"));
+    auto X = builder_.CreateInput(
+        inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto Y = builder_.ReduceSum(X, reduce_dim);
+    return builder_.Build();
+  }
+};
+
 }  // namespace tests
 }  // namespace cinn
diff --git a/test/cpp/eager/performance_tests/CMakeLists.txt b/test/cpp/eager/performance_tests/CMakeLists.txt
index 1f5a15b3e1ea85..18821be4f630ae 100644
--- a/test/cpp/eager/performance_tests/CMakeLists.txt
+++ b/test/cpp/eager/performance_tests/CMakeLists.txt
@@ -20,10 +20,12 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
   paddle_test(test_egr_performance_benchmark_fluid_cpu SRCS
               benchmark_fluid_cpu.cc DEPS performance_benchmark_utils)
 
-  paddle_test(test_egr_performance_benchmark_eager_cuda SRCS
-              benchmark_eager_cuda.cc DEPS performance_benchmark_utils)
-  paddle_test(test_egr_performance_benchmark_fluid_cuda SRCS
-              benchmark_fluid_cuda.cc DEPS performance_benchmark_utils)
+  if(WITH_GPU)
+    paddle_test(test_egr_performance_benchmark_eager_cuda SRCS
+                benchmark_eager_cuda.cc DEPS performance_benchmark_utils)
+    paddle_test(test_egr_performance_benchmark_fluid_cuda SRCS
+                benchmark_fluid_cuda.cc DEPS performance_benchmark_utils)
+  endif()
 
   if(WITH_ONNXRUNTIME AND WIN32)
     # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/test/cpp/fluid/cinn/CMakeLists.txt b/test/cpp/fluid/cinn/CMakeLists.txt
index 0feb905a83902f..96c38feb32ba7a 100644
--- a/test/cpp/fluid/cinn/CMakeLists.txt
+++ b/test/cpp/fluid/cinn/CMakeLists.txt
@@ -46,7 +46,13 @@ if(WITH_TESTING)
     elementwise_add_op
     paddle_flags)
   target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
-  set_tests_properties(
-    cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
-                                            "${CINN_RUN_ENVIRONMENT}")
+
+  get_property(
+    env
+    TEST cinn_instruction_run_op_test
+    PROPERTY ENVIRONMENT)
+  set_property(TEST cinn_instruction_run_op_test
+               PROPERTY ENVIRONMENT "${CINN_RUN_ENVIRONMENT}" ${env})
+  set_tests_properties(cinn_instruction_run_op_test PROPERTIES LABELS
+                                                               "RUN_TYPE=CINN")
 endif()
diff --git a/test/cpp/fluid/math/im2col_test.cc b/test/cpp/fluid/math/im2col_test.cc
index fab3086a820f20..f3925bce958696 100644
--- a/test/cpp/fluid/math/im2col_test.cc
+++ b/test/cpp/fluid/math/im2col_test.cc
@@ -89,7 +89,7 @@ void testIm2col() {
   std::array<float, 8> out_cfo_data = {0, 1, 1, 2, 3, 4, 4, 5};
   std::array<float, 8> out_ocf_data = {0, 1, 3, 4, 1, 2, 4, 5};
 
-  float* out_cfo_ptr;
+  float* out_cfo_ptr = nullptr;
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
@@ -101,7 +101,7 @@ void testIm2col() {
     EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
   }
 
-  float* out_ocf_ptr;
+  float* out_ocf_ptr = nullptr;
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
@@ -130,7 +130,7 @@ void testIm2col() {
 
   col2im(*context, output_cfo, dilation, stride, padding, &input);
 
-  float* in_ptr;
+  float* in_ptr = nullptr;
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
diff --git a/test/cpp/fluid/math/vol2col_test.cc b/test/cpp/fluid/math/vol2col_test.cc
index 27a873082a1191..9a6f14c3685cb2 100644
--- a/test/cpp/fluid/math/vol2col_test.cc
+++ b/test/cpp/fluid/math/vol2col_test.cc
@@ -91,7 +91,7 @@ void testVol2col() {
 
   std::array<float, 16> vol_2_col = {
       0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
-  float* out_cfo_ptr;
+  float* out_cfo_ptr = nullptr;
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
@@ -116,7 +116,7 @@ void testVol2col() {
   phi::funcs::Col2VolFunctor<DeviceContext, float> col2vol;
   col2vol(*context, output, dilations, strides, paddings, &input);
 
-  float* in_ptr;
+  float* in_ptr = nullptr;
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
diff --git a/test/cpp/fluid/pscore/CMakeLists.txt b/test/cpp/fluid/pscore/CMakeLists.txt
index 07d3efaa311102..eb6d3b4385487a 100644
--- a/test/cpp/fluid/pscore/CMakeLists.txt
+++ b/test/cpp/fluid/pscore/CMakeLists.txt
@@ -51,69 +51,22 @@ endif()
 
 set_source_files_properties(
   heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
-  heter_server_test
-  SRCS
-  heter_server_test.cc
-  DEPS
-  ${RPC_DEPS}
-  ${DISTRIBUTE_DEPS}
-  executor
-  scope
-  proto_desc
-  generated_op
-  phi)
+paddle_test(heter_server_test SRCS heter_server_test.cc)
 
 set_source_files_properties(
   send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS
                                           ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
-  send_and_recv_cpu_test
-  SRCS
-  send_and_recv_op_cpu_test.cc
-  DEPS
-  executor
-  scope
-  proto_desc
-  generated_op
-  send_and_recv_op
-  ${RPC_DEPS}
-  ${DISTRIBUTE_DEPS}
-  phi)
+paddle_test(send_and_recv_cpu_test SRCS send_and_recv_op_cpu_test.cc)
 
 set_source_files_properties(
   send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS
                                           ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
-  send_and_recv_gpu_test
-  SRCS
-  send_and_recv_op_gpu_test.cc
-  DEPS
-  executor
-  scope
-  proto_desc
-  generated_op
-  send_and_recv_op
-  ${RPC_DEPS}
-  ${DISTRIBUTE_DEPS}
-  phi)
+paddle_test(send_and_recv_gpu_test SRCS send_and_recv_op_gpu_test.cc)
 
 set_source_files_properties(
   heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS
                                              ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test_old(
-  heter_listen_and_server_test
-  SRCS
-  heter_listen_and_server_test.cc
-  DEPS
-  executor
-  scope
-  proto_desc
-  generated_op
-  heter_listen_and_serv_op
-  ${RPC_DEPS}
-  ${DISTRIBUTE_DEPS}
-  phi)
+paddle_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc)
 
 #set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 #cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc generated_static_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} phi)
diff --git a/test/cpp/fluid/reader/reader_blocking_queue_test.cc b/test/cpp/fluid/reader/reader_blocking_queue_test.cc
index 7db47f0761853f..b02f21eb2eb499 100644
--- a/test/cpp/fluid/reader/reader_blocking_queue_test.cc
+++ b/test/cpp/fluid/reader/reader_blocking_queue_test.cc
@@ -40,7 +40,7 @@ void FirstInFirstOut(size_t queue_cap,
   size_t count = 0;
   while (true) {
     std::this_thread::sleep_for(std::chrono::milliseconds(receive_time_gap));
-    size_t elem;
+    size_t elem = 0;
     if (!q.Receive(&elem)) {
       break;
     }
@@ -76,7 +76,7 @@ TEST(BlockingQueue, SenderBlockingTest) {
   EXPECT_EQ(send_count, queue_cap);
   std::vector<size_t> res;
   while (true) {
-    size_t elem;
+    size_t elem = 0;
     if (!q.Receive(&elem)) {
       break;
     }
@@ -93,7 +93,7 @@ TEST(BlockingQueue, ReceiverBlockingTest) {
   BlockingQueue<size_t> q(queue_cap);
   std::vector<size_t> receive_res;
   std::thread receiver([&]() {
-    size_t elem;
+    size_t elem = 0;
     while (true) {
       if (!q.Receive(&elem)) {
         break;
@@ -162,7 +162,7 @@ void MultiSenderMultiReceiver(const size_t queue_cap,
       while (true) {
         std::this_thread::sleep_for(
             std::chrono::milliseconds(receive_time_gap));
-        size_t elem;
+        size_t elem = 0;
         if (!q.Receive(&elem)) {
           break;
         }
@@ -230,7 +230,7 @@ TEST(BlockingQueue, speed_test_mode) {
   for (size_t i = 0; i < queue_size; ++i) {
     q1.Send(i);
   }
-  size_t b;
+  size_t b = 0;
   for (size_t i = 0; i < queue_size; ++i) {
     q1.Receive(&b);
     EXPECT_EQ(b, i);
diff --git a/test/cpp/imperative/test_gradient_accmulator.cc b/test/cpp/imperative/test_gradient_accmulator.cc
index 982fd81a988358..bb264250ecf567 100644
--- a/test/cpp/imperative/test_gradient_accmulator.cc
+++ b/test/cpp/imperative/test_gradient_accmulator.cc
@@ -392,7 +392,7 @@ static void TestGradientAccumulatorTestUnchangeInput(
   int64_t maximum_row_number = 100;
 
   std::uniform_int_distribution<int64_t> dist(1, maximum_row_number);
-  int seed;
+  int seed = 0;
   {
     std::random_device rd;
     seed = static_cast<int>(rd());
diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt
index bbd76ca4344119..8f0b3e5c093335 100644
--- a/test/cpp/inference/api/CMakeLists.txt
+++ b/test/cpp/inference/api/CMakeLists.txt
@@ -969,6 +969,14 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       paddle_inference_shared
       ARGS
       --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+    inference_analysis_test(
+      trt_disable_tensorrt_half_ops_test
+      SRCS
+      trt_disable_tensorrt_half_ops_test.cc
+      EXTRA_DEPS
+      paddle_inference_shared
+      ARGS
+      --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
     inference_analysis_test(
       trt_fc_prelu_test
       SRCS
@@ -1304,6 +1312,8 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 480)
     set_tests_properties(trt_mark_trt_engine_outputs_test PROPERTIES TIMEOUT
                                                                      300)
+    set_tests_properties(trt_disable_tensorrt_half_ops_test PROPERTIES TIMEOUT
+                                                                       300)
   endif()
 
   if(WITH_MKLDNN)
diff --git a/test/cpp/inference/api/trt_disable_tensorrt_half_ops_test.cc b/test/cpp/inference/api/trt_disable_tensorrt_half_ops_test.cc
new file mode 100644
index 00000000000000..68dfd62d019026
--- /dev/null
+++ b/test/cpp/inference/api/trt_disable_tensorrt_half_ops_test.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "test/cpp/inference/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+TEST(TensorRT, disable_tensorrt_half_ops) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  AnalysisConfig config;
+  config.SetModel(model_dir);
+  config.EnableUseGpu(100, 0);
+  config.EnableTensorRtEngine(
+      1 << 30, 1, 5, AnalysisConfig::Precision::kHalf, false, false);
+
+  paddle_infer::experimental::InternalUtils::DisableTensorRtHalfOps(&config,
+                                                                    {"conv2d"});
+
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  auto predictor = CreatePaddlePredictor(config);
+  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+
+  std::vector<PaddleTensor> outputs;
+  for (auto &input : inputs_all) {
+    ASSERT_TRUE(predictor->Run(input, &outputs));
+    predictor->ClearIntermediateTensor();
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/test/cpp/new_executor/CMakeLists.txt b/test/cpp/new_executor/CMakeLists.txt
index 9fbd4a82feb9a2..435124d87049ad 100644
--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
@@ -1,16 +1,8 @@
 # skip win32 since wget is not installed by default on windows machine.
 
 if(NOT WIN32)
-  cc_test_old(
-    standalone_executor_new_ir_test
-    SRCS
-    standalone_executor_new_ir_test.cc
-    DEPS
-    pd_op_dialect
-    pd_kernel_dialect
-    pir
-    phi
-    standalone_executor)
+  paddle_test(standalone_executor_new_ir_test SRCS
+              standalone_executor_new_ir_test.cc)
 endif()
 
 set(OPS
diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
index 02ca49d180baaf..28a425dbd4ebe9 100644
--- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc
+++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
@@ -23,6 +23,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 #include "paddle/fluid/framework/new_executor/new_ir_interpreter.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
@@ -43,6 +44,7 @@ PD_DECLARE_KERNEL(full_int_array, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(uniform, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sqrt, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(less_than, CPU, ALL_LAYOUT);
 
 bool simple_cmp(float a, float b) { return std::abs((a - b) / a) < 1e-5; }
 
@@ -278,5 +280,74 @@ TEST(StandaloneExecutor, if_op) {
   EXPECT_EQ(res1, true);
 }
 
+using namespace paddle::dialect;  // NOLINT
+TEST(StandaloneExecutor, while_op) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::ControlFlowDialect>();
+
+  pir::Program program(ctx);
+  pir::Block* block = program.block();
+  pir::Builder builder(ctx, block);
+
+  auto i = builder
+               .Build<paddle::dialect::FullOp>(
+                   std::vector<int64_t>{1}, 1, phi::DataType::INT32)
+               .out();
+
+  auto ten = builder
+                 .Build<paddle::dialect::FullOp>(
+                     std::vector<int64_t>{1}, 10, phi::DataType::INT32)
+                 .out();
+
+  // comput condition value: i <= ten
+  auto cond_value = builder.Build<LessEqualOp>(i, ten).out();
+
+  auto while_op =
+      builder.Build<WhileOp>(cond_value, std::vector<pir::Value>{i, ten});
+
+  // { i = i + 1}
+  pir::Block* body_block = while_op.body_block();
+  auto body_i_argument = body_block->AddArgument(i.type());
+  auto body_ten_argument = body_block->AddArgument(ten.type());
+  builder.SetInsertionPointToStart(body_block);
+  auto one =
+      builder.Build<FullOp>(std::vector<int64_t>{1}, 1, phi::DataType::INT32)
+          .out();
+  auto new_i = builder.Build<AddOp>(body_i_argument, one).out();
+
+  // comput new condition value: new_i <= new_ten
+  auto new_cond_value =
+      builder.Build<LessEqualOp>(new_i, body_ten_argument).out();
+
+  builder.Build<pir::YieldOp>(
+      std::vector<pir::Value>{new_cond_value, new_i, body_ten_argument});
+
+  builder.SetInsertionPointAfter(while_op);
+
+  auto kernel_program = PdOpLowerToKernelPass(&program);
+
+  auto place = platform::CPUPlace();
+  Scope scope;
+  InterpreterCore test_core(place, {}, kernel_program->block(), &scope);
+
+  std::stringstream os;
+  os << reinterpret_cast<NewIRInterpreter*>(
+      const_cast<InterpreterBaseImpl*>(test_core.Impl()));
+  std::string out_name = os.str() + "_inner_var_3";
+  test_core.SetSkipGcVars({out_name});
+
+  test_core.Run({});
+
+  auto out_tensor =
+      test_core.local_scope() == nullptr
+          ? scope.FindVar(out_name)->Get<phi::DenseTensor>()
+          : test_core.local_scope()->FindVar(out_name)->Get<phi::DenseTensor>();
+
+  bool res0 = out_tensor.data<int>()[0] == 11;
+
+  EXPECT_EQ(res0, true);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/test/cpp/phi/core/test_ddim.cc b/test/cpp/phi/core/test_ddim.cc
old mode 100755
new mode 100644
index 3a8afe131eb4df..a58d86e62aa403
--- a/test/cpp/phi/core/test_ddim.cc
+++ b/test/cpp/phi/core/test_ddim.cc
@@ -126,7 +126,7 @@ TEST(DDim, Print) {
 
 TEST(DDim, Hash) {
   // hash a DDim
-  std::size_t h;
+  std::size_t h = 0;
   phi::DDim ddim = phi::make_ddim({2, 3, 4});
   h = std::hash<phi::DDim>()(ddim);
   EXPECT_EQ(h, 0xa16fb2b2967ul);
diff --git a/test/cpp/pir/cinn/CMakeLists.txt b/test/cpp/pir/cinn/CMakeLists.txt
index 7952e53da32c08..10fbb7b7eb9e8d 100644
--- a/test/cpp/pir/cinn/CMakeLists.txt
+++ b/test/cpp/pir/cinn/CMakeLists.txt
@@ -1,49 +1,29 @@
 if(WITH_TESTING AND WITH_CINN)
-  cc_test_old(
-    test_new_ir_compiler
-    SRCS
-    new_ir_compiler_test.cc
-    DEPS
-    new_ir_compiler
-    convert_to_dialect
-    cinn_runtime_dialect
-    pir
-    phi
-    gtest
-    glog)
-  set_tests_properties(test_new_ir_compiler PROPERTIES LABELS "RUN_TYPE=CINN")
+  paddle_test(test_pir_compiler SRCS pir_compiler_test.cc DEPS pir_compiler
+              cinn_runtime_dialect)
+  set_tests_properties(test_pir_compiler PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  cc_test_old(
-    test_jit_instruction
-    SRCS
-    jit_instruction_test.cc
-    DEPS
-    interpreter
-    new_ir_compiler
-    convert_to_dialect)
+  paddle_test(test_jit_instruction SRCS jit_instruction_test.cc DEPS
+              cinn_runtime_dialect pir_compiler)
   set_tests_properties(test_jit_instruction PROPERTIES LABELS "RUN_TYPE=CINN")
 
   cc_test_old(
-    test_group_op
+    ir_op_fusion_test
     SRCS
-    group_op_test.cc
+    ir_op_fusion_test.cc
     DEPS
+    op_with_group_merge_pass
+    pd_op_dialect
     cinn_op_dialect
     pir
-    phi
     gtest
     glog)
+
+  paddle_test(test_group_op SRCS group_op_test.cc DEPS cinn_op_dialect)
   set_tests_properties(test_group_op PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  cc_test_old(
-    test_pir_build_cinn_pass
-    SRCS
-    build_cinn_pass_test.cc
-    DEPS
-    pd_build_cinn_pass
-    pir_pass
-    gtest
-    glog)
+  paddle_test(test_pir_build_cinn_pass SRCS build_cinn_pass_test.cc DEPS
+              pd_build_cinn_pass pir_pass)
   set_tests_properties(test_pir_build_cinn_pass PROPERTIES LABELS
                                                            "RUN_TYPE=CINN")
 endif()
diff --git a/test/cpp/pir/cinn/build_cinn_pass_test.cc b/test/cpp/pir/cinn/build_cinn_pass_test.cc
index 2d6d7b09db3868..40fefeb3d21733 100644
--- a/test/cpp/pir/cinn/build_cinn_pass_test.cc
+++ b/test/cpp/pir/cinn/build_cinn_pass_test.cc
@@ -62,7 +62,7 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) {
   CHECK_EQ(origin_program->block()->size(), 1u);
   pir::Operation* group_op = origin_program->block()->front();
   pir::Block* group_block =
-      group_op->dyn_cast<cinn::dialect::GroupOp>().Block();
+      group_op->dyn_cast<cinn::dialect::GroupOp>().block();
   CHECK_EQ(group_block->size(), 6u);
 
   std::vector<std::string> op_names = {
diff --git a/test/cpp/pir/cinn/group_op_test.cc b/test/cpp/pir/cinn/group_op_test.cc
index a5bd90a54f0f01..c252c06a3cccdf 100644
--- a/test/cpp/pir/cinn/group_op_test.cc
+++ b/test/cpp/pir/cinn/group_op_test.cc
@@ -51,7 +51,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgram() {
   const std::vector<int64_t> shape = {64, 128};
   auto group_op1 = builder.Build<cinn::dialect::GroupOp>(
       CreateDenseTensorTypes(phi::make_ddim(shape)));
-  pir::Block* block1 = group_op1.Block();
+  pir::Block* block1 = group_op1.block();
   builder.SetInsertionPointToEnd(block1);
   auto full_op_x = builder.Build<paddle::dialect::FullOp>(
       shape, value_one, phi::DataType::FLOAT32, phi::GPUPlace());
@@ -60,7 +60,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgram() {
   builder.SetInsertionPointToEnd(program->block());
   auto group_op2 = builder.Build<cinn::dialect::GroupOp>(
       CreateDenseTensorTypes(phi::make_ddim(shape)));
-  pir::Block* block2 = group_op2.Block();
+  pir::Block* block2 = group_op2.block();
   builder.SetInsertionPointToEnd(block2);
 
   auto tan_op_x = builder.Build<paddle::dialect::TanOp>(group_op1->result(0));
@@ -84,7 +84,62 @@ TEST(GroupOp, TestBuild) {
   int i = 0;
   for (auto* sub_op : *(program->block())) {
     EXPECT_TRUE(sub_op->isa<cinn::dialect::GroupOp>());
-    EXPECT_EQ(sub_op->dyn_cast<cinn::dialect::GroupOp>().Ops().size(),
+    EXPECT_EQ(sub_op->dyn_cast<cinn::dialect::GroupOp>().ops().size(),
+              op_num[i]);
+    ++i;
+  }
+}
+
+std::shared_ptr<::pir::Program> BuildGroupProgramByBlock() {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<::pir::ControlFlowDialect>();
+
+  auto program = std::make_shared<::pir::Program>(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program->block());
+
+  // ------- Group op 1 ---------
+  const float value_one = 1.0;
+  const std::vector<int64_t> shape = {64, 128};
+  std::unique_ptr<::pir::Block> block1(new ::pir::Block());
+  builder.SetInsertionPointToEnd(block1.get());
+  auto full_op_x = builder.Build<paddle::dialect::FullOp>(
+      shape, value_one, phi::DataType::FLOAT32, phi::GPUPlace());
+  builder.Build<::pir::YieldOp>(std::vector<::pir::Value>{full_op_x.out()});
+
+  builder.SetInsertionPointToEnd(program->block());
+  auto group_op1 = builder.Build<cinn::dialect::GroupOp>(std::move(block1));
+
+  // ------- Group op 2 ---------
+  std::unique_ptr<::pir::Block> block2(new ::pir::Block());
+  builder.SetInsertionPointToEnd(block2.get());
+  auto tan_op_x = builder.Build<paddle::dialect::TanOp>(group_op1->result(0));
+  auto relu_op_x = builder.Build<paddle::dialect::ReluOp>(tan_op_x->result(0));
+  auto tan_op_y = builder.Build<paddle::dialect::TanOp>(relu_op_x->result(0));
+  auto relu_op_y = builder.Build<paddle::dialect::ReluOp>(tan_op_y->result(0));
+  builder.Build<::pir::YieldOp>(std::vector<::pir::Value>{relu_op_y.out()});
+
+  builder.SetInsertionPointToEnd(program->block());
+  auto group_op2 = builder.Build<cinn::dialect::GroupOp>(std::move(block2));
+
+  return program;
+}
+
+TEST(GroupOp, TestBuildByBlock) {
+  // Step 1: Construct pir::Program
+  std::shared_ptr<::pir::Program> program = BuildGroupProgramByBlock();
+  std::stringstream ss;
+  program->Print(ss);
+  LOG(INFO) << ss.str();
+
+  EXPECT_EQ(program->block()->size(), 2u);
+  LOG(INFO) << program->block()->size();
+  std::vector<uint32_t> op_num = {2, 5};
+  int i = 0;
+  for (auto* sub_op : *(program->block())) {
+    EXPECT_TRUE(sub_op->isa<cinn::dialect::GroupOp>());
+    EXPECT_EQ(sub_op->dyn_cast<cinn::dialect::GroupOp>().ops().size(),
               op_num[i]);
     ++i;
   }
diff --git a/test/cpp/pir/cinn/ir_op_fusion_test.cc b/test/cpp/pir/cinn/ir_op_fusion_test.cc
new file mode 100644
index 00000000000000..a392373358b2af
--- /dev/null
+++ b/test/cpp/pir/cinn/ir_op_fusion_test.cc
@@ -0,0 +1,444 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <sstream>
+
+#include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/op_with_group_merge_pass.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+
+std::vector<pir::OpResult> BuildInput(
+    ::pir::Builder* builder,
+    const std::vector<std::vector<int64_t>>& vec_shapes) {
+  std::vector<pir::OpResult> vec_res;
+  for (size_t i = 0; i < vec_shapes.size(); ++i) {
+    auto op = builder->Build<paddle::dialect::FullOp>(
+        vec_shapes[i], 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
+
+    vec_res.push_back(op.result(0));
+  }
+
+  return vec_res;
+}
+
+TEST(IROpFusionPass, demo) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  auto inputs = BuildInput(&builder_base, {{10, 10}, {10, 10}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  auto add = builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]);
+  builder.Build<paddle::dialect::ReluOp>(add.result(0));
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  ASSERT_EQ(res.size(), 1u);
+}
+
+TEST(IROpFusionPass, ElementWise_Fusion_0) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{h, w}, {h, w}, {h, w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  auto e =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  auto f = builder.Build<paddle::dialect::AddOp>(e, inputs[2]).result(0);
+  builder.Build<paddle::dialect::AddOp>(f, inputs[2]);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(res.size(), 1u);
+}
+
+// Real test 0
+TEST(IROpFusionPass, Broadcast_Test_0) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{w}, {w}, {h, w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  auto e =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  auto f =
+      builder.Build<paddle::dialect::AddOp>(inputs[2], inputs[3]).result(0);
+  std::vector<int64_t> axes{1};
+  std::vector<int64_t> out_shape{h, w};
+  auto e1 =
+      builder.Build<cinn::dialect::BroadcastOp>(e, axes, out_shape).result(0);
+  builder.Build<paddle::dialect::AddOp>(e1, f);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  // ASSERT_EQ(res.size(), 1u);
+}
+
+// Real test 1
+TEST(IROpFusionPass, Broadcast_Test_1) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{w}, {w}, {w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  auto e =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[2], e).result(0);
+  std::vector<int64_t> axes{1};
+  std::vector<int64_t> out_shape{h, w};
+  auto e1 =
+      builder.Build<cinn::dialect::BroadcastOp>(e, axes, out_shape).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[3], e1);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 2u);
+}
+
+// Real test 2
+TEST(IROpFusionPass, Broadcast_Test_2) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{w}, {w}, {w}, {h, w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  auto f =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[2], f).result(0);
+  std::vector<int64_t> axes{1};
+  std::vector<int64_t> out_shape{h, w};
+  auto f1 =
+      builder.Build<cinn::dialect::BroadcastOp>(f, axes, out_shape).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[3], f1);
+  builder.Build<paddle::dialect::AddOp>(inputs[4], f1);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 2u);
+}
+
+// Real reduce 0
+TEST(IROpFusionPass, reduce_test_0) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{h, w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  std::vector<int64_t> axes{0};
+  auto c =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  builder.Build<cinn::dialect::ReduceSumOp>(c, axes, true).result(0);
+  builder.Build<cinn::dialect::ReduceSumOp>(c, axes, true).result(0);
+  builder.Build<cinn::dialect::ReduceSumOp>(c, axes, true).result(0);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 1u);
+}
+
+// Real reduce 1
+TEST(IROpFusionPass, reduce_test_1) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{h, w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  std::vector<int64_t> axes{0};
+  std::vector<int64_t> axes1{1};
+  auto c =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  builder.Build<cinn::dialect::ReduceSumOp>(c, axes, true).result(0);
+  builder.Build<cinn::dialect::ReduceSumOp>(c, axes1, true).result(0);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 2u);
+}
+
+// Real reduce 2
+TEST(IROpFusionPass, reduce_test_2) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{h, w}, {h, w}, {w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  std::vector<int64_t> axes{0};
+  std::vector<int64_t> axes1{1};
+  auto d =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  auto e = builder.Build<cinn::dialect::ReduceSumOp>(d, axes, false).result(0);
+  auto f = builder.Build<cinn::dialect::ReduceSumOp>(d, axes1, false).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[2], e).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[2], f).result(0);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 2u);
+}
+
+// Real reduce 3
+TEST(IROpFusionPass, reduce_test_3) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{h, w}, {h, w}, {w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  std::vector<int64_t> axes{0};
+  std::vector<int64_t> axes1{1};
+  auto e =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  auto f = builder.Build<cinn::dialect::ReduceSumOp>(e, axes, false).result(0);
+
+  builder.Build<paddle::dialect::AddOp>(inputs[2], f).result(0);
+
+  std::vector<int64_t> out_shape{h, w};
+  auto f1 =
+      builder.Build<cinn::dialect::BroadcastOp>(f, axes1, out_shape).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[2], f1).result(0);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 1u);
+}
+
+// Real reduce 4
+TEST(IROpFusionPass, reduce_test_4) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{h, w}, {h, w}, {w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  std::vector<int64_t> axes{0};
+  std::vector<int64_t> axes1{1};
+  auto e =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  auto f = builder.Build<cinn::dialect::ReduceSumOp>(e, axes, false).result(0);
+
+  builder.Build<paddle::dialect::AddOp>(inputs[2], f).result(0);
+
+  std::vector<int64_t> out_shape{h, w};
+  auto f1 =
+      builder.Build<cinn::dialect::BroadcastOp>(f, axes1, out_shape).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[3], f1).result(0);
+  auto f2 =
+      builder.Build<cinn::dialect::BroadcastOp>(f, axes1, out_shape).result(0);
+  builder.Build<paddle::dialect::AddOp>(inputs[3], f2).result(0);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 1u);
+}
+
+// Real reduce 5
+TEST(IROpFusionPass, reduce_test_5) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  int h = 32, w = 32;
+  auto inputs = BuildInput(&builder_base, {{h, w}, {h, w}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  std::vector<int64_t> axes{1};
+
+  auto c =
+      builder.Build<paddle::dialect::AddOp>(inputs[0], inputs[1]).result(0);
+  builder.Build<cinn::dialect::ReduceSumOp>(inputs[0], axes, false).result(0);
+  builder.Build<cinn::dialect::ReduceSumOp>(inputs[1], axes, false).result(0);
+  builder.Build<cinn::dialect::ReduceSumOp>(c, axes, false).result(0);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 1u);
+}
+
+TEST(IROpFusionPass, layer_norm) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ::pir::Program program_base(ctx);
+  ::pir::Builder builder_base = ::pir::Builder(ctx, program_base.block());
+
+  auto inputs = BuildInput(&builder_base, {{128, 128, 768}, {768}, {768}});
+
+  ::pir::Program program(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+
+  std::vector<int64_t> axes{-1};
+
+  auto num = builder
+                 .Build<paddle::dialect::FullOp>(std::vector<int64_t>{1},
+                                                 768.0,
+                                                 phi::DataType::FLOAT32,
+                                                 phi::CPUPlace())
+                 .result(0);
+  auto eps = builder
+                 .Build<paddle::dialect::FullOp>(std::vector<int64_t>{1},
+                                                 1e-5,
+                                                 phi::DataType::FLOAT32,
+                                                 phi::CPUPlace())
+                 .result(0);
+
+  auto sum = builder.Build<cinn::dialect::ReduceSumOp>(inputs[0], axes, true)
+                 .result(0);
+  std::vector<int64_t> all_axes{0, 1, 2};
+  std::vector<int64_t> out_shape1{128, 128, 1};
+  auto num1 =
+      builder.Build<cinn::dialect::BroadcastOp>(num, all_axes, out_shape1)
+          .result(0);
+  auto mean = builder.Build<paddle::dialect::DivideOp>(sum, num1).result(0);
+  auto power = builder.Build<paddle::dialect::MultiplyOp>(inputs[0], inputs[0])
+                   .result(0);
+  auto power_sum =
+      builder.Build<cinn::dialect::ReduceSumOp>(power, axes, true).result(0);
+  auto mean2 =
+      builder.Build<paddle::dialect::DivideOp>(power_sum, num1).result(0);
+  auto power_mean =
+      builder.Build<paddle::dialect::MultiplyOp>(mean, mean).result(0);
+
+  auto var =
+      builder.Build<paddle::dialect::SubtractOp>(mean2, power_mean).result(0);
+
+  std::vector<int64_t> out_shape2{128, 128, 768};
+  auto sub =
+      builder.Build<paddle::dialect::SubtractOp>(inputs[0], mean).result(0);
+  auto eps1 =
+      builder.Build<cinn::dialect::BroadcastOp>(eps, all_axes, out_shape2)
+          .result(0);
+  auto t1 = builder.Build<paddle::dialect::AddOp>(var, eps1).result(0);
+  auto t2 = builder.Build<paddle::dialect::SqrtOp>(t1).result(0);
+  auto t3 = builder.Build<paddle::dialect::DivideOp>(sub, t2).result(0);
+  auto scale =
+      builder.Build<cinn::dialect::BroadcastOp>(inputs[1], all_axes, out_shape2)
+          .result(0);
+  auto bias =
+      builder.Build<cinn::dialect::BroadcastOp>(inputs[2], all_axes, out_shape2)
+          .result(0);
+  auto t5 = builder.Build<paddle::dialect::MultiplyOp>(t3, scale).result(0);
+  builder.Build<paddle::dialect::MultiplyOp>(t5, bias).result(0);
+
+  auto res = cinn::dialect::ir::OpFusionPassInternal(program);
+
+  auto new_group =
+      cinn::dialect::ir::GeneralFusionMergePassInternal(&program, res);
+
+  ASSERT_EQ(new_group.size(), 1u);
+}
diff --git a/test/cpp/pir/cinn/jit_instruction_test.cc b/test/cpp/pir/cinn/jit_instruction_test.cc
index 2996bf17c962a7..8fdffa86de6677 100644
--- a/test/cpp/pir/cinn/jit_instruction_test.cc
+++ b/test/cpp/pir/cinn/jit_instruction_test.cc
@@ -27,11 +27,18 @@
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/program.h"
 
+#include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
-#include "paddle/cinn/hlir/framework/convert_to_dialect.h"
-#include "paddle/cinn/hlir/framework/new_ir_compiler.h"
+#include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/cinn/utils/data_util.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+bool simple_cmp(float a, float b) { return std::abs((a - b) / a) < 1e-5; }
 
 std::unique_ptr<::pir::Program> BuildProgram() {
   ::pir::IrContext* ctx = ::pir::IrContext::Instance();
@@ -39,18 +46,29 @@ std::unique_ptr<::pir::Program> BuildProgram() {
   auto program = std::make_unique<::pir::Program>(ctx);
   ::pir::Builder builder = ::pir::Builder(ctx, program->block());
 
-  const float value = 2.0;
+  const float value = 0.5;
   auto full_op_x =
-      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64, 128},
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{2, 2},
                                              value,
                                              phi::DataType::FLOAT32,
                                              phi::GPUPlace());
 
   auto full_op_y =
-      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{128, 64},
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{2, 2},
+                                             value,
+                                             phi::DataType::FLOAT32,
+                                             phi::GPUPlace());
+  auto full_op_z =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{2, 2},
                                              value,
                                              phi::DataType::FLOAT32,
                                              phi::GPUPlace());
+
+  auto sin = builder.Build<paddle::dialect::SinOp>(full_op_x.result(0));
+  auto cos = builder.Build<paddle::dialect::CosOp>(full_op_y.result(0));
+  auto add =
+      builder.Build<paddle::dialect::AddOp>(sin.result(0), cos.result(0));
+  builder.Build<paddle::dialect::FetchOp>(add.out(), "out", 0);
   return std::move(program);
 }
 
@@ -60,43 +78,105 @@ namespace framework {
 TEST(CinnJitInstruction, Run) {
   // Step 1: Construct pir::Program
   std::unique_ptr<::pir::Program> program = BuildProgram();
-  EXPECT_EQ(program->block()->size(), 2u);
+  EXPECT_EQ(program->block()->size(), 7u);
 
   // Step 2: Compiler New pir::Program into Runtime Program
   auto target = cinn::common::DefaultNVGPUTarget();
   auto scope = cinn::hlir::framework::BuildScope(target, *program);
-  ASSERT_EQ(scope->var_names().size(), 2);
 
-  cinn::hlir::framework::NewIRCompiler ir_compiler(*program, target, scope);
-  auto runtime_program = ir_compiler.Build();
+  std::vector<cinn::hlir::framework::PIRCompiler*> compiler_list;
 
-  // Step 3: Convert into cinn::dialect::RuntimeDialect
-  std::unique_ptr<::pir::Program> ir_runtime_program =
-      cinn::hlir::framework::ConvertToRuntimeDialect(*runtime_program);
+  std::set<std::string> checking_cinn_ops = {"pd_op.sin", "pd_op.cos"};
 
-  std::set<std::string> out_names;
-  for (auto& var_name : scope->var_names()) {
-    std::string name = {var_name.begin(), var_name.end()};
-    out_names.insert(name);
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<cinn::dialect::RuntimeDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::KernelDialect>();
+  auto ir_program = std::make_unique<::pir::Program>(ctx);
+  std::string jit_op_name = cinn::dialect::JitKernelOp::name();
+  ::pir::OpInfo op_info = ctx->GetRegisteredOpInfo(jit_op_name);
+
+  std::unordered_map<pir::Value, pir::Value> value_map;
+  for (auto it = program->block()->begin(); it != program->block()->end();
+       ++it) {
+    if (checking_cinn_ops.count((*it)->name())) {
+      auto ir_compiler =
+          new cinn::hlir::framework::PIRCompiler(*program, target, scope);
+
+      std::vector<::pir::Operation*> ops = {*it};
+      auto group = std::make_shared<cinn::hlir::framework::pir::Group>(ops);
+      auto fn_ptr_res = ir_compiler->BuildCUDAJITInfo({group});
+      compiler_list.push_back(ir_compiler);
+      std::unordered_map<std::string, ::pir::Attribute> op_attrs{
+          {cinn::dialect::JitKernelOp::kAttrName,
+           cinn::dialect::CUDAJITInfoAttribute::get(ctx, fn_ptr_res[0])},
+      };
+
+      auto out_type = (*it)->result(0).type();
+
+      std::vector<pir::Value> vec_ins;
+
+      for (size_t i = 0; i < (*it)->num_operands(); ++i) {
+        vec_ins.push_back(value_map.at((*it)->operand_source(i)));
+      }
+
+      ::pir::Operation* cinn_op =
+          ::pir::Operation::Create(vec_ins, op_attrs, {out_type}, op_info);
+
+      value_map[(*it)->result(0)] = cinn_op->result(0);
+
+      ir_program->block()->push_back(cinn_op);
+    } else {
+      std::vector<pir::Value> vec_ins;
+
+      for (size_t i = 0; i < (*it)->num_operands(); ++i) {
+        vec_ins.push_back(value_map.at((*it)->operand_source(i)));
+      }
+
+      auto type1 = (*it)->result(0).type();
+      ::pir::OpInfo info1 = ctx->GetRegisteredOpInfo((*it)->name());
+      ::pir::Operation* op = ::pir::Operation::Create(
+          vec_ins, (*it)->attributes(), {type1}, info1);
+
+      ir_program->block()->push_back(op);
+
+      value_map[(*it)->result(0)] = op->result(0);
+    }
   }
 
   platform::Place place = platform::CUDAPlace(0);
+
+  auto kernel_program =
+      paddle::dialect::PdOpLowerToKernelPass(ir_program.get(), place);
+
   Scope exe_scope;
 
-  InterpreterCore executor(place, {}, ir_runtime_program->block(), &exe_scope);
-  executor.SetSkipGcVars(out_names);
-  executor.Run({});
-
-  // TODO(Aurelius84): Need to replace check with framework::Scope.
-  const float value = 2.0;
-  for (auto& name : out_names) {
-    std::vector<float> data =
-        cinn::GetTensorData<float>(scope->GetTensor(name), target);
-    for (int i = 0; i < data.size(); ++i) {
-      LOG_FIRST_N(INFO, 3) << "data: " << data[i];
-      ASSERT_NEAR(data[i], value, 1e-5);
-    }
+  paddle::framework::interpreter::ExecutionConfig exe_conf;
+  exe_conf.create_local_scope = false;
+  InterpreterCore executor(
+      place, {"out@fetch"}, kernel_program->block(), &exe_scope);
+
+  std::set<std::string> out_names;
+  out_names.insert("out@fetch");
+  auto local_names = exe_scope.LocalVarNames();
+  for (size_t i = 0; i < local_names.size(); ++i) {
+    out_names.insert(local_names[i]);
   }
+
+  executor.SetSkipGcVars(out_names);
+  executor.Run({}, true);
+  auto out_tensor =
+      executor.local_scope()->FindVar("out@fetch")->Get<phi::DenseTensor>();
+
+  bool res0 = simple_cmp(out_tensor.data<float>()[0], 1.35701);
+  bool res1 = simple_cmp(out_tensor.data<float>()[1], 1.35701);
+  bool res2 = simple_cmp(out_tensor.data<float>()[2], 1.35701);
+  bool res3 = simple_cmp(out_tensor.data<float>()[3], 1.35701);
+
+  EXPECT_EQ(res0, true);
+  EXPECT_EQ(res1, true);
+  EXPECT_EQ(res2, true);
+  EXPECT_EQ(res3, true);
 }
 
 }  // namespace framework
diff --git a/test/cpp/pir/cinn/new_ir_compiler_test.cc b/test/cpp/pir/cinn/pir_compiler_test.cc
similarity index 68%
rename from test/cpp/pir/cinn/new_ir_compiler_test.cc
rename to test/cpp/pir/cinn/pir_compiler_test.cc
index 4b680b1ac89048..8f1c883bc37341 100644
--- a/test/cpp/pir/cinn/new_ir_compiler_test.cc
+++ b/test/cpp/pir/cinn/pir_compiler_test.cc
@@ -22,16 +22,17 @@
 
 #include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
-#include "paddle/cinn/hlir/framework/convert_to_dialect.h"
-#include "paddle/cinn/hlir/framework/new_ir_compiler.h"
+#include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/cinn/utils/data_util.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/program.h"
 
-using cinn::hlir::framework::newir::Group;
-using cinn::hlir::framework::newir::GroupPtr;
+using cinn::hlir::framework::pir::Group;
+using cinn::hlir::framework::pir::GroupPtr;
 
 using ProgramInfo =
     std::tuple<std::shared_ptr<::pir::Program>, std::vector<GroupPtr>>;
@@ -75,11 +76,45 @@ ProgramInfo BuildProgram() {
   return {program, groups};
 }
 
-TEST(NewIRCompier, CompilerAndRun) {
+ProgramInfo BuildSoftmax() {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  auto program = std::make_shared<::pir::Program>(ctx);
+  paddle::dialect::APIBuilder::Instance().SetProgram(program.get());
+
+  auto x = paddle::dialect::full(std::vector<int64_t>{64, 128},
+                                 1.0,
+                                 phi::DataType::FLOAT32,
+                                 phi::GPUPlace());
+  auto max_tmp = paddle::dialect::max(x, std::vector<int64_t>{1}, true);
+  auto sub_tmp = paddle::dialect::subtract(x, max_tmp);
+  auto exp_tmp = paddle::dialect::exp(sub_tmp);
+  // sum need to be decomposed in Program pass, but not implemented currently.
+  auto sum_tmp = paddle::dialect::sum(
+      exp_tmp, std::vector<int64_t>{1}, phi::DataType::FLOAT32, true);
+  auto out = paddle::dialect::divide(exp_tmp, sum_tmp);
+
+  std::vector<GroupPtr> groups;
+  groups.emplace_back(std::make_shared<Group>(
+      std::initializer_list<::pir::Operation*>({x.owner()})));
+  groups.emplace_back(
+      std::make_shared<Group>(std::initializer_list<::pir::Operation*>({
+          max_tmp.owner(),
+          sub_tmp.owner(),
+          exp_tmp.owner(),
+          sum_tmp.owner(),
+          out.owner(),
+      })));
+
+  return {program, groups};
+}
+
+TEST(PIRCompier, CompileSoftmax) {
   // Step 1: Construct pir::Program
-  auto prog_info = BuildProgram();
+  auto prog_info = BuildSoftmax();
   std::shared_ptr<::pir::Program> program = std::get<0>(prog_info);
-  EXPECT_EQ(program->block()->size(), 6u);
+  std::vector<GroupPtr> groups = std::get<1>(prog_info);
+  EXPECT_EQ(program->block()->size(), 8u);
   LOG(INFO) << program->block()->size();
 
   std::stringstream ss;
@@ -89,10 +124,11 @@ TEST(NewIRCompier, CompilerAndRun) {
   // Step 2: Compiler New pir::Program into Runtime Program
   auto target = cinn::common::DefaultNVGPUTarget();
   auto scope = cinn::hlir::framework::BuildScope(target, *program);
-  ASSERT_EQ(scope->var_names().size(), 6);
+  LOG(INFO) << scope->var_names().size();
+  ASSERT_EQ(scope->var_names().size(), 8);
 
-  cinn::hlir::framework::NewIRCompiler ir_compiler(*program, target, scope);
-  auto runtime_program = ir_compiler.Build();
+  cinn::hlir::framework::PIRCompiler ir_compiler(*program, target, scope);
+  auto runtime_program = ir_compiler.Build(groups);
 
   // Step 3: Execute Runtime Instruction and check Scope.
   ASSERT_NO_THROW(runtime_program->Execute());
@@ -106,11 +142,10 @@ TEST(NewIRCompier, CompilerAndRun) {
   }
 }
 
-TEST(NewIRCompier, CompileGroupOps) {
+TEST(PIRCompier, CompilerAndRun) {
   // Step 1: Construct pir::Program
   auto prog_info = BuildProgram();
   std::shared_ptr<::pir::Program> program = std::get<0>(prog_info);
-  std::vector<GroupPtr> groups = std::get<1>(prog_info);
   EXPECT_EQ(program->block()->size(), 6u);
   LOG(INFO) << program->block()->size();
 
@@ -123,8 +158,8 @@ TEST(NewIRCompier, CompileGroupOps) {
   auto scope = cinn::hlir::framework::BuildScope(target, *program);
   ASSERT_EQ(scope->var_names().size(), 6);
 
-  cinn::hlir::framework::NewIRCompiler ir_compiler(*program, target, scope);
-  auto runtime_program = ir_compiler.Build(groups);
+  cinn::hlir::framework::PIRCompiler ir_compiler(*program, target, scope);
+  auto runtime_program = ir_compiler.Build();
 
   // Step 3: Execute Runtime Instruction and check Scope.
   ASSERT_NO_THROW(runtime_program->Execute());
@@ -138,40 +173,28 @@ TEST(NewIRCompier, CompileGroupOps) {
   }
 }
 
-TEST(RuntimeDialect, CompilerAndRun) {
+TEST(PIRCompier, CompileGroupOps) {
   // Step 1: Construct pir::Program
   auto prog_info = BuildProgram();
   std::shared_ptr<::pir::Program> program = std::get<0>(prog_info);
+  std::vector<GroupPtr> groups = std::get<1>(prog_info);
   EXPECT_EQ(program->block()->size(), 6u);
+  LOG(INFO) << program->block()->size();
+
+  std::stringstream ss;
+  program->Print(ss);
+  LOG(INFO) << ss.str();
 
   // Step 2: Compiler New pir::Program into Runtime Program
   auto target = cinn::common::DefaultNVGPUTarget();
   auto scope = cinn::hlir::framework::BuildScope(target, *program);
-  ASSERT_EQ(scope->var_names().size(), 6u);
-
-  cinn::hlir::framework::NewIRCompiler ir_compiler(*program, target, scope);
-  auto runtime_program = ir_compiler.Build();
+  ASSERT_EQ(scope->var_names().size(), 6);
 
-  // Step 3: Convert into cinn::dialect::RuntimeDialect
-  std::shared_ptr<::pir::Program> ir_runtime_program =
-      cinn::hlir::framework::ConvertToRuntimeDialect(*runtime_program);
-
-  // Step 4: Run cinn::dialect::RuntimeDialect
-  for (auto iter = ir_runtime_program->block()->begin();
-       iter != ir_runtime_program->block()->end();
-       ++iter) {
-    auto op = (*iter)->dyn_cast<cinn::dialect::JitKernelOp>();
-    auto* instr = op.instruction();
-    instr->Run(/*name2podargs=*/nullptr,
-               false,
-               /*stream=*/nullptr,
-               /*use_cache=*/true);
-  }
-#ifdef CINN_WITH_CUDA
-  CUDA_CALL(cudaDeviceSynchronize());
-#endif
+  cinn::hlir::framework::PIRCompiler ir_compiler(*program, target, scope);
+  auto runtime_program = ir_compiler.Build(groups);
 
-  // Step 5: Check Scope Tensor Value.
+  // Step 3: Execute Runtime Instruction and check Scope.
+  ASSERT_NO_THROW(runtime_program->Execute());
   for (auto& var_name : scope->var_names()) {
     std::string name = {var_name.begin(), var_name.end()};
     std::vector<float> data =
@@ -181,3 +204,18 @@ TEST(RuntimeDialect, CompilerAndRun) {
     }
   }
 }
+
+TEST(RuntimeDialect, CompilerAndRun) {
+  // Step 1: Construct pir::Program
+  auto prog_info = BuildProgram();
+  std::shared_ptr<::pir::Program> program = std::get<0>(prog_info);
+  EXPECT_EQ(program->block()->size(), 6u);
+
+  // Step 2: Compiler New pir::Program into Runtime Program
+  auto target = cinn::common::DefaultNVGPUTarget();
+  auto scope = cinn::hlir::framework::BuildScope(target, *program);
+  ASSERT_EQ(scope->var_names().size(), 6u);
+
+  cinn::hlir::framework::PIRCompiler ir_compiler(*program, target, scope);
+  auto runtime_program = ir_compiler.Build();
+}
diff --git a/test/cpp/pir/control_flow_dialect/if_op_test.cc b/test/cpp/pir/control_flow_dialect/if_op_test.cc
index f2e49b150b7bc7..02d4061a0d5f8b 100644
--- a/test/cpp/pir/control_flow_dialect/if_op_test.cc
+++ b/test/cpp/pir/control_flow_dialect/if_op_test.cc
@@ -14,7 +14,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 
-#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/builder.h"
@@ -59,3 +59,43 @@ TEST(if_op_test, base) {
 
   LOG(INFO) << ss.str();
 }
+
+TEST(if_op_test, build_by_block) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::ControlFlowDialect>();
+
+  pir::Program program(ctx);
+  pir::Block* block = program.block();
+  pir::Builder builder(ctx, block);
+  auto full_op = builder.Build<paddle::dialect::FullOp>(
+      std::vector<int64_t>{1}, true, phi::DataType::BOOL);
+
+  // construct true block
+  std::unique_ptr<pir::Block> true_block(new pir::Block());
+  builder.SetInsertionPointToStart(true_block.get());
+  auto full_op_1 = builder.Build<paddle::dialect::FullOp>(
+      std::vector<int64_t>{2}, true, phi::DataType::BOOL);
+  builder.Build<pir::YieldOp>(std::vector<pir::Value>{full_op_1.out()});
+
+  // construct false block
+  std::unique_ptr<pir::Block> false_block(new pir::Block());
+  builder.SetInsertionPointToStart(false_block.get());
+  auto full_op_2 = builder.Build<paddle::dialect::FullOp>(
+      std::vector<int64_t>{2}, true, phi::DataType::BOOL);
+  builder.Build<pir::YieldOp>(std::vector<pir::Value>{full_op_2.out()});
+
+  builder.SetInsertionPointToEnd(block);
+
+  builder.Build<paddle::dialect::IfOp>(
+      full_op.out(), std::move(true_block), std::move(false_block));
+
+  EXPECT_FALSE(true_block);
+  EXPECT_FALSE(false_block);
+  EXPECT_EQ(full_op_2->GetParentProgram(), &program);
+
+  std::stringstream ss;
+  program.Print(ss);
+
+  LOG(INFO) << ss.str();
+}
diff --git a/test/cpp/pir/control_flow_dialect/while_op_test.cc b/test/cpp/pir/control_flow_dialect/while_op_test.cc
index 6c558cc9829267..7536ea2014fe0f 100644
--- a/test/cpp/pir/control_flow_dialect/while_op_test.cc
+++ b/test/cpp/pir/control_flow_dialect/while_op_test.cc
@@ -14,7 +14,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 
-#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/builder.h"
@@ -24,6 +24,9 @@
 #include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
 
 using namespace paddle::dialect;  // NOLINT
+
+// example for while_op use
+// while(i < ten) { i = i + 1;}
 TEST(while_op_test, base) {
   pir::IrContext* ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<pir::ControlFlowDialect>();
@@ -36,24 +39,15 @@ TEST(while_op_test, base) {
   auto i =
       builder.Build<FullOp>(std::vector<int64_t>{1}, 1, phi::DataType::INT32)
           .out();
-
   auto ten =
       builder.Build<FullOp>(std::vector<int64_t>{1}, 10, phi::DataType::INT32)
           .out();
 
-  auto while_op = builder.Build<WhileOp>(
-      std::vector<pir::Value>{i, ten},
-      std::vector<pir::Type>{builder.int32_type(), builder.int32_type()});
+  // comput condition value: i < ten
+  auto cond_value = builder.Build<LessThanOp>(i, ten).out();
 
-  // while(i < ten)
-  pir::Block* cond_block = while_op.cond_block();
-  auto cond_i_argument = cond_block->AddArgument(i.type());
-  auto cond_ten_argument = cond_block->AddArgument(ten.type());
-  builder.SetInsertionPointToStart(cond_block);
-  auto cond_value =
-      builder.Build<LessThanOp>(cond_i_argument, cond_ten_argument).out();
-  builder.Build<pir::CondYieldOp>(
-      cond_value, std::vector<pir::Value>{cond_i_argument, cond_ten_argument});
+  auto while_op =
+      builder.Build<WhileOp>(cond_value, std::vector<pir::Value>{i, ten});
 
   // { i = i + 1}
   pir::Block* body_block = while_op.body_block();
@@ -64,12 +58,19 @@ TEST(while_op_test, base) {
       builder.Build<FullOp>(std::vector<int64_t>{1}, 1, phi::DataType::INT32)
           .out();
   auto new_i = builder.Build<AddOp>(body_i_argument, one).out();
+
+  // comput new condition value: new_i < new_ten
+  auto new_cond_value =
+      builder.Build<LessThanOp>(new_i, body_ten_argument).out();
+
   builder.Build<pir::YieldOp>(
-      std::vector<pir::Value>{new_i, body_ten_argument});
+      std::vector<pir::Value>{new_cond_value, new_i, body_ten_argument});
 
   builder.SetInsertionPointAfter(while_op);
   std::stringstream ss;
   program.Print(ss);
 
   LOG(INFO) << ss.str();
+
+  EXPECT_EQ(while_op.cond(), cond_value);
 }
diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
index 0d65bc5b454c3d..ca71cb8fe9eef9 100644
--- a/test/cpp/pir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -8,14 +8,15 @@ cc_test_old(
   pd_op_dialect)
 cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir gtest)
 cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS pir gtest)
-cc_test_old(
+paddle_test(
   ir_op_test
   SRCS
   ir_op_test.cc
   DEPS
   pir
   gtest
-  test_dialect)
+  test_dialect
+  pd_op_dialect)
 cc_test_old(ir_region_test SRCS ir_region_test.cc DEPS pir gtest)
 cc_test_old(ir_builder_test SRCS ir_builder_test.cc DEPS pir gtest)
 cc_test_old(
@@ -64,6 +65,11 @@ file(
   ${CMAKE_CURRENT_BINARY_DIR}/conditional_block_test.prog
   EXPECTED_MD5 cf9dc869ca7f69e2d57b38dbf8427134)
 
+file(
+  DOWNLOAD https://paddle-ci.gz.bcebos.com/ir_translator_test/while_op_test.prog
+  ${CMAKE_CURRENT_BINARY_DIR}/while_op_test.prog
+  EXPECTED_MD5 290164ae52a496332b0be5829fc93bcd)
+
 copy_if_different(${CMAKE_CURRENT_SOURCE_DIR}/TestParserText.txt
                   ${CMAKE_CURRENT_BINARY_DIR}/TestParserText.txt)
 
@@ -139,3 +145,9 @@ cc_test_old(
   test_dialect
   gtest
   pir)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(ir_op_test)
+endif()
diff --git a/test/cpp/pir/core/TestParserText.txt b/test/cpp/pir/core/TestParserText.txt
index 71a6e0425f0c36..9f979c50cc7c32 100644
--- a/test/cpp/pir/core/TestParserText.txt
+++ b/test/cpp/pir/core/TestParserText.txt
@@ -76,3 +76,23 @@ f16
 //CHECK attribute
 []
 //END
+
+//CHECK type
+vec[vec[],vec[]]
+//END
+
+//CHECK attribute
+[(Float)inf,(Float)-inf]
+//END
+
+//CHECK attribute
+[(Float)-1,(Float)-1.00001,(Double)-1.00001,(Float)-1.1e+30,(Double)1e+200,(Float)0.123456,(Double)0.123456]
+//END
+
+//CHECK type
+vec[vec[i8,bf16],vec[]]
+//END
+
+//CHECK type
+vec[vec[i8,bf16],vec[],vec[u8]]
+//END
diff --git a/test/cpp/pir/core/ir_infershape_test.cc b/test/cpp/pir/core/ir_infershape_test.cc
index 720d4b238d5ebd..09d3a2fe9b6b17 100644
--- a/test/cpp/pir/core/ir_infershape_test.cc
+++ b/test/cpp/pir/core/ir_infershape_test.cc
@@ -45,7 +45,7 @@ class OperationTest
   static const char *name() { return "test.operation2"; }
   static constexpr uint32_t attributes_num = 2;
   static const char *attributes_name[attributes_num];  // NOLINT
-  static void Verify() {}
+  static void VerifySig() {}
   static void InferMeta(phi::InferMetaContext *infer_meta) {
     auto fn = PD_INFER_META(phi::CreateInferMeta);
     fn(infer_meta);
diff --git a/test/cpp/pir/core/ir_op_test.cc b/test/cpp/pir/core/ir_op_test.cc
index c512ea753e3c00..596519ba57d4cc 100644
--- a/test/cpp/pir/core/ir_op_test.cc
+++ b/test/cpp/pir/core/ir_op_test.cc
@@ -15,6 +15,8 @@
 #include <gtest/gtest.h>
 #include <sstream>
 
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/phi/core/tensor_meta.h"
 #include "paddle/pir/core/block.h"
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/builtin_attribute.h"
@@ -43,6 +45,27 @@ pir::AttributeMap CreateAttributeMap(
   return attr_map;
 }
 
+pir::Operation *CreateDenseTensorOp(
+    pir::IrContext *ctx,
+    const phi::DDim &dims,
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes,
+    const pir::Type &dtype =
+        pir::Float32Type::get(pir::IrContext::Instance())) {
+  std::vector<pir::Value> op_inputs = {};
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+  std::vector<pir::Type> op_output_types = {
+      pir::DenseTensorType::get(ctx, dtype, dims, data_layout, lod, offset)};
+  pir::Operation *op =
+      pir::Operation::Create(op_inputs,
+                             CreateAttributeMap(attribute_names, attributes),
+                             op_output_types,
+                             pir::OpInfo());
+  return op;
+}
+
 TEST(op_test, region_test) {
   // (1) Register Dialect, Operation1, Operation2 into IrContext.
   pir::IrContext *ctx = pir::IrContext::Instance();
@@ -126,3 +149,367 @@ TEST(op_test, trait_and_interface) {
   pir::OperationArgument argument(&ctx, "test.region");
   EXPECT_THROW(builder.Build(std::move(argument)), pir::IrNotMetException);
 }
+
+TEST(op_test, op_traits_test) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype = pir::Float32Type::get(ctx);
+  phi::DDim dims = {2, 2};
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+
+  pir::DenseTensorType dense_tensor_dtype =
+      pir::DenseTensorType::get(ctx, dtype, dims, data_layout, lod, offset);
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype);
+
+  auto op3 = builder.Build<test::TraitExampleOp>(
+      op1->result(0), op2->result(0), dense_tensor_dtype);
+
+  EXPECT_EQ(op3->HasTrait<pir::SameOperandsShapeTrait>(), true);
+  EXPECT_EQ(op3->HasTrait<pir::SameOperandsAndResultShapeTrait>(), true);
+  EXPECT_EQ(op3->HasTrait<pir::SameOperandsElementTypeTrait>(), true);
+  EXPECT_EQ(op3->HasTrait<pir::SameOperandsAndResultElementTypeTrait>(), true);
+  EXPECT_EQ(op3->HasTrait<pir::SameOperandsAndResultTypeTrait>(), true);
+  EXPECT_EQ(op3->HasTrait<pir::SameTypeOperandsTrait>(), true);
+}
+
+TEST(op_test, same_operands_shape_trait_test1) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsShapeTraitOp1>(),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_shape_trait_test2) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype1 = pir::Float32Type::get(ctx);
+  phi::DDim dims1 = {2, 2};
+
+  pir::Type dtype2 = pir::Float64Type::get(ctx);
+  phi::DDim dims2 = {2, 2, 2};
+
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+
+  pir::DenseTensorType dense_tensor_dtype =
+      pir::DenseTensorType::get(ctx, dtype1, dims1, data_layout, lod, offset);
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims1, {"op1_temp"}, {"op1_attr"}, dtype1);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims2, {"op2_temp"}, {"op2_attr"}, dtype2);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsShapeTraitOp2>(
+                   op1->result(0), op2->result(0), dense_tensor_dtype),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_shape_trait_test1) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultShapeTraitOp1>(),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_shape_trait_test2) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype = pir::Float64Type::get(ctx);
+  phi::DDim dims = {2, 2, 2};
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultShapeTraitOp2>(
+                   op1->result(0), op2->result(0)),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_shape_trait_test3) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype1 = pir::Float32Type::get(ctx);
+  phi::DDim dims1 = {2, 2};
+
+  pir::Type dtype2 = pir::Float64Type::get(ctx);
+  phi::DDim dims2 = {2, 2, 2};
+
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+
+  pir::DenseTensorType dense_tensor_dtype =
+      pir::DenseTensorType::get(ctx, dtype1, dims1, data_layout, lod, offset);
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims1, {"op1_temp"}, {"op1_attr"}, dtype1);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims2, {"op2_temp"}, {"op2_attr"}, dtype2);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultShapeTraitOp3>(
+                   op1->result(0), op2->result(0), dense_tensor_dtype),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_element_type_trait_test1) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsElementTypeTraitOp1>(),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_element_type_trait_test2) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype1 = pir::Float32Type::get(ctx);
+  pir::Type dtype2 = pir::Float64Type::get(ctx);
+
+  phi::DDim dims = {2, 2};
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+
+  pir::DenseTensorType dense_tensor_dtype =
+      pir::DenseTensorType::get(ctx, dtype1, dims, data_layout, lod, offset);
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype1);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype2);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsElementTypeTraitOp2>(
+                   op1->result(0), op2->result(0), dense_tensor_dtype),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_element_type_trait_test1) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultElementTypeTraitOp1>(),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_element_type_trait_test2) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype = pir::Float32Type::get(ctx);
+  phi::DDim dims = {2, 2};
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultElementTypeTraitOp2>(
+                   op1->result(0), op2->result(0)),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_element_type_trait_test3) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype1 = pir::Float32Type::get(ctx);
+  phi::DDim dims1 = {2, 2};
+
+  pir::Type dtype2 = pir::Float64Type::get(ctx);
+  phi::DDim dims2 = {2, 2, 2};
+
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+
+  pir::DenseTensorType dense_tensor_dtype1 =
+      pir::DenseTensorType::get(ctx, dtype1, dims1, data_layout, lod, offset);
+  pir::DenseTensorType dense_tensor_dtype2 =
+      pir::DenseTensorType::get(ctx, dtype2, dims2, data_layout, lod, offset);
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims1, {"op1_temp"}, {"op1_attr"}, dtype1);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims2, {"op2_temp"}, {"op2_attr"}, dtype2);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultElementTypeTraitOp3>(
+                   op1->result(0),
+                   op2->result(0),
+                   dense_tensor_dtype1,
+                   dense_tensor_dtype1),
+               pir::IrNotMetException);
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultElementTypeTraitOp3>(
+                   op1->result(0),
+                   op1->result(0),
+                   dense_tensor_dtype1,
+                   dense_tensor_dtype2),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_type_trait_test1) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp1>(),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_type_trait_test2) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype = pir::Float32Type::get(ctx);
+  phi::DDim dims = {2, 2};
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp2>(
+                   op1->result(0), op2->result(0)),
+               pir::IrNotMetException);
+}
+
+TEST(op_test, same_operands_and_result_type_trait_test3) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  auto block = program.block();
+  pir::Builder builder(ctx, block);
+
+  pir::Type dtype1 = pir::Float32Type::get(ctx);
+  phi::DDim dims1 = {2, 2};
+
+  pir::Type dtype2 = pir::Float64Type::get(ctx);
+  phi::DDim dims2 = {2, 2, 2};
+
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+
+  pir::DenseTensorType dense_tensor_dtype1 =
+      pir::DenseTensorType::get(ctx, dtype1, dims1, data_layout, lod, offset);
+
+  pir::DenseTensorType dense_tensor_dtype2 =
+      pir::DenseTensorType::get(ctx, dtype2, dims2, data_layout, lod, offset);
+
+  pir::DenseTensorType dense_tensor_dtype3 =
+      pir::DenseTensorType::get(ctx, dtype1, dims2, data_layout, lod, offset);
+
+  pir::Operation *op1 =
+      CreateDenseTensorOp(ctx, dims1, {"op1_temp"}, {"op1_attr"}, dtype2);
+  pir::Operation *op2 =
+      CreateDenseTensorOp(ctx, dims2, {"op2_temp"}, {"op2_attr"}, dtype1);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp3>(
+                   op1->result(0),
+                   op2->result(0),
+                   dense_tensor_dtype1,
+                   dense_tensor_dtype2),
+               pir::IrNotMetException);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp3>(
+                   op1->result(0),
+                   op2->result(0),
+                   dense_tensor_dtype1,
+                   dense_tensor_dtype3),
+               pir::IrNotMetException);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp3>(
+                   op1->result(0),
+                   op2->result(0),
+                   dense_tensor_dtype1,
+                   dense_tensor_dtype1),
+               pir::IrNotMetException);
+
+  EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp3>(
+                   op2->result(0),
+                   op1->result(0),
+                   dense_tensor_dtype1,
+                   dense_tensor_dtype1),
+               pir::IrNotMetException);
+}
diff --git a/test/cpp/pir/core/ir_parser_test.cc b/test/cpp/pir/core/ir_parser_test.cc
index 7990d26e8afaf1..91a26c6e970fc1 100644
--- a/test/cpp/pir/core/ir_parser_test.cc
+++ b/test/cpp/pir/core/ir_parser_test.cc
@@ -86,7 +86,7 @@ TestTask* ParserTest::GetTestTask() {
   std::string test_type_info;
   while (test_text.peek() != '\n' && test_text.peek() != ' ' &&
          test_text.peek() != EOF) {
-    test_type_info += test_text.get();
+    test_type_info += test_text.get();  // NOLINT
   }
 
   while (test_text.peek() == '\n' || test_text.peek() == ' ') {
@@ -95,10 +95,10 @@ TestTask* ParserTest::GetTestTask() {
 
   std::string test_info;
   while (Peek(5) != "//END" && test_text.peek() != EOF) {
-    test_info += test_text.get();
+    test_info += test_text.get();  // NOLINT
   }
 
-  if (Peek(5) != "//END" || test_info.size() == 0) {
+  if (Peek(5) != "//END" || static_cast<int>(test_info.size()) == 0) {
     return nullptr;
   }
 
@@ -175,7 +175,7 @@ std::string ParserTest::Get(const size_t len) {
     if (test_text.peek() == EOF) {
       break;
     }
-    str += test_text.get();
+    str += test_text.get();  // NOLINT
   }
   return str;
 }
diff --git a/test/cpp/pir/core/ir_program_test.cc b/test/cpp/pir/core/ir_program_test.cc
index 85f608aa117a28..7ae348d004f53e 100644
--- a/test/cpp/pir/core/ir_program_test.cc
+++ b/test/cpp/pir/core/ir_program_test.cc
@@ -41,14 +41,14 @@ class AddOp : public pir::Op<AddOp> {
   static const char *name() { return "test.add"; }
   static constexpr const char **attributes_name = nullptr;
   static constexpr uint32_t attributes_num = 0;
-  void Verify();
+  void VerifySig();
   static void Build(pir::Builder &builder,             // NOLINT
                     pir::OperationArgument &argument,  // NOLINT
                     pir::Value l_operand,
                     pir::Value r_operand,
                     pir::Type sum_type);
 };
-void AddOp::Verify() {
+void AddOp::VerifySig() {
   if (num_operands() != 2) {
     throw("The size of inputs must be equal to 2.");
   }
diff --git a/test/cpp/pir/core/op_info_test.cc b/test/cpp/pir/core/op_info_test.cc
index fec5b71396095c..3a273575a06618 100644
--- a/test/cpp/pir/core/op_info_test.cc
+++ b/test/cpp/pir/core/op_info_test.cc
@@ -39,8 +39,8 @@ TEST(ir_op_info_test, op_op_info_test) {
   auto& info_map = context->registered_op_info_map();
   EXPECT_FALSE(info_map.empty());
 
-  void* info_1 = op->info().AsOpaquePointer();
-  auto info_2 = pir::OpInfo::RecoverFromOpaquePointer(info_1);
+  void* info_1 = op->info();
+  auto info_2 = pir::OpInfo::RecoverFromVoidPointer(info_1);
   EXPECT_EQ(op->info(), info_2);
   pir::Verify(program.module_op());
 }
diff --git a/test/cpp/pir/core/program_translator_test.cc b/test/cpp/pir/core/program_translator_test.cc
index c95d5952577baf..ba85e396d41b7c 100644
--- a/test/cpp/pir/core/program_translator_test.cc
+++ b/test/cpp/pir/core/program_translator_test.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
@@ -265,3 +266,75 @@ TEST(IrParserTest, StartupProgram) {
 
   EXPECT_TRUE(ssp.str() == ss.str());
 }
+
+TEST(OperatorDialectTest, WhileOpProgram) {
+  auto p = load_from_file("while_op_test.prog");
+  EXPECT_EQ(p.Size(), 3u);
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
+  auto program = paddle::TranslateLegacyProgramToProgram(p);
+
+  std::stringstream ss;
+  program->Print(ss);
+
+  LOG(INFO) << ss.str();
+
+  EXPECT_EQ(program->block()->size(), 4u);
+  size_t id = 0;
+  for (auto &op : *program->block()) {
+    if (id == 0 || id == 1) {
+      EXPECT_TRUE(op->isa<paddle::dialect::FullOp>());
+    }
+    if (id == 2) {
+      EXPECT_TRUE(op->isa<paddle::dialect::LessThanOp>());
+    }
+    if (id == 3) {
+      EXPECT_TRUE(op->isa<paddle::dialect::WhileOp>());
+      EXPECT_EQ(op->num_regions(), 1u);
+      // body block
+      pir::Block *body_block =
+          op->dyn_cast<paddle::dialect::WhileOp>().body_block();
+      size_t body_id = 0;
+      for (auto &op1 : *body_block) {
+        if (body_id == 0) {
+          EXPECT_TRUE(op1->isa<paddle::dialect::FullOp>());
+        }
+        if (body_id == 1) {
+          EXPECT_TRUE(op1->isa<paddle::dialect::ScaleOp>());
+        }
+        if (body_id == 2) {
+          EXPECT_TRUE(op1->isa<paddle::dialect::LessThanOp>());
+        }
+        if (body_id == 3) {
+          pir::Block *body_body_block =
+              op1->dyn_cast<paddle::dialect::WhileOp>().body_block();
+          size_t body_body_id = 0;
+          for (auto &op2 : *body_body_block) {
+            if (body_body_id == 0) {
+              EXPECT_TRUE(op2->isa<paddle::dialect::FullOp>());
+            }
+            if (body_body_id == 1) {
+              EXPECT_TRUE(op2->isa<paddle::dialect::ScaleOp>());
+            }
+            if (body_body_id == 2) {
+              EXPECT_TRUE(op2->isa<paddle::dialect::LessThanOp>());
+            }
+            if (body_body_id == 3) {
+              EXPECT_TRUE(op2->isa<pir::YieldOp>());
+            }
+            body_body_id++;
+          }
+        }
+        if (body_id == 4) {
+          EXPECT_TRUE(op1->isa<paddle::dialect::LessThanOp>());
+        }
+        if (body_id == 5) {
+          EXPECT_TRUE(op1->isa<pir::YieldOp>());
+        }
+        body_id++;
+      }
+    }
+    id++;
+  }
+}
diff --git a/test/cpp/pir/core/type_test.cc b/test/cpp/pir/core/type_test.cc
index ada08b5f9bf1a4..0f3581732784fe 100644
--- a/test/cpp/pir/core/type_test.cc
+++ b/test/cpp/pir/core/type_test.cc
@@ -24,6 +24,7 @@
 #include "paddle/pir/core/type.h"
 #include "paddle/pir/core/type_base.h"
 #include "paddle/pir/core/type_name.h"
+#include "paddle/pir/core/type_util.h"
 #include "paddle/pir/core/utils.h"
 
 class TypeA {};
@@ -260,6 +261,36 @@ TEST(type_test, pd_op_dialect) {
   EXPECT_EQ(select_rows_dtype.offset(), offset);
 }
 
+TEST(type_test, type_util) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  phi::DDim dims1 = {2, 2};
+  phi::DDim dims2 = {2, 2, 3};
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+
+  paddle::dialect::SelectedRowsType select_rows_dtype1 =
+      paddle::dialect::SelectedRowsType::get(
+          ctx, fp32_dtype, dims1, data_layout, lod, offset);
+
+  paddle::dialect::SelectedRowsType select_rows_dtype2 =
+      paddle::dialect::SelectedRowsType::get(
+          ctx, fp32_dtype, dims2, data_layout, lod, offset);
+
+  std::vector<pir::Type> types1 = {
+      select_rows_dtype1, select_rows_dtype1, select_rows_dtype1};
+  std::vector<pir::Type> types2 = {
+      select_rows_dtype1, select_rows_dtype1, select_rows_dtype1};
+  std::vector<pir::Type> types3 = {
+      select_rows_dtype2, select_rows_dtype2, select_rows_dtype2};
+
+  EXPECT_TRUE(pir::VerifyCompatibleShapes(types1, types2));
+  EXPECT_FALSE(pir::VerifyCompatibleShapes(types1, types3));
+}
+
 namespace TestNamespace {
 class TestClass {};
 }  // namespace TestNamespace
diff --git a/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
index bb99e86dfc21cd..6812e7a9ed1946 100644
--- a/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
+++ b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
diff --git a/test/cpp/pir/pass/pass_manager_test.cc b/test/cpp/pir/pass/pass_manager_test.cc
index e83764226ebd11..03e7d88d484bca 100644
--- a/test/cpp/pir/pass/pass_manager_test.cc
+++ b/test/cpp/pir/pass/pass_manager_test.cc
@@ -69,14 +69,14 @@ class AddOp : public pir::Op<AddOp> {
   static const char *name() { return "test.add"; }
   static constexpr const char **attributes_name = nullptr;
   static constexpr uint32_t attributes_num = 0;
-  void Verify();
+  void VerifySig();
   static void Build(pir::Builder &builder,             // NOLINT
                     pir::OperationArgument &argument,  // NOLINT
                     pir::OpResult l_operand,
                     pir::OpResult r_operand,
                     pir::Type sum_type);
 };
-void AddOp::Verify() {
+void AddOp::VerifySig() {
   if (num_operands() != 2) {
     throw("The size of inputs must be equal to 2.");
   }
diff --git a/test/cpp/pir/pattern_rewrite/CMakeLists.txt b/test/cpp/pir/pattern_rewrite/CMakeLists.txt
index 6e1ff948e3e045..3282fe5893abba 100644
--- a/test/cpp/pir/pattern_rewrite/CMakeLists.txt
+++ b/test/cpp/pir/pattern_rewrite/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(PATTERN_REWRITE_TEST_DEPS
-    _constant_folding_pass transform_general_functions gtest pd_op_dialect pir)
+    pd_constant_folding_pass transform_general_functions gtest pd_op_dialect
+    pir)
 
 if(WITH_DISTRIBUTE)
   set(PATTERN_REWRITE_TEST_DEPS ${PATTERN_REWRITE_TEST_DEPS} fleet_executor
@@ -9,6 +10,46 @@ endif()
 cc_test_old(pattern_rewrite_test SRCS pattern_rewrite_test.cc DEPS
             ${PATTERN_REWRITE_TEST_DEPS})
 
+cc_test_old(
+  drr_test
+  SRCS
+  drr_test.cc
+  DEPS
+  drr
+  gtest
+  pd_op_dialect
+  pir)
+cc_test_old(
+  drr_same_type_binding_test
+  SRCS
+  drr_same_type_binding_test.cc
+  DEPS
+  drr
+  gtest
+  pd_op_dialect
+  pir)
+
+cc_test_old(
+  drr_fuse_linear_test
+  SRCS
+  drr_fuse_linear_test.cc
+  DEPS
+  fusion_passes
+  drr
+  gtest
+  pd_op_dialect
+  pir)
+cc_test_old(
+  drr_attention_fuse_test
+  SRCS
+  drr_attention_fuse_test.cc
+  DEPS
+  fusion_passes
+  drr
+  gtest
+  pd_op_dialect
+  pir)
+
 set_tests_properties(
   pattern_rewrite_test PROPERTIES ENVIRONMENT
                                   "FLAGS_enable_new_ir_in_executor=true")
diff --git a/test/cpp/pir/pattern_rewrite/drr_attention_fuse_test.cc b/test/cpp/pir/pattern_rewrite/drr_attention_fuse_test.cc
new file mode 100644
index 00000000000000..8ac00044146f5b
--- /dev/null
+++ b/test/cpp/pir/pattern_rewrite/drr_attention_fuse_test.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <vector>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/transforms/fusion/attention_fuse_pass.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/pass/pass_manager.h"
+
+void BuildProgram(pir::Builder &builder) {  // NOLINT
+  paddle::dialect::FullOp matmul_1_in_1 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{1, 300, 256},
+                                             0.9,
+                                             phi::DataType::FLOAT32,
+                                             phi::CPUPlace());
+  // The first path to matmul with scale (q).
+  paddle::dialect::FullOp matmul_1_in_2 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{256, 256},
+                                             1.1,
+                                             phi::DataType::FLOAT32,
+                                             phi::CPUPlace());
+
+  paddle::dialect::MatmulOp matmul_1 = builder.Build<paddle::dialect::MatmulOp>(
+      matmul_1_in_1.out(), matmul_1_in_2.out(), false, false);
+
+  paddle::dialect::FullOp add_1_in_2 = builder.Build<paddle::dialect::FullOp>(
+      std::vector<int64_t>{256}, 1.5, phi::DataType::FLOAT32, phi::CPUPlace());
+
+  paddle::dialect::AddOp add_1 =
+      builder.Build<paddle::dialect::AddOp>(matmul_1.out(), add_1_in_2.out());
+
+  paddle::dialect::ReshapeOp reshape_1 =
+      builder.Build<paddle::dialect::ReshapeOp>(
+          add_1.out(), std::vector<int64_t>{0, 0, 8, 32});
+
+  paddle::dialect::TransposeOp transpose_1 =
+      builder.Build<paddle::dialect::TransposeOp>(reshape_1.out(),
+                                                  std::vector<int>{0, 2, 1, 3});
+
+  paddle::dialect::ScaleOp scale_op = builder.Build<paddle::dialect::ScaleOp>(
+      transpose_1.out(), 0.1767766922712326, 0.0, true);
+
+  // The second path to matmul (k).
+  paddle::dialect::FullOp matmul_2_in_2 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{256, 256},
+                                             1.1,
+                                             phi::DataType::FLOAT32,
+                                             phi::CPUPlace());
+
+  paddle::dialect::MatmulOp matmul_2 = builder.Build<paddle::dialect::MatmulOp>(
+      matmul_1_in_1.out(), matmul_2_in_2.out(), false, false);
+
+  paddle::dialect::FullOp add_2_in_2 = builder.Build<paddle::dialect::FullOp>(
+      std::vector<int64_t>{256}, 1.5, phi::DataType::FLOAT32, phi::CPUPlace());
+  paddle::dialect::AddOp add_op2 =
+      builder.Build<paddle::dialect::AddOp>(matmul_2.out(), add_2_in_2.out());
+
+  paddle::dialect::ReshapeOp reshape_2 =
+      builder.Build<paddle::dialect::ReshapeOp>(
+          add_op2.out(), std::vector<int64_t>{0, 0, 8, 32});
+
+  paddle::dialect::TransposeOp transpose_2 =
+      builder.Build<paddle::dialect::TransposeOp>(reshape_2.out(),
+                                                  std::vector<int>{0, 2, 1, 3});
+
+  // The third path to matmul (v).
+  paddle::dialect::FullOp matmul_3_in_2 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{256, 256},
+                                             1.1,
+                                             phi::DataType::FLOAT32,
+                                             phi::CPUPlace());
+  paddle::dialect::MatmulOp matmul_3 = builder.Build<paddle::dialect::MatmulOp>(
+      matmul_1_in_1.out(), matmul_3_in_2.out(), false, false);
+
+  paddle::dialect::FullOp add_3_in_2 = builder.Build<paddle::dialect::FullOp>(
+      std::vector<int64_t>{256}, 1.5, phi::DataType::FLOAT32, phi::CPUPlace());
+
+  paddle::dialect::AddOp add_3 =
+      builder.Build<paddle::dialect::AddOp>(matmul_3.out(), add_3_in_2.out());
+
+  paddle::dialect::ReshapeOp reshape_3 =
+      builder.Build<paddle::dialect::ReshapeOp>(
+          add_3.out(), std::vector<int64_t>{0, 0, 8, 32});
+
+  paddle::dialect::TransposeOp transpose_3 =
+      builder.Build<paddle::dialect::TransposeOp>(reshape_3.out(),
+                                                  std::vector<int>{0, 2, 1, 3});
+
+  // softmax(qk)v
+  paddle::dialect::MatmulOp matmul_4 = builder.Build<paddle::dialect::MatmulOp>(
+      scale_op.out(), transpose_2.out(), false, true);
+
+  paddle::dialect::FullOp add_4_in_2 = builder.Build<paddle::dialect::FullOp>(
+      std::vector<int64_t>{1, 8, 300, 300},
+      1.5,
+      phi::DataType::FLOAT32,
+      phi::CPUPlace());
+
+  paddle::dialect::AddOp add_4 =
+      builder.Build<paddle::dialect::AddOp>(matmul_4.out(), add_4_in_2.out());
+
+  paddle::dialect::SoftmaxOp softmax_op =
+      builder.Build<paddle::dialect::SoftmaxOp>(add_4.out(), -1);
+  paddle::dialect::MatmulOp matmul_5 = builder.Build<paddle::dialect::MatmulOp>(
+      softmax_op.out(), transpose_3.out(), false, false);
+
+  paddle::dialect::TransposeOp transpose_4 =
+      builder.Build<paddle::dialect::TransposeOp>(matmul_5.out(),
+                                                  std::vector<int>{0, 2, 1, 3});
+
+  paddle::dialect::ReshapeOp reshape_4 =
+      builder.Build<paddle::dialect::ReshapeOp>(
+          transpose_4.out(), std::vector<int64_t>{0, 0, 256});
+
+  builder.Build<paddle::dialect::FetchOp>(reshape_4.out(), "out", 0);
+}
+
+TEST(DrrTest, AttentionFuse) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  BuildProgram(builder);
+  EXPECT_EQ(program.block()->size(), 33u);
+
+  pir::PassManager pm(ctx);
+  pm.AddPass(pir::CreateAttentionFusePass());
+  pm.EnableIRPrinting();
+
+  CHECK_EQ(pm.Run(&program), true);
+  EXPECT_EQ(program.block()->size(), 20u);
+}
diff --git a/test/cpp/pir/pattern_rewrite/drr_fuse_linear_test.cc b/test/cpp/pir/pattern_rewrite/drr_fuse_linear_test.cc
new file mode 100644
index 00000000000000..3ef77cd1f96652
--- /dev/null
+++ b/test/cpp/pir/pattern_rewrite/drr_fuse_linear_test.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/transforms/fusion/fused_gemm_epilogue_pass.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/pass/pass_manager.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
+
+void BuildProgram(pir::Builder &builder) {  // NOLINT
+  paddle::dialect::FullOp full_input_op1 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{1, 512, 64},
+                                             1.5);
+  // linear 1
+  paddle::dialect::FullOp full_weight_op1 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64, 64}, 1.5);
+  paddle::dialect::FullOp full_bias_op1 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64}, 1.0);
+  paddle::dialect::MatmulOp matmul_op1 =
+      builder.Build<paddle::dialect::MatmulOp>(full_input_op1.out(),
+                                               full_weight_op1.out());
+  paddle::dialect::AddOp add_op1 = builder.Build<paddle::dialect::AddOp>(
+      matmul_op1.out(), full_bias_op1.out());
+  // linear 2
+  paddle::dialect::FullOp full_weight_op2 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64, 128},
+                                             1.5);
+  paddle::dialect::FullOp full_bias_op2 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{128}, 1.0);
+  paddle::dialect::MatmulOp matmul_op2 =
+      builder.Build<paddle::dialect::MatmulOp>(add_op1.out(),
+                                               full_weight_op2.out());
+  paddle::dialect::AddOp add_op2 = builder.Build<paddle::dialect::AddOp>(
+      matmul_op2.out(), full_bias_op2.out());
+  paddle::dialect::ReluOp relu_op =
+      builder.Build<paddle::dialect::ReluOp>(add_op2.out());
+  // linear 3
+  paddle::dialect::FullOp full_weight_op3 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{128, 64},
+                                             1.5);
+  paddle::dialect::FullOp full_bias_op3 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64}, 1.0);
+  paddle::dialect::MatmulOp matmul_op3 =
+      builder.Build<paddle::dialect::MatmulOp>(relu_op.out(),
+                                               full_weight_op3.out());
+  paddle::dialect::AddOp add_op3 = builder.Build<paddle::dialect::AddOp>(
+      matmul_op3.out(), full_bias_op3.out());
+  paddle::dialect::GeluOp gelu_op1 =
+      builder.Build<paddle::dialect::GeluOp>(add_op3.out());
+  // linear 4
+  paddle::dialect::FullOp full_weight_op4 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64, 64}, 1.5);
+  paddle::dialect::FullOp full_bias_op4 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64}, 1.0);
+  paddle::dialect::MatmulOp matmul_op4 =
+      builder.Build<paddle::dialect::MatmulOp>(gelu_op1.out(),
+                                               full_weight_op4.out());
+  paddle::dialect::AddOp add_op4 = builder.Build<paddle::dialect::AddOp>(
+      matmul_op4.out(), full_bias_op4.out());
+  paddle::dialect::GeluOp gelu_op2 =
+      builder.Build<paddle::dialect::GeluOp>(add_op4.out());
+
+  // backward
+  paddle::dialect::FullOp full_grad_op = builder.Build<paddle::dialect::FullOp>(
+      std::vector<int64_t>{1, 512, 64}, 1.0);
+
+  paddle::dialect::GeluGradOp gelu_op2_grad =
+      builder.Build<paddle::dialect::GeluGradOp>(
+          add_op4.out(), full_grad_op.out(), false);
+  // backward linear 4
+  paddle::dialect::AddGradOp add_op4_grad =
+      builder.Build<paddle::dialect::AddGradOp>(
+          matmul_op4.out(), full_bias_op4.out(), gelu_op2_grad.x_grad());
+  paddle::dialect::MatmulGradOp matmul_op4_grad =
+      builder.Build<paddle::dialect::MatmulGradOp>(
+          gelu_op1.out(), full_weight_op4.out(), add_op4_grad.x_grad());
+
+  paddle::dialect::GeluGradOp gelu_op1_grad =
+      builder.Build<paddle::dialect::GeluGradOp>(
+          add_op3.out(), matmul_op4_grad.x_grad(), false);
+  // backward linear 3
+  paddle::dialect::AddGradOp add_op3_grad =
+      builder.Build<paddle::dialect::AddGradOp>(
+          matmul_op3.out(), full_bias_op3.out(), gelu_op1_grad.x_grad());
+  paddle::dialect::MatmulGradOp matmul_op3_grad =
+      builder.Build<paddle::dialect::MatmulGradOp>(
+          relu_op.out(), full_weight_op3.out(), add_op3_grad.x_grad());
+
+  paddle::dialect::ReluGradOp relu_op_grad =
+      builder.Build<paddle::dialect::ReluGradOp>(relu_op.out(),
+                                                 matmul_op3_grad.x_grad());
+  // backward linear 2
+  paddle::dialect::AddGradOp add_op2_grad =
+      builder.Build<paddle::dialect::AddGradOp>(
+          matmul_op2.out(), full_bias_op2.out(), relu_op_grad.x_grad());
+  paddle::dialect::MatmulGradOp matmul_op2_grad =
+      builder.Build<paddle::dialect::MatmulGradOp>(
+          add_op1.out(), full_weight_op2.out(), add_op2_grad.x_grad());
+  // backward linear 1
+  paddle::dialect::AddGradOp add_op1_grad =
+      builder.Build<paddle::dialect::AddGradOp>(
+          matmul_op1.out(), full_bias_op1.out(), matmul_op2_grad.x_grad());
+  paddle::dialect::MatmulGradOp matmul_op1_grad =
+      builder.Build<paddle::dialect::MatmulGradOp>(
+          full_input_op1.out(), full_weight_op1.out(), add_op1_grad.x_grad());
+
+  builder.Build<paddle::dialect::FetchOp>(gelu_op2.out(), "out", 0);
+  builder.Build<paddle::dialect::FetchOp>(matmul_op1_grad.x_grad(), "dx", 1);
+}
+
+TEST(DrrTest, FusedLinear) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  BuildProgram(builder);
+
+  EXPECT_EQ(program.block()->size(), 34u);
+
+  pir::PassManager pm(ctx);
+  pm.AddPass(pir::CreateFusedGemmEpiloguePass());
+  // pm.EnablePassTiming();
+  pm.EnableIRPrinting();
+
+  CHECK_EQ(pm.Run(&program), true);
+  EXPECT_EQ(program.block()->size(), 22u);
+}
diff --git a/test/cpp/pir/pattern_rewrite/drr_same_type_binding_test.cc b/test/cpp/pir/pattern_rewrite/drr_same_type_binding_test.cc
new file mode 100644
index 00000000000000..cb4c6e4b0b92f6
--- /dev/null
+++ b/test/cpp/pir/pattern_rewrite/drr_same_type_binding_test.cc
@@ -0,0 +1,332 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/api/drr_pattern_base.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_manager.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
+#include "paddle/pir/transforms/dead_code_elimination_pass.h"
+
+/* Source pattern:
+                                       input1
+                                    /  |  \  \  \
+                                  /    |   \   \    \
+             full               /      |    |    \     \           full_tmp
+            /  |        transpos1      | trans2 trans3    \         /   |
+           /   |         /    |        |    |      |        \      /    |
+    softmax1   |        /     |        |    |      |          \   /     |
+         \     |      /    softmax2    |    |      |          add1      |
+           \   |    /             \    |     \    /             |       |
+           layernorm             matmul2     matmul1             \      |
+             / | \                   |         |                  \     |
+           /   |   \                  \       /                     \   |
+         /     |     \                 matmul3                        add2
+        |      |      |                /  |  \                          |
+        |      |      |              /    |    \                        |
+        |      |      |            /      |      \                      |
+        |      |      |         trans4  trans5  trans6                  |
+        |      |      |           |       |        |                    |
+        |      |      |         relu1  softmax3 softmax4              relu2
+        |      |      |           |       |        |                    |
+    output0 output1 output2    output3  output4  output5             output6
+*/
+
+class SameTypeBindingTestPattern
+    // This class is for test cases of the same type of OP.
+    // (without considering the computational logic between OPs,
+    // only focusing on the process of matching and replacing)
+    : public pir::drr::DrrPatternBase<SameTypeBindingTestPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern src = ctx->SourcePattern();
+
+    // path 1
+    const auto &transpose_1 =
+        src.Op("pd_op.transpose", {{"perm", src.Attr("perm_1")}});
+    src.Tensor("transpose_1_out") = transpose_1(src.Tensor("input_1"));
+    const auto &softmax_2 =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_2_axis")}});
+    src.Tensor("softmax_2_out") = softmax_2(src.Tensor("transpose_1_out"));
+    const auto &matmul_2 =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_2_tradnspose_x")},
+                {"transpose_y", src.Attr("matmul_2_transpose_y")}});
+    src.Tensor("matmul_2_out") =
+        matmul_2(src.Tensor("softmax_2_out"), src.Tensor("input_1"));
+
+    // path 2
+    const auto &full_1 = src.Op("pd_op.full",
+                                {{"shape", src.Attr("shape_1")},
+                                 {"value", src.Attr("value_1")},
+                                 {"dtype", src.Attr("dtype_1")},
+                                 {"place", src.Attr("place_1")}});
+    src.Tensor("full_1_out") = full_1();
+    const auto &softmax_1 =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_1_axis")}});
+    src.Tensor("softmax_1_out") = softmax_1(src.Tensor("full_1_out"));
+    const auto &layernorm_1 =
+        src.Op("pd_op.layer_norm",
+               {{"epsilon", src.Attr("layernorm_epsilon")},
+                {"begin_norm_axis", src.Attr("layernorm_begin_norm_axis")}});
+    layernorm_1({&src.Tensor("transpose_1_out"),
+                 &src.Tensor("full_1_out"),
+                 &src.Tensor("softmax_1_out")},
+                {&src.Tensor("output0"),
+                 &src.Tensor("output1"),
+                 &src.Tensor("output2")});
+
+    // path 3
+    const auto &transpose_2 =
+        src.Op("pd_op.transpose", {{"perm", src.Attr("perm_2")}});
+    const auto &transpose_3 =
+        src.Op("pd_op.transpose", {{"perm", src.Attr("perm_3")}});
+    const auto &matmul_1 =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_1_transpose_x")},
+                {"transpose_y", src.Attr("matmul_1_transpose_y")}});
+    src.Tensor("matmul_1_out") = matmul_1(transpose_2(src.Tensor("input_1")),
+                                          transpose_3(src.Tensor("input_1")));
+    const auto &matmul_3 =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_3_transpose_x")},
+                {"transpose_y", src.Attr("matmul_3_transpose_y")}});
+    src.Tensor("matmul_3_out") =
+        matmul_3(src.Tensor("matmul_2_out"), src.Tensor("matmul_1_out"));
+    const auto &transpose_4 =
+        src.Op("pd_op.transpose", {{"perm", src.Attr("perm_4")}});
+    const auto &transpose_5 =
+        src.Op("pd_op.transpose", {{"perm", src.Attr("perm_5")}});
+    const auto &transpose_6 =
+        src.Op("pd_op.transpose", {{"perm", src.Attr("perm_6")}});
+    const auto &relu_1 = src.Op("pd_op.relu");
+    const auto &softmax_3 =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_3_axis")}});
+    const auto &softmax_4 =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_4_axis")}});
+    src.Tensor("output3") = relu_1(transpose_4(src.Tensor("matmul_3_out")));
+    src.Tensor("output4") = softmax_3(transpose_5(src.Tensor("matmul_3_out")));
+    src.Tensor("output5") = softmax_4(transpose_6(src.Tensor("matmul_3_out")));
+
+    // path 4
+    const auto &full_tmp = src.Op("pd_op.full",
+                                  {{"shape", src.Attr("shape_tmp")},
+                                   {"value", src.Attr("value_tmp")},
+                                   {"dtype", src.Attr("dtype_tmp")},
+                                   {"place", src.Attr("place_tmp")}});
+    src.Tensor("full_tmp_out") = full_tmp();
+    const auto &add_1 = src.Op("pd_op.add");
+    src.Tensor("add_1_out") =
+        add_1(src.Tensor("input_1"), src.Tensor("full_tmp_out"));
+    const auto &add_2 = src.Op("pd_op.add");
+    src.Tensor("add_2_out") =
+        add_2(src.Tensor("add_1_out"), src.Tensor("full_tmp_out"));
+    const auto &relu_2 = src.Op("pd_op.relu");
+    src.Tensor("output6") = relu_2(src.Tensor("add_2_out"));
+
+    pir::drr::ResultPattern res = src.ResultPattern();
+    const auto &transpose_7 =
+        res.Op("pd_op.transpose", {{"perm", src.Attr("perm_4")}});
+    res.Tensor("output0") = transpose_7(res.Tensor("input_1"));
+    const auto &transpose_8 =
+        res.Op("pd_op.transpose", {{"perm", src.Attr("perm_5")}});
+    res.Tensor("output1") = transpose_8(res.Tensor("input_1"));
+    const auto &full_2 = res.Op("pd_op.full",
+                                {{"shape", src.Attr("shape_tmp")},
+                                 {"value", src.Attr("value_tmp")},
+                                 {"dtype", src.Attr("dtype_tmp")},
+                                 {"place", src.Attr("place_tmp")}});
+    const auto &full_3 = res.Op("pd_op.full",
+                                {{"shape", src.Attr("shape_tmp")},
+                                 {"value", src.Attr("value_tmp")},
+                                 {"dtype", src.Attr("dtype_tmp")},
+                                 {"place", src.Attr("place_tmp")}});
+    const auto &full_4 = res.Op("pd_op.full",
+                                {{"shape", src.Attr("shape_tmp")},
+                                 {"value", src.Attr("value_tmp")},
+                                 {"dtype", src.Attr("dtype_tmp")},
+                                 {"place", src.Attr("place_tmp")}});
+    const auto &full_5 = res.Op("pd_op.full",
+                                {{"shape", src.Attr("shape_tmp")},
+                                 {"value", src.Attr("value_tmp")},
+                                 {"dtype", src.Attr("dtype_tmp")},
+                                 {"place", src.Attr("place_tmp")}});
+    const auto &full_6 = res.Op("pd_op.full",
+                                {{"shape", src.Attr("shape_tmp")},
+                                 {"value", src.Attr("value_tmp")},
+                                 {"dtype", src.Attr("dtype_tmp")},
+                                 {"place", src.Attr("place_tmp")}});
+    res.Tensor("output2") = full_2();
+    res.Tensor("output3") = full_3();
+    res.Tensor("output4") = full_4();
+    res.Tensor("output5") = full_5();
+    res.Tensor("output6") = full_6();
+  }
+};
+
+void BuildProgram(pir::Builder &builder) {  // NOLINT
+  paddle::dialect::FullOp full_input_op1 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{4, 3, 16},
+                                             1.5,
+                                             phi::DataType::FLOAT32,
+                                             phi::CPUPlace());
+
+  // path 1
+  paddle::dialect::TransposeOp transpose_op1 =
+      builder.Build<paddle::dialect::TransposeOp>(full_input_op1.out(),
+                                                  std::vector<int>{0, 1, 2});
+
+  paddle::dialect::SoftmaxOp softmax_op2 =
+      builder.Build<paddle::dialect::SoftmaxOp>(transpose_op1.out(), -1);
+
+  paddle::dialect::MatmulOp matmul_op2 =
+      builder.Build<paddle::dialect::MatmulOp>(softmax_op2.out(),
+                                               full_input_op1.out());
+
+  // path 2
+  paddle::dialect::FullOp full_op_scale =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{48},
+                                             1.5,
+                                             phi::DataType::FLOAT32,
+                                             phi::CPUPlace());
+  paddle::dialect::SoftmaxOp softmax_op_bias =
+      builder.Build<paddle::dialect::SoftmaxOp>(full_op_scale.out(), -1);
+  paddle::dialect::LayerNormOp layernorm_op1 =
+      builder.Build<paddle::dialect::LayerNormOp>(
+          transpose_op1.out(), full_op_scale.out(), softmax_op_bias.out());
+
+  // path 3
+  paddle::dialect::TransposeOp transpose_op2 =
+      builder.Build<paddle::dialect::TransposeOp>(full_input_op1.out(),
+                                                  std::vector<int>{0, 1, 2});
+
+  paddle::dialect::TransposeOp transpose_op3 =
+      builder.Build<paddle::dialect::TransposeOp>(full_input_op1.out(),
+                                                  std::vector<int>{0, 1, 2});
+
+  paddle::dialect::MatmulOp matmul_op1 =
+      builder.Build<paddle::dialect::MatmulOp>(transpose_op2.out(),
+                                               transpose_op3.out());
+
+  paddle::dialect::MatmulOp matmul_op3 =
+      builder.Build<paddle::dialect::MatmulOp>(matmul_op2.out(),
+                                               matmul_op1.out());
+
+  paddle::dialect::TransposeOp transpose_op4 =
+      builder.Build<paddle::dialect::TransposeOp>(matmul_op3.out(),
+                                                  std::vector<int>{0, 1, 2});
+
+  paddle::dialect::ReluOp relu_op1 =
+      builder.Build<paddle::dialect::ReluOp>(transpose_op4.out());
+
+  paddle::dialect::TransposeOp transpose_op5 =
+      builder.Build<paddle::dialect::TransposeOp>(matmul_op3.out(),
+                                                  std::vector<int>{0, 1, 2});
+
+  paddle::dialect::SoftmaxOp softmax_op3 =
+      builder.Build<paddle::dialect::SoftmaxOp>(transpose_op5.out(), -1);
+
+  paddle::dialect::TransposeOp transpose_op6 =
+      builder.Build<paddle::dialect::TransposeOp>(matmul_op3.out(),
+                                                  std::vector<int>{0, 1, 2});
+
+  paddle::dialect::SoftmaxOp softmax_op4 =
+      builder.Build<paddle::dialect::SoftmaxOp>(transpose_op6.out(), -1);
+
+  // path 4
+  paddle::dialect::FullOp full_input_op2 =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{4, 3, 16},
+                                             1.5,
+                                             phi::DataType::FLOAT32,
+                                             phi::CPUPlace());
+
+  paddle::dialect::AddOp add_op1 = builder.Build<paddle::dialect::AddOp>(
+      full_input_op1.out(), full_input_op2.out());
+
+  paddle::dialect::AddOp add_op2 = builder.Build<paddle::dialect::AddOp>(
+      add_op1.out(), full_input_op2.out());
+
+  paddle::dialect::ReluOp relu_op2 =
+      builder.Build<paddle::dialect::ReluOp>(add_op2.out());
+
+  // tail
+  paddle::dialect::MatmulOp matmul_op4 =
+      builder.Build<paddle::dialect::MatmulOp>(layernorm_op1.variance(),
+                                               layernorm_op1.mean());
+
+  paddle::dialect::MatmulOp matmul_op5 =
+      builder.Build<paddle::dialect::MatmulOp>(relu_op1.out(),
+                                               softmax_op3.out());
+
+  paddle::dialect::MatmulOp matmul_op6 =
+      builder.Build<paddle::dialect::MatmulOp>(softmax_op4.out(),
+                                               relu_op2.out());
+
+  builder.Build<paddle::dialect::FetchOp>(matmul_op4.out(), "out1", 0);
+  builder.Build<paddle::dialect::FetchOp>(matmul_op5.out(), "out2", 1);
+  builder.Build<paddle::dialect::FetchOp>(matmul_op6.out(), "out3", 2);
+}
+
+class DrrPatternRewritePass : public pir::Pass {
+ public:
+  DrrPatternRewritePass() : pir::Pass("DrrPatternRewritePass", 1) {}
+
+  bool Initialize(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(SameTypeBindingTestPattern().Build(context));
+
+    patterns_ = pir::FrozenRewritePatternSet(std::move(ps));
+    return true;
+  }
+
+  void Run(pir::Operation *op) override {
+    pir::GreedyRewriteConfig cfg;
+    cfg.use_top_down_traversal = true;
+    cfg.max_iterations = 10;
+    pir::ApplyPatternsGreedily(op->region(0), patterns_, cfg);
+  }
+
+  bool CanApplyOn(pir::Operation *op) const override {
+    return op->name() == "builtin.module" && op->num_regions() > 0;
+  }
+
+ private:
+  pir::FrozenRewritePatternSet patterns_;
+};
+
+TEST(DrrTest, drr_demo) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  BuildProgram(builder);
+
+  EXPECT_EQ(program.block()->size(), 27u);
+
+  pir::PassManager pm(ctx);
+  pm.AddPass(std::make_unique<DrrPatternRewritePass>());
+  pm.AddPass(pir::CreateDeadCodeEliminationPass());
+  // pm.EnablePassTiming();
+  pm.EnableIRPrinting();
+
+  CHECK_EQ(pm.Run(&program), true);
+  EXPECT_EQ(program.block()->size(), 13u);
+}
diff --git a/test/cpp/pir/pattern_rewrite/drr_test.cc b/test/cpp/pir/pattern_rewrite/drr_test.cc
new file mode 100644
index 00000000000000..f607fa5a083260
--- /dev/null
+++ b/test/cpp/pir/pattern_rewrite/drr_test.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/api/drr_pattern_base.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_manager.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
+#include "paddle/pir/transforms/dead_code_elimination_pass.h"
+
+class RemoveRedundentReshapePattern
+    : public pir::drr::DrrPatternBase<RemoveRedundentReshapePattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    // Source patterns
+    pir::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &reshape1 = pat.Op("pd_op.reshape");
+    const auto &reshape2 = pat.Op("pd_op.reshape");
+
+    reshape1({&pat.Tensor("arg0"), &pat.Tensor("shape0")},
+             {&pat.Tensor("out1"), &pat.Tensor("xshape_0")});
+    reshape2({&pat.Tensor("out1"), &pat.Tensor("shape1")},
+             {&pat.Tensor("ret"), &pat.Tensor("xshape_1")});
+
+    // Result patterns
+    pir::drr::ResultPattern res = pat.ResultPattern();
+    res.Op("pd_op.reshape")({&res.Tensor("arg0"), &res.Tensor("shape1")},
+                            {&res.Tensor("ret"), &res.Tensor("xshape_1")});
+  }
+};
+
+class FoldExpandToConstantPattern
+    : public pir::drr::DrrPatternBase<FoldExpandToConstantPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    // Source Pattern
+    pir::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &full1 = pat.Op("pd_op.full",
+                               {{"shape", pat.Attr("shape_1")},
+                                {"value", pat.Attr("value_1")},
+                                {"dtype", pat.Attr("dtype_1")},
+                                {"place", pat.Attr("place_1")}});
+    const auto &full_int_array1 =
+        pat.Op("pd_op.full_int_array",
+               {{"value", pat.Attr("expand_shape_value")},
+                {"dtype", pat.Attr("dtype_2")},
+                {"place", pat.Attr("place_2")}});
+    const auto &expand = pat.Op("pd_op.expand");
+    pat.Tensor("ret") = expand(full1(), full_int_array1());
+
+    // Result patterns
+    pir::drr::ResultPattern res = pat.ResultPattern();
+    const auto &full2 = res.Op("pd_op.full",
+                               {{"shape", pat.Attr("expand_shape_value")},
+                                {"value", pat.Attr("value_1")},
+                                {"dtype", pat.Attr("dtype_1")},
+                                {"place", pat.Attr("place_1")}});
+    res.Tensor("ret") = full2();
+  }
+};
+
+class RemoveRedundentTransposePattern
+    : public pir::drr::DrrPatternBase<RemoveRedundentTransposePattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &transpose1 =
+        pat.Op("pd_op.transpose", {{"perm", pat.Attr("perm_1")}});
+    const auto &transpose2 =
+        pat.Op("pd_op.transpose", {{"perm", pat.Attr("perm_2")}});
+
+    pat.Tensor("ret") = transpose2(transpose1(pat.Tensor("arg_transpose")));
+
+    pir::drr::ResultPattern res = pat.ResultPattern();
+    const auto &new_perm_attr = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> std::vector<int> {
+          const auto &perm1 = match_ctx.Attr<std::vector<int>>("perm_1");
+          const auto &perm2 = match_ctx.Attr<std::vector<int>>("perm_2");
+          std::vector<int> new_perm;
+          for (int v : perm2) {
+            new_perm.emplace_back(perm1[v]);
+          }
+          return new_perm;
+        });
+    const auto &tranpose_continuous =
+        res.Op("pd_op.transpose", {{"perm", new_perm_attr}});
+
+    res.Tensor("ret") = tranpose_continuous(res.Tensor("arg_transpose"));
+  }
+};
+
+class RemoveRedundentCastPattern
+    : public pir::drr::DrrPatternBase<RemoveRedundentCastPattern> {
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    auto pat = ctx->SourcePattern();
+    pat.Tensor("tmp") = pat.Op(
+        "pd_op.cast", {{"dtype", pat.Attr("dtype1")}})(pat.Tensor("arg0"));
+    pat.Tensor("ret") = pat.Op(
+        "pd_op.cast", {{"dtype", pat.Attr("dtype2")}})(pat.Tensor("tmp"));
+    auto res = pat.ResultPattern();
+    res.Tensor("ret") = res.Op(
+        "pd_op.cast", {{"dtype", pat.Attr("dtype2")}})(res.Tensor("arg0"));
+  }
+};
+
+class RemoveUselessCastPattern
+    : public pir::drr::DrrPatternBase<RemoveUselessCastPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    auto pat = ctx->SourcePattern();
+    pat.Tensor("ret") = pat.Op("pd_op.cast")(pat.Tensor("arg0"));
+    pat.RequireEqual(pat.Tensor("ret").dtype(), pat.Tensor("arg0").dtype());
+    auto res = pat.ResultPattern();
+    res.Tensor("ret").Assign(res.Tensor("arg0"));
+  }
+};
+
+void BuildProgram(pir::Builder &builder) {  // NOLINT
+  paddle::dialect::FullOp full_input_op =
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{4, 3, 16},
+                                             1.5,
+                                             phi::DataType::FLOAT32,
+                                             phi::CPUPlace());
+
+  paddle::dialect::FullIntArrayOp full_int_array_op =
+      builder.Build<paddle::dialect::FullIntArrayOp>(
+          std::vector<int64_t>{4, 3, 16, 16},
+          phi::DataType::FLOAT32,
+          phi::CPUPlace());
+
+  paddle::dialect::ExpandOp expand_op =
+      builder.Build<paddle::dialect::ExpandOp>(full_input_op.out(),
+                                               full_int_array_op.out());
+
+  paddle::dialect::ReshapeOp reshape_op1 =
+      builder.Build<paddle::dialect::ReshapeOp>(
+          expand_op.out(), std::vector<int64_t>{16, 3, 4, 16});
+
+  paddle::dialect::ReshapeOp reshape_op2 =
+      builder.Build<paddle::dialect::ReshapeOp>(
+          reshape_op1.out(), std::vector<int64_t>{16, 3, 4, 16});
+
+  paddle::dialect::ReluOp relu_op =
+      builder.Build<paddle::dialect::ReluOp>(reshape_op2.out());
+
+  paddle::dialect::CastOp cast_op1 = builder.Build<paddle::dialect::CastOp>(
+      relu_op.out(), phi::DataType::FLOAT64);
+
+  paddle::dialect::CastOp cast_op2 = builder.Build<paddle::dialect::CastOp>(
+      cast_op1.out(), phi::DataType::FLOAT32);
+
+  paddle::dialect::TransposeOp transpose_op1 =
+      builder.Build<paddle::dialect::TransposeOp>(cast_op2.out(),
+                                                  std::vector<int>{0, 2, 1, 3});
+
+  paddle::dialect::TransposeOp transpose_op2 =
+      builder.Build<paddle::dialect::TransposeOp>(transpose_op1.out(),
+                                                  std::vector<int>{1, 0, 2, 3});
+
+  paddle::dialect::ReluOp relu_op_second =
+      builder.Build<paddle::dialect::ReluOp>(transpose_op2.out());
+
+  builder.Build<paddle::dialect::FetchOp>(relu_op_second.out(), "out", 0);
+}
+
+class DrrPatternRewritePass : public pir::Pass {
+ public:
+  DrrPatternRewritePass() : pir::Pass("DrrPatternRewritePass", 1) {}
+
+  bool Initialize(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(RemoveRedundentReshapePattern().Build(context));
+    ps.Add(RemoveRedundentTransposePattern().Build(context));
+    ps.Add(RemoveRedundentCastPattern().Build(context));
+    ps.Add(RemoveUselessCastPattern().Build(context));
+    ps.Add(FoldExpandToConstantPattern().Build(context));
+
+    patterns_ = pir::FrozenRewritePatternSet(std::move(ps));
+    return true;
+  }
+
+  void Run(pir::Operation *op) override {
+    pir::GreedyRewriteConfig cfg;
+    cfg.use_top_down_traversal = true;
+    cfg.max_iterations = 10;
+    pir::ApplyPatternsGreedily(op->region(0), patterns_, cfg);
+  }
+
+  bool CanApplyOn(pir::Operation *op) const override {
+    return op->name() == "builtin.module" && op->num_regions() > 0;
+  }
+
+ private:
+  pir::FrozenRewritePatternSet patterns_;
+};
+
+TEST(DrrTest, drr_demo) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  BuildProgram(builder);
+
+  EXPECT_EQ(program.block()->size(), 14u);
+
+  pir::PassManager pm(ctx);
+  pm.AddPass(std::make_unique<DrrPatternRewritePass>());
+  pm.AddPass(pir::CreateDeadCodeEliminationPass());
+  // pm.EnablePassTiming();
+  pm.EnableIRPrinting();
+
+  CHECK_EQ(pm.Run(&program), true);
+  EXPECT_EQ(program.block()->size(), 7u);
+}
diff --git a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
index adfe431a6be2ba..1499ba161bb09d 100644
--- a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
@@ -79,11 +79,11 @@ class Operation1 : public pir::Op<Operation1> {
   static const char *name() { return "test.Operation1"; }
   static constexpr uint32_t attributes_num = 2;
   static const char *attributes_name[attributes_num];  // NOLINT
-  void Verify();
+  void VerifySig();
   static void InferShape() { VLOG(2) << "This is op2's InferShape interface."; }
 };
 
-void Operation1::Verify() {
+void Operation1::VerifySig() {
   auto &attributes = this->attributes();
   if (attributes.count("op2_attr1") == 0 ||
       (!attributes.at("op2_attr1").isa<pir::StrAttribute>())) {
@@ -390,7 +390,7 @@ class Conv2dFusionOpTest : public pir::Op<Conv2dFusionOpTest,
                     pir::OpResult bias_,
                     pir::OpResult residual_,
                     pir::AttributeMap attributes);
-  void Verify();
+  void VerifySig();
   pir::Value input() { return operand_source(0); }
   pir::Value filter() { return operand_source(1); }
   pir::Value bias() { return operand_source(2); }
@@ -767,7 +767,7 @@ void Conv2dFusionOpTest::Build(pir::Builder &builder,
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
 }
 
-void Conv2dFusionOpTest::Verify() {
+void Conv2dFusionOpTest::VerifySig() {
   VLOG(4)
       << "Start Verifying inputs, outputs and attributes for: Conv2dFusionOp.";
   VLOG(4) << "Verifying inputs:";
@@ -1111,9 +1111,12 @@ void BuildProgram(pir::Builder &builder) {  // NOLINT
 // TODO(wilber): Add a normal test.
 TEST(pattern_rewrite, Patterns) {
   pir::IrContext *ctx = pir::IrContext::Instance();
+
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   auto *test_dialect = ctx->GetOrRegisterDialect<Conv2dFusionTestDialect>();
   test_dialect->RegisterOp<paddle::dialect::Conv2dFusionOpTest>();
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
   pir::Program program(ctx);
   pir::Builder builder = pir::Builder(ctx, program.block());
   BuildProgram(builder);
@@ -1122,7 +1125,7 @@ TEST(pattern_rewrite, Patterns) {
 
   pir::PassManager pm(ctx);
   pm.AddPass(std::make_unique<TestPass>());
-  //   pm.AddPass(ir::CreateConstantFoldingPass());
+  //   pm.AddPass(pir::CreateConstantFoldingPass());
   pm.AddPass(pir::CreateDeadCodeEliminationPass());
   pm.AddPass(pir::CreateReorderBlockOpsPass());
   pm.EnablePassTiming();
diff --git a/test/cpp/pir/shape_dialect/CMakeLists.txt b/test/cpp/pir/shape_dialect/CMakeLists.txt
index d5fe787de4a801..349d6a32dfa224 100644
--- a/test/cpp/pir/shape_dialect/CMakeLists.txt
+++ b/test/cpp/pir/shape_dialect/CMakeLists.txt
@@ -1,13 +1,22 @@
-cc_test_old(
-  symbolic_op_test
+paddle_test(
+  shape_op_test
   SRCS
-  symbolic_op_test.cc
+  shape_op_test.cc
   DEPS
   pd_op_dialect
   pir
   gtest)
 
-cc_test_old(
+paddle_test(
+  shape_struct_test
+  SRCS
+  shape_struct_test.cc
+  DEPS
+  pd_op_dialect
+  pir
+  gtest)
+
+paddle_test(
   constraint_pass_test
   SRCS
   constraint_pass_test.cc
@@ -19,3 +28,9 @@ cc_test_old(
 set_tests_properties(
   constraint_pass_test PROPERTIES ENVIRONMENT
                                   "FLAGS_enable_new_ir_in_executor=true")
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(shape_op_test)
+endif()
diff --git a/test/cpp/pir/shape_dialect/constraint_pass_test.cc b/test/cpp/pir/shape_dialect/constraint_pass_test.cc
index 7c645044a09d08..860bf34a69ac4f 100644
--- a/test/cpp/pir/shape_dialect/constraint_pass_test.cc
+++ b/test/cpp/pir/shape_dialect/constraint_pass_test.cc
@@ -39,7 +39,7 @@
 #include "paddle/pir/core/value.h"
 #include "paddle/pir/dialect/shape/ir/shape_dialect.h"
 #include "paddle/pir/dialect/shape/ir/shape_op.h"
-#include "paddle/pir/dialect/shape/transforms/shape_optimization_pass.h"
+#include "paddle/pir/dialect/shape/transforms/passes.h"
 #include "paddle/pir/dialect/shape/utils/shape_utils.h"
 #include "paddle/pir/pass/pass.h"
 #include "paddle/pir/pass/pass_manager.h"
@@ -133,8 +133,5 @@ TEST(constraint_pass, shape_computation_run) {
   EXPECT_TRUE(pm.Run(&program));
   pir::SymbolicDimMgr mgr(program.module_op());
   EXPECT_TRUE(mgr.Load());
-  pir::ShapeComputationIRAnalysis analysis(program.module_op(), mgr);
-  EXPECT_TRUE(analysis.Run());
-  EXPECT_FALSE(analysis.Run());
   EXPECT_TRUE(mgr.Save());
 }
diff --git a/test/cpp/pir/shape_dialect/shape_op_test.cc b/test/cpp/pir/shape_dialect/shape_op_test.cc
new file mode 100644
index 00000000000000..9d71e721fe72df
--- /dev/null
+++ b/test/cpp/pir/shape_dialect/shape_op_test.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/dialect/shape/ir/shape_op.h"
+#include <gtest/gtest.h>
+#include <map>
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/dialect/shape/ir/shape_dialect.h"
+#include "paddle/pir/dialect/shape/utils/shape_utils.h"
+#include "paddle/pir/dialect/shape/utils/symbol_table.h"
+
+pir::AttributeMap CreateAttributeMap(
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::AttributeMap attr_map;
+  for (size_t i = 0; i < attribute_names.size(); i++) {
+    pir::Attribute attr_value = pir::StrAttribute::get(ctx, attributes[i]);
+    attr_map.insert(
+        std::pair<std::string, pir::Attribute>(attribute_names[i], attr_value));
+  }
+  return attr_map;
+}
+
+pir::Operation *CreateDenseTensorOp(
+    pir::IrContext *ctx,
+    const phi::DDim &dims,
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes) {
+  std::vector<pir::Value> op_inputs = {};
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+  std::vector<pir::Type> op_output_types = {
+      paddle::dialect::DenseTensorType::get(
+          ctx, fp32_dtype, dims, data_layout, lod, offset)};
+  pir::Operation *op =
+      pir::Operation::Create(op_inputs,
+                             CreateAttributeMap(attribute_names, attributes),
+                             op_output_types,
+                             pir::OpInfo());
+  return op;
+}
+
+TEST(shape_op, dim) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+
+  pir::dialect::DimOp dim_op = builder.Build<pir::dialect::DimOp>("S0");
+  pir::OpResult res = dim_op.out();
+  EXPECT_EQ(dim_op.getName(), "S0");
+  dim_op.setName("S1");
+  EXPECT_EQ(dim_op.getName(), "S1");
+  EXPECT_EQ(res.owner(), dim_op.operation());
+  EXPECT_EQ(res.type(), pir::IndexType::get(ctx));
+}
+
+TEST(shape_op, tie_product_equal) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  pir::SymbolTable symbolt_table(program.module_op());
+
+  pir::OpResult dim_op0 = builder.Build<pir::dialect::DimOp>("S0").out();
+  pir::OpResult dim_op1 = builder.Build<pir::dialect::DimOp>("S1").out();
+  pir::OpResult dim_op2 = builder.Build<pir::dialect::DimOp>("S2").out();
+  pir::OpResult dim_op3 = builder.Build<pir::dialect::DimOp>("S3").out();
+  pir::OpResult dim_op4 = builder.Build<pir::dialect::DimOp>("S4").out();
+
+  pir::dialect::TieProductEqualOp tie_product_equal =
+      builder.Build<pir::dialect::TieProductEqualOp>(
+          2,
+          3,
+          std::vector<pir::Value>{dim_op0, dim_op1, dim_op2, dim_op3, dim_op4});
+
+  std::vector<pir::Value> lhs = tie_product_equal.lhs();
+  std::vector<pir::Value> rhs = tie_product_equal.rhs();
+
+  std::vector<pir::Value> lhs_ref{dim_op0, dim_op1};
+  std::vector<pir::Value> rhs_ref{dim_op2, dim_op3, dim_op4};
+
+  EXPECT_EQ(symbolt_table.insert(tie_product_equal), "tie_product_equal");
+  EXPECT_EQ(
+      symbolt_table.Lookup<pir::dialect::TieProductEqualOp>("tie_product_equal")
+          .size(),
+      static_cast<size_t>(1));
+  EXPECT_EQ(symbolt_table.Lookup<pir::dialect::TieProductEqualOp>(
+                "tie_product_equal")[0],
+            tie_product_equal);
+  EXPECT_EQ(lhs, lhs_ref);
+  EXPECT_EQ(rhs, rhs_ref);
+}
+
+TEST(shape_op, tie_shape) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Builder builder = pir::Builder(ctx, program.block());
+
+  auto op = CreateDenseTensorOp(
+      ctx, {pir::ShapedTypeInterface::kDynamic, 2}, {"op_attr"}, {"op_name"});
+  pir::OpResult res = op->result(0);
+
+  pir::dialect::TieShapeOp tie_shape_op =
+      builder.Build<pir::dialect::TieShapeOp>(res);
+  pir::Value tie_shape_op_value = tie_shape_op.value();
+
+  pir::Attribute attr_s0 = pir::StrAttribute::get(ctx, "S0");
+  pir::Attribute attr_s1 = pir::StrAttribute::get(ctx, "S1");
+
+  std::vector<pir::Attribute> new_attrs = {attr_s0, attr_s1};
+
+  auto array_attr = pir::ArrayAttribute::get(ctx, new_attrs);
+  tie_shape_op->set_attribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), array_attr);
+
+  std::vector<pir::Attribute> arr_attr_vec =
+      tie_shape_op
+          ->attribute<pir::ArrayAttribute>(
+              pir::dialect::SymbolicDim::GetSymbolicDimAttrName())
+          .AsVector();
+
+  EXPECT_EQ(tie_shape_op_value, res);
+  EXPECT_EQ(arr_attr_vec.size(), static_cast<size_t>(2));
+  EXPECT_EQ(arr_attr_vec[0].dyn_cast<pir::StrAttribute>(), attr_s0);
+  EXPECT_EQ(arr_attr_vec[1].dyn_cast<pir::StrAttribute>(), attr_s1);
+  EXPECT_TRUE(tie_shape_op->HasAttribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName()));
+}
+
+TEST(shape_op, func_op) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+  pir::dialect::FuncOp func_op = builder.Build<pir::dialect::FuncOp>();
+  auto func_block = func_op.block();
+  builder.SetInsertionPointToStart(func_block);
+  builder.Build<pir::ConstantOp>(pir::Int32Attribute::get(ctx, 2),
+                                 pir::Int32Type::get(ctx));
+  EXPECT_EQ(func_block, func_op->region(0).front());
+  EXPECT_EQ(func_op->region(0).size(), static_cast<size_t>(1));
+  EXPECT_EQ(func_block->size(), static_cast<size_t>(1));
+}
+
+TEST(shape_op, tensor_dim) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+
+  pir::Operation *op = CreateDenseTensorOp(
+      ctx, {pir::ShapedTypeInterface::kDynamic, 2}, {"op_attr"}, {"op_name"});
+  pir::OpResult res_dense_tensor_value = op->result(0);
+
+  pir::dialect::TensorDimOp tensor_dim_op0 =
+      builder.Build<pir::dialect::TensorDimOp>(res_dense_tensor_value, 0);
+  pir::OpResult res0 = tensor_dim_op0.out();
+
+  pir::OpResult index_value =
+      builder
+          .Build<pir::ConstantOp>(
+              pir::Int64Attribute::get(pir::IrContext::Instance(), 1),
+              pir::IndexType::get(pir::IrContext::Instance()))
+          ->result(0);
+  pir::dialect::TensorDimOp tensor_dim_op1 =
+      builder.Build<pir::dialect::TensorDimOp>(res_dense_tensor_value,
+                                               index_value);
+  pir::OpResult res1 = tensor_dim_op1.out();
+
+  EXPECT_EQ(res0.type(), pir::IndexType::get(ctx));
+  EXPECT_EQ(res1.type(), pir::IndexType::get(ctx));
+  EXPECT_EQ(tensor_dim_op0.source(), res_dense_tensor_value);
+  EXPECT_EQ(tensor_dim_op1.source(), res_dense_tensor_value);
+  EXPECT_EQ(tensor_dim_op1.index(), index_value);
+}
diff --git a/test/cpp/pir/shape_dialect/shape_struct_test.cc b/test/cpp/pir/shape_dialect/shape_struct_test.cc
new file mode 100644
index 00000000000000..64b58a399a1508
--- /dev/null
+++ b/test/cpp/pir/shape_dialect/shape_struct_test.cc
@@ -0,0 +1,503 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <map>
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/dialect/shape/ir/shape_dialect.h"
+#include "paddle/pir/dialect/shape/ir/shape_op.h"
+#include "paddle/pir/dialect/shape/utils/shape_utils.h"
+#include "paddle/pir/dialect/shape/utils/symbol_table.h"
+
+pir::AttributeMap CreateAttributeMap(
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::AttributeMap attr_map;
+  for (size_t i = 0; i < attribute_names.size(); i++) {
+    pir::Attribute attr_value = pir::StrAttribute::get(ctx, attributes[i]);
+    attr_map.insert(
+        std::pair<std::string, pir::Attribute>(attribute_names[i], attr_value));
+  }
+  return attr_map;
+}
+
+pir::Operation *CreateDenseTensorOp(
+    pir::IrContext *ctx,
+    const phi::DDim &dims,
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes) {
+  std::vector<pir::Value> op_inputs = {};
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+  std::vector<pir::Type> op_output_types = {
+      paddle::dialect::DenseTensorType::get(
+          ctx, fp32_dtype, dims, data_layout, lod, offset)};
+  pir::Operation *op =
+      pir::Operation::Create(op_inputs,
+                             CreateAttributeMap(attribute_names, attributes),
+                             op_output_types,
+                             pir::OpInfo());
+  return op;
+}
+
+TEST(shape_struct_test, symbolic_dim) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+
+  pir::dialect::SymbolicDim sym_dim1 = builder.Build<pir::dialect::SymbolicDim>(
+      "S0", 10, false, false, false, false);
+  pir::dialect::SymbolicDim sym_dim2 = builder.Build<pir::dialect::SymbolicDim>(
+      "S1", 10, false, false, false, false);
+
+  EXPECT_EQ(sym_dim1.GetDimSize(), 10);
+  EXPECT_EQ(sym_dim1.GetSymName(), "S0");
+  EXPECT_FALSE(sym_dim1.GetKnownNegativeOne());
+  EXPECT_FALSE(sym_dim1.GetKnownNonSizeOne());
+  EXPECT_FALSE(sym_dim1.GetKnownNonSizeZero());
+  EXPECT_FALSE(sym_dim1.GetKnownNonNegative());
+
+  EXPECT_FALSE(sym_dim1.IsDynamic());
+  EXPECT_TRUE(sym_dim1.Merge(sym_dim2));
+
+  sym_dim1.SetDimSize(20);
+  sym_dim1.SetSymName("S2");
+  sym_dim1.UpdateKnownNegativeOne(true);
+  sym_dim1.UpdateKnownNonSizeOne(true);
+  sym_dim1.UpdateKnownNonSizeZero(true);
+  sym_dim1.UpdateKnownNonNegative(true);
+
+  EXPECT_FALSE(sym_dim1.Merge(sym_dim2));
+
+  EXPECT_EQ(sym_dim1.GetDimSize(), 20);
+  EXPECT_EQ(sym_dim1.GetSymName(), "S2");
+  EXPECT_TRUE(sym_dim1.GetKnownNegativeOne());
+  EXPECT_TRUE(sym_dim1.GetKnownNonSizeOne());
+  EXPECT_TRUE(sym_dim1.GetKnownNonSizeZero());
+  EXPECT_TRUE(sym_dim1.GetKnownNonNegative());
+}
+
+TEST(shape_struct_test, symbolic_dim_product) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  pir::dialect::SymbolicDim sym_dim = builder.Build<pir::dialect::SymbolicDim>(
+      "S0", pir::ShapedTypeInterface::kDynamic, false, false, false, false);
+  pir::SymbolicDimProduct sym_dim_product1;
+  pir::SymbolicDimProduct sym_dim_product2;
+  sym_dim_product1.symbols.push_back(sym_dim);
+  sym_dim_product1.factor *= 10;
+  EXPECT_EQ(sym_dim_product1.factor, 10);
+  EXPECT_NE(sym_dim_product1, sym_dim_product2);
+  EXPECT_FALSE(sym_dim_product1.empty());
+}
+
+TEST(shape_struct_test, symbolic_dim_table) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  pir::dialect::SymbolicDim sym_dim = builder.Build<pir::dialect::SymbolicDim>(
+      "S0", 10, false, false, false, false);
+
+  pir::SymbolTable symbol_table(program.module_op());
+  EXPECT_EQ(symbol_table.insert(sym_dim), "S0");
+  EXPECT_EQ(symbol_table.Lookup<pir::dialect::SymbolicDim>("S0"), sym_dim);
+  EXPECT_EQ(symbol_table.getOp(), program.module_op());
+  EXPECT_FALSE(symbol_table.Lookup<pir::dialect::SymbolicDim>("S1"));
+}
+
+TEST(shape_struct_test, symbolic_dim_mgr_simple) {
+  /******************************************************/
+  /* Mgr simple version, only SymbolicDim related func. */
+  /******************************************************/
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::SymbolicDimMgr sym_dim_mgr(program.module_op());
+  pir::dialect::SymbolicDim sym_dim_s0 = sym_dim_mgr.NewSymbolicDim();
+  pir::dialect::SymbolicDim sym_dim_s1 = sym_dim_mgr.NewSymbolicDim();
+  pir::dialect::SymbolicDim sym_dim_c10 =
+      sym_dim_mgr.NewConstantSymbolicDim(10);
+  sym_dim_mgr.MapSymbolicDimEqual(sym_dim_s0, sym_dim_s1);
+
+  auto op = CreateDenseTensorOp(
+      ctx, {pir::ShapedTypeInterface::kDynamic, 2}, {"op_attr"}, {"op_name"});
+  pir::Value res = op->result(0);
+
+  std::vector<pir::dialect::SymbolicDim> sym_dim_vec =
+      sym_dim_mgr.CreateSymbolicDimsForRankedValue(res);
+
+  EXPECT_EQ(sym_dim_s0.GetSymName(), "S0");
+  EXPECT_EQ(sym_dim_s1.GetSymName(), "S1");
+  EXPECT_EQ(sym_dim_s1.GetDimSize(), pir::ShapedTypeInterface::kDynamic);
+  EXPECT_EQ(sym_dim_c10.GetSymName(), "C10");
+  EXPECT_EQ(sym_dim_c10.GetDimSize(), 10);
+  EXPECT_EQ(sym_dim_vec[0].GetSymName(), "S2");
+  EXPECT_EQ(sym_dim_vec[1].GetSymName(), "C2");
+  EXPECT_EQ(sym_dim_mgr.symbolTable().Lookup<pir::dialect::SymbolicDim>("S0"),
+            sym_dim_s0);
+  EXPECT_EQ(sym_dim_mgr.symbolTable().Lookup<pir::dialect::SymbolicDim>("C10"),
+            sym_dim_c10);
+  EXPECT_EQ(sym_dim_mgr.GetRootSymbolicDim(sym_dim_s1), sym_dim_s0);
+  EXPECT_TRUE(sym_dim_mgr.IsSymbolicDimEqual(sym_dim_s0, sym_dim_s1));
+  EXPECT_FALSE(sym_dim_mgr.IsSymbolicDimEqual(sym_dim_s0, sym_dim_c10));
+}
+
+TEST(shape_struct_test, symbolic_dim_mgr_complex) {
+  /***************************************************************/
+  /* Mgr with constraintOp, and SymbolicDimProduct related func. */
+  /***************************************************************/
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::SymbolicDimMgr sym_dim_mgr(program.module_op());
+  auto func_op =
+      sym_dim_mgr.symbolTable().getOp()->dyn_cast<pir::dialect::FuncOp>();
+
+  pir::Builder builder = pir::Builder(ctx, func_op.block());
+
+  pir::dialect::SymbolicDim sym_dim_s0 = sym_dim_mgr.NewSymbolicDim("S0");
+  pir::dialect::SymbolicDim sym_dim_s1 = sym_dim_mgr.NewSymbolicDim("S1");
+  pir::dialect::SymbolicDim sym_dim_s2 = sym_dim_mgr.NewSymbolicDim("S2");
+  pir::dialect::SymbolicDim sym_dim_s3 = sym_dim_mgr.NewSymbolicDim("S3");
+  pir::dialect::SymbolicDim sym_dim_s4 = sym_dim_mgr.NewSymbolicDim("S4");
+  pir::dialect::SymbolicDim sym_dim_s5 = sym_dim_mgr.NewSymbolicDim("S5");
+  pir::dialect::SymbolicDim sym_dim_s6 = sym_dim_mgr.NewSymbolicDim("S6");
+  pir::dialect::SymbolicDim sym_dim_s7 = sym_dim_mgr.NewSymbolicDim("S7");
+  pir::dialect::SymbolicDim sym_dim_s8 = sym_dim_mgr.NewSymbolicDim("S8");
+  pir::dialect::SymbolicDim sym_dim_s9 = sym_dim_mgr.NewSymbolicDim("S9");
+  pir::dialect::SymbolicDim sym_dim_s10 = sym_dim_mgr.NewSymbolicDim("S10");
+  pir::dialect::SymbolicDim sym_dim_s11 = sym_dim_mgr.NewSymbolicDim("S11");
+  pir::dialect::SymbolicDim sym_dim_s12 = sym_dim_mgr.NewSymbolicDim("S12");
+  pir::dialect::SymbolicDim sym_dim_c10 =
+      sym_dim_mgr.NewConstantSymbolicDim(10);
+  pir::dialect::SymbolicDim sym_dim_c20 =
+      sym_dim_mgr.NewConstantSymbolicDim(20);
+
+  pir::OpResult dim_op_s0 = builder.Build<pir::dialect::DimOp>("S0").out();
+  pir::OpResult dim_op_s1 = builder.Build<pir::dialect::DimOp>("S1").out();
+  pir::OpResult dim_op_s2 = builder.Build<pir::dialect::DimOp>("S2").out();
+  pir::OpResult dim_op_s3 = builder.Build<pir::dialect::DimOp>("S3").out();
+  pir::OpResult dim_op_s4 = builder.Build<pir::dialect::DimOp>("S4").out();
+  pir::OpResult dim_op_s5 = builder.Build<pir::dialect::DimOp>("S5").out();
+  pir::OpResult dim_op_s6 = builder.Build<pir::dialect::DimOp>("S6").out();
+  pir::OpResult dim_op_s7 = builder.Build<pir::dialect::DimOp>("S7").out();
+  pir::OpResult dim_op_s8 = builder.Build<pir::dialect::DimOp>("S8").out();
+  pir::OpResult dim_op_s9 = builder.Build<pir::dialect::DimOp>("S9").out();
+  pir::OpResult dim_op_s10 = builder.Build<pir::dialect::DimOp>("S10").out();
+  pir::OpResult dim_op_s11 = builder.Build<pir::dialect::DimOp>("S11").out();
+  pir::OpResult dim_op_c10 = builder.Build<pir::dialect::DimOp>("C10").out();
+  pir::OpResult dim_op_c20 = builder.Build<pir::dialect::DimOp>("C20").out();
+  pir::OpResult constant =
+      builder
+          .Build<pir::ConstantOp>(pir::Int32Attribute::get(ctx, 2),
+                                  pir::Int32Type::get(ctx))
+          ->result(0);
+
+  // Mark S1 == S2.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      2, 2, std::vector<pir::Value>{constant, dim_op_s1, dim_op_s2, constant});
+  // Mark S0 * S1 == S2 * S3, For check S0 == S3.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      2,
+      2,
+      std::vector<pir::Value>{dim_op_s0, dim_op_s1, dim_op_s2, dim_op_s3});
+  // Mark S4 * S0 * S1 == S2 * S3 * S5, For check S4 == S5.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      3,
+      3,
+      std::vector<pir::Value>{
+          dim_op_s4, dim_op_s0, dim_op_s1, dim_op_s2, dim_op_s3, dim_op_s5});
+  // For check S6 == C10 * C20.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      1, 2, std::vector<pir::Value>{dim_op_s6, dim_op_c10, dim_op_c20});
+  // Mark C10 * S0 * S1 == S2 * S3 * S7, for check C10 == S7.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      3,
+      3,
+      std::vector<pir::Value>{
+          dim_op_c10, dim_op_s0, dim_op_s1, dim_op_s2, dim_op_s3, dim_op_s7});
+
+  // For unsimplify product case: S8 * S9 == S10 * S11
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      2,
+      2,
+      std::vector<pir::Value>{dim_op_s8, dim_op_s9, dim_op_s10, dim_op_s11});
+
+  auto op = CreateDenseTensorOp(ctx,
+                                {pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic},
+                                {"op0_attr"},
+                                {"op0_name"});
+  auto op_ = CreateDenseTensorOp(ctx,
+                                 {pir::ShapedTypeInterface::kDynamic,
+                                  pir::ShapedTypeInterface::kDynamic,
+                                  pir::ShapedTypeInterface::kDynamic,
+                                  pir::ShapedTypeInterface::kDynamic,
+                                  pir::ShapedTypeInterface::kDynamic,
+                                  10,
+                                  20},
+                                 {"op1_attr"},
+                                 {"op1_name"});
+  pir::OpResult res = op->result(0);
+  pir::OpResult res_ = op_->result(0);
+
+  builder.SetInsertionPointToEnd(program.block());
+  pir::dialect::TieShapeOp tie_shape_op1 =
+      builder.Build<pir::dialect::TieShapeOp>(res);
+  pir::dialect::TieShapeOp tie_shape_op2 =
+      builder.Build<pir::dialect::TieShapeOp>(res_);
+
+  pir::Attribute attr_s0 = pir::StrAttribute::get(ctx, "S0");
+  pir::Attribute attr_s1 = pir::StrAttribute::get(ctx, "S1");
+  pir::Attribute attr_s2 = pir::StrAttribute::get(ctx, "S2");
+  pir::Attribute attr_s3 = pir::StrAttribute::get(ctx, "S3");
+  pir::Attribute attr_s4 = pir::StrAttribute::get(ctx, "S4");
+  pir::Attribute attr_s5 = pir::StrAttribute::get(ctx, "S5");
+  pir::Attribute attr_s6 = pir::StrAttribute::get(ctx, "S6");
+  pir::Attribute attr_s7 = pir::StrAttribute::get(ctx, "S7");
+  pir::Attribute attr_s8 = pir::StrAttribute::get(ctx, "S8");
+  pir::Attribute attr_s9 = pir::StrAttribute::get(ctx, "S9");
+  pir::Attribute attr_s10 = pir::StrAttribute::get(ctx, "S10");
+  pir::Attribute attr_s11 = pir::StrAttribute::get(ctx, "S11");
+  pir::Attribute attr_c10 = pir::StrAttribute::get(ctx, "C10");
+  pir::Attribute attr_c20 = pir::StrAttribute::get(ctx, "C20");
+
+  std::vector<pir::Attribute> new_attrs1 = {
+      attr_s0, attr_s1, attr_s2, attr_s3, attr_s4, attr_s5};
+  std::vector<pir::Attribute> new_attrs2 = {attr_s6,
+                                            attr_s7,
+                                            attr_s8,
+                                            attr_s9,
+                                            attr_s10,
+                                            attr_s11,
+                                            attr_c10,
+                                            attr_c20};
+  std::vector<pir::Attribute> new_attrs_ref = {
+      attr_s0, attr_s1, attr_s1, attr_s0, attr_s2, attr_s2};
+
+  auto array_attr1 = pir::ArrayAttribute::get(ctx, new_attrs1);
+  auto array_attr2 = pir::ArrayAttribute::get(ctx, new_attrs2);
+  auto array_attr_ref = pir::ArrayAttribute::get(ctx, new_attrs_ref);
+
+  tie_shape_op1->set_attribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), array_attr1);
+  tie_shape_op2->set_attribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), array_attr2);
+
+  EXPECT_TRUE(sym_dim_mgr.Load());
+
+  // For check indirect equality: S1 * S4 == S2 * S5
+  pir::SymbolicDimProduct sym_dim_product_lhs1;
+  pir::SymbolicDimProduct sym_dim_product_rhs1;
+
+  sym_dim_product_lhs1.symbols.push_back(sym_dim_s1);
+  sym_dim_product_lhs1.symbols.push_back(sym_dim_s4);
+
+  sym_dim_product_rhs1.symbols.push_back(sym_dim_s2);
+  sym_dim_product_rhs1.symbols.push_back(sym_dim_s5);
+
+  // For uncompletely simplied product check: S8 * S9 * S12 == S10 * S11 * S12
+  pir::SymbolicDimProduct sym_dim_product_lhs2;
+  pir::SymbolicDimProduct sym_dim_product_rhs2;
+
+  sym_dim_product_lhs2.symbols.push_back(sym_dim_s8);
+  sym_dim_product_lhs2.symbols.push_back(sym_dim_s9);
+  sym_dim_product_lhs2.symbols.push_back(sym_dim_s12);
+
+  sym_dim_product_rhs2.symbols.push_back(sym_dim_s10);
+  sym_dim_product_rhs2.symbols.push_back(sym_dim_s11);
+  sym_dim_product_rhs2.symbols.push_back(sym_dim_s12);
+
+  // For check SimplifySymbolicDimProduct, {factor = 1, Sym = {S7}} => {factor =
+  // 10}
+  pir::SymbolicDimProduct sym_dim_product_s7;
+  sym_dim_product_s7.symbols.push_back(sym_dim_s7);
+  pir::SymbolicDimProduct simplified_product_s7 =
+      sym_dim_mgr.SimplifySymbolicDimProduct(sym_dim_product_s7);
+
+  // For check SimplifySymbolicDimProductPair, X * Y * Y, Y * Y * Z => X, Z
+  pir::SymbolicDimProduct sym_dim_product_pair_lhs;
+  pir::SymbolicDimProduct sym_dim_product_pair_rhs;
+  pir::SymbolicDimProduct new_lhs, new_rhs;
+  sym_dim_product_pair_lhs.symbols.push_back(sym_dim_s4);
+  sym_dim_product_pair_lhs.symbols.push_back(sym_dim_s1);
+  sym_dim_product_pair_lhs.symbols.push_back(sym_dim_s2);
+  sym_dim_product_pair_rhs.symbols.push_back(sym_dim_s1);
+  sym_dim_product_pair_rhs.symbols.push_back(sym_dim_s2);
+  sym_dim_product_pair_rhs.symbols.push_back(sym_dim_s3);
+
+  std::tie(new_lhs, new_rhs) = sym_dim_mgr.SimplifySymbolicDimProductPair(
+      sym_dim_product_pair_lhs, sym_dim_product_pair_rhs);
+
+  // For check SymbolicDimProductDivide, {S4 * S1 * C20} / {S1 * C10} => {factor
+  // = 2 Sym = {S4}}
+  pir::SymbolicDimProduct sym_dim_product_div_lhs;
+  pir::SymbolicDimProduct sym_dim_product_div_rhs;
+  sym_dim_product_div_lhs.symbols.push_back(sym_dim_s4);
+  sym_dim_product_div_lhs.symbols.push_back(sym_dim_s1);
+  sym_dim_product_div_lhs.symbols.push_back(sym_dim_c20);
+  sym_dim_product_div_rhs.symbols.push_back(sym_dim_s1);
+  sym_dim_product_div_rhs.symbols.push_back(sym_dim_c10);
+
+  pir::SymbolicDimProduct *divRes = sym_dim_mgr.SymbolicDimProductDivide(
+      sym_dim_product_div_lhs, sym_dim_product_div_rhs);
+
+  EXPECT_TRUE(sym_dim_mgr.IsSymbolicDimEqual(sym_dim_s1, sym_dim_s2));
+  EXPECT_TRUE(sym_dim_mgr.IsSymbolicDimEqual(sym_dim_s0, sym_dim_s3));
+  EXPECT_TRUE(sym_dim_mgr.IsSymbolicDimEqual(sym_dim_s4, sym_dim_s5));
+  EXPECT_EQ(sym_dim_s6.GetDimSize(), 200);
+  EXPECT_EQ(sym_dim_mgr.symbolTable().Lookup<pir::dialect::SymbolicDim>("C20"),
+            sym_dim_c20);
+  EXPECT_EQ(sym_dim_s7.GetDimSize(), sym_dim_c10.GetDimSize());
+  EXPECT_EQ(simplified_product_s7.factor, 10);
+  EXPECT_EQ(simplified_product_s7.symbols.size(), static_cast<size_t>(0));
+  EXPECT_EQ(new_lhs.symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(new_rhs.symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(new_lhs.symbols[0], sym_dim_mgr.GetRootSymbolicDim(sym_dim_s4));
+  EXPECT_EQ(new_rhs.symbols[0], sym_dim_mgr.GetRootSymbolicDim(sym_dim_s3));
+  EXPECT_EQ(divRes->factor, 2);
+  EXPECT_EQ(divRes->symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(divRes->symbols[0], sym_dim_mgr.GetRootSymbolicDim(sym_dim_s4));
+  EXPECT_TRUE(sym_dim_mgr.IsSymbolicDimProductEqual(sym_dim_product_lhs1,
+                                                    sym_dim_product_rhs1));
+  EXPECT_TRUE(sym_dim_mgr.IsSymbolicDimProductEqual(sym_dim_product_lhs2,
+                                                    sym_dim_product_rhs2));
+  EXPECT_TRUE(sym_dim_mgr.Save());
+
+  pir::SymbolicDimMgr sym_dim_mgr_new(program.module_op());
+  EXPECT_TRUE(sym_dim_mgr_new.Load());
+
+  auto attrs = tie_shape_op1.attribute<pir::ArrayAttribute>(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName());
+  EXPECT_FALSE(
+      sym_dim_mgr_new.symbolTable().Lookup<pir::dialect::SymbolicDim>("S7"));
+  EXPECT_EQ(sym_dim_mgr_new.symbolTable()
+                .Lookup<pir::dialect::TieProductEqualOp>("tie_product_equal")
+                .size(),
+            static_cast<size_t>(1));
+
+  EXPECT_EQ(attrs.AsVector(), array_attr_ref.AsVector());
+}
+
+TEST(shape_struct_test, shape_analysis) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+  pir::dialect::FuncOp func_op = builder.Build<pir::dialect::FuncOp>();
+
+  phi::DDim dims_D_2 = {pir::ShapedTypeInterface::kDynamic, 2};
+  phi::DDim dims_2_2 = {2, 2};
+  phi::DDim dims_D = {pir::ShapedTypeInterface::kDynamic};
+
+  // same shape with dynamic: value1 == value2
+  auto op1 = CreateDenseTensorOp(ctx, dims_D_2, {"op1_attr"}, {"op1_name"});
+  auto op2 = CreateDenseTensorOp(ctx, dims_D_2, {"op2_attr"}, {"op2_name"});
+  pir::OpResult value1 = op1->result(0);
+  pir::OpResult value2 = op2->result(0);
+
+  // same shape with static: value3 == value4
+  auto op3 = CreateDenseTensorOp(ctx, dims_2_2, {"op3_attr"}, {"op3_name"});
+  auto op4 = CreateDenseTensorOp(ctx, dims_2_2, {"op4_attr"}, {"op4_name"});
+  pir::OpResult value3 = op3->result(0);
+  pir::OpResult value4 = op4->result(0);
+
+  // one dimension with dynamic: value5 != value1 != value3
+  auto op5 = CreateDenseTensorOp(ctx, dims_D, {"op5_attr"}, {"op5_name"});
+  pir::OpResult value5 = op5->result(0);
+
+  pir::dialect::TieShapeOp tie_shape_op1 =
+      builder.Build<pir::dialect::TieShapeOp>(value1);
+  pir::dialect::TieShapeOp tie_shape_op2 =
+      builder.Build<pir::dialect::TieShapeOp>(value2);
+  pir::dialect::TieShapeOp tie_shape_op3 =
+      builder.Build<pir::dialect::TieShapeOp>(value3);
+  pir::dialect::TieShapeOp tie_shape_op4 =
+      builder.Build<pir::dialect::TieShapeOp>(value4);
+  pir::dialect::TieShapeOp tie_shape_op5 =
+      builder.Build<pir::dialect::TieShapeOp>(value5);
+
+  builder.SetInsertionPointToEnd(func_op.block());
+  builder.Build<pir::dialect::SymbolicDim>("C2", 2, true, false, true, true);
+  pir::dialect::SymbolicDim sym_dim_s0 =
+      builder.Build<pir::dialect::SymbolicDim>(
+          "S0", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
+  pir::dialect::SymbolicDim sym_dim_s1 =
+      builder.Build<pir::dialect::SymbolicDim>(
+          "S1", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
+  pir::dialect::SymbolicDim sym_dim_s2 =
+      builder.Build<pir::dialect::SymbolicDim>(
+          "S2", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
+
+  pir::Attribute attr_s0 = pir::StrAttribute::get(ctx, "S0");
+  pir::Attribute attr_s1 = pir::StrAttribute::get(ctx, "S1");
+  pir::Attribute attr_s2 = pir::StrAttribute::get(ctx, "S2");
+  pir::Attribute attr_c2 = pir::StrAttribute::get(ctx, "C2");
+
+  auto attr_op1 = pir::ArrayAttribute::get(ctx, {attr_s0, attr_c2});
+  auto attr_op2 = pir::ArrayAttribute::get(ctx, {attr_s1, attr_c2});
+  auto attr_op3 = pir::ArrayAttribute::get(ctx, {attr_c2, attr_c2});
+  auto attr_op4 = pir::ArrayAttribute::get(ctx, {attr_c2, attr_c2});
+  auto attr_op5 = pir::ArrayAttribute::get(ctx, {attr_s2});
+
+  tie_shape_op1->set_attribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attr_op1);
+  tie_shape_op2->set_attribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attr_op2);
+  tie_shape_op3->set_attribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attr_op3);
+  tie_shape_op4->set_attribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attr_op4);
+  tie_shape_op5->set_attribute(
+      pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attr_op5);
+
+  pir::ShapeConstraintIRAnalysis shape_analysis(program.module_op());
+  EXPECT_TRUE(shape_analysis.IsShapeEqual(value3, value4));
+  EXPECT_FALSE(shape_analysis.IsShapeEqual(value1, value2));
+  EXPECT_FALSE(shape_analysis.IsShapeEqual(value1, value3));
+  EXPECT_FALSE(shape_analysis.IsShapeEqual(value1, value5));
+  EXPECT_FALSE(shape_analysis.IsShapeEqual(value3, value5));
+  EXPECT_TRUE(shape_analysis.IsProductEqual(value1, {1}, value3, {0}));
+  EXPECT_TRUE(shape_analysis.IsSameNumElements(value4, value3));
+
+  shape_analysis.symbolicDimMgr().MapSymbolicDimEqual(sym_dim_s0, sym_dim_s1);
+  shape_analysis.symbolicDimMgr().MapSymbolicDimEqual(sym_dim_s0, sym_dim_s2);
+
+  EXPECT_TRUE(shape_analysis.IsShapeEqual(value1, value2));
+  EXPECT_FALSE(shape_analysis.IsShapeEqual(value1, value5));
+}
diff --git a/test/cpp/pir/shape_dialect/symbolic_op_test.cc b/test/cpp/pir/shape_dialect/symbolic_op_test.cc
deleted file mode 100644
index b2b62c7b46aa90..00000000000000
--- a/test/cpp/pir/shape_dialect/symbolic_op_test.cc
+++ /dev/null
@@ -1,619 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <map>
-#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
-#include "paddle/pir/core/block.h"
-#include "paddle/pir/core/builder.h"
-#include "paddle/pir/core/builtin_type.h"
-#include "paddle/pir/core/builtin_type_interfaces.h"
-#include "paddle/pir/core/dialect.h"
-#include "paddle/pir/core/ir_context.h"
-#include "paddle/pir/core/program.h"
-#include "paddle/pir/dialect/shape/ir/shape_dialect.h"
-#include "paddle/pir/dialect/shape/ir/shape_op.h"
-#include "paddle/pir/dialect/shape/utils/shape_utils.h"
-
-pir::AttributeMap CreateAttributeMap(
-    const std::vector<std::string> &attribute_names,
-    const std::vector<std::string> &attributes) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::AttributeMap attr_map;
-  for (size_t i = 0; i < attribute_names.size(); i++) {
-    pir::Attribute attr_value = pir::StrAttribute::get(ctx, attributes[i]);
-    attr_map.insert(
-        std::pair<std::string, pir::Attribute>(attribute_names[i], attr_value));
-  }
-  return attr_map;
-}
-
-pir::Operation *CreateDenseTensorOp(
-    pir::IrContext *ctx,
-    const phi::DDim &dims,
-    const std::vector<std::string> &attribute_names,
-    const std::vector<std::string> &attributes) {
-  std::vector<pir::Value> op_inputs = {};
-  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
-  phi::DataLayout data_layout = phi::DataLayout::NCHW;
-  phi::LoD lod = {{0, 1, 2}};
-  size_t offset = 0;
-  std::vector<pir::Type> op_output_types = {
-      paddle::dialect::DenseTensorType::get(
-          ctx, fp32_dtype, dims, data_layout, lod, offset)};
-  pir::Operation *op =
-      pir::Operation::Create(op_inputs,
-                             CreateAttributeMap(attribute_names, attributes),
-                             op_output_types,
-                             pir::OpInfo());
-  return op;
-}
-
-TEST(assist_struct_test, symbolic_dim) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  pir::Builder builder = pir::Builder(ctx, program.block());
-  pir::dialect::SymbolicDim symDim = builder.Build<pir::dialect::SymbolicDim>(
-      "S0", 10, false, false, false, false);
-  pir::dialect::SymbolicDim symDim_ = builder.Build<pir::dialect::SymbolicDim>(
-      "S1", 10, false, false, false, false);
-  EXPECT_EQ(symDim.getValue(), 10);
-  EXPECT_EQ(symDim.getSymName(), "S0");
-  EXPECT_FALSE(symDim.getKnownNegativeOne());
-  EXPECT_FALSE(symDim.getKnownNonSizeOne());
-  EXPECT_FALSE(symDim.getKnownNonSizeZero());
-  EXPECT_FALSE(symDim.getKnownNonNegative());
-
-  EXPECT_FALSE(symDim.IsDynamic());
-  EXPECT_TRUE(symDim.Merge(symDim_));
-
-  symDim.updateValue(20);
-  symDim.updateSymName("S2");
-  symDim.updateKnownNegativeOne(true);
-  symDim.updateKnownNonSizeOne(true);
-  symDim.updateKnownNonSizeZero(true);
-  symDim.updateKnownNonNegative(true);
-
-  EXPECT_FALSE(symDim.Merge(symDim_));
-
-  EXPECT_EQ(symDim.getValue(), 20);
-  EXPECT_EQ(symDim.getSymName(), "S2");
-  EXPECT_TRUE(symDim.getKnownNegativeOne());
-  EXPECT_TRUE(symDim.getKnownNonSizeOne());
-  EXPECT_TRUE(symDim.getKnownNonSizeZero());
-  EXPECT_TRUE(symDim.getKnownNonNegative());
-}
-
-TEST(assist_struct_test, symbolic_dim_product) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  pir::Builder builder = pir::Builder(ctx, program.block());
-  pir::dialect::SymbolicDim symDim = builder.Build<pir::dialect::SymbolicDim>(
-      "S0", pir::ShapedTypeInterface::kDynamic, false, false, false, false);
-  pir::SymbolicDimProduct symDimProduct;
-  pir::SymbolicDimProduct symDimProduct_;
-  symDimProduct.symbols.push_back(symDim);
-  symDimProduct.factor *= 10;
-  EXPECT_EQ(symDimProduct.factor, 10);
-  EXPECT_NE(symDimProduct, symDimProduct_);
-  EXPECT_FALSE(symDimProduct.empty());
-}
-
-TEST(assist_struct_test, symbolic_dim_table) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  pir::Builder builder = pir::Builder(ctx, program.block());
-  pir::dialect::SymbolicDim symDim = builder.Build<pir::dialect::SymbolicDim>(
-      "S0", 10, false, false, false, false);
-
-  pir::SymbolTable symbolTable(program.module_op());
-  EXPECT_EQ(symbolTable.insert(symDim), "S0");
-  EXPECT_EQ(symbolTable.Lookup<pir::dialect::SymbolicDim>("S0"), symDim);
-  EXPECT_EQ(symbolTable.getOp(), program.module_op());
-  EXPECT_FALSE(symbolTable.Lookup<pir::dialect::SymbolicDim>("S1"));
-}
-
-TEST(assist_struct_test, symbolic_dim_mgr_simple) {
-  /******************************************************/
-  /* Mgr simple version, only SymbolicDim related func. */
-  /******************************************************/
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-
-  pir::SymbolicDimMgr symDimMgr(program.module_op());
-  pir::dialect::SymbolicDim symDimS0 = symDimMgr.NewSymbolicDim();
-  pir::dialect::SymbolicDim symDimS1 = symDimMgr.NewSymbolicDim();
-  pir::dialect::SymbolicDim symDimC10 = symDimMgr.NewConstantSymbolicDim(10);
-  symDimMgr.MapSymbolicDimEqual(symDimS0, symDimS1);
-
-  auto op = CreateDenseTensorOp(
-      ctx, {pir::ShapedTypeInterface::kDynamic, 2}, {"op_attr"}, {"op_name"});
-  pir::Value res = op->result(0);
-
-  std::vector<pir::dialect::SymbolicDim> symDimVec =
-      symDimMgr.CreateSymbolicDimsForRankedValue(res);
-
-  EXPECT_EQ(symDimS0.getSymName(), "S0");
-  EXPECT_EQ(symDimS1.getSymName(), "S1");
-  EXPECT_EQ(symDimS1.getValue(), pir::ShapedTypeInterface::kDynamic);
-  EXPECT_EQ(symDimC10.getSymName(), "C10");
-  EXPECT_EQ(symDimC10.getValue(), 10);
-  EXPECT_EQ(symDimVec[0].getSymName(), "S2");
-  EXPECT_EQ(symDimVec[1].getSymName(), "C2");
-  EXPECT_EQ(symDimMgr.symbolTable().Lookup<pir::dialect::SymbolicDim>("S0"),
-            symDimS0);
-  EXPECT_EQ(symDimMgr.symbolTable().Lookup<pir::dialect::SymbolicDim>("C10"),
-            symDimC10);
-  EXPECT_EQ(symDimMgr.GetRootSymbolicDim(symDimS1), symDimS0);
-  EXPECT_TRUE(symDimMgr.IsSymbolicDimEqual(symDimS0, symDimS1));
-  EXPECT_FALSE(symDimMgr.IsSymbolicDimEqual(symDimS0, symDimC10));
-}
-
-TEST(assist_struct_test, symbolic_dim_mgr_complex) {
-  /***************************************************************/
-  /* Mgr with constraintOp, and SymbolicDimProduct related func. */
-  /***************************************************************/
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-
-  pir::SymbolicDimMgr symDimMgr(program.module_op());
-  auto funcOp =
-      symDimMgr.symbolTable().getOp()->dyn_cast<pir::dialect::FuncOp>();
-
-  pir::Builder builder = pir::Builder(ctx, funcOp.block());
-
-  pir::dialect::SymbolicDim symDimS0 = symDimMgr.NewSymbolicDim("S0");
-  pir::dialect::SymbolicDim symDimS1 = symDimMgr.NewSymbolicDim("S1");
-  pir::dialect::SymbolicDim symDimS2 = symDimMgr.NewSymbolicDim("S2");
-  pir::dialect::SymbolicDim symDimS3 = symDimMgr.NewSymbolicDim("S3");
-  pir::dialect::SymbolicDim symDimS4 = symDimMgr.NewSymbolicDim("S4");
-  pir::dialect::SymbolicDim symDimS5 = symDimMgr.NewSymbolicDim("S5");
-  pir::dialect::SymbolicDim symDimS6 = symDimMgr.NewSymbolicDim("S6");
-  pir::dialect::SymbolicDim symDimS7 = symDimMgr.NewSymbolicDim("S7");
-  pir::dialect::SymbolicDim symDimS8 = symDimMgr.NewSymbolicDim("S8");
-  pir::dialect::SymbolicDim symDimS9 = symDimMgr.NewSymbolicDim("S9");
-  pir::dialect::SymbolicDim symDimS10 = symDimMgr.NewSymbolicDim("S10");
-  pir::dialect::SymbolicDim symDimS11 = symDimMgr.NewSymbolicDim("S11");
-  pir::dialect::SymbolicDim symDimS12 = symDimMgr.NewSymbolicDim("S12");
-  pir::dialect::SymbolicDim symDimC10 = symDimMgr.NewConstantSymbolicDim(10);
-  pir::dialect::SymbolicDim symDimC20 = symDimMgr.NewConstantSymbolicDim(20);
-
-  pir::OpResult dimOpS0 = builder.Build<pir::dialect::DimOp>("S0").out();
-  pir::OpResult dimOpS1 = builder.Build<pir::dialect::DimOp>("S1").out();
-  pir::OpResult dimOpS2 = builder.Build<pir::dialect::DimOp>("S2").out();
-  pir::OpResult dimOpS3 = builder.Build<pir::dialect::DimOp>("S3").out();
-  pir::OpResult dimOpS4 = builder.Build<pir::dialect::DimOp>("S4").out();
-  pir::OpResult dimOpS5 = builder.Build<pir::dialect::DimOp>("S5").out();
-  pir::OpResult dimOpS6 = builder.Build<pir::dialect::DimOp>("S6").out();
-  pir::OpResult dimOpS7 = builder.Build<pir::dialect::DimOp>("S7").out();
-  pir::OpResult dimOpS8 = builder.Build<pir::dialect::DimOp>("S8").out();
-  pir::OpResult dimOpS9 = builder.Build<pir::dialect::DimOp>("S9").out();
-  pir::OpResult dimOpS10 = builder.Build<pir::dialect::DimOp>("S10").out();
-  pir::OpResult dimOpS11 = builder.Build<pir::dialect::DimOp>("S11").out();
-  pir::OpResult dimOpC10 = builder.Build<pir::dialect::DimOp>("C10").out();
-  pir::OpResult dimOpC20 = builder.Build<pir::dialect::DimOp>("C20").out();
-  pir::OpResult constant =
-      builder
-          .Build<pir::ConstantOp>(pir::Int32Attribute::get(ctx, 2),
-                                  pir::Int32Type::get(ctx))
-          ->result(0);
-
-  // Mark S1 == S2.
-  builder.Build<pir::dialect::TieProductEqualOp>(
-      2, 2, std::vector<pir::Value>{constant, dimOpS1, dimOpS2, constant});
-  // Mark S0 * S1 == S2 * S3, For check S0 == S3.
-  builder.Build<pir::dialect::TieProductEqualOp>(
-      2, 2, std::vector<pir::Value>{dimOpS0, dimOpS1, dimOpS2, dimOpS3});
-  // Mark S4 * S0 * S1 == S2 * S3 * S5, For check S4 == S5.
-  builder.Build<pir::dialect::TieProductEqualOp>(
-      3,
-      3,
-      std::vector<pir::Value>{
-          dimOpS4, dimOpS0, dimOpS1, dimOpS2, dimOpS3, dimOpS5});
-  // For check S6 == C10 * C20.
-  builder.Build<pir::dialect::TieProductEqualOp>(
-      1, 2, std::vector<pir::Value>{dimOpS6, dimOpC10, dimOpC20});
-  // Mark C10 * S0 * S1 == S2 * S3 * S7, for check C10 == S7.
-  builder.Build<pir::dialect::TieProductEqualOp>(
-      3,
-      3,
-      std::vector<pir::Value>{
-          dimOpC10, dimOpS0, dimOpS1, dimOpS2, dimOpS3, dimOpS7});
-
-  // For unsimplify product case: S8 * S9 == S10 * S11
-  builder.Build<pir::dialect::TieProductEqualOp>(
-      2, 2, std::vector<pir::Value>{dimOpS8, dimOpS9, dimOpS10, dimOpS11});
-
-  auto op = CreateDenseTensorOp(ctx,
-                                {pir::ShapedTypeInterface::kDynamic,
-                                 pir::ShapedTypeInterface::kDynamic,
-                                 pir::ShapedTypeInterface::kDynamic,
-                                 pir::ShapedTypeInterface::kDynamic,
-                                 pir::ShapedTypeInterface::kDynamic,
-                                 pir::ShapedTypeInterface::kDynamic},
-                                {"op0_attr"},
-                                {"op0_name"});
-  auto op_ = CreateDenseTensorOp(ctx,
-                                 {pir::ShapedTypeInterface::kDynamic,
-                                  pir::ShapedTypeInterface::kDynamic,
-                                  pir::ShapedTypeInterface::kDynamic,
-                                  pir::ShapedTypeInterface::kDynamic,
-                                  pir::ShapedTypeInterface::kDynamic,
-                                  10,
-                                  20},
-                                 {"op1_attr"},
-                                 {"op1_name"});
-  pir::OpResult res = op->result(0);
-  pir::OpResult res_ = op_->result(0);
-
-  builder.SetInsertionPointToEnd(program.block());
-  pir::dialect::TieShapeOp tieShapeOp =
-      builder.Build<pir::dialect::TieShapeOp>(res);
-  pir::dialect::TieShapeOp tieShapeOp_ =
-      builder.Build<pir::dialect::TieShapeOp>(res_);
-
-  pir::Attribute attrS0 = pir::StrAttribute::get(ctx, "S0");
-  pir::Attribute attrS1 = pir::StrAttribute::get(ctx, "S1");
-  pir::Attribute attrS2 = pir::StrAttribute::get(ctx, "S2");
-  pir::Attribute attrS3 = pir::StrAttribute::get(ctx, "S3");
-  pir::Attribute attrS4 = pir::StrAttribute::get(ctx, "S4");
-  pir::Attribute attrS5 = pir::StrAttribute::get(ctx, "S5");
-  pir::Attribute attrS6 = pir::StrAttribute::get(ctx, "S6");
-  pir::Attribute attrS7 = pir::StrAttribute::get(ctx, "S7");
-  pir::Attribute attrS8 = pir::StrAttribute::get(ctx, "S8");
-  pir::Attribute attrS9 = pir::StrAttribute::get(ctx, "S9");
-  pir::Attribute attrS10 = pir::StrAttribute::get(ctx, "S10");
-  pir::Attribute attrS11 = pir::StrAttribute::get(ctx, "S11");
-  pir::Attribute attrC10 = pir::StrAttribute::get(ctx, "C10");
-  pir::Attribute attrC20 = pir::StrAttribute::get(ctx, "C20");
-
-  std::vector<pir::Attribute> newAttrs = {
-      attrS0, attrS1, attrS2, attrS3, attrS4, attrS5};
-  std::vector<pir::Attribute> newAttrsRef = {
-      attrS0, attrS1, attrS1, attrS0, attrS2, attrS2};
-  std::vector<pir::Attribute> newAttrs_ = {
-      attrS6, attrS7, attrS8, attrS9, attrS10, attrS11, attrC10, attrC20};
-
-  auto arrayAttr = pir::ArrayAttribute::get(ctx, newAttrs);
-  auto arrayAttrRef = pir::ArrayAttribute::get(ctx, newAttrsRef);
-  auto arrayAttr_ = pir::ArrayAttribute::get(ctx, newAttrs_);
-  tieShapeOp->set_attribute(pir::dialect::SymbolicDim::getSymbolicDimAttrName(),
-                            arrayAttr);
-  tieShapeOp_->set_attribute(
-      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), arrayAttr_);
-
-  EXPECT_TRUE(symDimMgr.Load());
-
-  // For check indirect equality: S1 * S4 == S2 * S5
-  pir::SymbolicDimProduct symDimProductLhs;
-  pir::SymbolicDimProduct symDimProductRhs;
-
-  symDimProductLhs.symbols.push_back(symDimS1);
-  symDimProductLhs.symbols.push_back(symDimS4);
-
-  symDimProductRhs.symbols.push_back(symDimS2);
-  symDimProductRhs.symbols.push_back(symDimS5);
-
-  // For uncompletely simplied product check: S8 * S9 * S12 == S10 * S11 * S12
-  pir::SymbolicDimProduct symDimProductLhs_;
-  pir::SymbolicDimProduct symDimProductRhs_;
-
-  symDimProductLhs_.symbols.push_back(symDimS8);
-  symDimProductLhs_.symbols.push_back(symDimS9);
-  symDimProductLhs_.symbols.push_back(symDimS12);
-
-  symDimProductRhs_.symbols.push_back(symDimS10);
-  symDimProductRhs_.symbols.push_back(symDimS11);
-  symDimProductRhs_.symbols.push_back(symDimS12);
-
-  // For check SimplifySymbolicDimProduct, {factor = 1, Sym = {S7}} => {factor =
-  // 10}
-  pir::SymbolicDimProduct symDimProductS7;
-  symDimProductS7.symbols.push_back(symDimS7);
-  pir::SymbolicDimProduct simplifiedProductS7 =
-      symDimMgr.SimplifySymbolicDimProduct(symDimProductS7);
-
-  // For check SimplifySymbolicDimProductPair, X * Y * Y, Y * Y * Z => X, Z
-  pir::SymbolicDimProduct symDimProductPairLhs;
-  pir::SymbolicDimProduct symDimProductPairRhs;
-  pir::SymbolicDimProduct newLhs, newRhs;
-  symDimProductPairLhs.symbols.push_back(symDimS4);
-  symDimProductPairLhs.symbols.push_back(symDimS1);
-  symDimProductPairLhs.symbols.push_back(symDimS2);
-  symDimProductPairRhs.symbols.push_back(symDimS1);
-  symDimProductPairRhs.symbols.push_back(symDimS2);
-  symDimProductPairRhs.symbols.push_back(symDimS3);
-
-  std::tie(newLhs, newRhs) = symDimMgr.SimplifySymbolicDimProductPair(
-      symDimProductPairLhs, symDimProductPairRhs);
-
-  // For check SymbolicDimProductDivide, {S4 * S1 * C20} / {S1 * C10} => {factor
-  // = 2 Sym = {S4}}
-  pir::SymbolicDimProduct symDimProductDivLhs;
-  pir::SymbolicDimProduct symDimProductDivRhs;
-  symDimProductDivLhs.symbols.push_back(symDimS4);
-  symDimProductDivLhs.symbols.push_back(symDimS1);
-  symDimProductDivLhs.symbols.push_back(symDimC20);
-  symDimProductDivRhs.symbols.push_back(symDimS1);
-  symDimProductDivRhs.symbols.push_back(symDimC10);
-
-  pir::SymbolicDimProduct *divRes = symDimMgr.SymbolicDimProductDivide(
-      symDimProductDivLhs, symDimProductDivRhs);
-
-  EXPECT_TRUE(symDimMgr.IsSymbolicDimEqual(symDimS1, symDimS2));
-  EXPECT_TRUE(symDimMgr.IsSymbolicDimEqual(symDimS0, symDimS3));
-  EXPECT_TRUE(symDimMgr.IsSymbolicDimEqual(symDimS4, symDimS5));
-  EXPECT_EQ(symDimS6.getValue(), 200);
-  EXPECT_EQ(symDimMgr.symbolTable().Lookup<pir::dialect::SymbolicDim>("C20"),
-            symDimC20);
-  EXPECT_EQ(symDimS7.getValue(), symDimC10.getValue());
-  EXPECT_EQ(simplifiedProductS7.factor, 10);
-  EXPECT_EQ(simplifiedProductS7.symbols.size(), static_cast<size_t>(0));
-  EXPECT_EQ(newLhs.symbols.size(), static_cast<size_t>(1));
-  EXPECT_EQ(newRhs.symbols.size(), static_cast<size_t>(1));
-  EXPECT_EQ(newLhs.symbols[0], symDimMgr.GetRootSymbolicDim(symDimS4));
-  EXPECT_EQ(newRhs.symbols[0], symDimMgr.GetRootSymbolicDim(symDimS3));
-  EXPECT_EQ(divRes->factor, 2);
-  EXPECT_EQ(divRes->symbols.size(), static_cast<size_t>(1));
-  EXPECT_EQ(divRes->symbols[0], symDimMgr.GetRootSymbolicDim(symDimS4));
-  EXPECT_TRUE(
-      symDimMgr.IsSymbolicDimProductEqual(symDimProductLhs, symDimProductRhs));
-  EXPECT_TRUE(symDimMgr.IsSymbolicDimProductEqual(symDimProductLhs_,
-                                                  symDimProductRhs_));
-  EXPECT_TRUE(symDimMgr.Save());
-
-  pir::SymbolicDimMgr symDimMgr_(program.module_op());
-  EXPECT_TRUE(symDimMgr_.Load());
-  auto attrs = tieShapeOp.attribute<pir::ArrayAttribute>(
-      pir::dialect::SymbolicDim::getSymbolicDimAttrName());
-  EXPECT_FALSE(
-      symDimMgr_.symbolTable().Lookup<pir::dialect::SymbolicDim>("S7"));
-  EXPECT_EQ(symDimMgr_.symbolTable()
-                .Lookup<pir::dialect::TieProductEqualOp>("tie_product_equal")
-                .size(),
-            static_cast<size_t>(1));
-
-  EXPECT_EQ(attrs.AsVector(), arrayAttrRef.AsVector());
-}
-
-TEST(shape_op, dim) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  pir::Builder builder = pir::Builder(ctx, program.block());
-
-  pir::dialect::DimOp dimOp = builder.Build<pir::dialect::DimOp>("S0");
-  pir::OpResult res = dimOp.out();
-  EXPECT_EQ(dimOp.getName(), "S0");
-  dimOp.setName("S1");
-  EXPECT_EQ(dimOp.getName(), "S1");
-  EXPECT_EQ(res.owner(), dimOp.operation());
-  EXPECT_EQ(res.type(), pir::IndexType::get(ctx));
-}
-
-TEST(shape_op, tie_product_equal) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  pir::Builder builder = pir::Builder(ctx, program.block());
-  pir::SymbolTable symbolTable(program.module_op());
-
-  pir::OpResult dimOp0 = builder.Build<pir::dialect::DimOp>("S0").out();
-  pir::OpResult dimOp1 = builder.Build<pir::dialect::DimOp>("S1").out();
-  pir::OpResult dimOp2 = builder.Build<pir::dialect::DimOp>("S2").out();
-  pir::OpResult dimOp3 = builder.Build<pir::dialect::DimOp>("S3").out();
-  pir::OpResult dimOp4 = builder.Build<pir::dialect::DimOp>("S4").out();
-
-  pir::dialect::TieProductEqualOp tie_product_equal =
-      builder.Build<pir::dialect::TieProductEqualOp>(
-          2,
-          3,
-          std::vector<pir::Value>{dimOp0, dimOp1, dimOp2, dimOp3, dimOp4});
-
-  std::vector<pir::Value> lhs = tie_product_equal.lhs();
-  std::vector<pir::Value> rhs = tie_product_equal.rhs();
-
-  std::vector<pir::Value> lhs_ref{dimOp0, dimOp1};
-  std::vector<pir::Value> rhs_ref{dimOp2, dimOp3, dimOp4};
-
-  EXPECT_EQ(symbolTable.insert(tie_product_equal), "tie_product_equal");
-  EXPECT_EQ(
-      symbolTable.Lookup<pir::dialect::TieProductEqualOp>("tie_product_equal")
-          .size(),
-      static_cast<size_t>(1));
-  EXPECT_EQ(symbolTable.Lookup<pir::dialect::TieProductEqualOp>(
-                "tie_product_equal")[0],
-            tie_product_equal);
-  EXPECT_EQ(lhs, lhs_ref);
-  EXPECT_EQ(rhs, rhs_ref);
-}
-
-TEST(shape_op, tie_shape) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-
-  pir::Builder builder = pir::Builder(ctx, program.block());
-
-  auto op = CreateDenseTensorOp(
-      ctx, {pir::ShapedTypeInterface::kDynamic, 2}, {"op_attr"}, {"op_name"});
-  pir::OpResult res = op->result(0);
-
-  pir::dialect::TieShapeOp tieShapeOp =
-      builder.Build<pir::dialect::TieShapeOp>(res);
-  pir::Value tieShapeOpValue = tieShapeOp.value();
-
-  pir::Attribute attrS0 = pir::StrAttribute::get(ctx, "S0");
-  pir::Attribute attrS1 = pir::StrAttribute::get(ctx, "S1");
-
-  std::vector<pir::Attribute> newAttrs = {attrS0, attrS1};
-
-  auto arrayAttr = pir::ArrayAttribute::get(ctx, newAttrs);
-  tieShapeOp->set_attribute(pir::dialect::SymbolicDim::getSymbolicDimAttrName(),
-                            arrayAttr);
-
-  std::vector<pir::Attribute> arrAttrVec =
-      tieShapeOp
-          ->attribute<pir::ArrayAttribute>(
-              pir::dialect::SymbolicDim::getSymbolicDimAttrName())
-          .AsVector();
-
-  EXPECT_EQ(tieShapeOpValue, res);
-  EXPECT_EQ(arrAttrVec.size(), static_cast<size_t>(2));
-  EXPECT_EQ(arrAttrVec[0].dyn_cast<pir::StrAttribute>(), attrS0);
-  EXPECT_EQ(arrAttrVec[1].dyn_cast<pir::StrAttribute>(), attrS1);
-  EXPECT_TRUE(tieShapeOp->HasAttribute(
-      pir::dialect::SymbolicDim::getSymbolicDimAttrName()));
-}
-
-TEST(shape_op, func_op) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
-  pir::dialect::FuncOp funcOp = builder.Build<pir::dialect::FuncOp>();
-  auto funcBlock = funcOp.block();
-  builder.SetInsertionPointToStart(funcBlock);
-  builder.Build<pir::ConstantOp>(pir::Int32Attribute::get(ctx, 2),
-                                 pir::Int32Type::get(ctx));
-  EXPECT_EQ(funcBlock, funcOp->region(0).front());
-  EXPECT_EQ(funcOp->region(0).size(), static_cast<size_t>(1));
-  EXPECT_EQ(funcBlock->size(), static_cast<size_t>(1));
-}
-
-TEST(assist_struct_test, shape_analysis) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
-  pir::dialect::FuncOp funcOp = builder.Build<pir::dialect::FuncOp>();
-
-  phi::DDim dims_D_2 = {pir::ShapedTypeInterface::kDynamic, 2};
-  phi::DDim dims_2_2 = {2, 2};
-  phi::DDim dims_D = {pir::ShapedTypeInterface::kDynamic};
-
-  // same shape with dynamic: value1 == value2
-  auto op1 = CreateDenseTensorOp(ctx, dims_D_2, {"op1_attr"}, {"op1_name"});
-  auto op2 = CreateDenseTensorOp(ctx, dims_D_2, {"op2_attr"}, {"op2_name"});
-  pir::OpResult value1 = op1->result(0);
-  pir::OpResult value2 = op2->result(0);
-
-  // same shape with static: value3 == value4
-  auto op3 = CreateDenseTensorOp(ctx, dims_2_2, {"op3_attr"}, {"op3_name"});
-  auto op4 = CreateDenseTensorOp(ctx, dims_2_2, {"op4_attr"}, {"op4_name"});
-  pir::OpResult value3 = op3->result(0);
-  pir::OpResult value4 = op4->result(0);
-
-  // one dimension with dynamic: value5 != value1 != value3
-  auto op5 = CreateDenseTensorOp(ctx, dims_D, {"op5_attr"}, {"op5_name"});
-  pir::OpResult value5 = op5->result(0);
-
-  pir::dialect::TieShapeOp tieShapeOp1 =
-      builder.Build<pir::dialect::TieShapeOp>(value1);
-  pir::dialect::TieShapeOp tieShapeOp2 =
-      builder.Build<pir::dialect::TieShapeOp>(value2);
-  pir::dialect::TieShapeOp tieShapeOp3 =
-      builder.Build<pir::dialect::TieShapeOp>(value3);
-  pir::dialect::TieShapeOp tieShapeOp4 =
-      builder.Build<pir::dialect::TieShapeOp>(value4);
-  pir::dialect::TieShapeOp tieShapeOp5 =
-      builder.Build<pir::dialect::TieShapeOp>(value5);
-
-  builder.SetInsertionPointToEnd(funcOp.block());
-  builder.Build<pir::dialect::SymbolicDim>("C2", 2, true, false, true, true);
-  pir::dialect::SymbolicDim symDimS0 = builder.Build<pir::dialect::SymbolicDim>(
-      "S0", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
-  pir::dialect::SymbolicDim symDimS1 = builder.Build<pir::dialect::SymbolicDim>(
-      "S1", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
-  pir::dialect::SymbolicDim symDimS2 = builder.Build<pir::dialect::SymbolicDim>(
-      "S2", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
-
-  pir::Attribute attrS0 = pir::StrAttribute::get(ctx, "S0");
-  pir::Attribute attrS1 = pir::StrAttribute::get(ctx, "S1");
-  pir::Attribute attrS2 = pir::StrAttribute::get(ctx, "S2");
-  pir::Attribute attrC2 = pir::StrAttribute::get(ctx, "C2");
-
-  auto attrOp1 = pir::ArrayAttribute::get(ctx, {attrS0, attrC2});
-  auto attrOp2 = pir::ArrayAttribute::get(ctx, {attrS1, attrC2});
-  auto attrOp3 = pir::ArrayAttribute::get(ctx, {attrC2, attrC2});
-  auto attrOp4 = pir::ArrayAttribute::get(ctx, {attrC2, attrC2});
-  auto attrOp5 = pir::ArrayAttribute::get(ctx, {attrS2});
-
-  tieShapeOp1->set_attribute(
-      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp1);
-  tieShapeOp2->set_attribute(
-      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp2);
-  tieShapeOp3->set_attribute(
-      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp3);
-  tieShapeOp4->set_attribute(
-      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp4);
-  tieShapeOp5->set_attribute(
-      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp5);
-
-  pir::SymbolicDimShapeAnalysis shapeAnalysis(program.module_op());
-  EXPECT_TRUE(shapeAnalysis.IsShapeEqual(value3, value4));
-  EXPECT_FALSE(shapeAnalysis.IsShapeEqual(value1, value2));
-  EXPECT_FALSE(shapeAnalysis.IsShapeEqual(value1, value3));
-  EXPECT_FALSE(shapeAnalysis.IsShapeEqual(value1, value5));
-  EXPECT_FALSE(shapeAnalysis.IsShapeEqual(value3, value5));
-  EXPECT_TRUE(shapeAnalysis.IsProductEqual(value1, {1}, value3, {0}));
-  EXPECT_TRUE(shapeAnalysis.IsSameNumElements(value4, value3));
-
-  shapeAnalysis.symbolicDimMgr().MapSymbolicDimEqual(symDimS0, symDimS1);
-  shapeAnalysis.symbolicDimMgr().MapSymbolicDimEqual(symDimS0, symDimS2);
-
-  EXPECT_TRUE(shapeAnalysis.IsShapeEqual(value1, value2));
-  EXPECT_FALSE(shapeAnalysis.IsShapeEqual(value1, value5));
-}
-
-TEST(shape_op, tensor_dim) {
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Program program(ctx);
-  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
-  pir::Builder builder = pir::Builder(ctx, program.block());
-
-  pir::Operation *op = CreateDenseTensorOp(
-      ctx, {pir::ShapedTypeInterface::kDynamic, 2}, {"op_attr"}, {"op_name"});
-  pir::OpResult resDenseTensorValue = op->result(0);
-
-  pir::dialect::TensorDimOp tensorDimOp0 =
-      builder.Build<pir::dialect::TensorDimOp>(resDenseTensorValue, 0);
-  pir::OpResult res0 = tensorDimOp0.out();
-
-  pir::OpResult indexValue =
-      builder
-          .Build<pir::ConstantOp>(
-              pir::Int64Attribute::get(pir::IrContext::Instance(), 1),
-              pir::IndexType::get(pir::IrContext::Instance()))
-          ->result(0);
-  pir::dialect::TensorDimOp tensorDimOp1 =
-      builder.Build<pir::dialect::TensorDimOp>(resDenseTensorValue, indexValue);
-  pir::OpResult res1 = tensorDimOp1.out();
-
-  EXPECT_EQ(res0.type(), pir::IndexType::get(ctx));
-  EXPECT_EQ(res1.type(), pir::IndexType::get(ctx));
-  EXPECT_EQ(tensorDimOp0.source(), resDenseTensorValue);
-  EXPECT_EQ(tensorDimOp1.source(), resDenseTensorValue);
-  EXPECT_EQ(tensorDimOp1.index(), indexValue);
-}
diff --git a/test/cpp/pir/tools/test_dialect.cc b/test/cpp/pir/tools/test_dialect.cc
index 49fb4a6951dd79..e3000a418119be 100644
--- a/test/cpp/pir/tools/test_dialect.cc
+++ b/test/cpp/pir/tools/test_dialect.cc
@@ -21,7 +21,24 @@ TestDialect::TestDialect(pir::IrContext *context)
   initialize();
 }
 void TestDialect::initialize() {
-  RegisterOps<RegionOp, BranchOp, Operation1, Operation2>();
+  RegisterOps<RegionOp,
+              BranchOp,
+              Operation1,
+              Operation2,
+              TraitExampleOp,
+              SameOperandsShapeTraitOp1,
+              SameOperandsShapeTraitOp2,
+              SameOperandsAndResultShapeTraitOp1,
+              SameOperandsAndResultShapeTraitOp2,
+              SameOperandsAndResultShapeTraitOp3,
+              SameOperandsElementTypeTraitOp1,
+              SameOperandsElementTypeTraitOp2,
+              SameOperandsAndResultElementTypeTraitOp1,
+              SameOperandsAndResultElementTypeTraitOp2,
+              SameOperandsAndResultElementTypeTraitOp3,
+              SameOperandsAndResultTypeTraitOp1,
+              SameOperandsAndResultTypeTraitOp2,
+              SameOperandsAndResultTypeTraitOp3>();
 }
 
 void TestDialect::PrintOperation(pir::Operation *op,
diff --git a/test/cpp/pir/tools/test_op.cc b/test/cpp/pir/tools/test_op.cc
index b67dd24c5dc042..d8ecbb3a2af385 100644
--- a/test/cpp/pir/tools/test_op.cc
+++ b/test/cpp/pir/tools/test_op.cc
@@ -21,23 +21,21 @@ void RegionOp::Build(pir::Builder &builder, pir::OperationArgument &argument) {
   argument.AddRegion(nullptr);
 }
 
-void BranchOp::Build(pir::Builder &builder,  // NOLINT
-                     pir::OperationArgument &argument,
+void BranchOp::Build(pir::Builder &builder,             // NOLINT
+                     pir::OperationArgument &argument,  // NOLINT
                      const std::vector<pir::OpResult> &target_operands,
                      pir::Block *target) {
   argument.AddInputs(target_operands.begin(), target_operands.end());
   argument.AddSuccessor(target);
 }
 
-void BranchOp::Verify() const {
+void BranchOp::VerifySig() const {
   IR_ENFORCE((*this)->num_successors() == 1u,
              "successors number must equal to 1.");
   IR_ENFORCE((*this)->successor(0), "successor[0] can't be nullptr");
 }
 
-const char *Operation1::attributes_name[2] = {  // NOLINT
-    "op1_attr1",
-    "op1_attr2"};
+const char *Operation1::attributes_name[2] = {"op1_attr1", "op1_attr2"};
 
 void Operation1::Build(pir::Builder &builder,               // NOLINT
                        pir::OperationArgument &argument) {  // NOLINT
@@ -47,7 +45,7 @@ void Operation1::Build(pir::Builder &builder,               // NOLINT
   argument.AddOutput(builder.float32_type());
   argument.AddAttributes(attributes);
 }
-void Operation1::Verify() const {
+void Operation1::VerifySig() const {
   auto &attributes = this->attributes();
   if (attributes.count("op1_attr1") == 0 ||
       !attributes.at("op1_attr1").isa<pir::StrAttribute>()) {
@@ -58,9 +56,120 @@ void Operation1::Verify() const {
     throw("Type of attribute: parameter_name is not right.");
   }
 }
+
+void TraitExampleOp::Build(pir::Builder &builder,             // NOLINT
+                           pir::OperationArgument &argument,  // NOLINT
+                           pir::Value l_operand,
+                           pir::Value r_operand,
+                           pir::Type out_type) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+  argument.AddOutput(out_type);
+}
+
+void SameOperandsShapeTraitOp2::Build(
+    pir::Builder &builder,             // NOLINT
+    pir::OperationArgument &argument,  // NOLINT
+    pir::Value l_operand,
+    pir::Value r_operand,
+    pir::Type out_type) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+  argument.AddOutput(out_type);
+}
+
+void SameOperandsAndResultShapeTraitOp2::Build(
+    pir::Builder &builder,             // NOLINT
+    pir::OperationArgument &argument,  // NOLINT
+    pir::Value l_operand,
+    pir::Value r_operand) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+}
+
+void SameOperandsAndResultShapeTraitOp3::Build(
+    pir::Builder &builder,             // NOLINT
+    pir::OperationArgument &argument,  // NOLINT
+    pir::Value l_operand,
+    pir::Value r_operand,
+    pir::Type out_type) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+  argument.AddOutput(out_type);
+}
+
+void SameOperandsElementTypeTraitOp2::Build(
+    pir::Builder &builder,             // NOLINT
+    pir::OperationArgument &argument,  // NOLINT
+    pir::Value l_operand,
+    pir::Value r_operand,
+    pir::Type out_type) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+  argument.AddOutput(out_type);
+}
+
+void SameOperandsAndResultElementTypeTraitOp2::Build(
+    pir::Builder &builder,             // NOLINT
+    pir::OperationArgument &argument,  // NOLINT
+    pir::Value l_operand,
+    pir::Value r_operand) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+}
+
+void SameOperandsAndResultElementTypeTraitOp3::Build(
+    pir::Builder &builder,             // NOLINT
+    pir::OperationArgument &argument,  // NOLINT
+    pir::Value l_operand,
+    pir::Value r_operand,
+    pir::Type out_type1,
+    pir::Type out_type2) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+  argument.AddOutput(out_type1);
+  argument.AddOutput(out_type2);
+}
+
+void SameOperandsAndResultTypeTraitOp2::Build(
+    pir::Builder &builder,             // NOLINT
+    pir::OperationArgument &argument,  // NOLINT
+    pir::Value l_operand,
+    pir::Value r_operand) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+}
+
+void SameOperandsAndResultTypeTraitOp3::Build(
+    pir::Builder &builder,             // NOLINT
+    pir::OperationArgument &argument,  // NOLINT
+    pir::Value l_operand,
+    pir::Value r_operand,
+    pir::Type out_type1,
+    pir::Type out_type2) {
+  argument.AddInput(l_operand);
+  argument.AddInput(r_operand);
+  argument.AddOutput(out_type1);
+  argument.AddOutput(out_type2);
+}
+
 }  // namespace test
 
 IR_DEFINE_EXPLICIT_TYPE_ID(test::RegionOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(test::BranchOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(test::Operation1)
 IR_DEFINE_EXPLICIT_TYPE_ID(test::Operation2)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::TraitExampleOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsShapeTraitOp1)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsShapeTraitOp2)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp1)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp2)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp3)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsElementTypeTraitOp1)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsElementTypeTraitOp2)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp1)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp2)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp3)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp1)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp2)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp3)
diff --git a/test/cpp/pir/tools/test_op.h b/test/cpp/pir/tools/test_op.h
index 8d4ccd49a38edb..175a9268390e94 100644
--- a/test/cpp/pir/tools/test_op.h
+++ b/test/cpp/pir/tools/test_op.h
@@ -17,6 +17,7 @@
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/builtin_type.h"
 #include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/op_trait.h"
 #include "paddle/pir/core/operation_utils.h"
 #include "test/cpp/pir/tools/test_interface.h"
 #include "test/cpp/pir/tools/test_trait.h"
@@ -33,7 +34,7 @@ class RegionOp : public pir::Op<RegionOp, OneRegionTrait> {
   static constexpr const char **attributes_name = nullptr;
   static void Build(pir::Builder &builder,              // NOLINT
                     pir::OperationArgument &argument);  // NOLINT
-  void Verify() const {}
+  void VerifySig() const {}
 };
 
 ///
@@ -49,7 +50,7 @@ class BranchOp : public pir::Op<BranchOp> {
                     pir::OperationArgument &argument,  // NOLINT
                     const std::vector<pir::OpResult> &target_operands,
                     pir::Block *target);
-  void Verify() const;
+  void VerifySig() const;
 };
 
 // Define case op1.
@@ -58,10 +59,10 @@ class Operation1 : public pir::Op<Operation1> {
   using Op::Op;
   static const char *name() { return "test.operation1"; }
   static constexpr uint32_t attributes_num = 2;
-  static const char *attributes_name[attributes_num];   // NOLINT
+  static const char *attributes_name[attributes_num];
   static void Build(pir::Builder &builder,              // NOLINT
                     pir::OperationArgument &argument);  // NOLINT
-  void Verify() const;
+  void VerifySig() const;
 };
 
 // Define op2.
@@ -71,16 +72,269 @@ class Operation2
   using Op::Op;
   static const char *name() { return "test.operation2"; }
   static constexpr uint32_t attributes_num = 0;
-  static constexpr const char **attributes_name = nullptr;  // NOLINT
-  static void Build(pir::Builder &builder,                  // NOLINT
-                    pir::OperationArgument &argument) {}    // NOLINT
-  void Verify() const {}
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,                // NOLINT
+                    pir::OperationArgument &argument) {}  // NOLINT
+  void VerifySig() const {}
   static void InferShape() { VLOG(2) << "This is op2's InferShape interface."; }
 };
 
+// Define TraitExampleOp.
+class TraitExampleOp
+    : public pir::Op<TraitExampleOp,
+                     pir::SameOperandsShapeTrait,
+                     pir::SameOperandsAndResultShapeTrait,
+                     pir::SameOperandsElementTypeTrait,
+                     pir::SameOperandsAndResultElementTypeTrait,
+                     pir::SameOperandsAndResultTypeTrait,
+                     pir::SameTypeOperandsTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.trait_example_op"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand,
+                    pir::Type out_type);
+  void VerifySig() const {}
+};
+
+// Define SameOperandsShapeTraitOp1.
+class SameOperandsShapeTraitOp1
+    : public pir::Op<SameOperandsShapeTraitOp1, pir::SameOperandsShapeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.same_operands_shape_op1"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,                // NOLINT
+                    pir::OperationArgument &argument) {}  // NOLINT
+  void VerifySig() const {}
+};
+
+// Define SameOperandsShapeTraitOp2.
+class SameOperandsShapeTraitOp2
+    : public pir::Op<SameOperandsShapeTraitOp2, pir::SameOperandsShapeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.same_operands_shape_op2"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand,
+                    pir::Type out_type);
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultShapeTraitOp1.
+class SameOperandsAndResultShapeTraitOp1
+    : public pir::Op<SameOperandsAndResultShapeTraitOp1,
+                     pir::SameOperandsAndResultShapeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() {
+    return "test.same_operands_and_result_shape_op1";
+  }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,                // NOLINT
+                    pir::OperationArgument &argument) {}  // NOLINT
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultShapeTraitOp2.
+class SameOperandsAndResultShapeTraitOp2
+    : public pir::Op<SameOperandsAndResultShapeTraitOp2,
+                     pir::SameOperandsAndResultShapeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() {
+    return "test.same_operands_and_result_shape_op2";
+  }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand);
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultShapeTraitOp3.
+class SameOperandsAndResultShapeTraitOp3
+    : public pir::Op<SameOperandsAndResultShapeTraitOp3,
+                     pir::SameOperandsAndResultShapeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() {
+    return "test.same_operands_and_result_shape_op3";
+  }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand,
+                    pir::Type out_type);
+  void VerifySig() const {}
+};
+
+// Define SameOperandsElementTypeTraitOp1.
+class SameOperandsElementTypeTraitOp1
+    : public pir::Op<SameOperandsElementTypeTraitOp1,
+                     pir::SameOperandsElementTypeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.same_operands_element_type_op1"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,                // NOLINT
+                    pir::OperationArgument &argument) {}  // NOLINT
+  void VerifySig() const {}
+};
+
+// Define SameOperandsElementTypeTraitOp2.
+class SameOperandsElementTypeTraitOp2
+    : public pir::Op<SameOperandsElementTypeTraitOp2,
+                     pir::SameOperandsElementTypeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.same_operands_element_type_op1"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand,
+                    pir::Type out_type);
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultElementTypeTraitOp1.
+class SameOperandsAndResultElementTypeTraitOp1
+    : public pir::Op<SameOperandsAndResultElementTypeTraitOp1,
+                     pir::SameOperandsAndResultElementTypeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() {
+    return "test.same_operands_and_result_element_type_op1";
+  }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,                // NOLINT
+                    pir::OperationArgument &argument) {}  // NOLINT
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultElementTypeTraitOp2.
+class SameOperandsAndResultElementTypeTraitOp2
+    : public pir::Op<SameOperandsAndResultElementTypeTraitOp2,
+                     pir::SameOperandsAndResultElementTypeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() {
+    return "test.same_operands_and_result_element_type_op2";
+  }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand);
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultElementTypeTraitOp3.
+class SameOperandsAndResultElementTypeTraitOp3
+    : public pir::Op<SameOperandsAndResultElementTypeTraitOp3,
+                     pir::SameOperandsAndResultElementTypeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() {
+    return "test.same_operands_and_result_element_type_op3";
+  }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand,
+                    pir::Type out_type1,
+                    pir::Type out_type2);
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultTypeTraitOp1.
+class SameOperandsAndResultTypeTraitOp1
+    : public pir::Op<SameOperandsAndResultTypeTraitOp1,
+                     pir::SameOperandsAndResultTypeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.same_operands_and_result_type_op1"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,                // NOLINT
+                    pir::OperationArgument &argument) {}  // NOLINT
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultTypeTraitOp2.
+class SameOperandsAndResultTypeTraitOp2
+    : public pir::Op<SameOperandsAndResultTypeTraitOp2,
+                     pir::SameOperandsAndResultTypeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.same_operands_and_result_type_op2"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand);
+  void VerifySig() const {}
+};
+
+// Define SameOperandsAndResultTypeTraitOp3.
+class SameOperandsAndResultTypeTraitOp3
+    : public pir::Op<SameOperandsAndResultTypeTraitOp3,
+                     pir::SameOperandsAndResultTypeTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.same_operands_and_result_type_op3"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;
+
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value l_operand,
+                    pir::Value r_operand,
+                    pir::Type out_type1,
+                    pir::Type out_type2);
+
+  void VerifySig() const {}
+};
+
 }  // namespace test
 
 IR_DECLARE_EXPLICIT_TYPE_ID(test::RegionOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(test::BranchOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(test::Operation1)
 IR_DECLARE_EXPLICIT_TYPE_ID(test::Operation2)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::TraitExampleOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsShapeTraitOp1)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsShapeTraitOp2)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp1)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp2)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp3)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsElementTypeTraitOp1)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsElementTypeTraitOp2)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp1)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp2)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp3)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp1)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp2)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp3)
diff --git a/test/cpp/prim/CMakeLists.txt b/test/cpp/prim/CMakeLists.txt
index 6499c2fae6c6e9..f4f3c1fe778f60 100644
--- a/test/cpp/prim/CMakeLists.txt
+++ b/test/cpp/prim/CMakeLists.txt
@@ -17,27 +17,7 @@ set(prim_generated_deps final_dygraph_function final_dygraph_node
 if(WITH_CINN)
   set(CINN_DEPS cinn_compiler)
 endif()
-cc_test_old(
-  test_comp_static
-  SRCS
-  test_static_prim.cc
-  DEPS
-  fleet_executor
-  static_utils
-  static_prim_api
-  generated_op
-  prim_utils
-  operator
-  elementwise_mul_op
-  elementwise_sub_op
-  fill_constant_op
-  activation_op
-  phi
-  static_global_utils
-  static_tensor_operants
-  generated_static_op
-  ${CINN_DEPS}
-  python)
+paddle_test(test_comp_static SRCS test_static_prim.cc)
 
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   if(WITH_CINN)
diff --git a/test/cpp/prim/test_static_prim.cc b/test/cpp/prim/test_static_prim.cc
index d4f5dcb8998ae7..8fd7d79bacbc37 100644
--- a/test/cpp/prim/test_static_prim.cc
+++ b/test/cpp/prim/test_static_prim.cc
@@ -31,46 +31,6 @@
 PD_DECLARE_bool(prim_enabled);
 PHI_DECLARE_string(tensor_operants_mode);
 
-PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(tanh, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(tanh_grad, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(pow, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(subtract, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(multiply, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(concat, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(less_equal, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(less_than, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(less_than_raw, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(equal, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(not_equal, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(greater_equal, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(greater_than, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(bitwise_and, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(bitwise_or, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(bitwise_xor, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(bitwise_not, CPU, ALL_LAYOUT);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(tanh, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(tanh_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(pow, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(scale, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(subtract, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(less_equal, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(less_than, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(less_than_raw, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(equal, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(not_equal, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(greater_equal, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(greater_than, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(bitwise_and, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(bitwise_or, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(bitwise_xor, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(bitwise_not, KPS, ALL_LAYOUT);
-#endif
 namespace paddle {
 namespace prim {
 
@@ -569,20 +529,3 @@ TEST(StaticPrim, TestFlags) {
 
 }  // namespace prim
 }  // namespace paddle
-USE_OP_ITSELF(fill_constant);
-USE_OP_ITSELF(tanh);
-USE_OP_ITSELF(tanh_grad);
-USE_OP_ITSELF(elementwise_mul);
-USE_OP_ITSELF(elementwise_sub);
-USE_OP_ITSELF(elementwise_pow);
-USE_OP_ITSELF(scale);
-USE_OP_ITSELF(less_equal);
-USE_OP_ITSELF(less_than);
-USE_OP_ITSELF(equal);
-USE_OP_ITSELF(not_equal);
-USE_OP_ITSELF(greater_equal);
-USE_OP_ITSELF(greater_than);
-USE_OP_ITSELF(bitwise_xor);
-USE_OP_ITSELF(bitwise_and);
-USE_OP_ITSELF(bitwise_not);
-USE_OP_ITSELF(bitwise_or);
diff --git a/test/distributed_passes/auto_parallel_pass_test_base.py b/test/distributed_passes/auto_parallel_pass_test_base.py
index 69c2d051c7db37..90173e43de5722 100644
--- a/test/distributed_passes/auto_parallel_pass_test_base.py
+++ b/test/distributed_passes/auto_parallel_pass_test_base.py
@@ -37,6 +37,7 @@ class AutoPallelPassTestBase(DistPassTestBase):
     def setUp(self):
         paddle.enable_static()
         seed = int(os.environ.get('SEED', -1))
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
         if seed <= 0:
             seed = np.random.randint(low=1, high=1000000, size=[1])[0]
             os.environ['SEED'] = str(seed)
diff --git a/test/distributed_passes/dist_pass_test_base.py b/test/distributed_passes/dist_pass_test_base.py
index 72bc7ca78d9de2..945f6f29eeb434 100644
--- a/test/distributed_passes/dist_pass_test_base.py
+++ b/test/distributed_passes/dist_pass_test_base.py
@@ -64,6 +64,7 @@ def setUp(self):
         if paddle.is_compiled_with_cuda():
             paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
 
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
         seed = int(os.environ.get('SEED', -1))
         if seed <= 0:
             seed = np.random.randint(low=1, high=1000000, size=[1])[0]
diff --git a/test/distribution/test_distribution_geometric.py b/test/distribution/test_distribution_geometric.py
index dd2ef33242c065..ab8333b7f9997c 100644
--- a/test/distribution/test_distribution_geometric.py
+++ b/test/distribution/test_distribution_geometric.py
@@ -62,7 +62,7 @@ def test_mean(self):
         with paddle.base.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 self._paddle_geom.mean,
-                scipy.stats.geom.mean(self.probs),
+                scipy.stats.geom.mean(self.probs, loc=-1),
                 rtol=RTOL.get(str(self._paddle_geom.probs.numpy().dtype)),
                 atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
             )
@@ -71,7 +71,7 @@ def test_variance(self):
         with paddle.base.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 self._paddle_geom.variance,
-                scipy.stats.geom.var(self.probs),
+                scipy.stats.geom.var(self.probs, loc=-1),
                 rtol=RTOL.get(str(self._paddle_geom.probs.numpy().dtype)),
                 atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
             )
@@ -80,7 +80,7 @@ def test_stddev(self):
         with paddle.base.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 self._paddle_geom.stddev,
-                scipy.stats.geom.std(self.probs),
+                scipy.stats.geom.std(self.probs, loc=-1),
                 rtol=RTOL.get(str(self._paddle_geom.probs.numpy().dtype)),
                 atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
             )
@@ -89,7 +89,7 @@ def test_entropy(self):
         with paddle.base.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 self._paddle_geom.entropy(),
-                scipy.stats.geom.entropy(self.probs),
+                scipy.stats.geom.entropy(self.probs, loc=-1),
                 rtol=RTOL.get(str(self._paddle_geom.probs.numpy().dtype)),
                 atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
             )
@@ -122,21 +122,21 @@ def test_sample_shape(self):
             )
 
     def test_sample(self):
-        sample_shape = (80000,)
+        sample_shape = (100000,)
         samples = self._paddle_geom.sample(sample_shape)
         sample_values = samples.numpy()
         self.assertEqual(sample_values.dtype, self.probs.dtype)
 
         np.testing.assert_allclose(
             sample_values.mean(axis=0),
-            scipy.stats.geom.mean(self.probs),
-            rtol=0.7,
+            scipy.stats.geom.mean(self.probs, loc=-1),
+            rtol=0.1,
             atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
         )
         np.testing.assert_allclose(
             sample_values.var(axis=0),
-            scipy.stats.geom.var(self.probs),
-            rtol=0.7,
+            scipy.stats.geom.var(self.probs, loc=-1),
+            rtol=0.1,
             atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
         )
 
@@ -167,14 +167,14 @@ def test_rsample(self):
 
         np.testing.assert_allclose(
             sample_values.mean(axis=0),
-            scipy.stats.geom.mean(self.probs),
-            rtol=0.7,
+            scipy.stats.geom.mean(self.probs, loc=-1),
+            rtol=0.1,
             atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
         )
         np.testing.assert_allclose(
             sample_values.var(axis=0),
-            scipy.stats.geom.var(self.probs),
-            rtol=0.7,
+            scipy.stats.geom.var(self.probs, loc=-1),
+            rtol=0.1,
             atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
         )
 
@@ -239,7 +239,7 @@ def test_pmf(self):
         with paddle.base.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 self._paddle_geom.pmf(self.value),
-                scipy.stats.geom.pmf(self.value, self.probs),
+                scipy.stats.geom.pmf(self.value, self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -248,7 +248,7 @@ def test_log_pmf(self):
         with paddle.base.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 self._paddle_geom.log_pmf(self.value),
-                scipy.stats.geom.logpmf(self.value, self.probs),
+                scipy.stats.geom.logpmf(self.value, self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -257,7 +257,7 @@ def test_cdf(self):
         with paddle.base.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 self._paddle_geom.cdf(self.value),
-                scipy.stats.geom.cdf(self.value, self.probs),
+                scipy.stats.geom.cdf(self.value, self.probs, loc=-1),
                 rtol=RTOL.get(str(self._paddle_geom.probs.numpy().dtype)),
                 atol=ATOL.get(str(self._paddle_geom.probs.numpy().dtype)),
             )
diff --git a/test/distribution/test_distribution_geometric_static.py b/test/distribution/test_distribution_geometric_static.py
index 3a4bf73dd83222..c56d9029d617bc 100644
--- a/test/distribution/test_distribution_geometric_static.py
+++ b/test/distribution/test_distribution_geometric_static.py
@@ -72,7 +72,7 @@ def test_mean(self):
             )
             np.testing.assert_allclose(
                 mean,
-                scipy.stats.geom.mean(self.probs),
+                scipy.stats.geom.mean(self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -86,7 +86,7 @@ def test_variance(self):
             )
             np.testing.assert_allclose(
                 variance,
-                scipy.stats.geom.var(self.probs),
+                scipy.stats.geom.var(self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -100,7 +100,7 @@ def test_stddev(self):
             )
             np.testing.assert_allclose(
                 stddev,
-                scipy.stats.geom.std(self.probs),
+                scipy.stats.geom.std(self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -136,7 +136,7 @@ def test_entropy(self):
             )
             np.testing.assert_allclose(
                 entropy,
-                scipy.stats.geom.entropy(self.probs),
+                scipy.stats.geom.entropy(self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -203,7 +203,7 @@ def test_pmf(self):
             )
             np.testing.assert_allclose(
                 pmf,
-                scipy.stats.geom.pmf(self.value, self.probs),
+                scipy.stats.geom.pmf(self.value, self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -217,7 +217,7 @@ def test_log_pmf(self):
             )
             np.testing.assert_allclose(
                 log_pmf,
-                scipy.stats.geom.logpmf(self.value, self.probs),
+                scipy.stats.geom.logpmf(self.value, self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -231,7 +231,7 @@ def test_cdf(self):
             )
             np.testing.assert_allclose(
                 cdf,
-                scipy.stats.geom.cdf(self.value, self.probs),
+                scipy.stats.geom.cdf(self.value, self.probs, loc=-1),
                 rtol=RTOL.get(str(self.probs.dtype)),
                 atol=ATOL.get(str(self.probs.dtype)),
             )
@@ -339,3 +339,7 @@ def _kl(self):
         return self.probs1 * np.log(self.probs1 / self.probs2) + (
             1.0 - self.probs1
         ) * np.log((1.0 - self.probs1) / (1.0 - self.probs2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt
index d8aca1e3f5671b..1beadd642a66e0 100644
--- a/test/dygraph_to_static/CMakeLists.txt
+++ b/test/dygraph_to_static/CMakeLists.txt
@@ -3,34 +3,9 @@ file(
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+set(SOT_ENVS SOT_LOG_LEVEL=0 COST_MODEL=False MIN_GRAPH_SIZE=0 STRICT_MODE=0)
 set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
-set(DY2ST_EAGER_TEST_ENVS ${GC_ENVS})
 
-set(TEST_EAGER_OPS
-    test_bmn
-    test_break_continue
-    test_ifelse
-    test_loop
-    test_mnist_amp
-    test_mnist_pure_fp16
-    test_mobile_net
-    test_program_translator
-    test_ptb_lm
-    test_reinforcement_learning
-    test_resnet
-    test_resnet_amp
-    test_resnet_pure_fp16
-    test_se_resnet
-    test_sentiment
-    test_seq2seq
-    test_tsm
-    test_word2vec
-    test_yolov3
-    test_bert
-    test_cycle_gan
-    test_lstm
-    test_simnet
-    test_transformer)
 list(REMOVE_ITEM TEST_OPS test_lac)
 # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope
 # will be removed and will cause some random failed in multi-thread.
@@ -52,20 +27,7 @@ if(NOT WITH_GPU)
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
-  list(FIND TEST_EAGER_OPS ${TEST_OP} WAS_FOUND)
-  if(NOT WAS_FOUND EQUAL -1)
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${DY2ST_EAGER_TEST_ENVS})
-  else()
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
-  endif()
-endforeach()
-
-set(STRIED_TESTS test_bert test_lstm test_ptb_lm_v2 test_slice)
-
-foreach(STRIED_TEST ${STRIED_TESTS})
-  py_test_modules(${STRIED_TEST}_with_stride MODULES ${STRIED_TEST} ENVS
-                  ${GC_ENVS} FLAGS_use_stride_kernel=true)
-  set_tests_properties(${STRIED_TEST}_with_stride PROPERTIES TIMEOUT 120)
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS} ${SOT_ENVS})
 endforeach()
 
 set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900)
@@ -75,11 +37,11 @@ set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 120)
 set_tests_properties(test_seq2seq PROPERTIES TIMEOUT 150)
 set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 150)
 set_tests_properties(test_bert PROPERTIES TIMEOUT 180)
-set_tests_properties(test_bert_with_stride PROPERTIES TIMEOUT 120)
-set_tests_properties(test_basic_api_transformation PROPERTIES TIMEOUT 120)
+set_tests_properties(test_basic_api_transformation PROPERTIES TIMEOUT 240)
 set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transformer PROPERTIES TIMEOUT 200)
-set_tests_properties(test_bmn PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
+set_tests_properties(test_bert PROPERTIES TIMEOUT 240)
 #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120)
 set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120)
 
diff --git a/test/dygraph_to_static/dygraph_to_static_util.py b/test/dygraph_to_static/dygraph_to_static_util.py
index 3202621228710c..9a5b9bf22d92a4 100644
--- a/test/dygraph_to_static/dygraph_to_static_util.py
+++ b/test/dygraph_to_static/dygraph_to_static_util.py
@@ -49,7 +49,8 @@ def to_sot(func):
     """
     convert run fall_back to ast
     """
-    enable_sot = os.environ.get("ENABLE_SOT", "False") == "True"
+    # TODO(SigureMo): ENABLE_SOT should always be True, remove this
+    enable_sot = os.environ.get("ENABLE_SOT", "True") == "True"
 
     def impl(*args, **kwargs):
         if enable_sot:
diff --git a/test/dygraph_to_static/dygraph_to_static_utils_new.py b/test/dygraph_to_static/dygraph_to_static_utils_new.py
new file mode 100644
index 00000000000000..de74552e3248d1
--- /dev/null
+++ b/test/dygraph_to_static/dygraph_to_static_utils_new.py
@@ -0,0 +1,317 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import inspect
+import logging
+import os
+import unittest
+from enum import Flag, auto
+from functools import wraps
+
+import numpy as np
+
+from paddle import set_flags, static
+from paddle.base import core
+
+"""
+# Usage:
+class MyTest(Dy2StTestBase):
+    @set_to_static_mode(
+        ToStaticMode.LEGACY_AST | ToStaticMode.SOT | ToStaticMode.PIR_AST
+    )
+    @set_ir_mode(IrMode.LEGACY_PROGRAM | IrMode.PIR)
+    def test_case1(self):
+        raise ValueError("MyTest 1")
+
+    def test_case2(self):
+        raise ValueError("MyTest 2")
+
+
+class MyTest2(MyTest):
+    def test_case1(self):
+        raise ValueError("MyTest2 1")
+"""
+
+logger = logging.getLogger("Dygraph to static utils")
+logger.setLevel(logging.WARNING)
+
+
+class ToStaticMode(Flag):
+    LEGACY_AST = auto()
+    PIR_AST = auto()
+    SOT = auto()
+
+    def lower_case_name(self):
+        return self.name.lower()
+
+
+class IrMode(Flag):
+    LEGACY_PROGRAM = auto()
+    PIR = auto()
+
+    def lower_case_name(self):
+        return self.name.lower()
+
+
+DEFAULT_TO_STATIC_MODE = ToStaticMode.LEGACY_AST | ToStaticMode.SOT
+DEFAULT_IR_MODE = IrMode.LEGACY_PROGRAM
+
+
+def in_sot_mode():
+    return os.getenv("ENABLE_FALL_BACK", "False") == "True"
+
+
+@contextlib.contextmanager
+def enable_fallback_guard(enable):
+    flag = os.environ.get("ENABLE_FALL_BACK", None)
+    os.environ["ENABLE_FALL_BACK"] = enable
+    yield
+    if flag is not None:
+        os.environ["ENABLE_FALL_BACK"] = flag
+    else:
+        del os.environ["ENABLE_FALL_BACK"]
+
+
+def to_legacy_ast_test(fn):
+    """
+    convert run fall_back to ast
+    """
+
+    @wraps(fn)
+    def impl(*args, **kwargs):
+        logger.info("[AST] running AST")
+        with enable_fallback_guard("False"):
+            fn(*args, **kwargs)
+
+    return impl
+
+
+def to_sot_test(fn):
+    """
+    convert run fall_back to ast
+    """
+
+    @wraps(fn)
+    def impl(*args, **kwargs):
+        logger.info("[SOT] running SOT")
+        with enable_fallback_guard("True"):
+            fn(*args, **kwargs)
+
+    return impl
+
+
+def to_pir_ast_test(fn):
+    raise TypeError("Don't enable PIR AST mode now!")
+
+
+def to_legacy_program_test(fn):
+    def impl(*args, **kwargs):
+        logger.info("[Program] running legacy program")
+        return fn(*args, **kwargs)
+
+    return impl
+
+
+def to_pir_test(fn):
+    @wraps(fn)
+    def impl(*args, **kwargs):
+        logger.info("[PIR] running pir")
+        ir_outs = None
+        if os.environ.get('FLAGS_use_stride_kernel', False):
+            return
+        with static.scope_guard(static.Scope()):
+            with static.program_guard(static.Program()):
+                try:
+                    new_ir_flag = 'FLAGS_enable_new_ir_in_executor'
+                    os.environ[new_ir_flag] = 'True'
+                    set_flags({new_ir_flag: True})
+                    ir_outs = fn(*args, **kwargs)
+                finally:
+                    del os.environ[new_ir_flag]
+                    set_flags({new_ir_flag: False})
+        return ir_outs
+
+    return impl
+
+
+# Metaclass and BaseClass
+class Dy2StTestMeta(type):
+    TO_STATIC_HANDLER_MAP = {
+        ToStaticMode.SOT: to_sot_test,
+        ToStaticMode.LEGACY_AST: to_legacy_ast_test,
+        ToStaticMode.PIR_AST: to_pir_ast_test,
+    }
+
+    IR_HANDLER_MAP = {
+        IrMode.LEGACY_PROGRAM: to_legacy_program_test,
+        IrMode.PIR: to_pir_test,
+    }
+
+    def __new__(cls, name, bases, attrs):
+        new_attrs = {}
+        original_test_cases = {
+            key: value
+            for key, value in attrs.items()
+            if key.startswith("test") and inspect.isfunction(value)
+        }
+        logger.info(f"[creating {name}]")
+        new_attrs.update(
+            {
+                key: value
+                for key, value in attrs.items()
+                if key not in original_test_cases
+            }
+        )
+        for fn_name, fn in original_test_cases.items():
+            logger.info(f"Generating {fn_name}")
+            # Disable inherited test cases
+            for base in bases:
+                for attr in dir(base):
+                    if attr.startswith(fn_name):
+                        new_attrs[attr] = None
+            fn_to_static_modes = getattr(
+                fn, "to_static_mode", DEFAULT_TO_STATIC_MODE
+            )
+            fn_ir_modes = getattr(fn, "ir_mode", DEFAULT_IR_MODE)
+            fn_disabled_test_cases = getattr(fn, "disabled_test_cases", [])
+            logger.info(f"fn_to_static_modes: {fn_to_static_modes}")
+            logger.info(f"fn_ir_modes: {fn_ir_modes}")
+            logger.info(f"fn_disabled_test_cases: {fn_disabled_test_cases}")
+            # Get all valid test cases with to_static_mode and ir_mode
+            to_static_with_ir_modes = [
+                (to_static_mode, ir_mode)
+                for to_static_mode in ToStaticMode
+                for ir_mode in IrMode
+                if to_static_mode & fn_to_static_modes and ir_mode & fn_ir_modes
+            ]
+            # Filter out disabled test cases and test cases already in compare groups
+            to_static_with_ir_modes = list(
+                filter(
+                    lambda flags: (flags not in fn_disabled_test_cases),
+                    to_static_with_ir_modes,
+                )
+            )
+            # Generate all test cases
+            for to_static_mode, ir_mode in to_static_with_ir_modes:
+                if (
+                    to_static_mode == ToStaticMode.PIR_AST
+                    and ir_mode == IrMode.LEGACY_PROGRAM
+                ):
+                    # PIR with LEGACY_PROGRAM is not a valid combination
+                    continue
+                new_attrs[
+                    Dy2StTestMeta.test_case_name(
+                        fn_name, to_static_mode, ir_mode
+                    )
+                ] = Dy2StTestMeta.convert_test_case(fn, to_static_mode, ir_mode)
+        return type.__new__(cls, name, bases, new_attrs)
+
+    @staticmethod
+    def test_case_name(original_name: str, to_static_mode, ir_mode):
+        return f"{original_name}__{to_static_mode.lower_case_name()}_{ir_mode.lower_case_name()}"
+
+    @staticmethod
+    def convert_test_case(fn, to_static_mode, ir_mode):
+        fn = Dy2StTestMeta.IR_HANDLER_MAP[ir_mode](fn)
+        fn = Dy2StTestMeta.TO_STATIC_HANDLER_MAP[to_static_mode](fn)
+        return fn
+
+
+class Dy2StTestBase(unittest.TestCase, metaclass=Dy2StTestMeta):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+# Base decorators
+def set_to_static_mode(mode: ToStaticMode):
+    def decorator(fn):
+        fn.to_static_mode = mode
+        return fn
+
+    return decorator
+
+
+def set_ir_mode(mode: IrMode):
+    def decorator(fn):
+        fn.ir_mode = mode
+        return fn
+
+    return decorator
+
+
+def disable_test_case(flags):
+    def decorator(fn):
+        disabled_test_cases = getattr(fn, "disabled_test_cases", [])
+        disabled_test_cases.append(flags)
+        fn.disabled_test_cases = disabled_test_cases
+        return fn
+
+    return decorator
+
+
+# Suger decorators
+# These decorators can be simply composed by base decorators
+def ast_only_test(fn):
+    fn = set_to_static_mode(ToStaticMode.LEGACY_AST)(fn)
+    return fn
+
+
+def sot_only_test(fn):
+    fn = set_to_static_mode(ToStaticMode.SOT)(fn)
+    return fn
+
+
+def test_with_new_ir(fn):
+    fn = set_ir_mode(IrMode.PIR)(fn)
+    return fn
+
+
+def _test_and_compare_with_new_ir(fn):
+    @wraps(fn)
+    def impl(*args, **kwargs):
+        outs = fn(*args, **kwargs)
+        if core._is_bwd_prim_enabled() or core._is_fwd_prim_enabled():
+            return outs
+        ir_outs = to_pir_test(fn)(*args, **kwargs)
+        np.testing.assert_equal(
+            outs,
+            ir_outs,
+            err_msg=f'Dy2St Unittest Check ({fn.__name__}) has diff \n'
+            + f'Expect {outs}\n'
+            + f'But Got {ir_outs}',
+        )
+        return outs
+
+    return impl
+
+
+def test_and_compare_with_new_ir(need_check_output: bool = True):
+    def decorator(fn):
+        fn = set_ir_mode(IrMode.LEGACY_PROGRAM | IrMode.PIR)(fn)
+        if need_check_output:
+            logger.info(f"[need_check_output] {fn.__name__}")
+            fn = _test_and_compare_with_new_ir(fn)
+        return fn
+
+    return decorator
+
+
+# For debug
+def show_all_test_cases(test_class):
+    logger.info(f"[showing {test_class.__name__}]")
+    for attr in dir(test_class):
+        if attr.startswith("test"):
+            fn = getattr(test_class, attr)
+            logger.info(f"{attr}: {fn}")
diff --git a/test/dygraph_to_static/test_assert.py b/test/dygraph_to_static/test_assert.py
index dc01413d0c8bec..210e904454fd93 100644
--- a/test/dygraph_to_static/test_assert.py
+++ b/test/dygraph_to_static/test_assert.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    ast_only_test,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base
@@ -33,7 +37,8 @@ def dyfunc_assert_non_variable(x=True):
     assert x
 
 
-class TestAssertVariable(unittest.TestCase):
+# @dy2static_unittest
+class TestAssertVariable(Dy2StTestBase):
     def _run(self, func, x, with_exception, to_static):
         paddle.jit.enable_to_static(to_static)
         if with_exception:
@@ -49,6 +54,7 @@ def _run_dy_static(self, func, x, with_exception):
         self._run(func, x, with_exception, False)
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_non_variable(self):
         self._run_dy_static(
             dyfunc_assert_non_variable, x=False, with_exception=True
@@ -58,6 +64,7 @@ def test_non_variable(self):
         )
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_bool_variable(self):
         self._run_dy_static(
             dyfunc_assert_variable, x=numpy.array([False]), with_exception=True
@@ -67,6 +74,7 @@ def test_bool_variable(self):
         )
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_int_variable(self):
         self._run_dy_static(
             dyfunc_assert_variable, x=numpy.array([0]), with_exception=True
diff --git a/test/dygraph_to_static/test_ast_util.py b/test/dygraph_to_static/test_ast_util.py
index 52920d81433c69..c2468765e34387 100644
--- a/test/dygraph_to_static/test_ast_util.py
+++ b/test/dygraph_to_static/test_ast_util.py
@@ -17,7 +17,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    ast_only_test,
+    test_and_compare_with_new_ir,
+)
 from ifelse_simple_func import (
     dyfunc_with_if_else,
     dyfunc_with_if_else2,
@@ -31,7 +35,8 @@
 from paddle.utils import gast
 
 
-class TestAST2Func(unittest.TestCase):
+# @dy2static_unittest
+class TestAST2Func(Dy2StTestBase):
     """
     TestCase for the transformation from ast.AST into python callable function.
     """
@@ -43,6 +48,7 @@ def _ast2func(self, func):
         transformed_func, _ = ast_to_func(ast_root, func)
         return transformed_func
 
+    @ast_only_test
     def test_ast2func(self):
         def func(x, y):
             return x + y
@@ -50,6 +56,7 @@ def func(x, y):
         x, y = 10, 20
         self.assertEqual(func(x, y), self._ast2func(func)(x, y))
 
+    @ast_only_test
     def test_ast2func_dygraph(self):
         paddle.disable_static()
         funcs = [dyfunc_with_if_else, dyfunc_with_if_else2, nested_if_else]
@@ -62,6 +69,7 @@ def test_ast2func_dygraph(self):
                 self.assertTrue((true_ret == test_ret).all())
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_ast2func_static(self):
         paddle.enable_static()
 
@@ -80,6 +88,7 @@ def func(x):
             ret = exe.run(main_program, fetch_list=[true_ret, test_ret])
             self.assertTrue((ret[0] == ret[1]).all())
 
+    @ast_only_test
     def test_ast2func_error(self):
         with self.assertRaises(Exception) as e:
             self.assertRaises(TypeError, ast_to_func("x = a + b", 'foo'))
diff --git a/test/dygraph_to_static/test_backward_without_params.py b/test/dygraph_to_static/test_backward_without_params.py
index af70b9e7a2f95f..336d96f2399b53 100644
--- a/test/dygraph_to_static/test_backward_without_params.py
+++ b/test/dygraph_to_static/test_backward_without_params.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -24,16 +27,16 @@ class Net(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
 
-    @paddle.jit.to_static
     def forward(self, x):
         out = x + 1
         return out
 
 
-class TestBackwardWithoutParams(unittest.TestCase):
+# @dy2static_unittest
+class TestBackwardWithoutParams(Dy2StTestBase):
     @test_and_compare_with_new_ir(False)
     def test_run(self):
-        net = Net()
+        net = paddle.jit.to_static(Net())
 
         x = paddle.ones([2, 2])
         x.stop_gradient = False
@@ -47,7 +50,6 @@ class ZeroSizeNet(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
 
-    @paddle.jit.to_static
     def forward(self, x):
         y = paddle.randn((0,))
         out = paddle.nn.functional.relu(x)
@@ -55,10 +57,11 @@ def forward(self, x):
         return y, out
 
 
-class TestZeroSizeNet(unittest.TestCase):
+# @dy2static_unittest
+class TestZeroSizeNet(Dy2StTestBase):
     @test_and_compare_with_new_ir(False)
     def test_run(self):
-        net = ZeroSizeNet()
+        net = paddle.jit.to_static(ZeroSizeNet())
         x = paddle.ones([2, 2])
         x.stop_gradient = False
         _, out = net(x)
diff --git a/test/dygraph_to_static/test_basic_api_transformation.py b/test/dygraph_to_static/test_basic_api_transformation.py
index efa9caa17dd515..e0998b8fe1e67f 100644
--- a/test/dygraph_to_static/test_basic_api_transformation.py
+++ b/test/dygraph_to_static/test_basic_api_transformation.py
@@ -16,7 +16,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base, to_tensor
@@ -69,6 +72,7 @@ def dyfunc_bool_to_tensor(x):
     return paddle.to_tensor(True)
 
 
+@dy2static_unittest
 class TestDygraphBasicApi_ToVariable(unittest.TestCase):
     def setUp(self):
         self.input = np.ones(5).astype("int32")
@@ -230,6 +234,7 @@ def dyfunc_Prelu(input):
     return res
 
 
+@dy2static_unittest
 class TestDygraphBasicApi(unittest.TestCase):
     # Compare results of dynamic graph and transformed static graph function which only
     # includes basic Api.
@@ -396,6 +401,7 @@ def dyfunc_PolynomialDecay():
     return paddle.to_tensor(lr)
 
 
+@dy2static_unittest
 class TestDygraphBasicApi_CosineDecay(unittest.TestCase):
     def setUp(self):
         self.dygraph_func = dyfunc_CosineDecay
@@ -539,6 +545,7 @@ def _dygraph_fn():
         np.random.random(1)
 
 
+@dy2static_unittest
 class TestDygraphApiRecognition(unittest.TestCase):
     def setUp(self):
         self.src = inspect.getsource(_dygraph_fn)
diff --git a/test/dygraph_to_static/test_bert.py b/test/dygraph_to_static/test_bert.py
index c7b5272ff47659..ba8e2350794aad 100644
--- a/test/dygraph_to_static/test_bert.py
+++ b/test/dygraph_to_static/test_bert.py
@@ -20,7 +20,11 @@
 import numpy as np
 from bert_dygraph_model import PretrainModelLayer
 from bert_utils import get_bert_config, get_feed_data_reader
-from dygraph_to_static_util import ast_only_test, test_with_new_ir
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_with_new_ir,
+)
 from predictor_utils import PredictorTools
 
 import paddle
@@ -74,6 +78,7 @@ def __len__(self):
         return len(self.src_ids)
 
 
+@dy2static_unittest
 class TestBert(unittest.TestCase):
     def setUp(self):
         self.bert_config = get_bert_config()
diff --git a/test/dygraph_to_static/test_break_continue.py b/test/dygraph_to_static/test_break_continue.py
index d3a2162dc787e1..a803c1d4bf49ed 100644
--- a/test/dygraph_to_static/test_break_continue.py
+++ b/test/dygraph_to_static/test_break_continue.py
@@ -205,6 +205,7 @@ def test_optim_break_in_while(x):
     return x
 
 
+@dy2static_unittest
 class TestContinueInFor(unittest.TestCase):
     def setUp(self):
         self.input = np.zeros(1).astype('int64')
diff --git a/test/dygraph_to_static/test_build_strategy.py b/test/dygraph_to_static/test_build_strategy.py
index 83ed8d56751dd9..85e934afb020bb 100644
--- a/test/dygraph_to_static/test_build_strategy.py
+++ b/test/dygraph_to_static/test_build_strategy.py
@@ -84,6 +84,7 @@ def test_in_static_mode_mkldnn(self):
             paddle.base.set_flags({'FLAGS_use_mkldnn': False})
 
 
+@dy2static_unittest
 class TestError(unittest.TestCase):
     def test_type_error(self):
         def foo(x):
diff --git a/test/dygraph_to_static/test_cache_program.py b/test/dygraph_to_static/test_cache_program.py
index 0602b15b3054be..199c3e980e20c9 100644
--- a/test/dygraph_to_static/test_cache_program.py
+++ b/test/dygraph_to_static/test_cache_program.py
@@ -76,6 +76,7 @@ def setUp(self):
         self.data = np.random.random((4, 10)).astype('float32')
 
 
+@dy2static_unittest
 class TestCacheProgramWithOptimizer(unittest.TestCase):
     def setUp(self):
         self.dygraph_class = Linear
@@ -125,6 +126,7 @@ def simple_func(x):
     return mean
 
 
+@dy2static_unittest
 class TestConvertWithCache(unittest.TestCase):
     def test_cache(self):
         static_func = convert_to_static(simple_func)
@@ -155,6 +157,7 @@ def sum_under_while(limit):
     return ret_sum
 
 
+@dy2static_unittest
 class TestToOutputWithCache(unittest.TestCase):
     def test_output(self):
         with base.dygraph.guard():
diff --git a/test/dygraph_to_static/test_cast.py b/test/dygraph_to_static/test_cast.py
index 7e2b0914a5fff5..8c0a4bf0a1318a 100644
--- a/test/dygraph_to_static/test_cast.py
+++ b/test/dygraph_to_static/test_cast.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test, test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    ast_only_test,
+    test_and_compare_with_new_ir,
+)
 
 from paddle import base
 from paddle.jit.api import to_static
@@ -24,14 +28,12 @@
 np.random.seed(SEED)
 
 
-@to_static
 def test_bool_cast(x):
     x = base.dygraph.to_variable(x)
     x = bool(x)
     return x
 
 
-@to_static
 def test_int_cast(x):
     x = base.dygraph.to_variable(x)
     x = int(x)
@@ -44,13 +46,11 @@ def test_float_cast(x):
     return x
 
 
-@to_static
 def test_not_var_cast(x):
     x = int(x)
     return x
 
 
-@to_static
 def test_mix_cast(x):
     x = base.dygraph.to_variable(x)
     x = int(x)
@@ -60,7 +60,8 @@ def test_mix_cast(x):
     return x
 
 
-class TestCastBase(unittest.TestCase):
+# @dy2static_unittest
+class TestCastBase(Dy2StTestBase):
     def setUp(self):
         self.place = (
             base.CUDAPlace(0)
@@ -81,7 +82,7 @@ def prepare(self):
         self.cast_dtype = 'bool'
 
     def set_func(self):
-        self.func = test_bool_cast
+        self.func = to_static(full_graph=True)(test_bool_cast)
 
     def do_test(self):
         with base.dygraph.guard():
@@ -90,6 +91,7 @@ def do_test(self):
 
     @ast_only_test  # TODO: add new symbolic only test.
     @test_and_compare_with_new_ir(False)
+    # @set_to_static_mode(ToStaticMode.LEGACY_AST)
     def test_cast_result(self):
         res = self.do_test().numpy()
         self.assertTrue(
@@ -119,7 +121,7 @@ def prepare(self):
         self.cast_dtype = 'int32'
 
     def set_func(self):
-        self.func = test_int_cast
+        self.func = to_static(full_graph=True)(test_int_cast)
 
 
 class TestFloatCast(TestCastBase):
@@ -134,7 +136,7 @@ def prepare(self):
         self.cast_dtype = 'float32'
 
     def set_func(self):
-        self.func = to_static(test_float_cast)
+        self.func = to_static(full_graph=True)(test_float_cast)
 
 
 class TestMixCast(TestCastBase):
@@ -152,7 +154,7 @@ def prepare(self):
         self.cast_dtype = 'float32'
 
     def set_func(self):
-        self.func = test_mix_cast
+        self.func = to_static(full_graph=True)(test_mix_cast)
 
     @ast_only_test  # TODO: add new symbolic only test.
     @test_and_compare_with_new_ir(False)
@@ -184,11 +186,13 @@ def prepare(self):
         self.cast_dtype = 'int'
 
     def set_func(self):
-        self.func = test_not_var_cast
+        self.func = to_static(full_graph=True)(test_not_var_cast)
 
-    @ast_only_test  # TODO: add new symbolic only test.
+    @ast_only_test
     @test_and_compare_with_new_ir(False)
     def test_cast_result(self):
+        # breakpoint()
+        # print("run once!!!")
         res = self.do_test()
         self.assertTrue(type(res) == int, msg='The casted dtype is not int.')
         ref_val = int(self.input)
diff --git a/test/dygraph_to_static/test_cinn.py b/test/dygraph_to_static/test_cinn.py
index 59a114d0aae586..84e619149c8009 100644
--- a/test/dygraph_to_static/test_cinn.py
+++ b/test/dygraph_to_static/test_cinn.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -42,6 +45,7 @@ def apply_to_static(net, use_cinn):
     return paddle.jit.to_static(net, build_strategy=build_strategy)
 
 
+@dy2static_unittest
 class TestCINN(unittest.TestCase):
     def setUp(self):
         self.x = paddle.randn([2, 4])
diff --git a/test/dygraph_to_static/test_cinn_prim.py b/test/dygraph_to_static/test_cinn_prim.py
index 0bf905ec846f9f..2ed5326f7b9d00 100644
--- a/test/dygraph_to_static/test_cinn_prim.py
+++ b/test/dygraph_to_static/test_cinn_prim.py
@@ -172,6 +172,7 @@ def test_cinn_prim(self):
             )
 
 
+@dy2static_unittest
 class TestBackend(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_backend(self):
diff --git a/test/dygraph_to_static/test_cinn_prim_layer_norm.py b/test/dygraph_to_static/test_cinn_prim_layer_norm.py
index 18c48883d75a68..42bf36d731eca6 100644
--- a/test/dygraph_to_static/test_cinn_prim_layer_norm.py
+++ b/test/dygraph_to_static/test_cinn_prim_layer_norm.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test
+from dygraph_to_static_util import ast_only_test, dy2static_unittest
 
 import paddle
 import paddle.nn.functional as F
@@ -52,6 +52,7 @@ def forward(self, x, w, b):
         return out[0]
 
 
+@dy2static_unittest
 class TestPrimForward(unittest.TestCase):
     """
     This case only tests prim_forward + to_static + cinn. Thus we need to
@@ -124,6 +125,7 @@ def test_cinn_prim_forward(self):
             )
 
 
+@dy2static_unittest
 class TestPrimForwardAndBackward(unittest.TestCase):
     """
     Test PrimeNet with @to_static + prim forward + prim backward + cinn v.s Dygraph
diff --git a/test/dygraph_to_static/test_closure_analysis.py b/test/dygraph_to_static/test_closure_analysis.py
index 95234565a6922f..de1d1e12d6502a 100644
--- a/test/dygraph_to_static/test_closure_analysis.py
+++ b/test/dygraph_to_static/test_closure_analysis.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 
 import inspect
-import os
 import unittest
 
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    test_and_compare_with_new_ir,
+)
 from numpy import append
 
 import paddle
@@ -161,7 +163,7 @@ def test_push_pop_4(x, *args, **kargs):
     return l, k
 
 
-class TestClosureAnalysis(unittest.TestCase):
+class TestClosureAnalysis(Dy2StTestBase):
     def setUp(self):
         self.judge_type = "var and w_vars"
         self.init_dygraph_func()
@@ -260,7 +262,7 @@ def init_dygraph_func(self):
         ]
 
 
-class TestPushPopTrans(unittest.TestCase):
+class TestPushPopTrans(Dy2StTestBase):
     @test_and_compare_with_new_ir(False)
     def test(self):
         def vlist_of_dict(x):
@@ -270,7 +272,6 @@ def vlist_of_dict(x):
             return ma
 
         x = paddle.to_tensor([3])
-        print(paddle.jit.to_static(vlist_of_dict).code)
         print(paddle.jit.to_static(vlist_of_dict)(x))
 
     @test_and_compare_with_new_ir(False)
@@ -284,7 +285,6 @@ def vlist_of_dict(x):
             return a
 
         x = paddle.to_tensor([3])
-        print(paddle.jit.to_static(vlist_of_dict).code)
         print(paddle.jit.to_static(vlist_of_dict)(x))
 
     @test_and_compare_with_new_ir(False)
@@ -298,7 +298,6 @@ def vlist_of_dict(x):
             return a
 
         x = paddle.to_tensor([3])
-        print(paddle.jit.to_static(vlist_of_dict).code)
         print(paddle.jit.to_static(vlist_of_dict)(x))
 
     @test_and_compare_with_new_ir(False)
@@ -312,7 +311,6 @@ def vlist_of_dict(x):
             return a
 
         x = paddle.to_tensor([3])
-        print(paddle.jit.to_static(vlist_of_dict).code)
         print(paddle.jit.to_static(vlist_of_dict)(x))
 
     @test_and_compare_with_new_ir(False)
@@ -326,10 +324,8 @@ def vlist_of_dict(x):
             return a
 
         x = paddle.to_tensor([3])
-        print(paddle.jit.to_static(vlist_of_dict).code)
         print(paddle.jit.to_static(vlist_of_dict)(x))
 
 
 if __name__ == '__main__':
-    os.environ['ENABLE_FALL_BACK'] = "False"
     unittest.main()
diff --git a/test/dygraph_to_static/test_convert_call.py b/test/dygraph_to_static/test_convert_call.py
index fb6c69fc899fae..723d3f910debdd 100644
--- a/test/dygraph_to_static/test_convert_call.py
+++ b/test/dygraph_to_static/test_convert_call.py
@@ -77,6 +77,7 @@ def dyfunc_with_staticmethod(x_v):
     return a.add(x_v, x_v)
 
 
+@dy2static_unittest
 class TestRecursiveCall1(unittest.TestCase):
     def setUp(self):
         self.input = np.random.random([10, 16]).astype('float32')
@@ -168,6 +169,7 @@ def forward(self, inputs):
         return self.act(out)
 
 
+@dy2static_unittest
 class TestRecursiveCall2(unittest.TestCase):
     def setUp(self):
         self.input = np.random.random((1, 3, 3, 5)).astype('float32')
@@ -286,7 +288,7 @@ def test_functional_api(self):
         func = paddle.nn.functional.relu
         func = paddle.jit.to_static(func)
         self.assertNotIn("_jst.IfElse", func.code)
-        self.assertIn("if in_dynamic_mode()", func.code)
+        self.assertIn("if in_dynamic_or_pir_mode()", func.code)
 
     @ast_only_test
     def test_class_api(self):
diff --git a/test/dygraph_to_static/test_convert_call_generator.py b/test/dygraph_to_static/test_convert_call_generator.py
index b33a41576498db..dd9d93c907c552 100644
--- a/test/dygraph_to_static/test_convert_call_generator.py
+++ b/test/dygraph_to_static/test_convert_call_generator.py
@@ -14,7 +14,11 @@
 
 import unittest
 
-from dygraph_to_static_util import ast_only_test, test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle.jit import to_static
@@ -32,6 +36,7 @@ def main_func():
         print(i)
 
 
+@dy2static_unittest
 class TestConvertGenerator(unittest.TestCase):
     # fallback will ok.
     @ast_only_test
diff --git a/test/dygraph_to_static/test_convert_operators.py b/test/dygraph_to_static/test_convert_operators.py
index 420e7d8b1e8871..02d0c09a70857c 100644
--- a/test/dygraph_to_static/test_convert_operators.py
+++ b/test/dygraph_to_static/test_convert_operators.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test, test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -40,6 +44,7 @@ def forward(self):
 net.forward = "A string so that convert forward will fail"
 
 
+@dy2static_unittest
 class TestConvertCall(unittest.TestCase):
     # fallback mode will raise a InnerError, it's ok.
     @ast_only_test
@@ -68,6 +73,7 @@ def callable_list(x, y):
         self.assertEqual(callable_list(1, 2), 3)
 
 
+@dy2static_unittest
 class TestConvertShapeCompare(unittest.TestCase):
     def test_non_variable(self):
         self.assertEqual(
@@ -204,6 +210,7 @@ def forward(self, x):
         return out
 
 
+@dy2static_unittest
 class TestChooseShapeAttrOrApiWithLayer(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_tensor_shape(self):
@@ -214,6 +221,7 @@ def test_tensor_shape(self):
         np.testing.assert_array_equal(out.numpy(), x.numpy())
 
 
+@dy2static_unittest
 class TestIfElseNoValue(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_else_ret_none(self):
diff --git a/test/dygraph_to_static/test_cpu_cuda_to_tensor.py b/test/dygraph_to_static/test_cpu_cuda_to_tensor.py
index f5d6c833d16c1c..b6e55b8900c1e8 100644
--- a/test/dygraph_to_static/test_cpu_cuda_to_tensor.py
+++ b/test/dygraph_to_static/test_cpu_cuda_to_tensor.py
@@ -25,6 +25,7 @@
 import paddle
 
 
+@dy2static_unittest
 class TestCpuCuda(unittest.TestCase):
     def test_cpu_cuda(self):
         def func(x):
@@ -38,6 +39,7 @@ def func(x):
         # print(paddle.jit.to_static(func)(x))
 
 
+@dy2static_unittest
 class TestToTensor(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_to_tensor_with_variable_list(self):
diff --git a/test/dygraph_to_static/test_cycle_gan.py b/test/dygraph_to_static/test_cycle_gan.py
index 3484b27d5fac5e..fb06a52407ec61 100644
--- a/test/dygraph_to_static/test_cycle_gan.py
+++ b/test/dygraph_to_static/test_cycle_gan.py
@@ -12,16 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
 
 import os
 import random
@@ -36,7 +26,10 @@
 # Use GPU:0 to elimate the influence of other tasks.
 os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle.base.dygraph import to_variable
@@ -686,6 +679,7 @@ def train(args, to_static):
         return np.array(loss_data)
 
 
+@dy2static_unittest
 class TestCycleGANModel(unittest.TestCase):
     def setUp(self):
         self.args = Args()
diff --git a/test/dygraph_to_static/test_declarative.py b/test/dygraph_to_static/test_declarative.py
index 9d3e1e54b0ebb5..f1599a8b907c30 100644
--- a/test/dygraph_to_static/test_declarative.py
+++ b/test/dygraph_to_static/test_declarative.py
@@ -17,7 +17,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    ast_only_test,
+    test_and_compare_with_new_ir,
+)
 from test_basic_api_transformation import dyfunc_to_variable
 
 import paddle
@@ -31,20 +35,21 @@
 from paddle.nn import Layer
 from paddle.static import InputSpec
 
-os.environ['ENABLE_FALL_BACK'] = "False"  # NOTE: ast only
-
 
 class SimpleNet(Layer):
     def __init__(self):
         super().__init__()
         self.linear = paddle.nn.Linear(10, 3)
 
-    @to_static(input_spec=[InputSpec(shape=[None, 10], dtype='float32')])
+    @to_static(
+        input_spec=[InputSpec(shape=[None, 10], dtype='float32')],
+        full_graph=True,
+    )
     def forward(self, x, a=1, b=2):
         y = self.inner_function(x)
         return y
 
-    @to_static
+    @to_static(full_graph=True)
     def inner_function(self, x):
         y = self.linear(x)
         return y
@@ -53,7 +58,10 @@ def add_func(self, x, y):
         z = x + y
         return z
 
-    @to_static(input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]])
+    @to_static(
+        input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]],
+        full_graph=True,
+    )
     def func_with_list(self, l, int_val=1):
         x, y = l
         z = x + y
@@ -61,7 +69,8 @@ def func_with_list(self, l, int_val=1):
         return z
 
     @to_static(
-        input_spec=[{'x': InputSpec([None, 10]), 'y': InputSpec([None, 10])}]
+        input_spec=[{'x': InputSpec([None, 10]), 'y': InputSpec([None, 10])}],
+        full_graph=True,
     )
     def func_with_dict(self, d):
         x = d['x']
@@ -76,7 +85,8 @@ def func_with_dict(self, d):
                 InputSpec([None]),
                 {'x': InputSpec([None, 10]), 'y': InputSpec([None, 10])},
             ]
-        ]
+        ],
+        full_graph=True,
     )
     def func_with_list_dict(self, dl):
         bias = dl[0]
@@ -89,7 +99,7 @@ def func_with_list_dict(self, dl):
         return z
 
 
-class TestStaticFunctionInstance(unittest.TestCase):
+class TestStaticFunctionInstance(Dy2StTestBase):
     def test_instance_same_class(self):
         with base.dygraph.guard(base.CPUPlace()):
             net_1 = SimpleNet()
@@ -106,7 +116,7 @@ def test_instance_same_class(self):
             self.assertTrue(len(net_2.forward.program_cache) == 0)
 
 
-class TestInputSpec(unittest.TestCase):
+class TestInputSpec(Dy2StTestBase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_path = os.path.join(self.temp_dir.name, 'simple_net')
@@ -115,6 +125,7 @@ def tearDown(self):
         self.temp_dir.cleanup()
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_with_input_spec(self):
         with base.dygraph.guard(base.CPUPlace()):
             x = to_variable(np.ones([4, 10]).astype('float32'))
@@ -175,6 +186,7 @@ def test_with_error(self):
                 )
                 net.add_func(x, y)
 
+    @ast_only_test
     def test_concrete_program(self):
         with base.dygraph.guard(base.CPUPlace()):
             x = to_variable(np.ones([4, 10]).astype('float32'))
@@ -210,11 +222,12 @@ def foo_func(a, b, c=1, d=2):
     return z
 
 
-class TestDifferentInputSpecCacheProgram(unittest.TestCase):
+class TestDifferentInputSpecCacheProgram(Dy2StTestBase):
     def setUp(self):
         paddle.jit.enable_to_static(True)
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_with_different_input(self):
         with base.dygraph.guard(base.CPUPlace()):
             x_data = np.ones([16, 10]).astype('float32')
@@ -260,6 +273,7 @@ def test_with_different_input(self):
             recent_program = foo.program_cache.last()
             self.assertTrue(first_program == recent_program)
 
+    @ast_only_test
     def test_get_concrete_program(self):
         foo = to_static(foo_func)
 
@@ -301,6 +315,7 @@ def test_get_concrete_program(self):
             )
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_concrete_program(self):
         with base.dygraph.guard(base.CPUPlace()):
             # usage 1
@@ -324,7 +339,7 @@ def test_concrete_program(self):
                 foo_3.concrete_program  # noqa: B018
 
 
-class TestInputDefaultName(unittest.TestCase):
+class TestInputDefaultName(Dy2StTestBase):
     def setUp(self):
         paddle.disable_static()
         self.net = SimpleNet()
@@ -348,7 +363,8 @@ def test_nest_input(self):
         self.assert_default_name('func_with_list_dict', ['dl_0', 'x', 'y'])
 
 
-class TestDeclarativeAPI(unittest.TestCase):
+class TestDeclarativeAPI(Dy2StTestBase):
+    @ast_only_test
     def test_error(self):
         func = to_static(dyfunc_to_variable)
 
@@ -366,19 +382,21 @@ def test_error(self):
             func(np.ones(5).astype("int32"))
 
 
-class TestDecorateModelDirectly(unittest.TestCase):
+class TestDecorateModelDirectly(Dy2StTestBase):
     def setUp(self):
         paddle.disable_static()
         paddle.jit.enable_to_static(True)
         self.x = to_variable(np.ones([4, 10]).astype('float32'))
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_fake_input(self):
         net = SimpleNet()
         net = to_static(net)
         y = net(self.x)
         self.assertTrue(len(net.forward.program_cache) == 1)
 
+    @ast_only_test
     def test_input_spec(self):
         net = SimpleNet()
         net = to_static(net, input_spec=[InputSpec([None, 8, 10])])
@@ -393,7 +411,7 @@ def test_input_spec(self):
         self.assertListEqual(list(input_shape), [-1, 16, 10])
 
 
-class TestErrorWithInitFromStaticMode(unittest.TestCase):
+class TestErrorWithInitFromStaticMode(Dy2StTestBase):
     def test_raise_error(self):
         # disable imperative
         paddle.enable_static()
@@ -420,7 +438,7 @@ def __init__(self):
         super().__init__()
         self.sub = CallNonForwardFuncSubNet()
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self):
         return self.sub.func()
 
@@ -435,7 +453,7 @@ def func(self):
         return x
 
 
-class TestCallNonForwardFunc(unittest.TestCase):
+class TestCallNonForwardFunc(Dy2StTestBase):
     @test_and_compare_with_new_ir(False)
     def test_call_non_forward(self):
         paddle.disable_static()
@@ -450,7 +468,7 @@ def __init__(self):
         super().__init__()
         self.a = paddle.to_tensor([1])
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self):
         self.a = self.a + 1
         return self.a
@@ -461,14 +479,14 @@ def __init__(self):
         super().__init__()
         self.b = paddle.to_tensor([2])
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self):
         self.b = None
         self.b = paddle.to_tensor([3])
         return self.b
 
 
-class TestSetBuffers(unittest.TestCase):
+class TestSetBuffers(Dy2StTestBase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_path = os.path.join(self.temp_dir.name, 'SetBuffersNet1')
@@ -485,6 +503,7 @@ def test_set_buffers1(self):
         paddle.jit.save(net, self.model_path)
         paddle.enable_static()
 
+    @ast_only_test
     def test_set_buffers2(self):
         paddle.disable_static()
         net = SetBuffersNet2()
@@ -498,7 +517,7 @@ def func(self, x):
         return x + 1
 
 
-class TestClassNoInheritLayer(unittest.TestCase):
+class TestClassNoInheritLayer(Dy2StTestBase):
     def test_to_static(self):
         paddle.disable_static()
         net = ClassNoInheritLayer()
diff --git a/test/dygraph_to_static/test_decorator_transform.py b/test/dygraph_to_static/test_decorator_transform.py
index d0ddffdd40cbe7..4f4096d607dc8a 100644
--- a/test/dygraph_to_static/test_decorator_transform.py
+++ b/test/dygraph_to_static/test_decorator_transform.py
@@ -19,9 +19,9 @@
 
 import decos
 import numpy as np
-from dygraph_to_static_util import (
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
     ast_only_test,
-    dy2static_unittest,
     test_and_compare_with_new_ir,
 )
 
@@ -185,8 +185,7 @@ def deco_with_paddle_api():
     return fun10()
 
 
-@dy2static_unittest
-class TestDecoratorTransform(unittest.TestCase):
+class TestDecoratorTransform(Dy2StTestBase):
     @test_and_compare_with_new_ir(False)
     def test_deco_transform(self):
         outs = paddle.jit.to_static(forward)()
diff --git a/test/dygraph_to_static/test_deepcopy.py b/test/dygraph_to_static/test_deepcopy.py
index 0959d74dbc1fbf..82ffeaf9f2290c 100644
--- a/test/dygraph_to_static/test_deepcopy.py
+++ b/test/dygraph_to_static/test_deepcopy.py
@@ -16,14 +16,18 @@
 from copy import deepcopy
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    test_and_compare_with_new_ir,
+)
 from test_rollback import Net, foo
 
 import paddle
 from paddle.jit.dy2static.program_translator import StaticFunction
 
 
-class TestDeepCopy(unittest.TestCase):
+# @dy2static_unittest
+class TestDeepCopy(Dy2StTestBase):
     @test_and_compare_with_new_ir(False)
     def test_net(self):
         net = Net()
diff --git a/test/dygraph_to_static/test_dict.py b/test/dygraph_to_static/test_dict.py
index 80180b522cf540..99364c1343a7d6 100644
--- a/test/dygraph_to_static/test_dict.py
+++ b/test/dygraph_to_static/test_dict.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base
@@ -116,6 +119,7 @@ def update_cache(cache):
     return cache
 
 
+@dy2static_unittest
 class TestNetWithDict(unittest.TestCase):
     """
     TestCase for the transformation from control flow `if/else`
@@ -169,6 +173,7 @@ def test_dic_pop_2(x):
     return out
 
 
+@dy2static_unittest
 class TestDictPop(unittest.TestCase):
     def setUp(self):
         self.input = np.random.random(3).astype('int32')
@@ -249,6 +254,7 @@ def test_ast_to_func(self):
         )
 
 
+@dy2static_unittest
 class TestDictCmpInFor(unittest.TestCase):
     def test_with_for(self):
         def func():
diff --git a/test/dygraph_to_static/test_drop_path.py b/test/dygraph_to_static/test_drop_path.py
index a9ea20be04c383..aad752007ceb0c 100644
--- a/test/dygraph_to_static/test_drop_path.py
+++ b/test/dygraph_to_static/test_drop_path.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -36,6 +39,7 @@ def forward(self, x):
         return drop_path(x, self.training)
 
 
+@dy2static_unittest
 class TestTrainEval(unittest.TestCase):
     def setUp(self):
         self.model = DropPath()
diff --git a/test/dygraph_to_static/test_duplicate_output.py b/test/dygraph_to_static/test_duplicate_output.py
index 7e4220899d5eff..c7f1e21b3552ab 100644
--- a/test/dygraph_to_static/test_duplicate_output.py
+++ b/test/dygraph_to_static/test_duplicate_output.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -38,6 +41,7 @@ def forward(self, x):
         return x, x
 
 
+@dy2static_unittest
 class TestDuplicateOutput(unittest.TestCase):
     """
     TestCase for the transformation from control flow `if/else`
@@ -50,9 +54,12 @@ def setUp(self):
 
     @test_and_compare_with_new_ir(False)
     def _run_static(self):
+        param = self.net.parameters()
+        param[0].clear_grad()
+
         loss0, loss1 = self.net(self.x)
         loss0.backward()
-        param = self.net.parameters()
+
         self.assertEqual(param[0].grad.numpy(), 1.0)
 
     def test_ast_to_func(self):
diff --git a/test/dygraph_to_static/test_error.py b/test/dygraph_to_static/test_error.py
index 8c6f74d75c4e0b..9bb23945970c6b 100644
--- a/test/dygraph_to_static/test_error.py
+++ b/test/dygraph_to_static/test_error.py
@@ -23,15 +23,13 @@
 from paddle.jit.dy2static import error
 from paddle.jit.dy2static.origin_info import unwrap
 
-os.environ['ENABLE_FALL_BACK'] = "False"  # NOTE: ast only
-
 
 def inner_func():
     paddle.tensor.fill_constant(shape=[1, 2], value=9, dtype="int")
     return  # noqa: PLR1711
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def func_error_in_compile_time(x):
     x = base.dygraph.to_variable(x)
     inner_func()
@@ -42,14 +40,14 @@ def func_error_in_compile_time(x):
     return x_v
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def func_error_in_compile_time_2(x):
     x = base.dygraph.to_variable(x)
     x = paddle.reshape(x, shape=[1, 2])
     return x
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def func_error_in_runtime(x):
     x = base.dygraph.to_variable(x)
     two = paddle.tensor.fill_constant(shape=[1], value=2, dtype="int32")
@@ -58,12 +56,12 @@ def func_error_in_runtime(x):
 
 
 @unwrap
-@paddle.jit.to_static()
+@paddle.jit.to_static(full_graph=True)
 def func_decorated_by_other_1():
     return 1
 
 
-@paddle.jit.to_static()
+@paddle.jit.to_static(full_graph=True)
 @unwrap
 def func_decorated_by_other_2():
     return 1
@@ -75,7 +73,8 @@ def __init__(self, fc_size=20):
         self._linear = paddle.nn.Linear(fc_size, fc_size)
 
     @paddle.jit.to_static(
-        input_spec=[paddle.static.InputSpec(shape=[20, 20], dtype='float32')]
+        input_spec=[paddle.static.InputSpec(shape=[20, 20], dtype='float32')],
+        full_graph=True,
     )
     def forward(self, x):
         y = self._linear(x)
@@ -88,7 +87,7 @@ class LayerErrorInCompiletime2(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self):
         self.test_func()
 
@@ -100,7 +99,7 @@ def test_func(self):
         return  # noqa: PLR1711
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def func_error_in_runtime_with_empty_line(x):
     x = base.dygraph.to_variable(x)
     two = paddle.tensor.fill_constant(shape=[1], value=2, dtype="int32")
@@ -115,7 +114,7 @@ def __init__(self):
         super().__init__()
         self.inner_net = SuggestionErrorTestNet2()
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self, x):
         return self.inner_net.forward(x)
 
@@ -257,9 +256,7 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            f'File "{self.filepath}", line 37, in func_error_in_compile_time',
             'inner_func()',
-            f'File "{self.filepath}", line 30, in inner_func',
             'def inner_func():',
             'paddle.tensor.fill_constant(shape=[1, 2], value=9, dtype="int")',
             '<--- HERE',
@@ -286,7 +283,6 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            f'File "{self.filepath}", line 48, in func_error_in_compile_time_2',
             'def func_error_in_compile_time_2(x):',
             'x = base.dygraph.to_variable(x)',
             'x = paddle.reshape(x, shape=[1, 2])',
@@ -310,7 +306,6 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            f'File "{self.filepath}", line 93, in forward',
             '@paddle.jit.to_static',
             'def forward(self):',
             'self.test_func()',
@@ -334,7 +329,6 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            f'File "{self.filepath}", line 56, in func_error_in_runtime',
             'x = base.dygraph.to_variable(x)',
             'two = paddle.tensor.fill_constant(shape=[1], value=2, dtype="int32")',
             'x = paddle.reshape(x, shape=[1, two])',
@@ -349,9 +343,6 @@ def set_func(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 108, in func_error_in_runtime_with_empty_line'.format(
-                self.filepath
-            ),
             'two = paddle.tensor.fill_constant(shape=[1], value=2, dtype="int32")',
             'x = paddle.reshape(x, shape=[1, two])',
             '<--- HERE',
@@ -372,7 +363,6 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            f'File "{self.filepath}", line 82, in forward',
             'def forward(self, x):',
             'y = self._linear(x)',
             'z = paddle.tensor.fill_constant(shape=[1, 2], value=9, dtype="int")',
@@ -391,7 +381,7 @@ def test_error(self):
         self._test_raise_new_exception()
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def func_ker_error(x):
     d = {'x': x}
     y = d['y'] + x
@@ -406,7 +396,7 @@ def test_key_error(self):
             func_ker_error(x)
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def NpApiErr():
     a = paddle.to_tensor([1, 2])
     b = np.sum(a.numpy())
@@ -436,7 +426,7 @@ def __init__(self):
         super().__init__()
         self.linear = paddle.nn.Linear(5, 2)
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self, x):
         old_dict = self.state_dict()
         wgt = old_dict['linear.weight']
diff --git a/test/dygraph_to_static/test_fallback.py b/test/dygraph_to_static/test_fallback.py
index b641f8b22233ad..58394feda2a680 100644
--- a/test/dygraph_to_static/test_fallback.py
+++ b/test/dygraph_to_static/test_fallback.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test
+from dygraph_to_static_util import ast_only_test, dy2static_unittest
 
 import paddle
 
@@ -51,6 +51,7 @@ def forward(self, x):
             return unsupport_func(x - 1)
 
 
+@dy2static_unittest
 class TestFallback(unittest.TestCase):
     def setUp(self):
         self.x = paddle.to_tensor([2]).astype('int')
diff --git a/test/dygraph_to_static/test_fetch_feed.py b/test/dygraph_to_static/test_fetch_feed.py
index 0834f2ec4a315e..b44578fad2c9e3 100644
--- a/test/dygraph_to_static/test_fetch_feed.py
+++ b/test/dygraph_to_static/test_fetch_feed.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base
@@ -62,6 +65,7 @@ def forward(self, x):
         return pre, loss
 
 
+@dy2static_unittest
 class TestPool2D(unittest.TestCase):
     def setUp(self):
         self.dygraph_class = Pool2D
diff --git a/test/dygraph_to_static/test_for_enumerate.py b/test/dygraph_to_static/test_for_enumerate.py
index bbb64e8756ea33..dc9505a5cf6fcc 100644
--- a/test/dygraph_to_static/test_for_enumerate.py
+++ b/test/dygraph_to_static/test_for_enumerate.py
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 from paddle import base
@@ -353,6 +354,7 @@ def tensor_array_slice_in_enumerate():
     return feat_n2
 
 
+@dy2static_unittest
 class TestTransformBase(unittest.TestCase):
     def setUp(self):
         self.place = (
@@ -556,6 +558,7 @@ def test_transformed_result_compare(self):
         self.transformed_result_compare()
 
 
+@dy2static_unittest
 class TestForZip(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/dygraph_to_static/test_full_name_usage.py b/test/dygraph_to_static/test_full_name_usage.py
index 0332480891e166..09087af0feaa61 100644
--- a/test/dygraph_to_static/test_full_name_usage.py
+++ b/test/dygraph_to_static/test_full_name_usage.py
@@ -15,13 +15,13 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test
+from dygraph_to_static_util import ast_only_test, dy2static_unittest
 
 import paddle
 from paddle import base
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def dygraph_decorated_func(x):
     x = base.dygraph.to_variable(x)
     if paddle.mean(x) > 0:
@@ -31,7 +31,7 @@ def dygraph_decorated_func(x):
     return x_v
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def jit_decorated_func(x):
     x = base.dygraph.to_variable(x)
     if paddle.mean(x) > 0:
@@ -41,23 +41,24 @@ def jit_decorated_func(x):
     return x_v
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def decorated_call_decorated(x):
     return jit_decorated_func(x)
 
 
 class DoubleDecorated:
     @classmethod
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def double_decorated_func1(self, x):
         return dygraph_decorated_func(x)
 
     @classmethod
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def double_decorated_func2(self, x):
         return jit_decorated_func(x)
 
 
+@dy2static_unittest
 class TestFullNameDecorator(unittest.TestCase):
     @ast_only_test
     def test_run_success(self):
diff --git a/test/dygraph_to_static/test_grad.py b/test/dygraph_to_static/test_grad.py
index e542d87efc90ce..ceca09e7895486 100644
--- a/test/dygraph_to_static/test_grad.py
+++ b/test/dygraph_to_static/test_grad.py
@@ -65,6 +65,7 @@ def forward(self, x):
         return out
 
 
+@dy2static_unittest
 class TestGrad(unittest.TestCase):
     def setUp(self):
         self.func = paddle.jit.to_static(GradLayer())
diff --git a/test/dygraph_to_static/test_gradient_aggregation.py b/test/dygraph_to_static/test_gradient_aggregation.py
index ab7effba5b16c6..4172fb87197df7 100644
--- a/test/dygraph_to_static/test_gradient_aggregation.py
+++ b/test/dygraph_to_static/test_gradient_aggregation.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -37,6 +40,7 @@ def forward(self, x):
         # return [out2, out1] # 梯度正常
 
 
+@dy2static_unittest
 class TestGradientAggregationInDy2Static(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_to_static(self):
diff --git a/test/dygraph_to_static/test_grid_generator.py b/test/dygraph_to_static/test_grid_generator.py
index ea1eafb5c1fa9f..7c1a9189366e0e 100644
--- a/test/dygraph_to_static/test_grid_generator.py
+++ b/test/dygraph_to_static/test_grid_generator.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import ParamAttr, nn
@@ -126,7 +129,7 @@ def get_expand_tensor(self, batch_C_prime):
         return batch_C_ex_part_tensor
 
 
-class TestGridGenerator(unittest.TestCase):
+class TestGridGenerator(Dy2StTestBase):
     def setUp(self):
         self.x = paddle.uniform(shape=[1, 20, 2], dtype='float32')
 
diff --git a/test/dygraph_to_static/test_ifelse.py b/test/dygraph_to_static/test_ifelse.py
index 6e2dc6f8ffe6df..67a48499510295 100644
--- a/test/dygraph_to_static/test_ifelse.py
+++ b/test/dygraph_to_static/test_ifelse.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test, dy2static_unittest
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from ifelse_simple_func import (
     NetWithControlFlowIf,
     add_fn,
@@ -63,6 +67,7 @@ def setUp(self):
         self.error = "Your if/else have different number of return value."
 
     @ast_only_test
+    @test_and_compare_with_new_ir()
     def test_error(self):
         if self.dyfunc:
             with self.assertRaisesRegex(Dygraph2StaticException, self.error):
@@ -72,6 +77,7 @@ def test_error(self):
         paddle.jit.enable_to_static(False)
 
 
+@dy2static_unittest
 class TestDygraphIfElse(unittest.TestCase):
     """
     TestCase for the transformation from control flow `if/else`
@@ -94,6 +100,7 @@ def _run_dygraph(self, to_static=False):
                 ret = self.dyfunc(x_v)
             return ret.numpy()
 
+    @test_and_compare_with_new_ir()
     def test_ast_to_func(self):
         self.assertTrue((self._run_dygraph() == self._run_static()).all())
 
@@ -122,11 +129,28 @@ def setUp(self):
         self.dyfunc = dyfunc_with_if_else_with_list_generator
 
 
-class TestDygraphNestedIfElse(TestDygraphIfElse):
+@dy2static_unittest
+class TestDygraphNestedIfElse(unittest.TestCase):
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = nested_if_else
 
+    def _run_static(self):
+        return self._run_dygraph(to_static=True)
+
+    def _run_dygraph(self, to_static=False):
+        with base.dygraph.guard(place):
+            x_v = paddle.to_tensor(self.x)
+            if to_static:
+                ret = paddle.jit.to_static(self.dyfunc)(x_v)
+            else:
+                ret = self.dyfunc(x_v)
+            return ret.numpy()
+
+    # TODO(zhangbo): open pir test (sub block cannot find var in parent block)
+    def test_ast_to_func(self):
+        self.assertTrue((self._run_dygraph() == self._run_static()).all())
+
 
 class TestDygraphNestedIfElse2(TestDygraphIfElse):
     def setUp(self):
@@ -232,12 +256,30 @@ def setUp(self):
         self.dyfunc = if_with_class_var
 
 
-class TestDygraphIfTensor(TestDygraphIfElse):
+@dy2static_unittest
+class TestDygraphIfTensor(unittest.TestCase):
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = if_tensor_case
 
+    def _run_static(self):
+        return self._run_dygraph(to_static=True)
 
+    def _run_dygraph(self, to_static=False):
+        with base.dygraph.guard(place):
+            x_v = paddle.to_tensor(self.x)
+            if to_static:
+                ret = paddle.jit.to_static(self.dyfunc)(x_v)
+            else:
+                ret = self.dyfunc(x_v)
+            return ret.numpy()
+
+    @test_and_compare_with_new_ir()
+    def test_ast_to_func(self):
+        self.assertTrue((self._run_dygraph() == self._run_static()).all())
+
+
+@dy2static_unittest
 class TestDygraphIfElseNet(unittest.TestCase):
     """
     TestCase for the transformation from control flow `if/else`
@@ -263,6 +305,7 @@ def _run(self, to_static=False):
             ret = net(x_v)
             return ret.numpy()
 
+    # TODO(zhangbo): open pir test (sub block cannot find var in parent block)
     def test_ast_to_func(self):
         self.assertTrue((self._run_dygraph() == self._run_static()).all())
 
@@ -316,6 +359,10 @@ def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.Net = NetWithExternalFunc
 
+    @test_and_compare_with_new_ir()
+    def test_ast_to_func(self):
+        self.assertTrue((self._run_dygraph() == self._run_static()).all())
+
 
 class DiffModeNet1(paddle.nn.Layer):
     def __init__(self, mode):
@@ -350,6 +397,7 @@ def forward(self, x, y):
             raise ValueError('Illegal mode')
 
 
+@dy2static_unittest
 class TestDiffModeNet(unittest.TestCase):
     """
     TestCase for the net with different modes
@@ -370,6 +418,7 @@ def _run(self, mode, to_static):
         ret = net(self.x, self.y)
         return ret.numpy()
 
+    @test_and_compare_with_new_ir()
     def test_train_mode(self):
         self.assertTrue(
             (
@@ -378,6 +427,7 @@ def test_train_mode(self):
             ).all()
         )
 
+    @test_and_compare_with_new_ir()
     def test_infer_mode(self):
         self.assertTrue(
             (
@@ -392,7 +442,9 @@ def init_net(self):
         self.Net = DiffModeNet2
 
 
+@dy2static_unittest
 class TestNewVarCreateInOneBranch(unittest.TestCase):
+    @test_and_compare_with_new_ir()
     def test_var_used_in_another_for(self):
         def case_func(training):
             # targets and targets_list is dynamically defined by training
@@ -430,6 +482,7 @@ def get_dy2stat_out(self):
         return out
 
     @ast_only_test
+    @test_and_compare_with_new_ir()
     def test_ast_to_func(self):
         self.setUp()
         self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor))
@@ -451,6 +504,7 @@ def setUp(self):
         self.out = self.get_dy2stat_out()
 
     @ast_only_test
+    @test_and_compare_with_new_ir()
     def test_ast_to_func(self):
         self.setUp()
         self.assertIsInstance(self.out, (paddle.Tensor, core.eager.Tensor))
@@ -463,6 +517,7 @@ def setUp(self):
         self.dyfunc = paddle.jit.to_static(dyfunc_ifelse_ret_int4)
 
     @ast_only_test
+    @test_and_compare_with_new_ir()
     def test_ast_to_func(self):
         paddle.jit.enable_to_static(True)
         with self.assertRaises(Dygraph2StaticException):
@@ -497,7 +552,9 @@ def forward(self, a, b, c):
         return b
 
 
+@dy2static_unittest
 class TestDy2StIfElseBackward(unittest.TestCase):
+    # TODO(zhangbo): open pir test (IfOp grad execution not yet supported)
     def test_run_backward(self):
         a = paddle.randn((4, 3), dtype='float32')
         a.stop_gradient = False
diff --git a/test/dygraph_to_static/test_isinstance.py b/test/dygraph_to_static/test_isinstance.py
index e3557dc32658f9..7dfd05989dabe8 100644
--- a/test/dygraph_to_static/test_isinstance.py
+++ b/test/dygraph_to_static/test_isinstance.py
@@ -26,7 +26,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import nn
@@ -85,6 +88,7 @@ def train(model, to_static):
     return out.numpy()
 
 
+@dy2static_unittest
 class TestIsinstance(unittest.TestCase):
     def test_isinstance_simple_return_layer(self):
         model = IsInstanceLayer(SimpleReturnLayer())
diff --git a/test/dygraph_to_static/test_jit_property_save.py b/test/dygraph_to_static/test_jit_property_save.py
index f25c128e265d7a..965168dedc6ea0 100644
--- a/test/dygraph_to_static/test_jit_property_save.py
+++ b/test/dygraph_to_static/test_jit_property_save.py
@@ -14,9 +14,12 @@
 
 import unittest
 
+from dygraph_to_static_util import dy2static_unittest
+
 import paddle
 
 
+@dy2static_unittest
 class TestPropertySave(unittest.TestCase):
     """test jit property save"""
 
diff --git a/test/dygraph_to_static/test_jit_setitem.py b/test/dygraph_to_static/test_jit_setitem.py
index 59841ed431f086..219e6a6c9de749 100644
--- a/test/dygraph_to_static/test_jit_setitem.py
+++ b/test/dygraph_to_static/test_jit_setitem.py
@@ -16,11 +16,13 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 import paddle.nn.functional as F
 
 
+@dy2static_unittest
 class TestSetItemBase(unittest.TestCase):
     def setUp(self) -> None:
         pass
diff --git a/test/dygraph_to_static/test_lac.py b/test/dygraph_to_static/test_lac.py
index 522eb81cf5a7ae..461b03fe7a5edc 100644
--- a/test/dygraph_to_static/test_lac.py
+++ b/test/dygraph_to_static/test_lac.py
@@ -22,6 +22,8 @@
 
 os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 
+from dygraph_to_static_util import dy2static_unittest
+
 import paddle
 from paddle import _legacy_C_ops, base
 from paddle.base.dygraph import to_variable
@@ -513,6 +515,7 @@ def create_dataloader(reader, place):
     return data_loader
 
 
+@dy2static_unittest
 class TestLACModel(unittest.TestCase):
     def setUp(self):
         self.args = Args()
diff --git a/test/dygraph_to_static/test_lambda.py b/test/dygraph_to_static/test_lambda.py
index c1ff57147564c5..add572cb6dfcff 100644
--- a/test/dygraph_to_static/test_lambda.py
+++ b/test/dygraph_to_static/test_lambda.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 import paddle.nn.functional as F
@@ -79,6 +80,7 @@ def call_lambda_with_ifExpr2(x):
     return out
 
 
+@dy2static_unittest
 class TestLambda(unittest.TestCase):
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
diff --git a/test/dygraph_to_static/test_layer_hook.py b/test/dygraph_to_static/test_layer_hook.py
index bf679cf8dcc2e4..d19b9ea9abfc94 100644
--- a/test/dygraph_to_static/test_layer_hook.py
+++ b/test/dygraph_to_static/test_layer_hook.py
@@ -17,7 +17,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -56,6 +59,7 @@ def forward(self, x):
         return out
 
 
+@dy2static_unittest
 class TestNestLayerHook(unittest.TestCase):
     def setUp(self):
         paddle.seed(2022)
diff --git a/test/dygraph_to_static/test_len.py b/test/dygraph_to_static/test_len.py
index e2cee7c4dc8b44..340ba86ff50c2f 100644
--- a/test/dygraph_to_static/test_len.py
+++ b/test/dygraph_to_static/test_len.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 from paddle import base
@@ -42,6 +43,7 @@ def len_with_lod_tensor_array(x):
     return arr_len
 
 
+@dy2static_unittest
 class TestLen(unittest.TestCase):
     def setUp(self):
         self.place = (
@@ -113,6 +115,7 @@ def len_with_selected_rows(place):
     return result
 
 
+@dy2static_unittest
 class TestLenWithSelectedRows(unittest.TestCase):
     def setUp(self):
         self.place = (
diff --git a/test/dygraph_to_static/test_list.py b/test/dygraph_to_static/test_list.py
index 9ad646de8818c9..51b28ce3fe38a7 100644
--- a/test/dygraph_to_static/test_list.py
+++ b/test/dygraph_to_static/test_list.py
@@ -16,6 +16,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 from paddle import base
@@ -207,6 +208,7 @@ def test_list_pop_in_while_loop(x, iter_num):
     return a[0], b[2]
 
 
+@dy2static_unittest
 class TestListWithoutControlFlow(unittest.TestCase):
     def setUp(self):
         self.place = (
@@ -354,6 +356,7 @@ def forward(self, x, index, *args):
         return z
 
 
+@dy2static_unittest
 class TestListWithCondGradInferVarType(unittest.TestCase):
     def test_to_static(self):
         net = ListWithCondNet()
diff --git a/test/dygraph_to_static/test_load_transformer.py b/test/dygraph_to_static/test_load_transformer.py
index 95e06a51f3c692..81a45fb91cc4ef 100644
--- a/test/dygraph_to_static/test_load_transformer.py
+++ b/test/dygraph_to_static/test_load_transformer.py
@@ -16,7 +16,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -41,7 +44,7 @@ def forward(self, x):
         return t
 
 
-class TestFallback(unittest.TestCase):
+class TestFallback(Dy2StTestBase):
     def setUp(self):
         self.x = paddle.to_tensor(1.0).astype('int')
 
@@ -54,7 +57,7 @@ def test_name_load(self):
         np.testing.assert_allclose(output_dy.numpy(), output_st.numpy())
 
 
-class TestLoad2(unittest.TestCase):
+class TestLoad2(Dy2StTestBase):
     @test_and_compare_with_new_ir(False)
     def test_name_load_nograd(self):
         @paddle.no_grad()
diff --git a/test/dygraph_to_static/test_logical.py b/test/dygraph_to_static/test_logical.py
index 9e0f1d12bd9b48..a05f91b7c04932 100644
--- a/test/dygraph_to_static/test_logical.py
+++ b/test/dygraph_to_static/test_logical.py
@@ -18,6 +18,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 from paddle import base
@@ -167,6 +168,7 @@ def test_shape_not_equal(x):
         return paddle.ones([1, 2, 3])
 
 
+@dy2static_unittest
 class TestLogicalBase(unittest.TestCase):
     def setUp(self):
         self.input = np.array([3]).astype('int32')
@@ -262,6 +264,7 @@ def _set_test_func(self):
         self.dygraph_func = test_shape_not_equal
 
 
+@dy2static_unittest
 class TestCmpopNodeToStr(unittest.TestCase):
     def test_exception(self):
         with self.assertRaises(KeyError):
diff --git a/test/dygraph_to_static/test_loop.py b/test/dygraph_to_static/test_loop.py
index 77f568e2c5eec9..422508d6cd97e8 100644
--- a/test/dygraph_to_static/test_loop.py
+++ b/test/dygraph_to_static/test_loop.py
@@ -16,6 +16,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 import paddle.nn.functional as F
@@ -229,6 +230,7 @@ def for_loop_dufunc_with_listcomp(array):
     return res
 
 
+@dy2static_unittest
 class TestNameVisitor(unittest.TestCase):
     def setUp(self):
         self.loop_funcs = [
@@ -299,6 +301,7 @@ def test_nested_loop_vars(self):
                 i += 1
 
 
+@dy2static_unittest
 class TestTransformWhileLoop(unittest.TestCase):
     def setUp(self):
         self.place = (
@@ -378,6 +381,7 @@ def _init_dyfunc(self):
         self.dyfunc = loop_var_contains_property
 
 
+@dy2static_unittest
 class TestTransformForLoop(unittest.TestCase):
     def setUp(self):
         self.place = (
@@ -460,6 +464,7 @@ def forward(self, x):
         return out
 
 
+@dy2static_unittest
 class TestForLoopMeetDict(unittest.TestCase):
     def test_start(self):
         net = Net()
diff --git a/test/dygraph_to_static/test_mnist.py b/test/dygraph_to_static/test_mnist.py
index 9641a9225cee7b..86d3d136b31473 100644
--- a/test/dygraph_to_static/test_mnist.py
+++ b/test/dygraph_to_static/test_mnist.py
@@ -18,7 +18,11 @@
 from time import time
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test, test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from predictor_utils import PredictorTools
 
 import paddle
@@ -126,6 +130,7 @@ def inference(self, inputs):
         return x
 
 
+@dy2static_unittest
 class TestMNIST(unittest.TestCase):
     def setUp(self):
         self.epoch_num = 1
@@ -194,7 +199,7 @@ def train(self, to_static=False):
             base.default_startup_program().random_seed = SEED
             mnist = MNIST()
             if to_static:
-                mnist = paddle.jit.to_static(mnist)
+                mnist = paddle.jit.to_static(mnist, full_graph=True)
             adam = Adam(learning_rate=0.001, parameters=mnist.parameters())
 
             for epoch in range(self.epoch_num):
diff --git a/test/dygraph_to_static/test_mobile_net.py b/test/dygraph_to_static/test_mobile_net.py
index 5536a14e695c48..cca77999d5e7d9 100644
--- a/test/dygraph_to_static/test_mobile_net.py
+++ b/test/dygraph_to_static/test_mobile_net.py
@@ -19,7 +19,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_with_new_ir
+from dygraph_to_static_util import dy2static_unittest, test_with_new_ir
 from predictor_utils import PredictorTools
 
 import paddle
@@ -656,6 +656,7 @@ def predict_analysis_inference(args, data):
     return out
 
 
+@dy2static_unittest
 class TestMobileNet(unittest.TestCase):
     def setUp(self):
         self.args = Args()
diff --git a/test/dygraph_to_static/test_multi_forward.py b/test/dygraph_to_static/test_multi_forward.py
index 039db089b5c86b..2cf8e592f3fa0f 100644
--- a/test/dygraph_to_static/test_multi_forward.py
+++ b/test/dygraph_to_static/test_multi_forward.py
@@ -14,7 +14,10 @@
 
 import unittest
 
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -33,6 +36,7 @@ def forward(self, x):
         return self.linear(x)
 
 
+@dy2static_unittest
 class TestBackward(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_order_0(self):
diff --git a/test/dygraph_to_static/test_new_ir_selectedrows.py b/test/dygraph_to_static/test_new_ir_selectedrows.py
index 7d87a48fe78585..fe45adf2e56408 100644
--- a/test/dygraph_to_static/test_new_ir_selectedrows.py
+++ b/test/dygraph_to_static/test_new_ir_selectedrows.py
@@ -15,10 +15,7 @@
 import random
 import unittest
 
-from dygraph_to_static_util import (
-    enable_fallback_guard,
-    test_and_compare_with_new_ir,
-)
+from dygraph_to_static_util import test_and_compare_with_new_ir
 
 import paddle
 from paddle.jit.api import to_static
@@ -90,7 +87,7 @@ def train_static():
         parameters=net.parameters(), learning_rate=0.01, grad_clip=clip
     )
 
-    return to_static(train)(net, adam, x)
+    return to_static(train, full_graph=True)(net, adam, x)
 
 
 class TestSimnet(unittest.TestCase):
@@ -104,5 +101,4 @@ def test_dygraph_static_same_loss(self):
 
 
 if __name__ == '__main__':
-    with enable_fallback_guard("False"):
-        unittest.main()
+    unittest.main()
diff --git a/test/dygraph_to_static/test_op_attr.py b/test/dygraph_to_static/test_op_attr.py
index 17394df88dd071..69e03cea13103d 100644
--- a/test/dygraph_to_static/test_op_attr.py
+++ b/test/dygraph_to_static/test_op_attr.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from dygraph_to_static_util import ast_only_test
+from dygraph_to_static_util import ast_only_test, dy2static_unittest
 
 import paddle
 from paddle.static import InputSpec
@@ -42,7 +42,7 @@ def forward(self, x):
         out = self.bn(out)
         return out
 
-    @paddle.jit.to_static(input_spec=[InputSpec([10, 16])])
+    @paddle.jit.to_static(input_spec=[InputSpec([10, 16])], full_graph=True)
     def with_cond(self, x):
         if paddle.mean(x) > 0.0:
             out = self.linear(x)
@@ -52,6 +52,7 @@ def with_cond(self, x):
         return out
 
 
+@dy2static_unittest
 class CheckOpAttr(unittest.TestCase):
     def setUp(self):
         self.in_num = 16
diff --git a/test/dygraph_to_static/test_origin_info.py b/test/dygraph_to_static/test_origin_info.py
index c6415dff1ba1c8..be38650b750c21 100644
--- a/test/dygraph_to_static/test_origin_info.py
+++ b/test/dygraph_to_static/test_origin_info.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 import sys
 import unittest
 
+from dygraph_to_static_util import dy2static_unittest
+
 from paddle.jit.api import to_static
 from paddle.jit.dy2static import DygraphToStaticAst
 from paddle.jit.dy2static.origin_info import (
@@ -23,11 +26,10 @@
     OriginInfo,
     attach_origin_info,
     create_and_update_origin_info_map,
-    gast,
-    inspect,
     unwrap,
 )
 from paddle.jit.dy2static.utils import ast_to_func
+from paddle.utils import gast
 
 
 def simple_func(x):
@@ -54,6 +56,7 @@ def decorated_func2(x):
     return x
 
 
+@dy2static_unittest
 class TestOriginInfo(unittest.TestCase):
     def setUp(self):
         self.set_test_func()
diff --git a/test/dygraph_to_static/test_param_guard.py b/test/dygraph_to_static/test_param_guard.py
index b8edaf50dfceda..c6787db58fc890 100644
--- a/test/dygraph_to_static/test_param_guard.py
+++ b/test/dygraph_to_static/test_param_guard.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle.jit import to_static
@@ -50,6 +53,7 @@ def forward(self, x):
         return out
 
 
+@dy2static_unittest
 class TestParameterList(unittest.TestCase):
     def setUp(self):
         self.seed = 2021
@@ -102,6 +106,7 @@ def forward(self, x):
         return out
 
 
+@dy2static_unittest
 class TestRawParameterList(unittest.TestCase):
     def setUp(self):
         self.seed = 2021
diff --git a/test/dygraph_to_static/test_params_no_grad.py b/test/dygraph_to_static/test_params_no_grad.py
index f7bf87888f49cd..3b3f3949fad57c 100644
--- a/test/dygraph_to_static/test_params_no_grad.py
+++ b/test/dygraph_to_static/test_params_no_grad.py
@@ -14,6 +14,8 @@
 
 import unittest
 
+from dygraph_to_static_util import dy2static_unittest
+
 import paddle
 import paddle.distributed as dist
 from paddle import nn
@@ -52,6 +54,7 @@ def train():
         print(loss)
 
 
+@dy2static_unittest
 class TestParamsNoGrad(unittest.TestCase):
     def test_two_card(self):
         if (
diff --git a/test/dygraph_to_static/test_partial_program.py b/test/dygraph_to_static/test_partial_program.py
index db4a7c21e40100..a521b113373454 100644
--- a/test/dygraph_to_static/test_partial_program.py
+++ b/test/dygraph_to_static/test_partial_program.py
@@ -15,9 +15,9 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import (
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
     ast_only_test,
-    dy2static_unittest,
     test_and_compare_with_new_ir,
 )
 from test_fetch_feed import Linear
@@ -57,8 +57,7 @@ def fake_data(shape):
     return base.dygraph.to_variable(x_data)
 
 
-@dy2static_unittest
-class TestWithNestedInput(unittest.TestCase):
+class TestWithNestedInput(Dy2StTestBase):
     def setUp(self):
         self.x = None
         self.y = None
@@ -82,7 +81,9 @@ def _run(self, to_static):
                 self.fake_input()
 
             if to_static:
-                out = paddle.jit.to_static(nested_input)(self.x, self.y)
+                out = paddle.jit.to_static(nested_input, full_graph=True)(
+                    self.x, self.y
+                )
             else:
                 out = nested_input(self.x, self.y)
 
@@ -95,8 +96,7 @@ def test_nest(self):
         np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05)
 
 
-@dy2static_unittest
-class TestWithNestedOutput(unittest.TestCase):
+class TestWithNestedOutput(Dy2StTestBase):
     def setUp(self):
         self.x = None
         self.y = None
@@ -108,7 +108,9 @@ def _run(self, to_static):
                 self.y = fake_data([10, 16])
 
             if to_static:
-                out = paddle.jit.to_static(nested_output)(self.x, self.y)
+                out = paddle.jit.to_static(nested_output, full_graph=True)(
+                    self.x, self.y
+                )
             else:
                 out = nested_output(self.x, self.y)
 
@@ -133,14 +135,13 @@ def test_nest(self):
                 self.assertTrue(dy_var, st_var)
 
 
-@dy2static_unittest
-class TestWithTrainAndEval(unittest.TestCase):
+class TestWithTrainAndEval(Dy2StTestBase):
     @ast_only_test
     @test_and_compare_with_new_ir(False)
     def test_switch_eval_and_train(self):
         with base.dygraph.guard():
             linear_net = Linear()
-            linear_net = paddle.jit.to_static(linear_net)
+            linear_net = paddle.jit.to_static(linear_net, full_graph=True)
             x_data = np.random.random((4, 10)).astype('float32')
             x = base.dygraph.to_variable(x_data)
             linear_net(x)
@@ -167,14 +168,13 @@ def test_switch_eval_and_train(self):
             )
 
 
-@dy2static_unittest
-class TestWithNoGrad(unittest.TestCase):
+class TestWithNoGrad(Dy2StTestBase):
     @ast_only_test
     @test_and_compare_with_new_ir(False)
     def test_with_no_grad(self):
         with base.dygraph.guard():
             linear_net = Linear()
-            linear_net = paddle.jit.to_static(linear_net)
+            linear_net = paddle.jit.to_static(linear_net, full_graph=True)
             x_data = np.random.random((5, 10)).astype('float32')
             x = base.dygraph.to_variable(x_data)
 
@@ -197,15 +197,14 @@ def __init__(self):
             np.random.rand(2, 3).astype('float32')
         )
 
-    @to_static
+    @to_static(full_graph=True)
     def forward(self, x):
         x = paddle.reshape(x, shape=[-1, 6])
         x1, x2, x3 = paddle.split(x=x, axis=1, num_or_sections=3)
         return x1
 
 
-@dy2static_unittest
-class TestPruneUnusedParamInProgram(unittest.TestCase):
+class TestPruneUnusedParamInProgram(Dy2StTestBase):
     @test_and_compare_with_new_ir(False)
     def test_prune(self):
         input_ids = np.array([[15, 11, 6, 3, 18, 13]]).astype("float32")
diff --git a/test/dygraph_to_static/test_partial_program_hook.py b/test/dygraph_to_static/test_partial_program_hook.py
index cb177862692d30..c10194f6187adf 100644
--- a/test/dygraph_to_static/test_partial_program_hook.py
+++ b/test/dygraph_to_static/test_partial_program_hook.py
@@ -15,11 +15,14 @@
 import os
 import unittest
 
+from dygraph_to_static_util import dy2static_unittest
+
 import paddle
 from paddle.base import core
 from paddle.jit.dy2static import partial_program, program_translator
 
 
+@dy2static_unittest
 class TestPartiaProgramLayerHook(unittest.TestCase):
     def setUp(self):
         os.environ["ENABLE_FALL_BACK"] = "False"
@@ -35,6 +38,7 @@ def test_after_infer(self):
         self.assertIsNone(self._hook.after_infer(None))
 
 
+@dy2static_unittest
 class TestPrimHook(unittest.TestCase):
     def setUp(self):
         os.environ["ENABLE_FALL_BACK"] = "False"
diff --git a/test/dygraph_to_static/test_place.py b/test/dygraph_to_static/test_place.py
index 2ed904a0b54902..f1cb7e80589a31 100644
--- a/test/dygraph_to_static/test_place.py
+++ b/test/dygraph_to_static/test_place.py
@@ -14,9 +14,12 @@
 
 import unittest
 
+from dygraph_to_static_util import dy2static_unittest
+
 import paddle
 
 
+@dy2static_unittest
 class TestPlace(unittest.TestCase):
     def test_place(self):
         paddle.enable_static()
diff --git a/test/dygraph_to_static/test_print.py b/test/dygraph_to_static/test_print.py
index d7fe1f5a882c07..251bca776e700b 100644
--- a/test/dygraph_to_static/test_print.py
+++ b/test/dygraph_to_static/test_print.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base
@@ -84,6 +87,7 @@ def dyfunc_print_with_kwargs(x):
     print("Tensor", x_t, end='\n\n', sep=': ')
 
 
+@dy2static_unittest
 class TestPrintBase(unittest.TestCase):
     def setUp(self):
         self.input = numpy.ones(5).astype("int32")
diff --git a/test/dygraph_to_static/test_program_translator.py b/test/dygraph_to_static/test_program_translator.py
index 25cf316dd7e91c..9447bbbf4f6087 100644
--- a/test/dygraph_to_static/test_program_translator.py
+++ b/test/dygraph_to_static/test_program_translator.py
@@ -18,7 +18,7 @@
 
 import astor
 import numpy as np
-from dygraph_to_static_util import ast_only_test
+from dygraph_to_static_util import ast_only_test, dy2static_unittest
 from ifelse_simple_func import (
     dyfunc_with_if_else_early_return1,
     dyfunc_with_if_else_early_return2,
@@ -205,13 +205,14 @@ def false_fn_3():
 
 
 class NetWithError(paddle.nn.Layer):
-    @to_static
+    @to_static(full_graph=True)
     def forward(self, x):
         linear = paddle.nn.Linear(32, 64)
         y = linear(x)
         return y
 
 
+@dy2static_unittest
 class TestEnableDeclarative(unittest.TestCase):
     def setUp(self):
         self.x = np.random.randn(30, 10, 32).astype('float32')
@@ -262,11 +263,12 @@ def foo(self):
         return True
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def switch_mode_function():
     return True
 
 
+@dy2static_unittest
 class TestFunctionTrainEvalMode(unittest.TestCase):
     @ast_only_test
     def test_switch_mode(self):
@@ -297,6 +299,7 @@ def test_raise_error(self):
             net.foo.train()
 
 
+@dy2static_unittest
 class TestIfElseEarlyReturn(unittest.TestCase):
     def test_ifelse_early_return1(self):
         answer = np.zeros([2, 2]) + 1
@@ -311,6 +314,7 @@ def test_ifelse_early_return2(self):
         np.testing.assert_allclose(answer, out[0].numpy(), rtol=1e-05)
 
 
+@dy2static_unittest
 class TestRemoveCommentInDy2St(unittest.TestCase):
     def func_with_comment(self):
         # Comment1
@@ -352,6 +356,7 @@ def func1(x):
         return func1(data)
 
 
+@dy2static_unittest
 class TestParameterRecorder(unittest.TestCase):
     def test_recorder(self):
         """function calls nn.Layer case."""
diff --git a/test/dygraph_to_static/test_ptb_lm.py b/test/dygraph_to_static/test_ptb_lm.py
index 2c94d6b343d3a8..76a35d57ac9baf 100644
--- a/test/dygraph_to_static/test_ptb_lm.py
+++ b/test/dygraph_to_static/test_ptb_lm.py
@@ -17,7 +17,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base
@@ -321,6 +324,7 @@ def train_static(place):
     return train(place)
 
 
+@dy2static_unittest
 class TestPtb(unittest.TestCase):
     def setUp(self):
         self.place = (
diff --git a/test/dygraph_to_static/test_ptb_lm_v2.py b/test/dygraph_to_static/test_ptb_lm_v2.py
index 3694d503965361..92d4d43d9d4ea2 100644
--- a/test/dygraph_to_static/test_ptb_lm_v2.py
+++ b/test/dygraph_to_static/test_ptb_lm_v2.py
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 
@@ -322,6 +323,7 @@ def train_static(place):
     return train(place)
 
 
+@dy2static_unittest
 class TestPtb(unittest.TestCase):
     def setUp(self):
         self.place = (
diff --git a/test/dygraph_to_static/test_pylayer.py b/test/dygraph_to_static/test_pylayer.py
index c36bc1a14d5d14..d047b6d5cd1cb6 100644
--- a/test/dygraph_to_static/test_pylayer.py
+++ b/test/dygraph_to_static/test_pylayer.py
@@ -26,6 +26,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 from test_jit_save_load import train
 
 import paddle
@@ -177,7 +178,7 @@ def __init__(self, in_size, out_size):
         super().__init__()
         self.linear = paddle.nn.Linear(in_size, out_size)
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self, data):
         hidden = self.linear(data)
         z = cus_tanh_1.apply(hidden)
@@ -212,7 +213,7 @@ class SimpleNetInplace(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self, data):
         data = data**2
         z = paddle.tanh(data)
@@ -225,7 +226,7 @@ def __init__(self, in_size, out_size):
         super().__init__()
         self.linear = paddle.nn.Linear(in_size, out_size)
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self, x):
         y = self.linear(x)
         out = cus_tanh_2.apply(y, func1=paddle.tanh)
@@ -239,7 +240,7 @@ def __init__(self, in_size, out_size):
         self.linear1 = paddle.nn.Linear(in_size, out_size)
         self.linear2 = paddle.nn.Linear(in_size, out_size)
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self, x1, x2):
         y1 = self.linear1(x1)
         y2 = self.linear1(x2)
@@ -254,7 +255,7 @@ def __init__(self, in_size, out_size):
         super().__init__()
         self.linear = paddle.nn.Linear(in_size, out_size)
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(full_graph=True)
     def forward(self, x):
         y = self.linear(x)
         y.stop_gradient = True
@@ -262,6 +263,7 @@ def forward(self, x):
         return out
 
 
+@dy2static_unittest
 class TestPyLayerBase(unittest.TestCase):
     def setUp(self):
         self.place = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
@@ -359,7 +361,7 @@ def _run_and_compare(self, *args, **kwargs):
 
 class TestPyLayerWithoutContext(TestPyLayerBase):
     def test_single_in_single_out(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def test_func(x):
             y = scaled_layer_1.apply(x)
             return y
@@ -372,7 +374,7 @@ def test_func(x):
         self._run_and_compare(input1)
 
     def test_multi_in_single_out(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def test_func(x1, x2):
             y = scaled_layer_2.apply(x1, x2)
             return y
@@ -389,7 +391,7 @@ def test_func(x1, x2):
 
 class TestPyLayerWithContext(TestPyLayerBase):
     def test_single_in_single_out(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def test_func(x):
             y = cus_tanh_1.apply(x)
             return y
@@ -402,7 +404,7 @@ def test_func(x):
         self._run_and_compare(input1)
 
     def test_nested_pylayer(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def test_func(x1, x2):
             y = nested_layer.apply(x1, x2)
             return y
@@ -417,7 +419,7 @@ def test_func(x1, x2):
         self._run_and_compare(input1, input2)
 
     def test_apply_kwargs_pylayer(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def test_func(x1, x2):
             y = scaled_layer_2.apply(x1=x2, x2=x1)
             return y
@@ -432,7 +434,7 @@ def test_func(x1, x2):
         self._run_and_compare(input1, input2)
 
     def test_non_variable_inputs(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def test_func(x):
             y = cus_tanh_2.apply(x, func1=paddle.tanh)
             return y
@@ -445,7 +447,7 @@ def test_func(x):
         self._run_and_compare(input1)
 
     def test_simple_pylayer_return_none_with_no_grad(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def test_func(input1, input2):
             z = cus_tanh_3.apply(input1, input2, paddle.tanh, paddle.square)
             z = z[2] + z[3]
@@ -461,7 +463,7 @@ def test_func(input1, input2):
         self._run_and_compare(input1, input2)
 
     def test_non_variable_inputs_and_userdefined_call(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def test_func(input1):
             y = cus_tanh_4.apply(
                 input1, func=user_defined_square, name="cus_tanh_test"
@@ -512,6 +514,7 @@ def test_pylayer_net_with_no_grad(self):
         self._run_and_compare(input1, input2)
 
 
+@dy2static_unittest
 class PyLayerTrainHelper(unittest.TestCase):
     def setUp(self):
         self.place = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
@@ -530,7 +533,9 @@ def _run_train(self, to_static, layer_builder, build_strategy=None):
         # net = self.build_layer()
         net = layer_builder()
         if to_static:
-            net = paddle.jit.to_static(net, build_strategy=build_strategy)
+            net = paddle.jit.to_static(
+                net, build_strategy=build_strategy, full_graph=True
+            )
 
         _, _, avg_loss = train(net)
         return avg_loss.numpy()
@@ -583,6 +588,7 @@ def test_pylayer_net_no_grad(self):
         )
 
 
+@dy2static_unittest
 class TestPyLayerJitSaveLoad(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/dygraph_to_static/test_reinforcement_learning.py b/test/dygraph_to_static/test_reinforcement_learning.py
index e8980165073d5a..ffbd0e315229d7 100644
--- a/test/dygraph_to_static/test_reinforcement_learning.py
+++ b/test/dygraph_to_static/test_reinforcement_learning.py
@@ -18,7 +18,10 @@
 
 import gym
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 import paddle.nn.functional as F
@@ -64,7 +67,7 @@ def train(args, place, to_static):
     paddle.jit.enable_to_static(to_static)
 
     env = gym.make('CartPole-v0')
-    env.seed(SEED)
+    env.reset(seed=SEED)
 
     with base.dygraph.guard(place):
         paddle.seed(SEED)
@@ -169,12 +172,13 @@ def finish_episode():
         loss_data = []
         running_reward = 10
         for i_episode in itertools.count(1):
-            state, ep_reward = env.reset(), 0
+            state, _ = env.reset()
+            ep_reward = 0
             # The default loop number is 10000 is models, we changed it to 1000 for smaller test
             for t in range(1, 1000):
                 state = np.array(state).astype("float32")
                 action, loss = select_action(state)
-                state, reward, done, _ = env.step(action)
+                state, reward, done, _, _ = env.step(action)
 
                 # log loss_probs
                 loss_data.append(float(loss))
@@ -202,6 +206,7 @@ def finish_episode():
         return np.array(loss_data)
 
 
+@dy2static_unittest
 class TestDeclarative(unittest.TestCase):
     def setUp(self):
         self.place = (
diff --git a/test/dygraph_to_static/test_resnet.py b/test/dygraph_to_static/test_resnet.py
index a99999c4e74475..cb57ce234b2639 100644
--- a/test/dygraph_to_static/test_resnet.py
+++ b/test/dygraph_to_static/test_resnet.py
@@ -19,7 +19,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_with_new_ir
+from dygraph_to_static_util import dy2static_unittest, test_with_new_ir
 from predictor_utils import PredictorTools
 
 import paddle
@@ -386,6 +386,7 @@ def predict_analysis_inference(self, data):
         return out
 
 
+@dy2static_unittest
 class TestResnet(unittest.TestCase):
     def setUp(self):
         self.resnet_helper = ResNetHelper()
diff --git a/test/dygraph_to_static/test_resnet_amp.py b/test/dygraph_to_static/test_resnet_amp.py
index 60a30db707be47..0255c0c00db3b5 100644
--- a/test/dygraph_to_static/test_resnet_amp.py
+++ b/test/dygraph_to_static/test_resnet_amp.py
@@ -16,7 +16,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from test_resnet import SEED, ResNet, optimizer_setting
 
 import paddle
@@ -111,6 +114,7 @@ def train(to_static, build_strategy=None):
     return total_loss.numpy()
 
 
+@dy2static_unittest
 class TestResnet(unittest.TestCase):
     def train(self, to_static):
         paddle.jit.enable_to_static(to_static)
diff --git a/test/dygraph_to_static/test_resnet_pure_fp16.py b/test/dygraph_to_static/test_resnet_pure_fp16.py
index 1eb6a8ac9b3a5a..771f9033f99d73 100644
--- a/test/dygraph_to_static/test_resnet_pure_fp16.py
+++ b/test/dygraph_to_static/test_resnet_pure_fp16.py
@@ -16,7 +16,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from test_resnet import SEED, ResNet, optimizer_setting
 
 import paddle
@@ -112,6 +115,7 @@ def train(to_static, build_strategy=None):
     return loss_data
 
 
+@dy2static_unittest
 class TestResnet(unittest.TestCase):
     def train(self, to_static):
         paddle.jit.enable_to_static(to_static)
diff --git a/test/dygraph_to_static/test_resnet_v2.py b/test/dygraph_to_static/test_resnet_v2.py
index cf941effd2c288..0f5d804427ca67 100644
--- a/test/dygraph_to_static/test_resnet_v2.py
+++ b/test/dygraph_to_static/test_resnet_v2.py
@@ -19,7 +19,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_with_new_ir
+from dygraph_to_static_util import dy2static_unittest, test_with_new_ir
 from predictor_utils import PredictorTools
 
 import paddle
@@ -242,6 +242,7 @@ def __len__(self):
         return len(self.img)
 
 
+@dy2static_unittest
 class TestResnet(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/dygraph_to_static/test_return.py b/test/dygraph_to_static/test_return.py
index 41c622e9ed03ab..dc79b8456ed3bd 100644
--- a/test/dygraph_to_static/test_return.py
+++ b/test/dygraph_to_static/test_return.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test
+from dygraph_to_static_util import ast_only_test, dy2static_unittest
 from ifelse_simple_func import dyfunc_with_if_else
 
 import paddle
@@ -28,13 +28,13 @@
 np.random.seed(SEED)
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_base(x):
     x = base.dygraph.to_variable(x)
     return x
 
 
-@to_static
+@to_static(full_graph=True)
 def test_inside_func_base(x):
     x = base.dygraph.to_variable(x)
 
@@ -44,7 +44,7 @@ def inner_func(x):
     return inner_func(x)
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_if(x):
     x = base.dygraph.to_variable(x)
     if x < 0:
@@ -54,7 +54,7 @@ def test_return_if(x):
     return x
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_if_else(x):
     x = base.dygraph.to_variable(x)
     if x > 0:
@@ -67,7 +67,7 @@ def test_return_if_else(x):
         x -= 8888  # useless statement to test our code can handle it.
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_in_while(x):
     x = base.dygraph.to_variable(x)
     i = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=0)
@@ -80,7 +80,7 @@ def test_return_in_while(x):
     return x
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_in_for(x):
     x = base.dygraph.to_variable(x)
     for i in range(10):
@@ -92,13 +92,13 @@ def test_return_in_for(x):
     return x - 1
 
 
-@to_static
+@to_static(full_graph=True)
 def test_recursive_return(x):
     x = base.dygraph.to_variable(x)
     return dyfunc_with_if_else(x)
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_different_length_if_body(x):
     x = base.dygraph.to_variable(x)
     y = x + 1
@@ -109,7 +109,7 @@ def test_return_different_length_if_body(x):
         return x
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_different_length_else(x):
     x = base.dygraph.to_variable(x)
     y = x + 1
@@ -120,13 +120,13 @@ def test_return_different_length_else(x):
         return x
 
 
-@to_static
+@to_static(full_graph=True)
 def test_no_return(x):
     x = base.dygraph.to_variable(x)
     y = x + 1
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_none(x):
     x = base.dygraph.to_variable(x)
     y = x + 1
@@ -137,7 +137,7 @@ def test_return_none(x):
         return x, y
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_no_variable(x):
     x = base.dygraph.to_variable(x)
     y = x + 1
@@ -148,14 +148,14 @@ def test_return_no_variable(x):
         return
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_list_one_value(x):
     x = base.dygraph.to_variable(x)
     x += 1
     return [x]
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_list_many_values(x):
     x = base.dygraph.to_variable(x)
     x += 1
@@ -164,14 +164,14 @@ def test_return_list_many_values(x):
     return [x, y, z]
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_tuple_one_value(x):
     x = base.dygraph.to_variable(x)
     x += 1
     return (x,)
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_tuple_many_values(x):
     x = base.dygraph.to_variable(x)
     x += 1
@@ -189,7 +189,7 @@ def inner_func(x):
     return y
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_without_paddle_cond(x):
     # y shape is [10]
     y = paddle.ones([10])
@@ -213,7 +213,7 @@ def diff_return_hepler(x):
         return two_value(x)
 
 
-@to_static
+@to_static(full_graph=True)
 def test_diff_return(x):
     x = paddle.to_tensor(x)
     y, z = diff_return_hepler(x)
@@ -222,7 +222,7 @@ def test_diff_return(x):
     return y, z
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_if_else_2(x):
     rr = 0
     if True:
@@ -232,7 +232,7 @@ def test_return_if_else_2(x):
         a = 0
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_in_while_2(x):
     while True:
         a = 12
@@ -240,7 +240,7 @@ def test_return_in_while_2(x):
     return 10
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_in_for_2(x):
     a = 12
     for i in range(10):
@@ -248,7 +248,7 @@ def test_return_in_for_2(x):
     return 10
 
 
-@to_static
+@to_static(full_graph=True)
 def test_return_nested(x):
     def func():
         rr = 0
@@ -264,6 +264,7 @@ def func():
     return func()
 
 
+@dy2static_unittest
 class TestReturnBase(unittest.TestCase):
     def setUp(self):
         self.input = np.ones(1).astype('int32')
diff --git a/test/dygraph_to_static/test_rollback.py b/test/dygraph_to_static/test_rollback.py
index 0efb2147f20761..7ee3456747b513 100644
--- a/test/dygraph_to_static/test_rollback.py
+++ b/test/dygraph_to_static/test_rollback.py
@@ -71,6 +71,7 @@ def foo(x, flag=False):
     return out
 
 
+@dy2static_unittest
 class TestRollBackPlainFunction(unittest.TestCase):
     def setUp(self):
         paddle.set_device("cpu")
diff --git a/test/dygraph_to_static/test_save_inference_model.py b/test/dygraph_to_static/test_save_inference_model.py
index c6a01d38e7d869..e765aec9670e15 100644
--- a/test/dygraph_to_static/test_save_inference_model.py
+++ b/test/dygraph_to_static/test_save_inference_model.py
@@ -17,7 +17,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test, test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base
@@ -38,7 +42,7 @@ def __init__(self, fc_size):
         super().__init__()
         self._linear = paddle.nn.Linear(fc_size, fc_size)
 
-    @to_static
+    @to_static(full_graph=True)
     def forward(self, x):
         y = self._linear(x)
         z = self._linear(y)
@@ -65,7 +69,7 @@ def __init__(self, fc_size):
         super().__init__()
         self._linear = paddle.nn.Linear(fc_size, fc_size)
 
-    @to_static
+    @to_static(full_graph=True)
     def forward(self, x):
         y = self._linear(x)
         out = cus_tanh.apply(y)
@@ -73,6 +77,7 @@ def forward(self, x):
         return loss, out
 
 
+@dy2static_unittest
 class TestDyToStaticSaveInferenceModel(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
@@ -223,6 +228,7 @@ def load_and_run_inference(
         return np.array(results[0])
 
 
+@dy2static_unittest
 class TestPartialProgramRaiseError(unittest.TestCase):
     @ast_only_test
     @test_and_compare_with_new_ir(False)
diff --git a/test/dygraph_to_static/test_save_load.py b/test/dygraph_to_static/test_save_load.py
index 1c7b34435d7ac5..92965aea2ccc2d 100644
--- a/test/dygraph_to_static/test_save_load.py
+++ b/test/dygraph_to_static/test_save_load.py
@@ -17,7 +17,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test, test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from test_fetch_feed import Linear
 
 import paddle
@@ -55,6 +59,7 @@ def forward_post_hook_for_prim_net(layer, input, output):
     return output * 2
 
 
+@dy2static_unittest
 class TestDyToStaticSaveLoad(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/dygraph_to_static/test_se_resnet.py b/test/dygraph_to_static/test_se_resnet.py
index c12990b53659d8..21a2aa9702d9dd 100644
--- a/test/dygraph_to_static/test_se_resnet.py
+++ b/test/dygraph_to_static/test_se_resnet.py
@@ -20,7 +20,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from predictor_utils import PredictorTools
 
 import paddle
@@ -29,6 +33,7 @@
 from paddle.jit.api import to_static
 from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn import BatchNorm, Linear
+from paddle.static import InputSpec
 
 SEED = 2020
 np.random.seed(SEED)
@@ -316,7 +321,7 @@ def __init__(self, layers=50, class_dim=102):
             ),
         )
 
-    @to_static
+    @to_static(full_graph=True)
     def forward(self, inputs, label):
         if self.layers == 50 or self.layers == 101:
             y = self.conv0(inputs)
@@ -346,6 +351,7 @@ def forward(self, inputs, label):
         return out, avg_loss, acc_top1, acc_top5
 
 
+@dy2static_unittest
 class TestSeResnet(unittest.TestCase):
     def setUp(self):
         self.train_reader = paddle.batch(
@@ -368,6 +374,7 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
+    @test_and_compare_with_new_ir(True)
     def train(self, train_reader, to_static):
         paddle.jit.enable_to_static(to_static)
 
@@ -450,9 +457,15 @@ def train(self, train_reader, to_static):
                             paddle.jit.save(
                                 se_resnext,
                                 self.model_save_prefix,
-                                [img, label],
                                 output_spec=[pred],
-                                input_names_after_prune=[img.name],
+                                input_names_after_prune=['x'],
+                                input_spec=[
+                                    InputSpec(
+                                        shape=[None, 3, 224, 224], name='x'
+                                    ),
+                                    InputSpec(shape=[None, 1], name='y'),
+                                ],
+                                clip_extra=False,
                             )
                         else:
                             paddle.save(
@@ -483,6 +496,7 @@ def predict_dygraph(self, data):
 
             return pred_res.numpy()
 
+    @test_and_compare_with_new_ir(True)
     def predict_static(self, data):
         paddle.enable_static()
         exe = base.Executor(place)
diff --git a/test/dygraph_to_static/test_sentiment.py b/test/dygraph_to_static/test_sentiment.py
index 22bb980cd437f9..60d3678a5a72b0 100644
--- a/test/dygraph_to_static/test_sentiment.py
+++ b/test/dygraph_to_static/test_sentiment.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from test_lac import DynamicGRU
 
 import paddle
@@ -369,6 +372,7 @@ def train(args, to_static):
     return loss_data
 
 
+@dy2static_unittest
 class TestSentiment(unittest.TestCase):
     def setUp(self):
         self.args = Args()
diff --git a/test/dygraph_to_static/test_seq2seq.py b/test/dygraph_to_static/test_seq2seq.py
index 85de170c3f06c6..b97752d4c57cbf 100644
--- a/test/dygraph_to_static/test_seq2seq.py
+++ b/test/dygraph_to_static/test_seq2seq.py
@@ -18,6 +18,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 from seq2seq_dygraph_model import AttentionModel, BaseModel
 from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter
 
@@ -174,6 +175,7 @@ def infer(args, attn_model=False):
         return outputs.numpy()
 
 
+@dy2static_unittest
 class TestSeq2seq(unittest.TestCase):
     def setUp(self):
         self.args = Seq2SeqModelHyperParams
diff --git a/test/dygraph_to_static/test_set_dynamic_shape.py b/test/dygraph_to_static/test_set_dynamic_shape.py
new file mode 100644
index 00000000000000..0f6859f49e92e0
--- /dev/null
+++ b/test/dygraph_to_static/test_set_dynamic_shape.py
@@ -0,0 +1,42 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from dygraph_to_static_utils_new import Dy2StTestBase, ast_only_test
+
+import paddle
+
+
+class TestSetDynamicShape(Dy2StTestBase):
+    @ast_only_test
+    def test_start(self):
+        def dygraph_func(loop_number):
+            mask = paddle.randn([2, 2])
+            paddle.jit.dy2static.utils_helper.set_dynamic_shape(mask, [-1, 2])
+            n = paddle.randn([1, 2])
+            for i in range(loop_number):
+                mask = paddle.concat([mask, n], axis=0)
+                if mask.shape[0] == 5:
+                    break
+            return mask
+
+        loop_num = paddle.to_tensor(10)
+        expected_shape = dygraph_func(loop_num).shape
+        actual_shape = paddle.jit.to_static(dygraph_func)(loop_num).shape
+        self.assertEqual(expected_shape, actual_shape)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/dygraph_to_static/test_simnet.py b/test/dygraph_to_static/test_simnet.py
index 7d6cad6d033819..90dce27f87eef2 100644
--- a/test/dygraph_to_static/test_simnet.py
+++ b/test/dygraph_to_static/test_simnet.py
@@ -17,7 +17,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from simnet_dygraph_model import BOW, HingeLoss
 
 import paddle
@@ -176,8 +179,9 @@ def train(conf_dict, to_static):
     return losses
 
 
+@dy2static_unittest
 class TestSimnet(unittest.TestCase):
-    @test_and_compare_with_new_ir(True)
+    @test_and_compare_with_new_ir(False)
     def test_dygraph_static_same_loss(self):
         if base.is_compiled_with_cuda():
             base.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/test/dygraph_to_static/test_simnet_v2.py b/test/dygraph_to_static/test_simnet_v2.py
index a54cfe14dcbf83..16fccfd731be0b 100644
--- a/test/dygraph_to_static/test_simnet_v2.py
+++ b/test/dygraph_to_static/test_simnet_v2.py
@@ -17,7 +17,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from simnet_dygraph_model_v2 import BOW, HingeLoss
 
 import paddle
@@ -176,8 +179,9 @@ def train(conf_dict, to_static):
     return losses
 
 
+@dy2static_unittest
 class TestSimnet(unittest.TestCase):
-    @test_and_compare_with_new_ir(True)
+    @test_and_compare_with_new_ir(False)
     def test_dygraph_static_same_loss(self):
         if paddle.is_compiled_with_cuda():
             paddle.base.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/test/dygraph_to_static/test_slice.py b/test/dygraph_to_static/test_slice.py
index e66080a2c687fa..3bd4c5f8a2c837 100644
--- a/test/dygraph_to_static/test_slice.py
+++ b/test/dygraph_to_static/test_slice.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test
+from dygraph_to_static_util import ast_only_test, dy2static_unittest
 
 import paddle
 from paddle.static import InputSpec
@@ -108,6 +108,7 @@ def forward(self, x):
         return x
 
 
+@dy2static_unittest
 class TestSliceWithoutControlFlow(unittest.TestCase):
     def setUp(self):
         self.init_input()
@@ -169,6 +170,7 @@ def init_dygraph_func(self):
         self.dygraph_func = test_set_value
 
 
+@dy2static_unittest
 class TestSetValueWithLayerAndSave(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
@@ -189,6 +191,7 @@ def test_set_value_with_save(self):
         )
 
 
+@dy2static_unittest
 class TestSliceSupplementSpecialCase(unittest.TestCase):
     # unittest for slice index which abs(step)>0. eg: x[::2]
     def test_static_slice_step(self):
@@ -232,6 +235,7 @@ def func(inps):
         )
 
 
+@dy2static_unittest
 class TestPaddleStridedSlice(unittest.TestCase):
     def test_compare_paddle_strided_slice_with_numpy(self):
         paddle.disable_static()
@@ -293,6 +297,7 @@ def slice_zero_shape_tensor(x):
     return y
 
 
+@dy2static_unittest
 class TestSliceZeroShapeTensor(unittest.TestCase):
     def test_slice(self):
         paddle.disable_static()
diff --git a/test/dygraph_to_static/test_spec_names.py b/test/dygraph_to_static/test_spec_names.py
index 86fe69c507631c..72ffdc845134a8 100644
--- a/test/dygraph_to_static/test_spec_names.py
+++ b/test/dygraph_to_static/test_spec_names.py
@@ -14,8 +14,9 @@
 
 import unittest
 
-from dygraph_to_static_util import (
-    enable_fallback_guard,
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
+    ast_only_test,
     test_and_compare_with_new_ir,
 )
 
@@ -40,7 +41,7 @@ def forward(self, x, y, m, n):
         return paddle.sum(out)
 
 
-class TestArgsSpecName(unittest.TestCase):
+class TestArgsSpecName(Dy2StTestBase):
     def read_from_dataset(self):
         self.x = paddle.randn([4, 2, 8])
         self.y = paddle.randn([4, 2, 8])
@@ -48,6 +49,7 @@ def read_from_dataset(self):
         self.n = paddle.randn([4, 2, 8])
 
     @test_and_compare_with_new_ir(False)
+    @ast_only_test
     def test_spec_name_hash(self):
         net = Net()
         net = paddle.jit.to_static(net)
@@ -90,5 +92,4 @@ def run_test(self, net, inputs, trace_count, mode):
 
 
 if __name__ == '__main__':
-    with enable_fallback_guard("False"):
-        unittest.main()
+    unittest.main()
diff --git a/test/dygraph_to_static/test_tensor_hook.py b/test/dygraph_to_static/test_tensor_hook.py
index fc53fefc95ae64..06b1b288ad8993 100644
--- a/test/dygraph_to_static/test_tensor_hook.py
+++ b/test/dygraph_to_static/test_tensor_hook.py
@@ -15,12 +15,14 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 from paddle import nn
 from paddle.jit import to_static
 
 
+@dy2static_unittest
 class TestStaticAnalysis(unittest.TestCase):
     def test_hook_for_different_parameter(self):
         def f(x):
diff --git a/test/dygraph_to_static/test_tensor_methods.py b/test/dygraph_to_static/test_tensor_methods.py
index 6e1ae1a3ffc0e2..a22a8a4bb5d0ca 100644
--- a/test/dygraph_to_static/test_tensor_methods.py
+++ b/test/dygraph_to_static/test_tensor_methods.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import ast_only_test, test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    ast_only_test,
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 
@@ -27,6 +31,7 @@ def tensor_clone(x):
     return y
 
 
+@dy2static_unittest
 class TestTensorClone(unittest.TestCase):
     def _run(self, to_static):
         paddle.jit.enable_to_static(to_static)
@@ -41,13 +46,14 @@ def test_tensor_clone(self):
         np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05)
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def tensor_numpy(x):
     x = paddle.to_tensor(x)
     x.clear_gradient()
     return x
 
 
+@dy2static_unittest
 class TestTensorDygraphOnlyMethodError(unittest.TestCase):
     def _run(self, to_static):
         paddle.jit.enable_to_static(to_static)
@@ -64,13 +70,14 @@ def test_to_static_numpy_report_error(self):
             static_res = self._run(to_static=True)
 
 
-@paddle.jit.to_static
+@paddle.jit.to_static(full_graph=True)
 def tensor_item(x):
     x = paddle.to_tensor(x)
     y = x.clone()
     return y.item()
 
 
+@dy2static_unittest
 class TestTensorItem(unittest.TestCase):
     def _run(self, to_static):
         paddle.jit.enable_to_static(to_static)
@@ -95,6 +102,7 @@ def tensor_size(x):
     return y
 
 
+@dy2static_unittest
 class TestTensorSize(unittest.TestCase):
     def _run(self, to_static):
         paddle.jit.enable_to_static(to_static)
@@ -120,6 +128,7 @@ def true_div(x, y):
     return z
 
 
+@dy2static_unittest
 class TestTrueDiv(unittest.TestCase):
     def _run(self, to_static):
         paddle.jit.enable_to_static(to_static)
diff --git a/test/dygraph_to_static/test_tensor_shape.py b/test/dygraph_to_static/test_tensor_shape.py
index ad85daf7b0f78b..d8c13cff351931 100644
--- a/test/dygraph_to_static/test_tensor_shape.py
+++ b/test/dygraph_to_static/test_tensor_shape.py
@@ -15,9 +15,9 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import (
+from dygraph_to_static_utils_new import (
+    Dy2StTestBase,
     ast_only_test,
-    dy2static_unittest,
     test_and_compare_with_new_ir,
 )
 
@@ -235,8 +235,7 @@ def dyfunc_dict_assign_shape():
 
 
 # 1. Basic tests without control flow
-@dy2static_unittest
-class TestTensorShapeBasic(unittest.TestCase):
+class TestTensorShapeBasic(Dy2StTestBase):
     def setUp(self):
         self.input = np.ones(5).astype("int32")
         self.place = (
@@ -495,7 +494,7 @@ def _set_expected_op_num(self):
 
 
 # 5. Test op num for negative dim
-class TestOpNumBasicWithTensorShape(unittest.TestCase):
+class TestOpNumBasicWithTensorShape(Dy2StTestBase):
     def setUp(self):
         self._set_input_spec()
         self._set_test_func()
@@ -617,7 +616,7 @@ def dyfunc_with_static_convert_var_shape(x):
     return res
 
 
-class TestFindStatiConvertVarShapeSuffixVar(unittest.TestCase):
+class TestFindStatiConvertVarShapeSuffixVar(Dy2StTestBase):
     @ast_only_test
     def test(self):
         x_spec = paddle.static.InputSpec(shape=[None, 10])
diff --git a/test/dygraph_to_static/test_to_tensor.py b/test/dygraph_to_static/test_to_tensor.py
index ee33d56187efa6..e47e3bc78980da 100644
--- a/test/dygraph_to_static/test_to_tensor.py
+++ b/test/dygraph_to_static/test_to_tensor.py
@@ -96,6 +96,10 @@ def case8(x):
     return a
 
 
+def case_to_tensor_default_dtype():
+    return paddle.to_tensor(1)
+
+
 @dy2static_unittest
 class TestToTensorReturnVal(unittest.TestCase):
     def test_to_tensor_badreturn(self):
@@ -150,6 +154,13 @@ def test_to_tensor_badreturn(self):
         self.assertTrue(a.stop_gradient == b.stop_gradient)
         self.assertTrue(a.place._equals(b.place))
 
+    def test_to_tensor_default_dtype(self):
+        a = paddle.jit.to_static(case_to_tensor_default_dtype)()
+        b = case_to_tensor_default_dtype()
+        self.assertTrue(a.dtype == b.dtype)
+        self.assertTrue(a.stop_gradient == b.stop_gradient)
+        self.assertTrue(a.place._equals(b.place))
+
     def test_to_tensor_err_log(self):
         paddle.disable_static()
         x = paddle.to_tensor([3])
@@ -162,6 +173,7 @@ def test_to_tensor_err_log(self):
             )
 
 
+@dy2static_unittest
 class TestStatic(unittest.TestCase):
     def test_static(self):
         paddle.enable_static()
@@ -190,5 +202,18 @@ def test_static(self):
             res = exe.run(fetch_list=[x, out])
 
 
+class TestInt16(unittest.TestCase):
+    def test_static(self):
+        import numpy as np
+
+        paddle.enable_static()
+        data = np.array([1, 2], dtype="int16")
+        x = paddle.to_tensor(data)
+        self.assertTrue(x.dtype == paddle.framework.core.VarDesc.VarType.INT16)
+
+        y = paddle.to_tensor([1, 2], dtype="int16")
+        self.assertTrue(y.dtype == paddle.framework.core.VarDesc.VarType.INT16)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/dygraph_to_static/test_train_step.py b/test/dygraph_to_static/test_train_step.py
index 3c003f0725909b..cc5618aa4897a1 100644
--- a/test/dygraph_to_static/test_train_step.py
+++ b/test/dygraph_to_static/test_train_step.py
@@ -17,10 +17,7 @@
 from functools import partial
 
 import numpy as np
-from dygraph_to_static_util import (
-    enable_fallback_guard,
-    test_and_compare_with_new_ir,
-)
+from dygraph_to_static_util import test_and_compare_with_new_ir
 
 import paddle
 
@@ -87,7 +84,9 @@ def test_train_step(self):
             self.train_step_func, self.steps
         )
         reset_seed()
-        static_func = paddle.jit.to_static(self.train_step_func)
+        static_func = paddle.jit.to_static(
+            self.train_step_func, full_graph=True
+        )
         static_losses = self.get_train_step_losses(static_func, self.steps)
         self.assertEqual(len(dygraph_losses), len(static_losses))
         for dygraph_loss, static_loss in zip(dygraph_losses, static_losses):
@@ -438,5 +437,4 @@ def setUp(self):
 
 
 if __name__ == "__main__":
-    with enable_fallback_guard("False"):
-        unittest.main()
+    unittest.main()
diff --git a/test/dygraph_to_static/test_train_step_resnet18_adam.py b/test/dygraph_to_static/test_train_step_resnet18_adam.py
index 95fd040282b92a..c8b34fe84f1133 100644
--- a/test/dygraph_to_static/test_train_step_resnet18_adam.py
+++ b/test/dygraph_to_static/test_train_step_resnet18_adam.py
@@ -15,7 +15,6 @@
 import platform
 import unittest
 
-from dygraph_to_static_util import enable_fallback_guard
 from test_train_step import (
     TestTrainStepTinyModel,
     loss_fn_tiny_model,
@@ -41,5 +40,4 @@ def setUp(self):
 
 
 if __name__ == "__main__":
-    with enable_fallback_guard("False"):
-        unittest.main()
+    unittest.main()
diff --git a/test/dygraph_to_static/test_train_step_resnet18_sgd.py b/test/dygraph_to_static/test_train_step_resnet18_sgd.py
index f6139e62dc216a..a73d945aa95243 100644
--- a/test/dygraph_to_static/test_train_step_resnet18_sgd.py
+++ b/test/dygraph_to_static/test_train_step_resnet18_sgd.py
@@ -15,7 +15,6 @@
 import platform
 import unittest
 
-from dygraph_to_static_util import enable_fallback_guard
 from test_train_step import (
     TestTrainStepTinyModel,
     loss_fn_tiny_model,
@@ -41,5 +40,4 @@ def setUp(self):
 
 
 if __name__ == "__main__":
-    with enable_fallback_guard("False"):
-        unittest.main()
+    unittest.main()
diff --git a/test/dygraph_to_static/test_transformer.py b/test/dygraph_to_static/test_transformer.py
index 073535371ccde3..29dda3916f3ab9 100644
--- a/test/dygraph_to_static/test_transformer.py
+++ b/test/dygraph_to_static/test_transformer.py
@@ -20,7 +20,10 @@
 
 import numpy as np
 import transformer_util as util
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from transformer_dygraph_model import (
     CrossEntropyCriterion,
     Transformer,
@@ -527,6 +530,7 @@ def predict_static(args, batch_generator):
     return seq_ids, seq_scores
 
 
+@dy2static_unittest
 class TestTransformer(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/dygraph_to_static/test_tsm.py b/test/dygraph_to_static/test_tsm.py
index e68406bd4c9ab2..2cef9e7df4dedd 100644
--- a/test/dygraph_to_static/test_tsm.py
+++ b/test/dygraph_to_static/test_tsm.py
@@ -19,7 +19,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from tsm_config_utils import merge_configs, parse_config, print_configs
 
 import paddle
@@ -384,6 +387,7 @@ def train(args, fake_data_reader, to_static):
         return ret
 
 
+@dy2static_unittest
 class TestTsm(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_dygraph_static_same_loss(self):
diff --git a/test/dygraph_to_static/test_typehint.py b/test/dygraph_to_static/test_typehint.py
index b37a3539e22543..563db1d7a1df04 100644
--- a/test/dygraph_to_static/test_typehint.py
+++ b/test/dygraph_to_static/test_typehint.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base
@@ -33,6 +36,7 @@ def function(x: A) -> A:
     return 2 * x
 
 
+@dy2static_unittest
 class TestTransformWhileLoop(unittest.TestCase):
     def setUp(self):
         self.place = (
diff --git a/test/dygraph_to_static/test_unuseful_inputs.py b/test/dygraph_to_static/test_unuseful_inputs.py
index 603ffe9eba12dc..8f83f015db4315 100644
--- a/test/dygraph_to_static/test_unuseful_inputs.py
+++ b/test/dygraph_to_static/test_unuseful_inputs.py
@@ -15,7 +15,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import nn
@@ -62,6 +65,7 @@ def forward(self, x):
         return val
 
 
+@dy2static_unittest
 class TestDuplicateOutput(unittest.TestCase):
     """
     TestCase for the transformation from control flow `if/else`
diff --git a/test/dygraph_to_static/test_utils.py b/test/dygraph_to_static/test_utils.py
index 3361a866feb540..180078c1448295 100644
--- a/test/dygraph_to_static/test_utils.py
+++ b/test/dygraph_to_static/test_utils.py
@@ -15,9 +15,12 @@
 import types
 import unittest
 
+from dygraph_to_static_util import dy2static_unittest
+
 from paddle.jit.dy2static.utils import index_in_list, is_paddle_func
 
 
+@dy2static_unittest
 class TestIndexInList(unittest.TestCase):
     def test_index_in_list(self):
         list_to_test = [1, 2, 3, 4, 5]
@@ -49,6 +52,7 @@ def dyfunc_assign(input):
         y = n
 
 
+@dy2static_unittest
 class TestIsPaddle(unittest.TestCase):
     def fake_module(self):
         return types.ModuleType('paddlenlp')
diff --git a/test/dygraph_to_static/test_variable_trans_func.py b/test/dygraph_to_static/test_variable_trans_func.py
index f2395fa517793d..0ca73fbf9dd755 100644
--- a/test/dygraph_to_static/test_variable_trans_func.py
+++ b/test/dygraph_to_static/test_variable_trans_func.py
@@ -14,10 +14,13 @@
 
 import unittest
 
+from dygraph_to_static_util import dy2static_unittest
+
 from paddle.jit.dy2static.utils import ast_to_source_code
 from paddle.jit.dy2static.variable_trans_func import create_fill_constant_node
 
 
+@dy2static_unittest
 class TestVariableTransFunc(unittest.TestCase):
     def test_create_fill_constant_node(self):
         node = create_fill_constant_node("a", 1.0)
diff --git a/test/dygraph_to_static/test_word2vec.py b/test/dygraph_to_static/test_word2vec.py
index 85edea2093d82f..0f16f5b2a9d23f 100644
--- a/test/dygraph_to_static/test_word2vec.py
+++ b/test/dygraph_to_static/test_word2vec.py
@@ -17,7 +17,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle import base
@@ -318,6 +321,7 @@ def train(to_static):
         return np.array(ret)
 
 
+@dy2static_unittest
 class TestWord2Vec(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_dygraph_static_same_loss(self):
diff --git a/test/dygraph_to_static/test_yolov3.py b/test/dygraph_to_static/test_yolov3.py
index 3f31b666c7f31d..12830ca7bce557 100644
--- a/test/dygraph_to_static/test_yolov3.py
+++ b/test/dygraph_to_static/test_yolov3.py
@@ -17,7 +17,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    dy2static_unittest,
+    test_and_compare_with_new_ir,
+)
 from yolov3 import YOLOv3, cfg
 
 import paddle
@@ -165,6 +168,7 @@ def train(to_static):
         return np.array(ret)
 
 
+@dy2static_unittest
 class TestYolov3(unittest.TestCase):
     @test_and_compare_with_new_ir(False)
     def test_dygraph_static_same_loss(self):
diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt
index 5c6714e698444d..fa01fe99a9e3ed 100755
--- a/test/ir/inference/CMakeLists.txt
+++ b/test/ir/inference/CMakeLists.txt
@@ -181,10 +181,13 @@ if(WITH_GPU AND TENSORRT_FOUND)
   set_tests_properties(test_trt_inference_predictor PROPERTIES TIMEOUT 60)
   set_tests_properties(test_trt_inference_fp16_io PROPERTIES TIMEOUT 300)
   set_tests_properties(test_trt_optimization_level PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_trt_explicit_quantization_resnet PROPERTIES TIMEOUT
-                                                                        300)
-  set_tests_properties(test_trt_explicit_quantization_mobilenet
-                       PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_trt_ops_fp32_mix_precision PROPERTIES TIMEOUT 300)
+  if(NOT WIN32)
+    set_tests_properties(test_trt_explicit_quantization_resnet
+                         PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_trt_explicit_quantization_mobilenet
+                         PROPERTIES TIMEOUT 300)
+  endif()
   if(WITH_MKLDNN)
     set_tests_properties(test_save_optimized_model_pass PROPERTIES TIMEOUT 300)
   endif()
diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py
index 4516c2cb4ad0c7..250b547efca31b 100644
--- a/test/ir/inference/program_config.py
+++ b/test/ir/inference/program_config.py
@@ -113,7 +113,6 @@ def __repr__(self):
     'fetch',
     'recurrent',
     'go',
-    'rnn_memory_helper_grad',
     'conditional_block',
     'static_pylayer',
     'while',
diff --git a/test/ir/inference/test_mkldnn_pad2d_op.py b/test/ir/inference/test_mkldnn_pad2d_op.py
deleted file mode 100644
index cb3b14ab3355db..00000000000000
--- a/test/ir/inference/test_mkldnn_pad2d_op.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import partial
-
-import hypothesis.strategies as st
-import numpy as np
-from auto_scan_test import MkldnnAutoScanTest
-from hypothesis import given
-from program_config import OpConfig, ProgramConfig, TensorConfig
-
-
-class TestOneDNNPad2DOp(MkldnnAutoScanTest):
-    def sample_program_configs(self, *args, **kwargs):
-        def generate_input(*args, **kwargs):
-            return np.random.random(kwargs['in_shape']).astype(np.float32)
-
-        pad3d_op = OpConfig(
-            type="pad2d",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["output_data"]},
-            attrs={
-                "mode": "constant",
-                "data_format": kwargs['data_format'],
-                "paddings": kwargs['paddings'],
-            },
-        )
-
-        program_config = ProgramConfig(
-            ops=[pad3d_op],
-            weights={},
-            inputs={
-                "input_data": TensorConfig(
-                    data_gen=partial(generate_input, *args, **kwargs)
-                ),
-            },
-            outputs=["output_data"],
-        )
-
-        yield program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
-        yield config, (1e-5, 1e-5)
-
-    @given(
-        data_format=st.sampled_from(['NCHW', 'NHWC']),
-        in_shape=st.sampled_from(
-            [[2, 3, 4, 5], [1, 4, 1, 3], [4, 3, 2, 1], [1, 1, 1, 1]]
-        ),
-        paddings=st.sampled_from(
-            [[0, 0, 0, 0], [1, 2, 0, 1], [2, 5, 11, 3], [0, 5, 0, 1]]
-        ),
-    )
-    def test(self, *args, **kwargs):
-        self.run_test(quant=False, *args, **kwargs)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/ir/inference/test_trt_convert_flatten.py b/test/ir/inference/test_trt_convert_flatten.py
deleted file mode 100644
index 636502aa231138..00000000000000
--- a/test/ir/inference/test_trt_convert_flatten.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import partial
-from typing import List
-
-import numpy as np
-from program_config import ProgramConfig, TensorConfig
-from trt_layer_auto_scan_test import TrtLayerAutoScanTest
-
-import paddle.inference as paddle_infer
-
-
-class TrtConvertFlattenTest_dim_2(TrtLayerAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        return True
-
-    def sample_program_configs(self):
-        def generate_input(batch):
-            return np.random.random([batch, 32]).astype(np.float32)
-
-        for batch in [1, 4]:
-            for axis in [0, 1]:
-                for type in ["flatten", "flatten2"]:
-                    if type == "flatten":
-                        op_outputs = {"Out": ["output_data"]}
-                    else:
-                        op_outputs = {
-                            "Out": ["output_data"],
-                            "XShape": ["xshape_data"],
-                        }
-                    dics = [{"axis": axis}]
-                    ops_config = [
-                        {
-                            "op_type": "flatten",
-                            "op_inputs": {"X": ["input_data"]},
-                            "op_outputs": op_outputs,
-                            "op_attrs": dics[0],
-                        }
-                    ]
-                    ops = self.generate_op_config(ops_config)
-
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights={},
-                        inputs={
-                            "input_data": TensorConfig(
-                                data_gen=partial(generate_input, batch)
-                            )
-                        },
-                        outputs=["output_data"],
-                    )
-
-                    yield program_config
-
-    def sample_predictor_configs(
-        self, program_config
-    ) -> (paddle_infer.Config, List[int], float):
-        def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 64]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [2, 32]}
-
-        def clear_dynamic_shape():
-            self.dynamic_shape.max_input_shape = {}
-            self.dynamic_shape.min_input_shape = {}
-            self.dynamic_shape.opt_input_shape = {}
-
-        def generate_trt_nodes_num(attrs, dynamic_shape):
-            ver = paddle_infer.get_trt_compile_version()
-            if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130:
-                if attrs[0]['axis'] == 1:
-                    return 1, 2
-                else:
-                    return 0, 3
-            else:
-                if dynamic_shape:
-                    return 0, 3
-
-                if attrs[0]['axis'] == 1:
-                    return 1, 2
-                else:
-                    return 0, 3
-
-        attrs = [
-            program_config.ops[i].attrs for i in range(len(program_config.ops))
-        ]
-
-        # for static_shape
-        clear_dynamic_shape()
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), (1e-3, 1e-3)
-
-        # for dynamic_shape
-        generate_dynamic_shape(attrs)
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), (1e-3, 1e-3)
-
-    def test(self):
-        self.run_test()
-
-
-class TrtConvertFlattenTest_dim_3(TrtLayerAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        return True
-
-    def sample_program_configs(self):
-        def generate_input(batch):
-            return np.random.random([batch, 32, 64]).astype(np.float32)
-
-        for batch in [1, 4]:
-            for axis in [0, 1, 2]:
-                for type in ["flatten", "flatten2"]:
-                    if type == "flatten":
-                        op_outputs = {"Out": ["output_data"]}
-                    else:
-                        op_outputs = {
-                            "Out": ["output_data"],
-                            "XShape": ["xshape_data"],
-                        }
-                    dics = [{"axis": axis}]
-                    ops_config = [
-                        {
-                            "op_type": "flatten",
-                            "op_inputs": {"X": ["input_data"]},
-                            "op_outputs": op_outputs,
-                            "op_attrs": dics[0],
-                        }
-                    ]
-                    ops = self.generate_op_config(ops_config)
-
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights={},
-                        inputs={
-                            "input_data": TensorConfig(
-                                data_gen=partial(generate_input, batch)
-                            )
-                        },
-                        outputs=["output_data"],
-                    )
-
-                    yield program_config
-
-    def sample_predictor_configs(
-        self, program_config
-    ) -> (paddle_infer.Config, List[int], float):
-        def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 64]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 64]}
-
-        def clear_dynamic_shape():
-            self.dynamic_shape.max_input_shape = {}
-            self.dynamic_shape.min_input_shape = {}
-            self.dynamic_shape.opt_input_shape = {}
-
-        def generate_trt_nodes_num(attrs, dynamic_shape):
-            ver = paddle_infer.get_trt_compile_version()
-            if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130:
-                if attrs[0]['axis'] == 1:
-                    return 1, 2
-                else:
-                    return 0, 3
-            else:
-                if dynamic_shape:
-                    return 0, 3
-
-                if attrs[0]['axis'] == 1:
-                    return 1, 2
-                else:
-                    return 0, 3
-
-        attrs = [
-            program_config.ops[i].attrs for i in range(len(program_config.ops))
-        ]
-
-        # for static_shape
-        clear_dynamic_shape()
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), (1e-3, 1e-3)
-
-        # for dynamic_shape
-        generate_dynamic_shape(attrs)
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), (1e-3, 1e-3)
-
-    def test(self):
-        self.run_test()
-
-
-class TrtConvertFlattenTest_dim_4(TrtLayerAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        return True
-
-    def sample_program_configs(self):
-        def generate_input(batch):
-            return np.random.random([batch, 8, 8, 8]).astype(np.float32)
-
-        for batch in [1, 4]:
-            for axis in [0, 1, 2, 3]:
-                for type in ["flatten", "flatten2"]:
-                    if type == "flatten":
-                        op_outputs = {"Out": ["output_data"]}
-                    else:
-                        op_outputs = {
-                            "Out": ["output_data"],
-                            "XShape": ["xshape_data"],
-                        }
-                    dics = [{"axis": axis}]
-                    ops_config = [
-                        {
-                            "op_type": "flatten",
-                            "op_inputs": {"X": ["input_data"]},
-                            "op_outputs": op_outputs,
-                            "op_attrs": dics[0],
-                        }
-                    ]
-                    ops = self.generate_op_config(ops_config)
-
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights={},
-                        inputs={
-                            "input_data": TensorConfig(
-                                data_gen=partial(generate_input, batch)
-                            )
-                        },
-                        outputs=["output_data"],
-                    )
-
-                    yield program_config
-
-    def sample_predictor_configs(
-        self, program_config
-    ) -> (paddle_infer.Config, List[int], float):
-        def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4, 4]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32, 32]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 16, 8]}
-
-        def clear_dynamic_shape():
-            self.dynamic_shape.max_input_shape = {}
-            self.dynamic_shape.min_input_shape = {}
-            self.dynamic_shape.opt_input_shape = {}
-
-        def generate_trt_nodes_num(attrs, dynamic_shape):
-            ver = paddle_infer.get_trt_compile_version()
-            if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130:
-                if attrs[0]['axis'] == 1:
-                    return 1, 2
-                else:
-                    return 0, 3
-            else:
-                if dynamic_shape:
-                    return 0, 3
-
-                if attrs[0]['axis'] == 1:
-                    return 1, 2
-                else:
-                    return 0, 3
-
-        attrs = [
-            program_config.ops[i].attrs for i in range(len(program_config.ops))
-        ]
-
-        # for static_shape
-        clear_dynamic_shape()
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), (1e-3, 1e-3)
-
-        # for dynamic_shape
-        generate_dynamic_shape(attrs)
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), (1e-3, 1e-3)
-
-    def test(self):
-        self.run_test()
-
-
-class TrtConvertFlattenTest_dim_5(TrtLayerAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        return True
-
-    def sample_program_configs(self):
-        def generate_input(batch):
-            return np.random.random([batch, 8, 8, 8]).astype(np.float32)
-
-        for batch in [1, 4]:
-            for axis in [0, 1, 2, 3, 4]:
-                for type in ["flatten", "flatten2"]:
-                    if type == "flatten":
-                        op_outputs = {"Out": ["output_data"]}
-                    else:
-                        op_outputs = {
-                            "Out": ["output_data"],
-                            "XShape": ["xshape_data"],
-                        }
-                    dics = [{"axis": axis}]
-                    ops_config = [
-                        {
-                            "op_type": "flatten",
-                            "op_inputs": {"X": ["input_data"]},
-                            "op_outputs": op_outputs,
-                            "op_attrs": dics[0],
-                        }
-                    ]
-                    ops = self.generate_op_config(ops_config)
-
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights={},
-                        inputs={
-                            "input_data": TensorConfig(
-                                data_gen=partial(generate_input, batch)
-                            )
-                        },
-                        outputs=["output_data"],
-                    )
-
-                    yield program_config
-
-    def sample_predictor_configs(
-        self, program_config
-    ) -> (paddle_infer.Config, List[int], float):
-        def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4, 4]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 16, 16, 8]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 16, 8]}
-
-        def clear_dynamic_shape():
-            self.dynamic_shape.max_input_shape = {}
-            self.dynamic_shape.min_input_shape = {}
-            self.dynamic_shape.opt_input_shape = {}
-
-        def generate_trt_nodes_num(attrs, dynamic_shape):
-            ver = paddle_infer.get_trt_compile_version()
-            if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130:
-                if attrs[0]['axis'] == 1:
-                    return 1, 2
-                else:
-                    return 0, 3
-            else:
-                if dynamic_shape:
-                    return 0, 3
-
-                if attrs[0]['axis'] == 1:
-                    return 1, 2
-                else:
-                    return 0, 3
-
-        attrs = [
-            program_config.ops[i].attrs for i in range(len(program_config.ops))
-        ]
-
-        # for static_shape
-        clear_dynamic_shape()
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), (1e-3, 1e-3)
-
-        # for dynamic_shape
-        generate_dynamic_shape(attrs)
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), (1e-3, 1e-3)
-
-    def test(self):
-        self.run_test()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/ir/inference/test_trt_convert_share_data.py b/test/ir/inference/test_trt_convert_share_data.py
new file mode 100644
index 00000000000000..168ef72b6e590b
--- /dev/null
+++ b/test/ir/inference/test_trt_convert_share_data.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+
+
+class TrtConvertShareDataTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        compile_version = paddle_infer.get_trt_compile_version()
+        runtime_version = paddle_infer.get_trt_runtime_version()
+        if (
+            compile_version[0] * 1000
+            + compile_version[1] * 100
+            + compile_version[2] * 10
+            < 8400
+        ):
+            return False
+        if (
+            runtime_version[0] * 1000
+            + runtime_version[1] * 100
+            + runtime_version[2] * 10
+            < 8400
+        ):
+            return False
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(type):
+            if self.dims == 1:
+                return np.ones([1]).astype(type)
+            else:
+                return np.ones([1, 3, 64, 64]).astype(type)
+
+        for dims in [1, 4]:
+            self.dims = dims
+            for dtype in [
+                np.int32,
+                np.float32,
+                np.int64,
+            ]:
+                self.has_bool_dtype = dtype == np.bool_
+                ops_config = [
+                    {
+                        "op_type": "share_data",
+                        "op_inputs": {"X": ["input_data"]},
+                        "op_outputs": {"Out": ["output_data0"]},
+                        "op_attrs": {},
+                    },
+                    {
+                        "op_type": "share_data",
+                        "op_inputs": {"X": ["output_data0"]},
+                        "op_outputs": {"Out": ["output_data1"]},
+                        "op_attrs": {},
+                    },
+                ]
+
+                ops = self.generate_op_config(ops_config)
+
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "input_data": TensorConfig(
+                            data_gen=partial(generate_input, dtype)
+                        )
+                    },
+                    outputs=["output_data1"],
+                )
+
+                yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"input_data": [1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [1]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [1]}
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 3, 64, 64]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [1, 3, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 3, 64, 64]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if not dynamic_shape and self.dims == 1:
+                return 0, 4
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        program_config.set_input_type(np.float32)
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        program_config.set_input_type(np.float16)
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        program_config.set_input_type(np.float32)
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        program_config.set_input_type(np.float16)
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-2
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/inference/test_trt_ops_fp32_mix_precision.py b/test/ir/inference/test_trt_ops_fp32_mix_precision.py
new file mode 100644
index 00000000000000..c2fcb2255c95c6
--- /dev/null
+++ b/test/ir/inference/test_trt_ops_fp32_mix_precision.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+from paddle.inference import InternalUtils
+
+
+class TestTrtFp32MixPrecision(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_conv2d_input():
+            return np.ones([1, 3, 64, 64]).astype(np.float32)
+
+        def generate_conv2d_weight():
+            return np.ones([9, 3, 3, 3]).astype(np.float32)
+
+        def generate_elementwise_input(op_type):
+            # elementwise_floordiv is integer only
+            if op_type == "elementwise_mod":
+                return np.random.uniform(
+                    low=0.1, high=1.0, size=[33, 10]
+                ).astype(np.float32)
+            else:
+                return np.random.random([33, 10]).astype(np.float32)
+
+        def generate_elementwise_weight(op_type):
+            if op_type == "elementwise_mod":
+                return np.random.uniform(
+                    low=0.1, high=1.0, size=[33, 1]
+                ).astype(np.float32)
+            else:
+                return np.random.randn(33, 1).astype(np.float32)
+
+        attrs = [
+            {
+                "data_fromat": 'NCHW',
+                "dilations": [1, 2],
+                "padding_algorithm": 'EXPLICIT',
+                "groups": 1,
+                "paddings": [0, 3],
+                "strides": [2, 2],
+            },
+            {"axis": -1},
+            {
+                "trans_x": False,
+                "trans_y": False,
+            },
+        ]
+        for op_type in [
+            "elementwise_add",
+            "elementwise_mul",
+            "elementwise_sub",
+            "elementwise_div",
+            "elementwise_pow",
+            "elementwise_min",
+            "elementwise_max",
+            "elementwise_mod",
+        ]:
+            ops_config = [
+                {
+                    "op_type": "conv2d",
+                    "op_inputs": {
+                        "Input": ["conv2d_input"],
+                        "Filter": ["conv2d_weight"],
+                    },
+                    "op_outputs": {"Output": ["conv_output_data"]},
+                    "op_attrs": attrs[0],
+                },
+                {
+                    "op_type": op_type,
+                    "op_inputs": {
+                        "X": ["elementwise_input"],
+                        "Y": ["elementwise_weight"],
+                    },
+                    "op_outputs": {"Out": ["elementwise_output_data"]},
+                    "op_attrs": attrs[1],
+                    "outputs_dtype": {"output_data": np.float32},
+                },
+                {
+                    "op_type": "matmul_v2",
+                    "op_inputs": {
+                        "X": ["conv_output_data"],
+                        "Y": ["elementwise_output_data"],
+                    },
+                    "op_outputs": {"Out": ["matmul_v2_output_data"]},
+                    "op_attrs": attrs[2],
+                },
+            ]
+
+            ops = self.generate_op_config(ops_config)
+
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={
+                    "conv2d_weight": TensorConfig(
+                        data_gen=partial(generate_conv2d_weight)
+                    ),
+                    "elementwise_weight": TensorConfig(
+                        data_gen=partial(generate_elementwise_weight, op_type)
+                    ),
+                },
+                inputs={
+                    "conv2d_input": TensorConfig(
+                        data_gen=partial(generate_conv2d_input)
+                    ),
+                    "elementwise_input": TensorConfig(
+                        data_gen=partial(generate_elementwise_input, op_type)
+                    ),
+                },
+                outputs=["matmul_v2_output_data"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "conv2d_input": [1, 3, 64, 64],
+                "elementwise_input": [33, 10],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "conv2d_input": [1, 3, 64, 64],
+                "elementwise_input": [33, 10],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "conv2d_input": [1, 3, 64, 64],
+                "elementwise_input": [33, 10],
+            }
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 3
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        program_config.set_input_type(np.float16)
+        config = self.create_inference_config()
+        InternalUtils.disable_tensorrt_half_ops(
+            config,
+            {
+                "conv_output_data",
+                "elementwise_output_data",
+                "matmul_v2_output_data",
+            },
+        )
+        yield config, generate_trt_nodes_num(attrs, True), (1e-3, 1e-3)
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/new_ir/test_new_ir_to_static.py b/test/ir/new_ir/test_new_ir_to_static.py
index 5516b3bca04c1a..2331dc0ea22e63 100644
--- a/test/ir/new_ir/test_new_ir_to_static.py
+++ b/test/ir/new_ir/test_new_ir_to_static.py
@@ -25,7 +25,7 @@ def func(x):
             out = paddle.mean(x)
             return out
 
-        static_func = paddle.jit.to_static(func)
+        static_func = paddle.jit.to_static(func, full_graph=True)
         x = paddle.randn((3, 3))
         y = paddle.randn((3, 3))
         x.stop_gradient = False
@@ -43,7 +43,7 @@ def func(x):
             return out
 
         # ==== dygraph computation ====
-        static_func = paddle.jit.to_static(func)
+        static_func = paddle.jit.to_static(func, full_graph=True)
         x = paddle.randn((3, 3))
         y = paddle.randn((3, 3))
         x.stop_gradient = False
@@ -78,7 +78,7 @@ def forward(self, x):
         x = paddle.randn((10, 10))
         x.stop_gradient = False
         ans = net(x)
-        net = paddle.jit.to_static(net)
+        net = paddle.jit.to_static(net, full_graph=True)
         out = net(x)
         np.testing.assert_allclose(
             out.numpy(), ans.numpy(), rtol=1e-05, atol=1e-8
@@ -101,7 +101,7 @@ def run_function(to_static=True):
             y.stop_gradient = True
             func = output_pure_func
             if to_static:
-                func = paddle.jit.to_static(func)
+                func = paddle.jit.to_static(func, full_graph=True)
             y, y_mean = func(x, y)
             loss = y.mean()
             loss.backward()
@@ -134,7 +134,7 @@ def train_step(to_static=True):
                 learning_rate=0.1, parameters=net.parameters()
             )
             if to_static:
-                net = paddle.jit.to_static(net)
+                net = paddle.jit.to_static(net, full_graph=True)
             losses = []
             for step in range(100):
                 y_pred = net(x)
@@ -177,7 +177,7 @@ def train_step(to_static=True):
                 learning_rate=0.1, parameters=net.parameters()
             )
             if to_static:
-                net = paddle.jit.to_static(net)
+                net = paddle.jit.to_static(net, full_graph=True)
             losses = []
             for step in range(100):
                 y_pred = net(x, step % 2 == 1)
@@ -195,5 +195,22 @@ def train_step(to_static=True):
         )
 
 
+class TestDy2staticNewIR6(unittest.TestCase):
+    # test basic-indexing __getitem__ for OpResult
+    def test_basic_network(self):
+        def func(x):
+            shape = paddle.shape(x)
+            out = shape[1:]
+            return out
+
+        static_func = paddle.jit.to_static(func, full_graph=True)
+        x = paddle.randn((2, 3, 4))
+        x.stop_gradient = False
+        ans = func(x)
+        out = static_func(x)
+
+        np.testing.assert_allclose(out.numpy(), ans.numpy())
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/new_ir/test_pass_manager.py b/test/ir/new_ir/test_pass_manager.py
index 5849b0bbdfeffa..44689f485af818 100644
--- a/test/ir/new_ir/test_pass_manager.py
+++ b/test/ir/new_ir/test_pass_manager.py
@@ -51,12 +51,12 @@ def test_op(self):
         self.assertTrue('pd_op.uniform' in op_names)
         pm = pir.PassManager()
         pm.add_pass(
-            'dead_code_elimination'
+            'dead_code_elimination_pass'
         )  # apply pass to elimitate dead code
         pm.run(new_program)
         op_names = [op.name() for op in new_program.global_block().ops]
         # print(op_names)
-        self.assertEqual(pm.passes(), ['dead_code_elimination'])
+        self.assertEqual(pm.passes(), ['dead_code_elimination_pass'])
         self.assertFalse(pm.empty())
         self.assertTrue(
             'pd_op.uniform' not in op_names
diff --git a/test/ir/new_ir/test_special_op_translator.py b/test/ir/new_ir/test_special_op_translator.py
index a2a17feb1275fa..a56282c023bf09 100644
--- a/test/ir/new_ir/test_special_op_translator.py
+++ b/test/ir/new_ir/test_special_op_translator.py
@@ -35,7 +35,73 @@ def test_op(self):
                 x = paddle.to_tensor([2, 3, 4], 'float64')
                 y = paddle.cast(x, 'uint8')
 
-        _ = pir.translate_to_new_ir(main_program.desc)
+        _, mappings = pir.translate_to_new_ir_with_param_map(main_program.desc)
+        assert len(str(mappings)) > 0, "no mapping found"
+
+
+class TestCondWithInplace(unittest.TestCase):
+    def test_op(self):
+        def cond_with_inplace():
+            x = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+            y = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+            running_mean = paddle.to_tensor([0], dtype="float32")
+            running_variance = paddle.to_tensor([1], dtype="float32")
+            weight = paddle.to_tensor([2], dtype="float32")
+            bias = paddle.to_tensor([1], dtype="float32")
+            if x > y:
+                y = paddle.nn.functional.batch_norm(
+                    x, running_mean, running_variance, weight, bias
+                )
+            else:
+                y = paddle.nn.functional.batch_norm(
+                    x, running_mean, running_variance, weight, bias
+                )
+
+        legacy_program = paddle.jit.to_static(
+            cond_with_inplace,
+            input_spec=[],
+            full_graph=True,
+        )
+
+        l = pir.translate_to_new_ir(legacy_program.main_program.desc)
+        assert l is not None
+
+    def test_nested_op(self):
+        def cond_with_inplace():
+            x = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+            y = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+            z = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+            running_mean = paddle.to_tensor([0], dtype="float32")
+            running_variance = paddle.to_tensor([1], dtype="float32")
+            weight = paddle.to_tensor([2], dtype="float32")
+            bias = paddle.to_tensor([1], dtype="float32")
+            if x > y:
+                if y > z:
+                    z = paddle.nn.functional.batch_norm(
+                        z, running_mean, running_variance, weight, bias
+                    )
+                else:
+                    y = paddle.nn.functional.batch_norm(
+                        x, running_mean, running_variance, weight, bias
+                    )
+            else:
+                if y > z:
+                    z = paddle.nn.functional.batch_norm(
+                        z, running_mean, running_variance, weight, bias
+                    )
+                else:
+                    y = paddle.nn.functional.batch_norm(
+                        x, running_mean, running_variance, weight, bias
+                    )
+
+        legacy_program = paddle.jit.to_static(
+            cond_with_inplace,
+            input_spec=[],
+            full_graph=True,
+        )
+
+        l = pir.translate_to_new_ir(legacy_program.main_program.desc)
+        assert l is not None
 
 
 class TestElementwiseOpTranscriber(unittest.TestCase):
@@ -100,6 +166,27 @@ def test_elementwise_with_y_grad(self):
                     atol=1e-6,
                 )
 
+    def test_add_inplace(self):
+        place = core.Place()
+        place.set_place(paddle.CPUPlace())
+        exe = paddle.static.Executor(place)
+
+        new_scope = paddle.static.Scope()
+        main_program = paddle.static.Program()
+        with paddle.static.scope_guard(new_scope):
+            with paddle.static.program_guard(main_program):
+                x = paddle.ones(shape=(100, 2, 3), dtype='float32')
+                y = paddle.ones(shape=(100, 2, 3), dtype='float32')
+
+                helper = LayerHelper('elementwise_add')
+                helper.append_op(
+                    type="elementwise_add",
+                    inputs={"X": x, "Y": y},
+                    outputs={"Out": y},
+                    attrs={"axis": -1},
+                )
+        _ = pir.translate_to_new_ir(main_program.desc)
+
 
 class TestEmbeddingOpTranscriber(unittest.TestCase):
     def test_op(self):
@@ -408,6 +495,30 @@ def test_grad(self):
                 self.assertTrue((ret[0][6:0:-4] == 0).all())
 
 
+class TestShareBufferOpTranscriber(unittest.TestCase):
+    def test_program(self):
+        place = core.Place()
+        place.set_place(paddle.CPUPlace())
+
+        new_scope = paddle.static.Scope()
+        main_program = paddle.static.Program()
+        with paddle.static.scope_guard(new_scope):
+            with paddle.static.program_guard(main_program):
+                x = paddle.ones(shape=(100, 2, 3), dtype='float32')
+                y = paddle.ones(shape=(100, 2, 3), dtype='float32')
+
+                helper = LayerHelper('share_buffer')
+                helper.append_op(
+                    type="share_buffer",
+                    inputs={"X": x},
+                    outputs={"Out": y, "XOut": x},
+                )
+        l = pir.translate_to_new_ir(main_program.desc)
+        assert (
+            l.global_block().ops[2].name() == "pd_op.share_data"
+        ), "share_buffer should be translated to share_data"
+
+
 class TestCheckUnregisteredOp(unittest.TestCase):
     def test_program(self):
         main_program = paddle.static.Program()
diff --git a/test/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py b/test/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
index 4c0b5d5689885b..ce96268f788b4b 100644
--- a/test/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
+++ b/test/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
@@ -121,14 +121,13 @@ def setUp(self):
         self.num_fused_ops = 2
 
     def test_check_output(self):
-        use_gpu_set = [True]
         if not core.is_compiled_with_cuda():
             return
         self.pass_attrs = {
             "embedding_eltwise_layernorm_fuse_pass": {"use_gpu": True}
         }
         place = base.CUDAPlace(0)
-        self.check_output_with_place(place, startup_on_cpu=True)
+        self.check_output_with_place(place)
 
 
 if __name__ == "__main__":
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 7aa25386076e54..96a15b04ab8a2e 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -521,11 +521,8 @@ set(TEST_OPS_WITH_GC
     test_lod_reset_op
     test_lookup_table_op
     test_mean_op
-    test_pad2d_op
     test_scatter_op
-    test_slice_op
-    test_space_to_depth_op
-    test_squared_l2_distance_op)
+    test_slice_op)
 
 foreach(TEST_OP ${TEST_OPS_WITH_GC})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
@@ -854,7 +851,6 @@ set_tests_properties(
   test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")
 set_tests_properties(
   test_sync_batch_norm_op
-  test_inplace_abn_op
   test_parallel_executor_seresnext_base_gpu
   test_parallel_executor_seresnext_with_reduce_gpu
   test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
@@ -1047,7 +1043,6 @@ set_tests_properties(test_sigmoid_cross_entropy_with_logits_op
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150)
 set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cond PROPERTIES TIMEOUT 120)
-set_tests_properties(test_space_to_depth_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
 set_tests_properties(test_parallel_executor_seresnext_base_gpu
                      PROPERTIES TIMEOUT 120)
@@ -1072,7 +1067,6 @@ set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_activation_op PROPERTIES TIMEOUT 270)
 set_tests_properties(test_normal PROPERTIES TIMEOUT 120)
-set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_decoupled_py_reader PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fuse_bn_act_pass PROPERTIES TIMEOUT 120)
@@ -1283,6 +1277,7 @@ set(STATIC_BUILD_TESTS
     test_adamw_op
     test_arg_min_max_op
     test_assign_pos_op
+    test_batch_norm_op
     test_bucketize_api
     test_bincount_op
     test_c_embedding_op
@@ -1290,6 +1285,7 @@ set(STATIC_BUILD_TESTS
     test_decoupled_py_reader
     test_eig_op
     test_eigh_op
+    test_fake_dequantize_op
     test_fake_quantize_op
     test_fetch_lod_tensor_array
     test_ftrl_op
@@ -1410,43 +1406,9 @@ foreach(IR_OP_TEST ${NEW_IR_OP_RELAXED_TESTS})
   endif()
 endforeach()
 
-set(STRIED_TESTS
-    test_complex_getitem
-    test_complex_grad_accumulated
-    test_complex_simplenet
-    test_conv1d_layer
-    test_conv1d_transpose_layer
-    test_conv2d_layer
-    test_diagonal_op
-    test_imperative_ocr_attention_model
-    test_imperative_ptb_rnn
-    test_imperative_ptb_rnn_sorted_gradient
-    test_initializer
-    test_inplace
-    test_real_imag_op
-    test_reshape_op
-    test_set_value_op
-    test_signal
-    test_slice_op
-    test_solve_op
-    test_squeeze_op
-    test_squeeze2_op
-    test_unbind_op
-    test_unsqueeze_op
-    test_unsqueeze2_op
-    test_var_base)
-
-foreach(STRIED_TEST ${STRIED_TESTS})
-  py_test_modules(${STRIED_TEST}_with_stride MODULES ${STRIED_TEST} ENVS
-                  FLAGS_use_stride_kernel=true)
-  set_tests_properties(${STRIED_TEST}_with_stride PROPERTIES TIMEOUT 120)
-endforeach()
-
 py_test_modules(test_stride MODULES test_stride ENVS
                 FLAGS_use_stride_kernel=true)
 
-set_tests_properties(test_slice_op_with_stride PROPERTIES TIMEOUT 300)
-
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
   # These UTs are specially designed for FleetExecutor
   set_tests_properties(
diff --git a/test/legacy_test/auto_parallel_data_unshard.py b/test/legacy_test/auto_parallel_data_unshard.py
index 2d8552e9d24c83..3c5a2637240cd6 100644
--- a/test/legacy_test/auto_parallel_data_unshard.py
+++ b/test/legacy_test/auto_parallel_data_unshard.py
@@ -106,7 +106,7 @@ def create_model(train_program, start_program):
         desired = input_data[worker_index].reshape(shard_data_np.shape)
         np.testing.assert_allclose(shard_data_np, desired)
 
-    def dp1pp1mp2(self):
+    def test_dp1pp1mp2(self):
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
                 MESH_0 = auto.ProcessMesh([0, 1], dim_names=["x"])
@@ -172,7 +172,6 @@ def create_model(train_program, start_program):
 
         input_data = np.array(range(8 * 8)).reshape([8, 8]).astype("float32")
         label_data = np.random.randint(0, 10, [8, 8]).astype("float32")
-
         fetchs = [loss.name, 'input']
         loss_np, shard_data_np = exe.run(
             distributed_main_program,
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index eae8a200212dfe..77ca5512d2b4fa 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -1967,7 +1967,7 @@ def check_output_with_place(
         only_check_prim=False,
         inplace_atol=None,
         check_cinn=False,
-        check_new_ir=False,
+        check_pir=False,
     ):
         core._set_prim_all_enabled(False)
         core.set_prim_eager_enabled(False)
@@ -2538,7 +2538,7 @@ def _is_skip_name(self, name):
             dygraph_checker.check()
             dygraph_dygraph_outs = dygraph_checker.outputs
 
-        if check_new_ir:
+        if check_pir:
             if (
                 type(place) is paddle.base.libpaddle.CPUPlace
                 or type(place) is paddle.base.libpaddle.CUDAPlace
@@ -2657,7 +2657,7 @@ def check_output(
         inplace_atol=None,
         check_cinn=False,
         only_check_prim=False,
-        check_new_ir=False,
+        check_pir=False,
     ):
         self.__class__.op_type = self.op_type
         if self.is_mkldnn_op():
@@ -2683,7 +2683,7 @@ def check_output(
                 only_check_prim=only_check_prim,
                 inplace_atol=inplace_atol,
                 check_cinn=check_cinn,
-                check_new_ir=check_new_ir,
+                check_pir=check_pir,
             )
             if not res and only_check_prim:
                 continue
@@ -2700,7 +2700,7 @@ def check_output(
                 self.check_compile_vs_runtime(fetch_list, outs)
 
     def check_output_customized(
-        self, checker, custom_place=None, check_new_ir=False
+        self, checker, custom_place=None, check_pir=False
     ):
         self.__class__.op_type = self.op_type
         places = self._get_places()
@@ -2711,7 +2711,7 @@ def check_output_customized(
             outs = [np.array(out) for out in outs]
             outs.sort(key=len)
             checker(outs)
-            if check_new_ir:
+            if check_pir:
                 with paddle.pir_utils.IrGuard():
                     outs_p = self._calc_new_ir_output(place)
                     outs_p = [outs_p[out] for out in outs_p]
@@ -2719,18 +2719,18 @@ def check_output_customized(
                     checker(outs_p[0])
 
     def check_output_with_place_customized(
-        self, checker, place, check_new_ir=False
+        self, checker, place, check_pir=False
     ):
         outs = self.calc_output(place)
         outs = [np.array(out) for out in outs]
         outs.sort(key=len)
         checker(outs)
-        if check_new_ir:
+        if check_pir:
             with paddle.pir_utils.IrGuard():
                 outs_p = self._calc_new_ir_output(place)
-                outs_p = [outs_p[out] for out in outs_p]
+                outs_p = [outs_p[out][0] for out in outs_p]
                 outs_p.sort(key=len)
-                checker(outs_p[0])
+                checker(outs_p)
 
     def _assert_is_close(
         self,
@@ -2867,7 +2867,7 @@ def check_grad(
         only_check_prim=False,
         atol=1e-5,
         check_cinn=False,
-        check_new_ir=False,
+        check_pir=False,
     ):
         if hasattr(self, "use_custom_device") and self.use_custom_device:
             check_dygraph = False
@@ -2891,7 +2891,7 @@ def check_grad(
                 only_check_prim=only_check_prim,
                 atol=atol,
                 check_cinn=check_cinn,
-                check_new_ir=check_new_ir,
+                check_pir=check_pir,
             )
 
     def check_grad_with_place(
@@ -2912,7 +2912,7 @@ def check_grad_with_place(
         numeric_place=None,
         atol=1e-5,
         check_cinn=False,
-        check_new_ir=False,
+        check_pir=False,
     ):
         if hasattr(self, "use_custom_device") and self.use_custom_device:
             check_dygraph = False
@@ -3126,7 +3126,7 @@ def check_grad_with_place(
                 )
 
         # get pir gradient
-        if check_new_ir:
+        if check_pir:
             if (
                 type(place) is paddle.base.libpaddle.CPUPlace
                 or type(place) is paddle.base.libpaddle.CUDAPlace
diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py
index bf32aefcebeae0..88843a1e550819 100644
--- a/test/legacy_test/prim_op_test.py
+++ b/test/legacy_test/prim_op_test.py
@@ -235,7 +235,9 @@ def is_bfloat16_type(cls, np_type):
 def apply_to_static(net, use_cinn):
     build_strategy = paddle.static.BuildStrategy()
     build_strategy.build_cinn_pass = use_cinn
-    return paddle.jit.to_static(net, build_strategy=build_strategy)
+    return paddle.jit.to_static(
+        net, build_strategy=build_strategy, full_graph=True
+    )
 
 
 class PrimNet(paddle.nn.Layer):
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 96af19e4b77d74..d1da7d941a679e 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -27,6 +27,7 @@
 from paddle import base, static
 from paddle.base import Program, core, program_guard
 from paddle.base.layer_helper import LayerHelper
+from paddle.pir_utils import test_with_pir_api
 
 
 @contextmanager
@@ -127,10 +128,16 @@ def setUp(self):
         self.convert_input_output()
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -174,12 +181,10 @@ def setUp(self):
         self.convert_input_output()
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.006, check_new_ir=True
-        )
+        self.check_grad(['X'], 'Out', max_relative_error=0.006, check_pir=True)
 
     def init_dtype(self):
         self.dtype = np.complex64
@@ -249,10 +254,10 @@ def setUp(self):
         self.convert_input_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestExpm1_Complex64(TestExpm1):
@@ -260,10 +265,10 @@ def init_dtype(self):
         self.dtype = np.complex64
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestExpm1_Complex128(TestExpm1_Complex64):
@@ -383,10 +388,19 @@ def init_dtype(self):
     def if_enable_cinn(self):
         pass
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', max_relative_error=0.01, check_prim=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            max_relative_error=0.01,
+            check_prim=True,
+            check_pir=True,
+        )
 
 
 class TestSigmoid_Complex64(TestSigmoid):
@@ -394,7 +408,7 @@ def init_dtype(self):
         self.dtype = np.complex64
 
     def test_check_output(self):
-        self.check_output(check_prim=False)
+        self.check_output(check_prim=False, check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -402,6 +416,7 @@ def test_check_grad(self):
             'Out',
             max_relative_error=0.006,
             check_prim=False,
+            check_pir=True,
         )
 
 
@@ -410,11 +425,7 @@ def init_dtype(self):
         self.dtype = np.complex128
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            check_prim=False,
-        )
+        self.check_grad(['X'], 'Out', check_prim=False, check_pir=True)
 
 
 class TestSigmoid_ZeroDim(TestSigmoid):
@@ -455,12 +466,13 @@ def if_enable_cinn(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        # elementwise_pow doesn't support bfloat16, skip check_prim here.
-        self.check_output_with_place(place, check_prim=True)
+        self.check_output_with_place(place, check_prim=True, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, check_pir=True
+        )
 
 
 '''
@@ -501,14 +513,25 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            self.check_output(check_pir=True)
+        else:
+            self.check_output(
+                check_prim=True, check_pir=True, check_prim_pir=True
+            )
 
     def test_check_grad(self):
         # TODO(BeingGod): set `check_prim=True` when `fill_constant` supports `complex` dtype
         if self.dtype == np.complex64 or self.dtype == np.complex128:
-            self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=True)
+            self.check_grad(['X'], 'Out', check_pir=True)
         else:
-            self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=True,
+                check_pir=True,
+                check_prim_pir=True,
+            )
 
 
 class TestSilu_ZeroDim(TestSilu):
@@ -694,7 +717,7 @@ def setUp(self):
         self.convert_input_output()
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -706,14 +729,14 @@ def test_check_grad(self):
                 'Out',
                 check_prim=False,
                 check_prim_pir=False,
-                check_new_ir=True,
+                check_pir=True,
             )
         else:
             self.check_grad(
                 ['X'],
                 'Out',
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
                 check_prim_pir=True,
             )
 
@@ -1439,7 +1462,7 @@ def test_errors(self):
 class TestSqrt(TestActivation, TestParameter):
     def setUp(self):
         self.op_type = "sqrt"
-        self.prim_op_type = "prim"
+        self.prim_op_type = "comp"
         self.python_api = paddle.sqrt
         self.public_python_api = paddle.sqrt
 
@@ -1461,16 +1484,22 @@ def if_enable_cinn(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_prim=True, check_pir=True, check_prim_pir=True)
 
 
 class TestSqrtPrimFp32(TestActivation):
     def setUp(self):
         self.op_type = "sqrt"
-        self.prim_op_type = "prim"
+        self.prim_op_type = "comp"
         self.python_api = paddle.sqrt
         self.public_python_api = paddle.sqrt
         self.init_dtype()
@@ -1486,10 +1515,16 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True, check_prim_pir=True)
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -1510,7 +1545,7 @@ def init_shape(self):
 class TestSqrtBF16(OpTest):
     def setUp(self):
         self.op_type = "sqrt"
-        self.prim_op_type = "prim"
+        self.prim_op_type = "comp"
         self.python_api = paddle.sqrt
         self.public_python_api = paddle.sqrt
         self.init_dtype()
@@ -1537,11 +1572,18 @@ def if_enable_cinn(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_pir=True, check_prim_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestSqrtComp(TestActivation, TestParameter):
@@ -1568,10 +1610,22 @@ def if_enable_cinn(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_dygraph=True, check_prim=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_dygraph=True,
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def test_check_output(self):
-        self.check_output(check_dygraph=True, check_prim=True)
+        self.check_output(
+            check_dygraph=True,
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestSqrtCompFp32(TestActivation):
@@ -1596,10 +1650,22 @@ def if_enable_cinn(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_dygraph=True, check_prim=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_dygraph=True,
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def test_check_output(self):
-        self.check_output(check_dygraph=True, check_prim=True)
+        self.check_output(
+            check_dygraph=True,
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -1635,9 +1701,7 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_new_ir=True, check_prim_pir=True
-        )
+        self.check_output(check_prim=True, check_pir=True, check_prim_pir=True)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -1647,7 +1711,7 @@ def test_check_grad(self):
             'Out',
             max_relative_error=0.0005,
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -1689,10 +1753,13 @@ def init_shape(self):
     def if_enable_cinn(self):
         pass
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_pir=True)
 
 
 class TestAbs_ZeroDim(TestAbs):
@@ -1718,6 +1785,9 @@ def setUp(self):
     def init_shape(self):
         self.shape = [10, 12]
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
     # The same reason with TestFloor
     def test_check_grad(self):
         pass
@@ -1752,6 +1822,9 @@ def init_shape(self):
     def if_enable_cinn(self):
         pass
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
     # the gradient on floor, ceil, round is undefined.
     # we return zero as gradient, but the numpy return nan
     # The same reason with TestFloor
@@ -1770,6 +1843,7 @@ def test_check_grad_for_prim(self):
                 'Out',
                 check_prim=True,
                 only_check_prim=True,
+                check_pir=True,
             )
 
 
@@ -1803,6 +1877,9 @@ def setUp(self):
     def init_shape(self):
         self.shape = [10, 12]
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
@@ -1810,10 +1887,14 @@ def test_check_grad(self):
         if self.dtype == np.complex64 or self.dtype == np.complex128:
             # Complex64 [GPU]: AssertionError: 0.0057843705 not less than or equal to 0.005
             self.check_grad(
-                ['X'], 'Out', check_prim=False, max_relative_error=0.006
+                ['X'],
+                'Out',
+                check_prim=False,
+                max_relative_error=0.006,
+                check_pir=True,
             )
         else:
-            self.check_grad(['X'], 'Out', check_prim=True)
+            self.check_grad(['X'], 'Out', check_prim=True, check_pir=True)
 
     def if_enable_cinn(self):
         pass
@@ -2000,14 +2081,22 @@ def setUp(self):
     def init_shape(self):
         self.shape = [10, 12]
 
+    @test_with_pir_api
+    def test_out_name(self):
+        # inherit from `TestParameter`
+        super().test_out_name()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
         # TODO(ScottWong98): set `check_prim=False` when `fill_any_like` supports `complex` dtype
         if self.dtype == np.complex64 or self.dtype == np.complex128:
-            self.check_grad(['X'], 'Out', check_prim=False)
+            self.check_grad(['X'], 'Out', check_prim=False, check_pir=True)
         else:
-            self.check_grad(['X'], 'Out', check_prim=True)
+            self.check_grad(['X'], 'Out', check_prim=True, check_pir=True)
 
     def if_enable_cinn(self):
         pass
@@ -2230,6 +2319,9 @@ def setUp(self):
     def init_shape(self):
         self.shape = [10, 12]
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
     def test_check_grad(self):
         pass
 
@@ -2262,10 +2354,10 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_pir=True)
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_pir=True)
 
     def if_enable_cinn(self):
         pass
@@ -2291,6 +2383,7 @@ def setUp(self):
     def executed_api(self):
         self.relu = F.relu
 
+    @test_with_pir_api
     def test_static_api(self):
         with static_guard():
             with paddle.static.program_guard(paddle.static.Program()):
@@ -2505,7 +2598,7 @@ def setUp(self):
     def test_check_output(self):
         self.check_output(
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=False,
         )
 
@@ -2516,7 +2609,7 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -2551,9 +2644,7 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_new_ir=True, check_prim_pir=False
-        )
+        self.check_output(check_prim=True, check_pir=True, check_prim_pir=False)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -2562,7 +2653,7 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -2589,6 +2680,7 @@ def setUp(self):
         self.rev_comp_rtol = 1e-8
         self.rev_comp_atol = 1e-8
 
+    @test_with_pir_api
     def test_static_api(self):
         with static_guard():
             with paddle.static.program_guard(paddle.static.Program()):
@@ -3244,10 +3336,19 @@ def setUp(self):
     def if_enable_cinn(self):
         pass
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class Test_Log_Op_Fp16(unittest.TestCase):
@@ -3573,10 +3674,10 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007, check_pir=True)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestSquare_ZeroDim(TestSquare):
@@ -3608,11 +3709,13 @@ def init_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.5)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', numeric_grad_delta=0.5, check_pir=True
+        )
 
 
 class TestPow(TestActivation):
@@ -3638,9 +3741,7 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_prim_pir=True, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=True, check_pir=True)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -3650,7 +3751,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -3858,6 +3959,11 @@ def setUp(self):
 
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(-1, 1, self.shape)
+                + 1j * np.random.uniform(-1, 1, self.shape)
+            ).astype(self.dtype)
         out = ref_softplus(x, beta, threshold)
         self.inputs = {'X': x}
         self.attrs = {'beta': beta, "threshold": threshold}
@@ -3872,6 +3978,19 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+class TestSoftplus_Complex64(TestSoftplus):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.06)
+
+
+class TestSoftplus_Complex128(TestSoftplus):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestSoftplus_ZeroDim(TestSoftplus):
     def init_shape(self):
         self.shape = []
@@ -4486,6 +4605,7 @@ def create_test_act_fp16_class(
     check_prim=False,
     check_prim_pir=False,
     enable_cinn=False,
+    check_pir=False,
     grad_atol=1e-2,
     **kwargs
 ):
@@ -4514,6 +4634,7 @@ def test_check_output(self):
                     check_dygraph=check_dygraph,
                     check_prim=check_prim,
                     check_prim_pir=check_prim_pir,
+                    check_pir=check_pir,
                 )
 
         def test_check_grad(self):
@@ -4528,6 +4649,7 @@ def test_check_grad(self):
                     check_prim=check_prim,
                     check_prim_pir=check_prim_pir,
                     max_relative_error=grad_atol,
+                    check_pir=check_pir,
                 )
 
     cls_name = "{}_{}".format(parent.__name__, "FP16OP")
@@ -4536,10 +4658,16 @@ def test_check_grad(self):
 
 
 create_test_act_fp16_class(TestActivation)
-create_test_act_fp16_class(TestExpFp32_Prim, check_prim=True, enable_cinn=True)
+create_test_act_fp16_class(
+    TestExpFp32_Prim, check_prim=True, enable_cinn=True, check_prim_pir=True
+)
 create_test_act_fp16_class(TestExpm1)
-create_test_act_fp16_class(TestSigmoid, check_prim=True, enable_cinn=True)
-create_test_act_fp16_class(TestSilu, check_prim=True, enable_cinn=True)
+create_test_act_fp16_class(
+    TestSigmoid, check_prim=True, enable_cinn=True, check_pir=True
+)
+create_test_act_fp16_class(
+    TestSilu, check_prim=True, enable_cinn=True, check_prim_pir=True
+)
 create_test_act_fp16_class(TestLogSigmoid)
 create_test_act_fp16_class(
     TestTanh, check_prim=True, check_prim_pir=True, enable_cinn=True
@@ -4547,31 +4675,51 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestTanhshrink)
 create_test_act_fp16_class(TestHardShrink)
 create_test_act_fp16_class(TestSoftshrink)
-create_test_act_fp16_class(TestSqrt, check_prim=True, enable_cinn=True)
-create_test_act_fp16_class(TestSqrtComp, check_prim=True, enable_cinn=True)
-create_test_act_fp16_class(TestAbs, check_prim=True, enable_cinn=True)
-create_test_act_fp16_class(TestCeil, grad_check=False)
 create_test_act_fp16_class(
-    TestFloor, check_prim=True, grad_check=False, enable_cinn=True
+    TestSqrt,
+    check_prim=True,
+    enable_cinn=True,
+    check_pir=True,
+    check_prim_pir=True,
+)
+create_test_act_fp16_class(
+    TestSqrtComp,
+    check_prim=True,
+    enable_cinn=True,
+    check_pir=True,
+    check_prim_pir=True,
+)
+create_test_act_fp16_class(
+    TestAbs, check_prim=True, enable_cinn=True, check_pir=True
+)
+create_test_act_fp16_class(TestCeil, grad_check=False, check_pir=True)
+create_test_act_fp16_class(
+    TestFloor,
+    check_prim=True,
+    grad_check=False,
+    enable_cinn=True,
+    check_pir=True,
 )
-create_test_act_fp16_class(TestCos)
+create_test_act_fp16_class(TestCos, check_pir=True)
 create_test_act_fp16_class(TestTan)
 create_test_act_fp16_class(TestCosh)
 create_test_act_fp16_class(TestAcos)
-create_test_act_fp16_class(TestSin)
+create_test_act_fp16_class(TestSin, check_pir=True)
 create_test_act_fp16_class(TestSinh)
 create_test_act_fp16_class(TestAsin)
 create_test_act_fp16_class(TestAtan)
 create_test_act_fp16_class(TestAcosh)
 create_test_act_fp16_class(TestAsinh)
 create_test_act_fp16_class(TestAtanh)
-create_test_act_fp16_class(TestRound, grad_check=False)
-create_test_act_fp16_class(TestRelu, check_prim=True, enable_cinn=True)
+create_test_act_fp16_class(TestRound, grad_check=False, check_pir=True)
+create_test_act_fp16_class(
+    TestRelu, check_prim=True, enable_cinn=True, check_pir=True
+)
 create_test_act_fp16_class(
     TestGelu,
     check_prim=True,
     check_prim_pir=True,
-    check_new_ir=True,
+    check_pir=True,
     enable_cinn=True,
     rev_comp_rtol=1e-3,
     rev_comp_atol=1e-3,
@@ -4584,14 +4732,14 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestELU)
 create_test_act_fp16_class(TestCELU)
 create_test_act_fp16_class(TestReciprocal)
-create_test_act_fp16_class(TestLog, check_prim=True)
+create_test_act_fp16_class(TestLog, check_prim=True, check_pir=True)
 if core.is_compiled_with_rocm():
     create_test_act_fp16_class(TestLog2)
 else:
     create_test_act_fp16_class(TestLog2)
 create_test_act_fp16_class(TestLog10)
 create_test_act_fp16_class(TestLog1p)
-create_test_act_fp16_class(TestSquare)
+create_test_act_fp16_class(TestSquare, check_pir=True)
 create_test_act_fp16_class(TestPow, check_prim=True, check_prim_pir=True)
 create_test_act_fp16_class(TestPow_API)
 create_test_act_fp16_class(TestSTanh)
@@ -4617,7 +4765,7 @@ def test_check_grad(self):
     TestRsqrt,
     check_prim=True,
     enable_cinn=True,
-    check_new_ir=True,
+    check_pir=True,
     check_prim_pir=True,
 )
 
@@ -4629,6 +4777,7 @@ def create_test_act_bf16_class(
     check_dygraph=True,
     check_prim=False,
     enable_cinn=False,
+    check_pir=False,
     grad_atol=1e-2,
     **kwargs
 ):
@@ -4657,7 +4806,10 @@ def convert_input_output(self):
         def test_check_output(self):
             place = core.CUDAPlace(0)
             self.check_output_with_place(
-                place, atol=atol, check_prim=check_prim
+                place,
+                atol=atol,
+                check_prim=check_prim,
+                check_pir=check_pir,
             )
 
         def test_check_grad(self):
@@ -4669,6 +4821,7 @@ def test_check_grad(self):
                     'Out',
                     max_relative_error=grad_atol,
                     check_prim=check_prim,
+                    check_pir=check_pir,
                 )
 
     cls_name = "{}_{}".format(parent.__name__, "BF16OP")
@@ -4677,37 +4830,45 @@ def test_check_grad(self):
 
 
 create_test_act_bf16_class(TestActivation)
-create_test_act_bf16_class(TestExpFp32_Prim, check_prim=True)
+create_test_act_bf16_class(
+    TestExpFp32_Prim, check_prim=True, check_prim_pir=True
+)
 create_test_act_bf16_class(TestExpm1)
-create_test_act_bf16_class(TestSigmoid, check_prim=True)
-create_test_act_bf16_class(TestSilu, check_prim=True)
+create_test_act_bf16_class(TestSigmoid, check_prim=True, check_pir=True)
+create_test_act_bf16_class(TestSilu, check_prim=True, check_prim_pir=True)
 create_test_act_bf16_class(TestLogSigmoid)
-create_test_act_bf16_class(TestTanh, check_prim=True)
+create_test_act_bf16_class(TestTanh, check_prim=True, check_prim_pir=True)
 create_test_act_bf16_class(TestTanhshrink)
 create_test_act_bf16_class(TestHardShrink)
 create_test_act_bf16_class(TestSoftshrink)
-create_test_act_bf16_class(TestSqrt, check_prim=True)
-create_test_act_bf16_class(TestSqrtComp, check_prim=True)
-create_test_act_bf16_class(TestAbs, check_prim=True)
-create_test_act_bf16_class(TestCeil, grad_check=False)
-create_test_act_bf16_class(TestFloor, grad_check=False, check_prim=True)
-create_test_act_bf16_class(TestCos)
+create_test_act_bf16_class(
+    TestSqrt, check_prim=True, check_pir=True, check_prim_pir=True
+)
+create_test_act_bf16_class(
+    TestSqrtComp, check_prim=True, check_pir=True, check_prim_pir=True
+)
+create_test_act_bf16_class(TestAbs, check_prim=True, check_pir=True)
+create_test_act_bf16_class(TestCeil, grad_check=False, check_pir=True)
+create_test_act_bf16_class(
+    TestFloor, grad_check=False, check_prim=True, check_pir=True
+)
+create_test_act_bf16_class(TestCos, check_pir=True)
 create_test_act_bf16_class(TestTan)
 create_test_act_bf16_class(TestCosh)
 create_test_act_bf16_class(TestAcos)
-create_test_act_bf16_class(TestSin)
+create_test_act_bf16_class(TestSin, check_pir=True)
 create_test_act_bf16_class(TestSinh)
 create_test_act_bf16_class(TestAsin)
 create_test_act_bf16_class(TestAtan)
 create_test_act_bf16_class(TestAcosh)
 create_test_act_bf16_class(TestAsinh)
 create_test_act_bf16_class(TestAtanh)
-create_test_act_bf16_class(TestRound, grad_check=False)
-create_test_act_bf16_class(TestRelu, check_prim=True)
+create_test_act_bf16_class(TestRound, grad_check=False, check_pir=True)
+create_test_act_bf16_class(TestRelu, check_prim=True, check_pir=True)
 create_test_act_bf16_class(
     TestGelu,
     check_prim=True,
-    check_new_ir=True,
+    check_pir=True,
     rev_comp_rtol=1e-2,
     rev_comp_atol=1e-2,
     cinn_rtol=1e-2,
@@ -4719,14 +4880,14 @@ def test_check_grad(self):
 create_test_act_bf16_class(TestELU)
 create_test_act_bf16_class(TestCELU)
 create_test_act_bf16_class(TestReciprocal)
-create_test_act_bf16_class(TestLog, check_prim=True)
+create_test_act_bf16_class(TestLog, check_prim=True, check_pir=True)
 if core.is_compiled_with_rocm():
     create_test_act_bf16_class(TestLog2)
 else:
     create_test_act_bf16_class(TestLog2)
 create_test_act_bf16_class(TestLog10)
 create_test_act_bf16_class(TestLog1p)
-create_test_act_bf16_class(TestSquare)
+create_test_act_bf16_class(TestSquare, check_pir=True)
 create_test_act_bf16_class(TestPow, check_prim=True)
 create_test_act_bf16_class(TestPow_API)
 create_test_act_bf16_class(TestSTanh)
@@ -4743,7 +4904,7 @@ def test_check_grad(self):
 create_test_act_bf16_class(TestLeakyReluAlpha3, check_prim=True)
 create_test_act_bf16_class(TestLeakyRelu_ZeroDim, check_prim=True)
 create_test_act_bf16_class(
-    TestRsqrt, check_prim=True, check_new_ir=True, check_prim_pir=True
+    TestRsqrt, check_prim=True, check_pir=True, check_prim_pir=True
 )
 
 if __name__ == "__main__":
diff --git a/test/legacy_test/test_adaptive_avg_pool2d.py b/test/legacy_test/test_adaptive_avg_pool2d.py
index 9c6c0c96287a45..137e943fa5e892 100644
--- a/test/legacy_test/test_adaptive_avg_pool2d.py
+++ b/test/legacy_test/test_adaptive_avg_pool2d.py
@@ -19,8 +19,8 @@
 from test_attribute_var import UnittestBase
 
 import paddle
-from paddle import base
 from paddle.base import Program, core, program_guard
+from paddle.pir_utils import test_with_pir_api
 
 
 def adaptive_start_index(index, input_size, output_size):
@@ -113,37 +113,45 @@ def setUp(self):
             x=self.x_np, output_size=[None, 3], pool_type="avg"
         )
 
+    @test_with_pir_api
     def test_static_graph(self):
         for use_cuda in (
             [False, True] if core.is_compiled_with_cuda() else [False]
         ):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.static.data(
-                name="x", shape=[2, 3, 7, 7], dtype="float32"
-            )
 
-            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[3, 3]
-            )
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
 
-            out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.static.data(
+                    name="x", shape=[2, 3, 7, 7], dtype="float32"
+                )
 
-            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[2, 5]
-            )
+                out_1 = paddle.nn.functional.adaptive_avg_pool2d(
+                    x=x, output_size=[3, 3]
+                )
 
-            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[3, 3], data_format="NHWC"
-            )
+                out_2 = paddle.nn.functional.adaptive_avg_pool2d(
+                    x=x, output_size=5
+                )
 
-            out_5 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[None, 3]
-            )
+                out_3 = paddle.nn.functional.adaptive_avg_pool2d(
+                    x=x, output_size=[2, 5]
+                )
+
+                out_4 = paddle.nn.functional.adaptive_avg_pool2d(
+                    x=x, output_size=[3, 3], data_format="NHWC"
+                )
+
+                out_5 = paddle.nn.functional.adaptive_avg_pool2d(
+                    x=x, output_size=[None, 3]
+                )
 
             exe = paddle.static.Executor(place=place)
             [res_1, res_2, res_3, res_4, res_5] = exe.run(
-                base.default_main_program(),
+                main_program,
                 feed={"x": self.x_np},
                 fetch_list=[out_1, out_2, out_3, out_4, out_5],
             )
@@ -232,38 +240,47 @@ def setUp(self):
             x=self.x_np, output_size=[None, 3], pool_type="avg"
         )
 
+    @test_with_pir_api
     def test_static_graph(self):
         for use_cuda in (
             [False, True] if core.is_compiled_with_cuda() else [False]
         ):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.static.data(
-                name="x", shape=[2, 3, 7, 7], dtype="float32"
-            )
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[3, 3])
-            out_1 = adaptive_avg_pool(x=x)
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.static.data(
+                    name="x", shape=[2, 3, 7, 7], dtype="float32"
+                )
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=5)
-            out_2 = adaptive_avg_pool(x=x)
+                adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
+                    output_size=[3, 3]
+                )
+                out_1 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[2, 5])
-            out_3 = adaptive_avg_pool(x=x)
+                adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=5)
+                out_2 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
-                output_size=[3, 3], data_format="NHWC"
-            )
-            out_4 = adaptive_avg_pool(x=x)
+                adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
+                    output_size=[2, 5]
+                )
+                out_3 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
-                output_size=[None, 3]
-            )
-            out_5 = adaptive_avg_pool(x=x)
+                adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
+                    output_size=[3, 3], data_format="NHWC"
+                )
+                out_4 = adaptive_avg_pool(x=x)
+
+                adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
+                    output_size=[None, 3]
+                )
+                out_5 = adaptive_avg_pool(x=x)
 
             exe = paddle.static.Executor(place=place)
             [res_1, res_2, res_3, res_4, res_5] = exe.run(
-                base.default_main_program(),
+                main_program,
                 feed={"x": self.x_np},
                 fetch_list=[out_1, out_2, out_3, out_4, out_5],
             )
diff --git a/test/legacy_test/test_allclose_op.py b/test/legacy_test/test_allclose_op.py
index 754a5c81509794..474f3edb3063f4 100644
--- a/test/legacy_test/test_allclose_op.py
+++ b/test/legacy_test/test_allclose_op.py
@@ -19,6 +19,7 @@
 
 import paddle
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestAllcloseOp(OpTest):
@@ -53,7 +54,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAllcloseOpException(TestAllcloseOp):
@@ -61,28 +62,28 @@ def test_check_output(self):
         def test_rtol_num():
             self.inputs['Rtol'] = np.array([1e-05, 1e-05]).astype("float64")
             self.inputs['Atol'] = np.array([1e-08]).astype("float64")
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
         self.assertRaises(ValueError, test_rtol_num)
 
         def test_rtol_type():
             self.inputs['Rtol'] = np.array([5]).astype("int32")
             self.inputs['Atol'] = np.array([1e-08]).astype("float64")
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
         self.assertRaises(ValueError, test_rtol_type)
 
         def test_atol_num():
             self.inputs['Rtol'] = np.array([1e-05]).astype("float64")
             self.inputs['Atol'] = np.array([1e-08, 1e-08]).astype("float64")
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
         self.assertRaises(ValueError, test_atol_num)
 
         def test_atol_type():
             self.inputs['Rtol'] = np.array([1e-05]).astype("float64")
             self.inputs['Atol'] = np.array([8]).astype("int32")
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
         self.assertRaises(ValueError, test_atol_type)
 
@@ -174,6 +175,7 @@ def test_equal_nan():
 
 
 class TestAllcloseOpFp16(unittest.TestCase):
+    @test_with_pir_api
     def test_fp16(self):
         x_data = np.random.rand(10, 10).astype('float16')
         y_data = np.random.rand(10, 10).astype('float16')
@@ -200,7 +202,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
-                self.check_output_with_place(place, check_new_ir=True)
+                self.check_output_with_place(place, check_pir=True)
 
 
 class TestAllcloseOpFloat32(TestAllcloseOp):
diff --git a/test/legacy_test/test_arange.py b/test/legacy_test/test_arange.py
index d22ec561e00012..e71402518696ba 100644
--- a/test/legacy_test/test_arange.py
+++ b/test/legacy_test/test_arange.py
@@ -48,7 +48,7 @@ def init_config(self):
         self.case = (0, 1, 0.2)
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestFloatArangeOp(TestArangeOp):
@@ -65,7 +65,7 @@ def init_config(self):
         self.case = (0, 5, 1)
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 @unittest.skipIf(
@@ -99,7 +99,7 @@ def init_config(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
 
 class TestInt32ArangeOp(TestArangeOp):
diff --git a/test/legacy_test/test_arg_min_max_op.py b/test/legacy_test/test_arg_min_max_op.py
index 09425be02fc53e..ede4a54a244ed5 100644
--- a/test/legacy_test/test_arg_min_max_op.py
+++ b/test/legacy_test/test_arg_min_max_op.py
@@ -42,7 +42,7 @@ def setUp(self):
             self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
 
     def test_check_output(self):
-        self.check_output(check_cinn=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
 
 class TestCase0(BaseTestCase):
@@ -122,7 +122,7 @@ def setUp(self):
             self.outputs = {'Out': np.argmax(x, axis=self.axis)}
 
     def test_check_output(self):
-        self.check_output_with_place(paddle.CUDAPlace(0))
+        self.check_output_with_place(paddle.CUDAPlace(0), check_pir=True)
 
 
 class TestArgMaxBF16OP(TestArgMinBF16OP):
diff --git a/test/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py
index 4a9ff9308f7b82..50f9e5e0548694 100644
--- a/test/legacy_test/test_assign_op.py
+++ b/test/legacy_test/test_assign_op.py
@@ -24,6 +24,7 @@
 from paddle import base
 from paddle.base import Program, core, program_guard
 from paddle.base.backward import append_backward
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestAssignOp(op_test.OpTest):
@@ -42,12 +43,12 @@ def init_input_configs(self):
 
     def test_forward(self):
         paddle.enable_static()
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
         paddle.disable_static()
 
     def test_backward(self):
         paddle.enable_static()
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_pir=True)
         paddle.disable_static()
 
 
@@ -71,12 +72,12 @@ def setUp(self):
 
     def test_forward(self):
         paddle.enable_static()
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
         paddle.disable_static()
 
     def test_backward(self):
         paddle.enable_static()
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_pir=True)
         paddle.disable_static()
 
 
@@ -97,12 +98,12 @@ def setUp(self):
 
     def test_forward(self):
         paddle.enable_static()
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
         paddle.disable_static()
 
     def test_backward(self):
         paddle.enable_static()
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_pir=True)
         paddle.disable_static()
 
 
@@ -275,9 +276,12 @@ def test_assign_bfp16(self):
 
 
 class TestAssignOpErrorApi(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         paddle.enable_static()
-        with program_guard(Program(), Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             # The type of input must be Variable or numpy.ndarray.
             x1 = base.create_lod_tensor(
                 np.array([[-1]]), [[1]], base.CPUPlace()
@@ -288,9 +292,12 @@ def test_errors(self):
             self.assertRaises(TypeError, paddle.assign, x2)
         paddle.disable_static()
 
+    @test_with_pir_api
     def test_type_error(self):
         paddle.enable_static()
-        with program_guard(Program(), Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             x = [paddle.randn([3, 3]), paddle.randn([3, 3])]
             # not support to assign list(var)
             self.assertRaises(TypeError, paddle.assign, x)
diff --git a/test/legacy_test/test_assign_value_op.py b/test/legacy_test/test_assign_value_op.py
index b0963b51b24856..6ff4282d9fc553 100644
--- a/test/legacy_test/test_assign_value_op.py
+++ b/test/legacy_test/test_assign_value_op.py
@@ -54,7 +54,7 @@ def init_data(self):
         self.attrs["fp32_values"] = [float(v) for v in self.value.flat]
 
     def test_forward(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
 
 class TestAssignValueOp2(TestAssignValueOp):
@@ -105,6 +105,18 @@ def test_assign(self):
             np.testing.assert_array_equal(fetched_x, self.value)
             self.assertEqual(fetched_x.dtype, self.value.dtype)
 
+    def test_pir_assign(self):
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.pir.Program()
+            with paddle.static.program_guard(main_program):
+                x = paddle.zeros(shape=[1], dtype=self.dtype)
+                paddle.assign(self.value, output=x)
+
+            exe = base.Executor(self.place)
+            [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x])
+            np.testing.assert_array_equal(fetched_x, self.value)
+            self.assertEqual(fetched_x.dtype, self.value.dtype)
+
 
 class TestAssignApi2(TestAssignApi):
     def init_dtype(self):
diff --git a/test/legacy_test/test_auto_parallel_partitioner.py b/test/legacy_test/test_auto_parallel_partitioner.py
index 43039d093a8ad0..1a0d70c232b366 100644
--- a/test/legacy_test/test_auto_parallel_partitioner.py
+++ b/test/legacy_test/test_auto_parallel_partitioner.py
@@ -46,6 +46,7 @@ def get_programs(annotated_func):
     complete_train_program = completer.complete_forward_annotation(
         train_program
     )
+
     dist_context.block_state.parse_forward_blocks(complete_train_program)
 
     rank_id = 3
@@ -212,14 +213,6 @@ def check_equal_dist_op_attr(
             )
             if tensor_dims_mapping != out_var_dims_mapping:
                 equal = False
-        dist_op_process_mesh = op_dist_attr.process_mesh
-        dist_op_impl_idx = op_dist_attr.impl_idx
-        if (
-            serial_op.desc.id() == dist_ops[i].desc.id()
-            or serial_process_mesh != dist_op_process_mesh
-            or serial_impl_idx != dist_op_impl_idx
-        ):
-            equal = False
 
     return equal
 
@@ -879,6 +872,7 @@ def test_attn_mp(self):
         # check distribured attr for dist op
         serial_op_idx = [0, 4, 6, 18]
         dist_op_idx = [[0, 1], [4, 5], [6, 7], [18, 19]]
+
         self.assertTrue(
             distributed_attr_check_for_dist_op(
                 serial_main_prog,
@@ -985,6 +979,7 @@ def test_attn_dp_mp(self):
         # check distribured attr for dist op
         serial_op_idx = [0, 4, 6, 18]
         dist_op_idx = [[0, 1], [4, 5], [6, 7], [18, 19]]
+
         self.assertTrue(
             distributed_attr_check_for_dist_op(
                 serial_main_prog,
diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
index cfbb33c2a29338..284826d7b4e530 100644
--- a/test/legacy_test/test_batch_norm_op.py
+++ b/test/legacy_test/test_batch_norm_op.py
@@ -28,6 +28,7 @@
 from paddle import base
 from paddle.base import Program, core, program_guard
 from paddle.base.framework import grad_var_name
+from paddle.pir_utils import test_with_pir_api
 
 _set_use_system_allocator(True)
 
@@ -857,6 +858,7 @@ def compute(x, is_test, trainable_statistics):
             y2 = compute(x, True, True)
             np.testing.assert_allclose(y1, y2, rtol=1e-05)
 
+    @test_with_pir_api
     def test_static(self):
         places = [base.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -866,7 +868,9 @@ def test_static(self):
             shape = [4, 10, 16, 16]
 
             def compute(x_np, is_test, trainable_statistics):
-                with program_guard(Program(), Program()):
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+                with paddle.static.program_guard(main_program, startup_program):
                     bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
@@ -876,7 +880,7 @@ def compute(x_np, is_test, trainable_statistics):
                         name='x', shape=x_np.shape, dtype=x_np.dtype
                     )
                     y = bn(x)
-                    exe.run(base.default_startup_program())
+                    exe.run(startup_program)
                     r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
                 return r
 
@@ -887,8 +891,11 @@ def compute(x_np, is_test, trainable_statistics):
 
 
 class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
+    @test_with_pir_api
     def test_reservespace(self):
-        with program_guard(Program(), Program()):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
             paddle.enable_static()
             x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
             x = paddle.static.data(name='x', shape=x.shape, dtype=x.dtype)
diff --git a/test/legacy_test/test_batch_norm_op_v2.py b/test/legacy_test/test_batch_norm_op_v2.py
index b53bfb9e73373d..639011460b102c 100644
--- a/test/legacy_test/test_batch_norm_op_v2.py
+++ b/test/legacy_test/test_batch_norm_op_v2.py
@@ -18,7 +18,8 @@
 
 import paddle
 from paddle import base
-from paddle.base import Program, core, program_guard
+from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestBatchNorm(unittest.TestCase):
@@ -210,6 +211,7 @@ def compute_v4(x):
             np.testing.assert_allclose(y1, y2, rtol=1e-05)
             np.testing.assert_allclose(y3, y4, rtol=1e-05)
 
+    @test_with_pir_api
     def test_static(self):
         places = [base.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -219,7 +221,9 @@ def test_static(self):
             shape = [4, 10, 16, 16]
 
             def compute_v1(x_np, is_test, trainable_statistics):
-                with program_guard(Program(), Program()):
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+                with base.program_guard(main_program, startup_program):
                     bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
@@ -229,18 +233,20 @@ def compute_v1(x_np, is_test, trainable_statistics):
                         name='x', shape=x_np.shape, dtype=x_np.dtype
                     )
                     y = bn(x)
-                    exe.run(base.default_startup_program())
+                    exe.run(startup_program)
                     r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
                 return r
 
             def compute_v2(x_np):
-                with program_guard(Program(), Program()):
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+                with base.program_guard(main_program, startup_program):
                     bn = paddle.nn.BatchNorm2D(shape[1])
                     x = paddle.static.data(
                         name='x', shape=x_np.shape, dtype=x_np.dtype
                     )
                     y = bn(x)
-                    exe.run(base.default_startup_program())
+                    exe.run(startup_program)
                     r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
                 return r
 
diff --git a/test/legacy_test/test_bitwise_op.py b/test/legacy_test/test_bitwise_op.py
index a5040b434b260a..21a7abe812ad7a 100644
--- a/test/legacy_test/test_bitwise_op.py
+++ b/test/legacy_test/test_bitwise_op.py
@@ -43,7 +43,7 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
         pass
diff --git a/test/legacy_test/test_calc_gradient.py b/test/legacy_test/test_calc_gradient.py
index 945acf18bb9320..41f3772260c771 100644
--- a/test/legacy_test/test_calc_gradient.py
+++ b/test/legacy_test/test_calc_gradient.py
@@ -85,7 +85,11 @@ def test2(self):
         self.assertEqual(12, out[0])
 
 
+from paddle.pir_utils import test_with_pir_api
+
+
 class TestGradientWithPrune(unittest.TestCase):
+    @test_with_pir_api
     def test_prune(self):
         with paddle.base.scope_guard(paddle.static.Scope()):
             x = paddle.static.data(name='x', shape=[3], dtype='float32')
@@ -95,8 +99,8 @@ def test_prune(self):
             x1_grad = base.gradients(y, x)
 
             exe = base.Executor(base.CPUPlace())
-            main = base.default_main_program()
-            exe.run(base.default_startup_program())
+            main = paddle.static.default_main_program()
+            exe.run(paddle.static.default_startup_program())
             out = exe.run(
                 main,
                 feed={'x': np.ones([3]).astype('float32')},
diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
index 79a8926162fa40..d9999aad5f9ddc 100644
--- a/test/legacy_test/test_cast_op.py
+++ b/test/legacy_test/test_cast_op.py
@@ -52,7 +52,7 @@ def init_shapes(self):
         self.input_shape = [10, 10]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_grad(self):
         self.check_grad(
@@ -60,7 +60,7 @@ def test_grad(self):
             ['Out'],
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -84,7 +84,7 @@ def setUp(self):
         self.public_python_api = cast_wrapper
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_grad(self):
         self.check_grad(
@@ -92,7 +92,7 @@ def test_grad(self):
             ['Out'],
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -111,7 +111,7 @@ def setUp(self):
         self.public_python_api = cast_wrapper
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_grad(self):
         self.check_grad(
@@ -119,7 +119,7 @@ def test_grad(self):
             ['Out'],
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -146,7 +146,7 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_grad(self):
         self.check_grad(
@@ -154,7 +154,7 @@ def test_grad(self):
             ['Out'],
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -181,7 +181,7 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_grad(self):
         self.check_grad(
@@ -189,7 +189,7 @@ def test_grad(self):
             ['Out'],
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_center_loss.py b/test/legacy_test/test_center_loss.py
deleted file mode 100644
index 31863cd93f7670..00000000000000
--- a/test/legacy_test/test_center_loss.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-class TestCenterLossOp(OpTest):
-    def setUp(self):
-        self.op_type = "center_loss"
-        self.dtype = np.float64
-        self.init_dtype_type()
-        batch_size = 12
-        feet_dim = 10
-        cluster_num = 8
-        self.attrs = {}
-        self.attrs['cluster_num'] = cluster_num
-        self.attrs['lambda'] = 0.1
-        self.config()
-        self.attrs['need_update'] = self.need_update
-        labels = np.random.randint(cluster_num, size=batch_size, dtype='int64')
-        feat = np.random.random((batch_size, feet_dim)).astype(np.float64)
-        centers = np.random.random((cluster_num, feet_dim)).astype(np.float64)
-        var_sum = np.zeros((cluster_num, feet_dim), dtype=np.float64)
-        centers_select = centers[labels]
-        output = feat - centers_select
-        diff_square = np.square(output).reshape(batch_size, feet_dim)
-        loss = 0.5 * np.sum(diff_square, axis=1).reshape(batch_size, 1)
-        cout = []
-        for i in range(cluster_num):
-            cout.append(0)
-        for i in range(batch_size):
-            cout[labels[i]] += 1
-            var_sum[labels[i]] += output[i]
-        for i in range(cluster_num):
-            var_sum[i] /= 1 + cout[i]
-        var_sum *= 0.1
-        result = centers + var_sum
-        rate = np.array([0.1]).astype(np.float64)
-
-        self.inputs = {
-            'X': feat,
-            'Label': labels,
-            'Centers': centers,
-            'CenterUpdateRate': rate,
-        }
-
-        if self.need_update:
-            self.outputs = {
-                'SampleCenterDiff': output,
-                'Loss': loss,
-                'CentersOut': result,
-            }
-        else:
-            self.outputs = {
-                'SampleCenterDiff': output,
-                'Loss': loss,
-                'CentersOut': centers,
-            }
-
-    def config(self):
-        self.need_update = True
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Loss', check_dygraph=False)
-
-
-class TestCenterLossOpNoUpdate(TestCenterLossOp):
-    def config(self):
-        self.need_update = False
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_clip_op.py b/test/legacy_test/test_clip_op.py
index 1c8366bbdf5efc..1fad87de2d1dce 100644
--- a/test/legacy_test/test_clip_op.py
+++ b/test/legacy_test/test_clip_op.py
@@ -20,6 +20,7 @@
 import paddle
 from paddle import base
 from paddle.base import Program, core, program_guard
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestClipOp(OpTest):
@@ -55,12 +56,12 @@ def setUp(self):
 
     def test_check_output(self):
         paddle.enable_static()
-        self.check_output(check_cinn=self.check_cinn)
+        self.check_output(check_cinn=self.check_cinn, check_pir=True)
         paddle.disable_static()
 
     def test_check_grad_normal(self):
         paddle.enable_static()
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_pir=True)
         paddle.disable_static()
 
     def initTestCase(self):
@@ -194,14 +195,14 @@ def test_check_output(self):
         if paddle.is_compiled_with_cuda():
             place = paddle.CUDAPlace(0)
             paddle.enable_static()
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, check_pir=True)
             paddle.disable_static()
 
     def test_check_grad_normal(self):
         if paddle.is_compiled_with_cuda():
             place = paddle.CUDAPlace(0)
             paddle.enable_static()
-            self.check_grad_with_place(place, ['X'], 'Out')
+            self.check_grad_with_place(place, ['X'], 'Out', check_pir=True)
             paddle.disable_static()
 
     def initTestCase(self):
@@ -266,16 +267,11 @@ class TestClipAPI(unittest.TestCase):
     def _executed_api(self, x, min=None, max=None):
         return paddle.clip(x, min, max)
 
+    @test_with_pir_api
     def test_clip(self):
         paddle.enable_static()
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
-        images = paddle.static.data(
-            name='image', shape=data_shape, dtype='float32'
-        )
-        min = paddle.static.data(name='min', shape=[1], dtype='float32')
-        max = paddle.static.data(name='max', shape=[1], dtype='float32')
-
         place = (
             base.CUDAPlace(0)
             if base.core.is_compiled_with_cuda()
@@ -283,23 +279,31 @@ def test_clip(self):
         )
         exe = base.Executor(place)
 
-        out_1 = self._executed_api(images, min=min, max=max)
-        out_2 = self._executed_api(images, min=0.2, max=0.9)
-        out_3 = self._executed_api(images, min=0.3)
-        out_4 = self._executed_api(images, max=0.7)
-        out_5 = self._executed_api(images, min=min)
-        out_6 = self._executed_api(images, max=max)
-        out_7 = self._executed_api(images, max=-1.0)
-        out_8 = self._executed_api(images)
-        out_9 = self._executed_api(
-            paddle.cast(images, 'float64'), min=0.2, max=0.9
-        )
-        out_10 = self._executed_api(
-            paddle.cast(images * 10, 'int32'), min=2, max=8
-        )
-        out_11 = self._executed_api(
-            paddle.cast(images * 10, 'int64'), min=2, max=8
-        )
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            images = paddle.static.data(
+                name='image', shape=data_shape, dtype='float32'
+            )
+            min = paddle.static.data(name='min', shape=[1], dtype='float32')
+            max = paddle.static.data(name='max', shape=[1], dtype='float32')
+            out_1 = self._executed_api(images, min=min, max=max)
+            out_2 = self._executed_api(images, min=0.2, max=0.9)
+            out_3 = self._executed_api(images, min=0.3)
+            out_4 = self._executed_api(images, max=0.7)
+            out_5 = self._executed_api(images, min=min)
+            out_6 = self._executed_api(images, max=max)
+            out_7 = self._executed_api(images, max=-1.0)
+            out_8 = self._executed_api(images)
+            out_9 = self._executed_api(
+                paddle.cast(images, 'float64'), min=0.2, max=0.9
+            )
+            out_10 = self._executed_api(
+                paddle.cast(images * 10, 'int32'), min=2, max=8
+            )
+            out_11 = self._executed_api(
+                paddle.cast(images * 10, 'int64'), min=2, max=8
+            )
 
         (
             res1,
@@ -314,7 +318,7 @@ def test_clip(self):
             res10,
             res11,
         ) = exe.run(
-            base.default_main_program(),
+            main,
             feed={
                 "image": data,
                 "min": np.array([0.2]).astype('float32'),
@@ -430,6 +434,7 @@ def test_errors(self):
 
 
 class TestClipOpFp16(unittest.TestCase):
+    @test_with_pir_api
     def test_fp16(self):
         paddle.enable_static()
         data_shape = [1, 9, 9, 4]
diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py
index 08de4a1be9a322..a431d77cdfe713 100644
--- a/test/legacy_test/test_collective_api_base.py
+++ b/test/legacy_test/test_collective_api_base.py
@@ -359,6 +359,7 @@ def check_with_place(
             "PATH_ID": path_id,
             "DTYPE": dtype,
             "REDUCE_TYPE": str(reduce_type),
+            "FLAGS_dynamic_static_unified_comm": "0",
         }
         required_envs.update(additional_envs)
         required_envs.update(need_envs)
@@ -608,16 +609,23 @@ def convertbf16(origin):
                     send_ptr2 = send_ptr2 + global_expert_count2[idx]
             result1 = []
             result2 = []
+
+            def is_empyt_list(x):
+                if isinstance(x, list) and len(x) == 0:
+                    return True
+                return False
+
             for i in range(tot_expert):
                 for arr in output1[i]:
-                    if arr == []:
+                    if is_empyt_list(arr):
                         continue
                     result1.append(arr)
             for i in range(tot_expert):
                 for arr in output2[i]:
-                    if arr == []:
+                    if is_empyt_list(arr):
                         continue
                     result2.append(arr)
+
             if result1 == []:
                 output1 = np.array([])
             else:
diff --git a/test/legacy_test/test_collective_base.py b/test/legacy_test/test_collective_base.py
index 9d3a602b8d051a..544cee3ac0e7ec 100644
--- a/test/legacy_test/test_collective_base.py
+++ b/test/legacy_test/test_collective_base.py
@@ -266,7 +266,7 @@ def check_with_place(
             "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
             "GLOG_v": "3",
             "NCCL_P2P_DISABLE": "1",
-            "Flags_dynamic_static_unified_comm": "False",
+            "FLAGS_dynamic_static_unified_comm": "0",
             "DTYPE": "float32",
         }
         required_envs.update(need_envs)
diff --git a/test/legacy_test/test_compare_op.py b/test/legacy_test/test_compare_op.py
index 2bae19d180e2c6..91dce088ef88ef 100755
--- a/test/legacy_test/test_compare_op.py
+++ b/test/legacy_test/test_compare_op.py
@@ -20,10 +20,11 @@
 
 import paddle
 from paddle import base
-from paddle.base import Program, core, program_guard
+from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
-def create_test_class(op_type, typename, callback, check_new_ir=False):
+def create_test_class(op_type, typename, callback, check_pir=False):
     class Cls(op_test.OpTest):
         def setUp(self):
             a = numpy.random.random(size=(10, 7)).astype(typename)
@@ -35,11 +36,13 @@ def setUp(self):
             self.op_type = op_type
 
         def test_output(self):
-            self.check_output(check_cinn=True, check_new_ir=check_new_ir)
+            self.check_output(check_cinn=True, check_pir=check_pir)
 
         def test_errors(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.static.data(name='x', shape=[-1, 2], dtype='int32')
                 y = paddle.static.data(name='y', shape=[-1, 2], dtype='int32')
                 a = paddle.static.data(name='a', shape=[-1, 2], dtype='int16')
@@ -58,14 +61,14 @@ def test_errors(self):
     if _type_name == 'float16' and (not core.is_compiled_with_cuda()):
         continue
 
-    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
-    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
-    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b, True)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b, True)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b, True)
     create_test_class(
         'greater_equal', _type_name, lambda _a, _b: _a >= _b, True
     )
     create_test_class('equal', _type_name, lambda _a, _b: _a == _b, True)
-    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
+    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b, True)
 
 
 def create_paddle_case(op_type, callback):
@@ -79,9 +82,12 @@ def setUp(self):
             if core.is_compiled_with_cuda():
                 self.place = paddle.CUDAPlace(0)
 
+        @test_with_pir_api
         def test_api(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.static.data(name='x', shape=[4], dtype='int64')
                 y = paddle.static.data(name='y', shape=[4], dtype='int64')
                 op = eval("paddle.%s" % (self.op_type))
@@ -93,10 +99,13 @@ def test_api(self):
                 )
             self.assertEqual((res == self.real_result).all(), True)
 
+        @test_with_pir_api
         def test_api_float(self):
             if self.op_type == "equal":
                 paddle.enable_static()
-                with program_guard(Program(), Program()):
+                with paddle.static.program_guard(
+                    paddle.static.Program(), paddle.static.Program()
+                ):
                     x = paddle.static.data(name='x', shape=[4], dtype='int64')
                     y = paddle.static.data(name='y', shape=[], dtype='int64')
                     op = eval("paddle.%s" % (self.op_type))
@@ -290,9 +299,12 @@ def test_dynamic_api_bool(self):
                 self.assertEqual((out.numpy() == self.real_result).all(), True)
                 paddle.enable_static()
 
+        @test_with_pir_api
         def test_broadcast_api_1(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.static.data(
                     name='x', shape=[1, 2, 1, 3], dtype='int32'
                 )
@@ -308,9 +320,12 @@ def test_broadcast_api_1(self):
                 )
             self.assertEqual((res == real_result).all(), True)
 
+        @test_with_pir_api
         def test_broadcast_api_2(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
                 y = paddle.static.data(
                     name='y', shape=[1, 2, 1, 3], dtype='int32'
@@ -326,9 +341,12 @@ def test_broadcast_api_2(self):
                 )
             self.assertEqual((res == real_result).all(), True)
 
+        @test_with_pir_api
         def test_broadcast_api_3(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.static.data(name='x', shape=[5], dtype='int32')
                 y = paddle.static.data(name='y', shape=[3, 1], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
@@ -342,9 +360,12 @@ def test_broadcast_api_3(self):
                 )
             self.assertEqual((res == real_result).all(), True)
 
+        @test_with_pir_api
         def test_zero_dim_api_1(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.randint(-3, 3, shape=[], dtype='int32')
                 y = paddle.randint(-3, 3, shape=[], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
@@ -358,9 +379,12 @@ def test_zero_dim_api_1(self):
                 real_result = callback(x_np, y_np)
             self.assertEqual((res == real_result).all(), True)
 
+        @test_with_pir_api
         def test_zero_dim_api_2(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.randint(-3, 3, shape=[2, 3, 4], dtype='int32')
                 y = paddle.randint(-3, 3, shape=[], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
@@ -374,9 +398,12 @@ def test_zero_dim_api_2(self):
                 real_result = callback(x_np, y_np)
             self.assertEqual((res == real_result).all(), True)
 
+        @test_with_pir_api
         def test_zero_dim_api_3(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.randint(-3, 3, shape=[], dtype='int32')
                 y = paddle.randint(-3, 3, shape=[2, 3, 4], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
@@ -390,9 +417,12 @@ def test_zero_dim_api_3(self):
                 real_result = callback(x_np, y_np)
             self.assertEqual((res == real_result).all(), True)
 
+        @test_with_pir_api
         def test_bool_api_4(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.static.data(name='x', shape=[3, 1], dtype='bool')
                 y = paddle.static.data(name='y', shape=[3, 1], dtype='bool')
                 op = eval("paddle.%s" % (self.op_type))
@@ -406,9 +436,12 @@ def test_bool_api_4(self):
                 )
             self.assertEqual((res == real_result).all(), True)
 
+        @test_with_pir_api
         def test_bool_broadcast_api_4(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.static.data(name='x', shape=[3, 1], dtype='bool')
                 y = paddle.static.data(name='y', shape=[1], dtype='bool')
                 op = eval("paddle.%s" % (self.op_type))
@@ -424,7 +457,9 @@ def test_bool_broadcast_api_4(self):
 
         def test_attr_name(self):
             paddle.enable_static()
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 x = paddle.static.data(name='x', shape=[-1, 4], dtype='int32')
                 y = paddle.static.data(name='y', shape=[-1, 4], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
@@ -445,7 +480,7 @@ def test_attr_name(self):
 
 
 # add bf16 tests
-def create_bf16_case(op_type, callback, check_new_ir=False):
+def create_bf16_case(op_type, callback, check_pir=False):
     class TestCompareOpBF16Op(op_test.OpTest):
         def setUp(self):
             self.op_type = op_type
@@ -462,25 +497,27 @@ def setUp(self):
             self.outputs = {'Out': real_result}
 
         def test_check_output(self):
-            self.check_output(check_cinn=True, check_new_ir=check_new_ir)
+            self.check_output(check_cinn=True, check_pir=check_pir)
 
     cls_name = f"BF16TestCase_{op_type}"
     TestCompareOpBF16Op.__name__ = cls_name
     globals()[cls_name] = TestCompareOpBF16Op
 
 
-create_bf16_case('less_than', lambda _a, _b: _a < _b)
-create_bf16_case('less_equal', lambda _a, _b: _a <= _b)
-create_bf16_case('greater_than', lambda _a, _b: _a > _b)
+create_bf16_case('less_than', lambda _a, _b: _a < _b, True)
+create_bf16_case('less_equal', lambda _a, _b: _a <= _b, True)
+create_bf16_case('greater_than', lambda _a, _b: _a > _b, True)
 create_bf16_case('greater_equal', lambda _a, _b: _a >= _b, True)
 create_bf16_case('equal', lambda _a, _b: _a == _b, True)
-create_bf16_case('not_equal', lambda _a, _b: _a != _b)
+create_bf16_case('not_equal', lambda _a, _b: _a != _b, True)
 
 
 class TestCompareOpError(unittest.TestCase):
     def test_errors(self):
         paddle.enable_static()
-        with program_guard(Program(), Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             # The input x and y of compare_op must be Variable.
             x = paddle.static.data(name='x', shape=[-1, 1], dtype="float32")
             y = base.create_lod_tensor(
@@ -490,9 +527,12 @@ def test_errors(self):
 
 
 class API_TestElementwise_Equal(unittest.TestCase):
+    @test_with_pir_api
     def test_api(self):
         paddle.enable_static()
-        with base.program_guard(base.Program(), base.Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             label = paddle.assign(np.array([3, 3], dtype="int32"))
             limit = paddle.assign(np.array([3, 2], dtype="int32"))
             out = paddle.equal(x=label, y=limit)
@@ -501,7 +541,9 @@ def test_api(self):
             (res,) = exe.run(fetch_list=[out])
         self.assertEqual((res == np.array([True, False])).all(), True)
 
-        with base.program_guard(base.Program(), base.Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             label = paddle.assign(np.array([3, 3], dtype="int32"))
             limit = paddle.assign(np.array([3, 3], dtype="int32"))
             out = paddle.equal(x=label, y=limit)
@@ -510,9 +552,12 @@ def test_api(self):
             (res,) = exe.run(fetch_list=[out])
         self.assertEqual((res == np.array([True, True])).all(), True)
 
+    @test_with_pir_api
     def test_api_fp16(self):
         paddle.enable_static()
-        with base.program_guard(base.Program(), base.Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             label = paddle.to_tensor([3, 3], dtype="float16")
             limit = paddle.to_tensor([3, 2], dtype="float16")
             out = paddle.equal(x=label, y=limit)
@@ -524,6 +569,7 @@ def test_api_fp16(self):
 
 
 class API_TestElementwise_Greater_Than(unittest.TestCase):
+    @test_with_pir_api
     def test_api_fp16(self):
         paddle.enable_static()
         with paddle.static.program_guard(
@@ -540,17 +586,21 @@ def test_api_fp16(self):
 
 
 class TestCompareOpPlace(unittest.TestCase):
+    @test_with_pir_api
     def test_place_1(self):
         paddle.enable_static()
         place = paddle.CPUPlace()
         if core.is_compiled_with_cuda():
             place = paddle.CUDAPlace(0)
-        label = paddle.assign(np.array([3, 3], dtype="int32"))
-        limit = paddle.assign(np.array([3, 2], dtype="int32"))
-        out = paddle.less_than(label, limit)
-        exe = base.Executor(place)
-        (res,) = exe.run(fetch_list=[out])
-        self.assertEqual((res == np.array([False, False])).all(), True)
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            label = paddle.assign(np.array([3, 3], dtype="int32"))
+            limit = paddle.assign(np.array([3, 2], dtype="int32"))
+            out = paddle.less_than(label, limit)
+            exe = base.Executor(place)
+            (res,) = exe.run(fetch_list=[out])
+            self.assertEqual((res == np.array([False, False])).all(), True)
 
     def test_place_2(self):
         place = paddle.CPUPlace()
diff --git a/test/legacy_test/test_concat_op.py b/test/legacy_test/test_concat_op.py
index 153e1cc06d3085..efa87c36095706 100644
--- a/test/legacy_test/test_concat_op.py
+++ b/test/legacy_test/test_concat_op.py
@@ -53,9 +53,9 @@ def get_dtype(self):
     def test_check_output(self):
         if self.dtype == np.uint16:
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, check_new_ir=True)
+            self.check_output_with_place(place, check_pir=True)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
     def test_check_grad(self):
         if self.dtype == np.uint16:
@@ -65,7 +65,7 @@ def test_check_grad(self):
                 ['x0'],
                 'Out',
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
                 check_prim_pir=True,
             )
             self.check_grad_with_place(
@@ -73,7 +73,7 @@ def test_check_grad(self):
                 ['x1'],
                 'Out',
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
                 check_prim_pir=True,
             )
             self.check_grad_with_place(
@@ -81,7 +81,7 @@ def test_check_grad(self):
                 ['x2'],
                 'Out',
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
                 check_prim_pir=True,
             )
         else:
@@ -89,21 +89,21 @@ def test_check_grad(self):
                 ['x0'],
                 'Out',
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
                 check_prim_pir=True,
             )
             self.check_grad(
                 ['x1'],
                 'Out',
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
                 check_prim_pir=True,
             )
             self.check_grad(
                 ['x2'],
                 'Out',
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
                 check_prim_pir=True,
             )
 
@@ -199,12 +199,12 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_new_ir=False)
+        self.check_output(check_pir=False)
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', check_new_ir=False)
-        self.check_grad(['x1'], 'Out', check_new_ir=False)
-        self.check_grad(['x2'], 'Out', check_new_ir=False)
+        self.check_grad(['x0'], 'Out', check_pir=False)
+        self.check_grad(['x1'], 'Out', check_pir=False)
+        self.check_grad(['x2'], 'Out', check_pir=False)
 
     def init_test_data(self):
         self.x0 = np.random.random([100]).astype(self.dtype)
@@ -243,28 +243,28 @@ def get_dtype(self):
         return "float64"
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
             ['x0'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
         self.check_grad(
             ['x1'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
         self.check_grad(
             ['x2'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -319,19 +319,13 @@ def test_check_grad(self):
                 return
             if self.dtype == np.uint16:
                 place = core.CUDAPlace(0)
-                self.check_grad_with_place(
-                    place, ['x0'], 'Out', check_new_ir=True
-                )
-                self.check_grad_with_place(
-                    place, ['x1'], 'Out', check_new_ir=True
-                )
-                self.check_grad_with_place(
-                    place, ['x2'], 'Out', check_new_ir=True
-                )
+                self.check_grad_with_place(place, ['x0'], 'Out', check_pir=True)
+                self.check_grad_with_place(place, ['x1'], 'Out', check_pir=True)
+                self.check_grad_with_place(place, ['x2'], 'Out', check_pir=True)
             else:
-                self.check_grad(['x0'], 'Out', check_new_ir=True)
-                self.check_grad(['x1'], 'Out', check_new_ir=True)
-                self.check_grad(['x2'], 'Out', check_new_ir=True)
+                self.check_grad(['x0'], 'Out', check_pir=True)
+                self.check_grad(['x1'], 'Out', check_pir=True)
+                self.check_grad(['x2'], 'Out', check_pir=True)
 
     cls_name = "{}_{}".format(parent.__name__, "AxisTensor")
     TestConcatAxisTensor.__name__ = cls_name
@@ -388,7 +382,7 @@ def test_check_grad(self):
                     place,
                     ['x0'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
@@ -396,7 +390,7 @@ def test_check_grad(self):
                     place,
                     ['x1'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
@@ -404,7 +398,7 @@ def test_check_grad(self):
                     place,
                     ['x2'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
@@ -412,21 +406,21 @@ def test_check_grad(self):
                 self.check_grad(
                     ['x0'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
                 self.check_grad(
                     ['x1'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
                 self.check_grad(
                     ['x2'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
@@ -493,7 +487,7 @@ def test_check_grad(self):
                     place,
                     ['x0'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
@@ -501,7 +495,7 @@ def test_check_grad(self):
                     place,
                     ['x1'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
@@ -509,7 +503,7 @@ def test_check_grad(self):
                     place,
                     ['x2'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
@@ -517,21 +511,21 @@ def test_check_grad(self):
                 self.check_grad(
                     ['x0'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
                 self.check_grad(
                     ['x1'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
                 self.check_grad(
                     ['x2'],
                     'Out',
-                    check_new_ir=True,
+                    check_pir=True,
                     check_prim=True,
                     check_prim_pir=True,
                 )
diff --git a/test/legacy_test/test_cond.py b/test/legacy_test/test_cond.py
index 55e6f8116cf33f..cec7664ae6cb63 100644
--- a/test/legacy_test/test_cond.py
+++ b/test/legacy_test/test_cond.py
@@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import unittest
 
 import numpy as np
 from simple_nets import batchnorm_fc_with_inputs, simple_fc_net_with_inputs
 
+sys.path.append("../dygraph_to_static")
+from dygraph_to_static_util import test_and_compare_with_new_ir
+
 import paddle
 from paddle import base
 from paddle.base import core, framework
@@ -27,6 +31,7 @@
 
 
 class TestCondInputOutput(unittest.TestCase):
+    @test_and_compare_with_new_ir()
     def test_return_single_var(self):
         """
         pseudocode:
@@ -73,6 +78,7 @@ def false_func():
             np.asarray(ret), np.full((3, 2), -1, np.int32), rtol=1e-05
         )
 
+    @test_and_compare_with_new_ir()
     def test_return_0d_tensor(self):
         """
         pseudocode:
@@ -110,6 +116,7 @@ def false_func():
         np.testing.assert_allclose(np.asarray(ret), np.array(2), rtol=1e-05)
         self.assertEqual(ret.shape, ())
 
+    @test_and_compare_with_new_ir()
     def test_0d_tensor_as_cond(self):
         """
         pseudocode:
@@ -210,6 +217,7 @@ def test_0d_tensor_dygraph(self):
         )
         self.assertEqual(a.grad.shape, [])
 
+    @test_and_compare_with_new_ir()
     def test_return_var_tuple(self):
         """
         pseudocode:
@@ -257,6 +265,7 @@ def false_func():
             np.asarray(ret[1]), np.full((2, 3), True, bool), rtol=1e-05
         )
 
+    @test_and_compare_with_new_ir()
     def test_pass_and_modify_var(self):
         """
         pseudocode:
@@ -347,6 +356,7 @@ def false_func():
             self.assertIsNone(out2)
             self.assertIsNone(out3)
 
+    @test_and_compare_with_new_ir()
     def test_wrong_structure_exception(self):
         """
         test returning different number of tensors cannot merge into output
diff --git a/test/legacy_test/test_conv2d_layer.py b/test/legacy_test/test_conv2d_layer.py
index 4290a7352afed9..a347472bd2a873 100644
--- a/test/legacy_test/test_conv2d_layer.py
+++ b/test/legacy_test/test_conv2d_layer.py
@@ -218,8 +218,53 @@ def paddle_nn_layer(self):
         t1 = x_var.gradient()
         return y_np, t1
 
+    def run_Conv2D_static(self, place):
+        paddle.seed(2023)
+        main = base.Program()
+        start = base.Program()
+        with base.unique_name.guard():
+            with base.program_guard(main, start):
+                x_var = paddle.static.data(
+                    "input", self.input.shape, dtype=self.dtype
+                )
+                conv = nn.Conv2D(
+                    self.num_channels,
+                    self.num_filters,
+                    self.filter_size,
+                    padding=self.padding,
+                    padding_mode=self.padding_mode,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format,
+                )
+                y_var = conv(x_var)
+        feed_dict = {"input": self.input}
+        exe = base.Executor(place)
+        exe.run(start)
+        (y_np,) = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def run_Conv2D_dygraph(self):
+        paddle.seed(2023)
+        x_var = paddle.to_tensor(self.input)
+        x_var.stop_gradient = False
+        conv = nn.Conv2D(
+            self.num_channels,
+            self.num_filters,
+            self.filter_size,
+            padding=self.padding,
+            padding_mode=self.padding_mode,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format,
+        )
+        y_var = conv(x_var)
+        y_np = y_var.numpy()
+        return y_np
+
     def _test_equivalence(self, place):
-        place = base.CPUPlace()
         result1 = self.base_layer(place)
         result2 = self.functional(place)
         with dg.guard(place):
@@ -227,13 +272,22 @@ def _test_equivalence(self, place):
         np.testing.assert_array_almost_equal(result1, result2)
         np.testing.assert_array_almost_equal(result2, result3)
 
+    def _test_equivalence_in_pir(self, place):
+        with paddle.pir_utils.IrGuard():
+            result1 = self.run_Conv2D_static(place)
+            with dg.guard(place):
+                result2 = self.run_Conv2D_dygraph()
+            np.testing.assert_array_almost_equal(result1, result2)
+
     def runTest(self):
         place = base.CPUPlace()
         self._test_equivalence(place)
+        self._test_equivalence_in_pir(place)
 
         if base.core.is_compiled_with_cuda():
             place = base.CUDAPlace(0)
             self._test_equivalence(place)
+            self._test_equivalence_in_pir(place)
 
 
 class Conv2DErrorTestCase(Conv2DTestCase):
diff --git a/test/legacy_test/test_conv_shift_op.py b/test/legacy_test/test_conv_shift_op.py
deleted file mode 100644
index 26965d9b393cb1..00000000000000
--- a/test/legacy_test/test_conv_shift_op.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def conv_shift_forward(x, y):
-    out = np.zeros_like(x)
-    M = x.shape[1]
-    N = y.shape[1]
-    y_half_width = (N - 1) // 2
-    for i in range(M):
-        for j in range(N):
-            out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
-    return out
-
-
-class TestConvShiftOp(OpTest):
-    def setUp(self):
-        self.op_type = "conv_shift"
-
-        batch_size = 10
-        x_dim = 17
-        y_dim = 11  # must be odd and <= x_dim
-        x = np.random.random((batch_size, x_dim)).astype("float32")
-        y = np.random.random((batch_size, y_dim)).astype("float32")
-        self.inputs = {'X': x, 'Y': y}
-
-        out = conv_shift_forward(x, y)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(['Y'], 'Out')
-
-    def test_check_grad_ignore_y(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_cos_sim_op.py b/test/legacy_test/test_cos_sim_op.py
deleted file mode 100644
index f9c761c9eedf3f..00000000000000
--- a/test/legacy_test/test_cos_sim_op.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestCosSimOp(OpTest):
-    def setUp(self):
-        self.op_type = "cos_sim"
-        self.inputs = {
-            'X': np.random.random((6, 20)).astype("float32"),
-            'Y': np.random.random((6, 20)).astype("float32"),
-        }
-        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
-        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
-        expect_out = (
-            (self.inputs['X'] * self.inputs['Y']).sum(axis=1)
-            / expect_x_norm
-            / expect_y_norm
-        )
-        self.outputs = {
-            'XNorm': np.expand_dims(expect_x_norm, 1),
-            'YNorm': np.expand_dims(expect_y_norm, 1),
-            'Out': np.expand_dims(expect_out, 1),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.06)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.06, no_grad_set=set("X")
-        )
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Y')
-        )
-
-
-class TestCosSimOp2(TestCosSimOp):
-    def setUp(self):
-        self.op_type = "cos_sim"
-        self.inputs = {
-            'X': np.random.random((6, 100)).astype("float32"),
-            'Y': np.random.random((1, 100)).astype("float32"),
-        }
-        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
-        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
-        expect_out = (
-            (self.inputs['X'] * self.inputs['Y']).sum(axis=1)
-            / expect_x_norm
-            / expect_y_norm
-        )
-        self.outputs = {
-            'XNorm': np.expand_dims(expect_x_norm, 1),
-            'YNorm': np.expand_dims(expect_y_norm, 1),
-            'Out': np.expand_dims(expect_out, 1),
-        }
-
-
-class TestCosSimOp3(TestCosSimOp):
-    def setUp(self):
-        self.op_type = "cos_sim"
-        self.inputs = {
-            'X': np.random.random((6, 5, 4)).astype("float32"),
-            'Y': np.random.random((6, 5, 4)).astype("float32"),
-        }
-        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
-        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
-        expect_out = (
-            (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2))
-            / expect_x_norm
-            / expect_y_norm
-        )
-        self.outputs = {
-            'XNorm': np.expand_dims(expect_x_norm, 1),
-            'YNorm': np.expand_dims(expect_y_norm, 1),
-            'Out': np.expand_dims(expect_out, 1),
-        }
-
-
-class TestCosSimOp4(TestCosSimOp):
-    def setUp(self):
-        self.op_type = "cos_sim"
-        self.inputs = {
-            'X': np.random.random((6, 5, 20)).astype("float32"),
-            'Y': np.random.random((1, 5, 20)).astype("float32"),
-        }
-        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
-        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
-        expect_out = (
-            (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2))
-            / expect_x_norm
-            / expect_y_norm
-        )
-        self.outputs = {
-            'XNorm': np.expand_dims(expect_x_norm, 1),
-            'YNorm': np.expand_dims(expect_y_norm, 1),
-            'Out': np.expand_dims(expect_out, 1),
-        }
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_diag.py b/test/legacy_test/test_diag.py
deleted file mode 100644
index 4f713488b8206c..00000000000000
--- a/test/legacy_test/test_diag.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-from paddle.base import Program, core, program_guard
-
-
-class TestDiagOp(OpTest):
-    def setUp(self):
-        self.op_type = "diag"
-        self.init_config()
-        self.inputs = {'Diagonal': self.case}
-
-        self.outputs = {'Out': np.diag(self.inputs['Diagonal'])}
-
-    def test_check_output(self):
-        paddle.enable_static()
-        self.check_output()
-
-    def init_config(self):
-        self.case = np.arange(3, 6)
-
-
-class TestDiagOpCase1(TestDiagOp):
-    def init_config(self):
-        self.case = np.array([3], dtype='int32')
-
-
-class TestDiagOpFp16(unittest.TestCase):
-    def test_fp16(self):
-        x_np = np.array([3], dtype='float16')
-        with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.static.data(shape=[1, 0], name='x', dtype='float16')
-            out = paddle.diag(x)
-            if core.is_compiled_with_cuda():
-                place = paddle.CUDAPlace(0)
-                exe = paddle.static.Executor(place)
-                exe.run(paddle.static.default_startup_program())
-                out = exe.run(feed={'x': x_np}, fetch_list=[out])
-
-
-class TestDiagError(unittest.TestCase):
-    def test_errors(self):
-        paddle.enable_static()
-        with program_guard(Program(), Program()):
-
-            def test_diag_type():
-                return paddle.diag(x=[1, 2, 3])
-
-            self.assertRaises(TypeError, test_diag_type)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_dist_base.py b/test/legacy_test/test_dist_base.py
index db7d490e3a5afe..b4d8257503d401 100755
--- a/test/legacy_test/test_dist_base.py
+++ b/test/legacy_test/test_dist_base.py
@@ -1692,6 +1692,7 @@ def _get_required_envs(self, check_error_log=False, need_envs={}):
             "NCCL_P2P_DISABLE": "1",
             "NCCL_SHM_DISABLE": "1",
             "FLAGS_new_executor_static_build": "1",
+            "FLAGS_dynamic_static_unified_comm": "0",
         }
 
         if check_error_log:
diff --git a/test/legacy_test/test_dist_hapi_model.py b/test/legacy_test/test_dist_hapi_model.py
index 1e5ec1d341f71f..03a92d6f3cbc91 100644
--- a/test/legacy_test/test_dist_hapi_model.py
+++ b/test/legacy_test/test_dist_hapi_model.py
@@ -75,6 +75,7 @@ def start_local_trainers(
             "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
+            "FLAGS_dynamic_static_unified_comm": "0",
         }
 
         current_env.update(proc_env)
diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
index 32ee6fd8b39581..62a94832d1ae9e 100644
--- a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
+++ b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
@@ -68,6 +68,7 @@ def run_test(
     os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
     os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
     os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0)
+    os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
     os.environ.update(need_env)
 
     touch_file_env = 'SUCCESS_TOUCH_FILE'
diff --git a/test/legacy_test/test_dot_op.py b/test/legacy_test/test_dot_op.py
index 17c928be250713..3b1a216add6da3 100644
--- a/test/legacy_test/test_dot_op.py
+++ b/test/legacy_test/test_dot_op.py
@@ -37,7 +37,7 @@ def setUp(self):
         self.attrs = {}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         if core.is_compiled_with_rocm():
@@ -45,12 +45,10 @@ def test_check_grad_normal(self):
                 ['X', 'Y'],
                 'Out',
                 user_defined_grads=[self.inputs['Y'], self.inputs['X']],
+                check_pir=True,
             )
         else:
-            self.check_grad(
-                ['X', 'Y'],
-                'Out',
-            )
+            self.check_grad(['X', 'Y'], 'Out', check_pir=True)
 
     def test_check_grad_ingore_x(self):
         if core.is_compiled_with_rocm():
@@ -59,13 +57,10 @@ def test_check_grad_ingore_x(self):
                 'Out',
                 no_grad_set=set("X"),
                 user_defined_grads=[self.inputs['X']],
+                check_pir=True,
             )
         else:
-            self.check_grad(
-                ['Y'],
-                'Out',
-                no_grad_set=set("X"),
-            )
+            self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_pir=True)
 
     def test_check_grad_ingore_y(self):
         if core.is_compiled_with_rocm():
@@ -74,13 +69,10 @@ def test_check_grad_ingore_y(self):
                 'Out',
                 no_grad_set=set('Y'),
                 user_defined_grads=[self.inputs['Y']],
+                check_pir=True,
             )
         else:
-            self.check_grad(
-                ['X'],
-                'Out',
-                no_grad_set=set('Y'),
-            )
+            self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_pir=True)
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [121]).astype(self.dtype)
@@ -129,13 +121,13 @@ def init_input_output(self):
         self.out = np.sum(self.x * self.y, axis=1)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        self.check_grad(['X', 'Y'], 'Out', check_pir=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_pir=True)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_pir=True)
 
 
 class TestDotOpError(unittest.TestCase):
@@ -238,20 +230,22 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=0.125)
+                self.check_output_with_place(place, atol=0.125, check_pir=True)
 
     def test_check_grad_normal(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
-                self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+                self.check_grad_with_place(
+                    place, ['X', 'Y'], 'Out', check_pir=True
+                )
 
     def test_check_grad_ingore_x(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 self.check_grad_with_place(
-                    place, ['Y'], 'Out', no_grad_set=set("X")
+                    place, ['Y'], 'Out', no_grad_set=set("X"), check_pir=True
                 )
 
     def test_check_grad_ingore_y(self):
@@ -259,7 +253,7 @@ def test_check_grad_ingore_y(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 self.check_grad_with_place(
-                    place, ['X'], 'Out', no_grad_set=set("Y")
+                    place, ['X'], 'Out', no_grad_set=set("Y"), check_pir=True
                 )
 
     def init_input_output(self):
@@ -310,7 +304,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_bfloat16_supported(place):
-                self.check_output_with_place(place, atol=0.5)
+                self.check_output_with_place(place, atol=0.5, check_pir=True)
 
     def test_check_grad_normal(self):
         if core.is_compiled_with_cuda():
@@ -321,6 +315,7 @@ def test_check_grad_normal(self):
                     ['X', 'Y'],
                     'Out',
                     user_defined_grads=[self.inputs['Y'], self.inputs['X']],
+                    check_pir=True,
                 )
 
     def test_check_grad_ingore_x(self):
@@ -333,6 +328,7 @@ def test_check_grad_ingore_x(self):
                     'Out',
                     no_grad_set=set("X"),
                     user_defined_grads=[self.inputs['X']],
+                    check_pir=True,
                 )
 
     def test_check_grad_ingore_y(self):
@@ -345,6 +341,7 @@ def test_check_grad_ingore_y(self):
                     'Out',
                     no_grad_set=set("Y"),
                     user_defined_grads=[self.inputs['Y']],
+                    check_pir=True,
                 )
 
     def init_input_output(self):
@@ -382,6 +379,7 @@ def test_check_grad_normal(self):
                         self.y / self.y.shape[0],
                         self.x / self.x.shape[0],
                     ],
+                    check_pir=True,
                 )
 
     def test_check_grad_ingore_x(self):
@@ -394,6 +392,7 @@ def test_check_grad_ingore_x(self):
                     'Out',
                     no_grad_set=set("X"),
                     user_defined_grads=[self.x / self.x.shape[0]],
+                    check_pir=True,
                 )
 
     def test_check_grad_ingore_y(self):
@@ -406,6 +405,7 @@ def test_check_grad_ingore_y(self):
                     'Out',
                     no_grad_set=set("Y"),
                     user_defined_grads=[self.y / self.y.shape[0]],
+                    check_pir=True,
                 )
 
 
diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py
index f65e4d2b4b855b..433b9eeff7056d 100644
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
@@ -26,6 +26,7 @@
 from paddle.base.executor import scope_guard
 from paddle.decomposition import decompose
 from paddle.incubate.autograd import primapi
+from paddle.pir_utils import test_with_pir_api
 
 
 def dropout_wapper(
@@ -84,13 +85,11 @@ def setUp(self):
         self.enable_check_static_comp = False
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_prim_pir=True, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=True, check_pir=True)
 
     def test_check_grad_normal(self):
         # Now in dy2st mode x_grad = [], so set check_prim=False
-        self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_prim=False, check_pir=True)
 
 
 class TestDropoutOp_ZeroDim(TestDropoutOp):
@@ -129,13 +128,11 @@ def setUp(self):
         self.enable_check_static_comp = False
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_prim_pir=True, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=True, check_pir=True)
 
     def test_check_grad_normal(self):
         # Now in dy2st mode x_grad = [], so set check_prim=False
-        self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_prim=False, check_pir=True)
 
 
 class TestDropoutOp2(TestDropoutOp):
@@ -198,9 +195,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_prim_pir=True, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=True, check_pir=True)
 
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
@@ -217,9 +212,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_prim_pir=True, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=True, check_pir=True)
 
 
 class TestDropoutOp6(TestDropoutOp):
@@ -281,9 +274,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X']}
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_prim_pir=True, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=True, check_pir=True)
 
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
@@ -302,9 +293,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X']}
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_prim_pir=True, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=True, check_pir=True)
 
 
 class TestDropoutOpWithSeed(OpTest):
@@ -331,9 +320,7 @@ def setUp(self):
 
     def test_check_output(self):
         # ir backward don't support of variable derivation of itself
-        self.check_output(
-            check_prim=True, check_prim_pir=False, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=False, check_pir=True)
 
     def test_check_grad_normal(self):
         # Now in dy2st mode x_grad = [], so set check_prim=False
@@ -342,7 +329,7 @@ def test_check_grad_normal(self):
             'Out',
             max_relative_error=0.05,
             check_prim=False,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -380,11 +367,11 @@ def test_check_output(self):
             atol=1e-3,
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
 
 @unittest.skipIf(
@@ -419,9 +406,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(
-            check_prim=True, check_prim_pir=True, check_new_ir=True
-        )
+        self.check_output(check_prim=True, check_prim_pir=True, check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -429,7 +414,7 @@ def test_check_grad_normal(self):
             'Out',
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -539,9 +524,11 @@ def setUp(self):
         if core.is_compiled_with_cuda():
             self.places.append(base.CUDAPlace(0))
 
+    @test_with_pir_api
     def check_static_result(self, place):
         paddle.enable_static()
-        with base.program_guard(base.Program(), base.Program()):
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
             input = paddle.static.data(
                 name="input", shape=[-1, -1], dtype="float32"
             )
@@ -590,7 +577,6 @@ def check_static_result(self, place):
                 training=False,
                 mode='downscale_in_infer',
             )
-            res10 = paddle.nn.functional.dropout(x=input, p=1.0, training=True)
             res11 = paddle.nn.functional.dropout(x=input, p=0.0)
             res12 = paddle.nn.functional.dropout(
                 x=input,
@@ -600,13 +586,8 @@ def check_static_result(self, place):
                 mode='upscale_in_train',
             )
 
-            res13 = paddle.nn.functional.dropout(
-                x=input, p=0.7, axis=1, training=True, mode='upscale_in_train'
-            )
-
             in_np = np.ones([40, 40]).astype("float32")
             res_np = in_np
-            res_np2 = np.zeros_like(in_np)
 
             exe = base.Executor(place)
             res_list = [
@@ -624,26 +605,39 @@ def check_static_result(self, place):
             ]
             for res in res_list:
                 fetches = exe.run(
-                    base.default_main_program(),
+                    main_prog,
                     feed={"input": in_np},
                     fetch_list=[res],
                 )
                 np.testing.assert_allclose(fetches[0], res_np, rtol=1e-05)
+
+    @test_with_pir_api
+    def check_static_result2(self, place):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            input = paddle.static.data(
+                name="input", shape=[-1, -1], dtype="float32"
+            )
+            res10 = paddle.nn.functional.dropout(x=input, p=1.0, training=True)
+            res13 = paddle.nn.functional.dropout(
+                x=input, p=0.7, axis=1, training=True, mode='upscale_in_train'
+            )
+            in_np = np.ones([40, 40]).astype("float32")
+            res_np2 = np.zeros_like(in_np)
+
+            exe = base.Executor(place)
             fetches2 = exe.run(
-                base.default_main_program(),
+                main_prog,
                 feed={"input": in_np},
-                fetch_list=[res10],
+                fetch_list=[res10, res13],
             )
             np.testing.assert_allclose(fetches2[0], res_np2, rtol=1e-05)
-            fetches3 = exe.run(
-                base.default_main_program(),
-                feed={"input": in_np},
-                fetch_list=[res13],
-            )
 
     def test_static(self):
         for place in self.places:
             self.check_static_result(place=place)
+            self.check_static_result2(place=place)
 
     def test_dygraph(self):
         for place in self.places:
@@ -785,6 +779,13 @@ def test_dtype():
 
             self.assertRaises(TypeError, test_dtype)
 
+    @test_with_pir_api
+    def test_errors2(self):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+
             def test_pdtype():
                 # p should be int or float
                 x2 = paddle.static.data(
@@ -877,9 +878,12 @@ def setUp(self):
         if core.is_compiled_with_cuda():
             self.places.append(base.CUDAPlace(0))
 
+    @test_with_pir_api
     def check_static_result(self, place):
         paddle.enable_static()
-        with base.program_guard(base.Program(), base.Program()):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
             input = paddle.static.data(
                 name="input", shape=[2, 3, 4, 5], dtype="float32"
             )
@@ -897,7 +901,7 @@ def check_static_result(self, place):
             res_list = [res1, res2]
             for res in res_list:
                 fetches = exe.run(
-                    base.default_main_program(),
+                    main_prog,
                     feed={"input": in_np},
                     fetch_list=[res],
                 )
@@ -927,9 +931,12 @@ def test_dygraph(self):
 
 
 class TestDropout2DFAPIError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         paddle.enable_static()
-        with program_guard(Program(), Program()):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
 
             def test_xdim():
                 # dimentions of x should be 4
@@ -970,6 +977,7 @@ def test_dygraph(self):
                     result.numpy(), result_np, rtol=1e-05
                 )
 
+    @test_with_pir_api
     def test_static_fp16_with_gpu(self):
         if paddle.base.core.is_compiled_with_cuda():
             place = paddle.CUDAPlace(0)
@@ -1002,9 +1010,12 @@ def setUp(self):
         if core.is_compiled_with_cuda():
             self.places.append(base.CUDAPlace(0))
 
+    @test_with_pir_api
     def check_static_result(self, place):
         paddle.enable_static()
-        with base.program_guard(base.Program(), base.Program()):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
             input = paddle.static.data(
                 name="input", shape=[2, 3, 4, 5, 6], dtype="float32"
             )
@@ -1022,7 +1033,7 @@ def check_static_result(self, place):
             res_list = [res1, res2]
             for res in res_list:
                 fetches = exe.run(
-                    base.default_main_program(),
+                    main_prog,
                     feed={"input": in_np},
                     fetch_list=[res],
                 )
@@ -1052,9 +1063,12 @@ def test_dygraph(self):
 
 
 class TestDropout3DFAPIError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         paddle.enable_static()
-        with program_guard(Program(), Program()):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
 
             def test_xdim():
                 # dimentions of x should be 5
@@ -1103,8 +1117,12 @@ def setUp(self):
         if core.is_compiled_with_cuda():
             self.places.append(base.CUDAPlace(0))
 
+    @test_with_pir_api
     def check_static_result(self, place):
-        with base.program_guard(base.Program(), base.Program()):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
             input = paddle.static.data(
                 name="input", shape=[40, 40], dtype="float32"
             )
@@ -1119,20 +1137,15 @@ def check_static_result(self, place):
             res_np3 = np.zeros_like(in_np)
 
             exe = base.Executor(place)
-            res_list = [res1, res2]
-            for res in res_list:
-                fetches = exe.run(
-                    base.default_main_program(),
-                    feed={"input": in_np},
-                    fetch_list=[res],
-                )
-                np.testing.assert_allclose(fetches[0], res_np, rtol=1e-05)
+
             fetches = exe.run(
-                base.default_main_program(),
+                main_prog,
                 feed={"input": in_np},
-                fetch_list=[res3],
+                fetch_list=[res1, res2, res3],
             )
-            np.testing.assert_allclose(fetches[0], res_np3, rtol=1e-05)
+            np.testing.assert_allclose(fetches[0], res_np, rtol=1e-05)
+            np.testing.assert_allclose(fetches[1], res_np, rtol=1e-05)
+            np.testing.assert_allclose(fetches[2], res_np3, rtol=1e-05)
 
     def test_static(self):
         for place in self.places:
@@ -1171,6 +1184,13 @@ def test_Variable():
 
             self.assertRaises(TypeError, test_Variable)
 
+    @test_with_pir_api
+    def test_errors2(self):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+
             def test_dtype():
                 # the input dtype of dropout must be float32 or float64
                 xr = paddle.static.data(
@@ -1219,6 +1239,7 @@ def test_dygraph(self):
                     result.numpy(), result_np, rtol=1e-05
                 )
 
+    @test_with_pir_api
     def test_static_fp16_gpu(self):
         if paddle.base.core.is_compiled_with_cuda():
             place = paddle.CUDAPlace(0)
@@ -1378,9 +1399,9 @@ def api_case(self, x):
 
     def run_static(self, x):
         paddle.seed(2022)
-        main_program = Program()
         paddle.enable_static()
-        with program_guard(main_program):
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
             input = paddle.static.data(shape=x.shape, name='x', dtype='float32')
             out = self.api_case(input)
             sgd = paddle.optimizer.SGD(learning_rate=0.1)
@@ -2098,7 +2119,9 @@ def test_static_comp(self):
                     )
 
                     core._set_prim_forward_enabled(True)
-                    [output] = decompose(mp, [output])  # decompose forward
+                    [output] = decompose(
+                        mp, [output], whitelist={"pd_op.dropout"}
+                    )  # decompose forward
                     self.assertTrue(
                         'pd_op.dropout'
                         not in [op.name() for op in mp.global_block().ops]
diff --git a/test/legacy_test/test_eig_op.py b/test/legacy_test/test_eig_op.py
index c5ba7262902c77..c6b57258fc8208 100644
--- a/test/legacy_test/test_eig_op.py
+++ b/test/legacy_test/test_eig_op.py
@@ -183,7 +183,7 @@ def init_grad(self):
 
     def test_check_output(self):
         self.check_output_with_place_customized(
-            checker=self.checker, place=core.CPUPlace()
+            checker=self.checker, place=core.CPUPlace(), check_pir=True
         )
 
     def test_check_grad(self):
@@ -193,6 +193,7 @@ def test_check_grad(self):
             ['Eigenvalues', 'Eigenvectors'],
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_w, self.grad_v],
+            check_pir=True,
         )
 
 
@@ -319,6 +320,7 @@ def test_check_grad(self):
         test_type = 'float64'
         paddle.set_device("cpu")
 
+        np.random.seed(1024)
         input_np = np.random.random(test_shape).astype(test_type)
         real_w, real_v = np.linalg.eig(input_np)
 
diff --git a/test/legacy_test/test_eigvals_op.py b/test/legacy_test/test_eigvals_op.py
index 6f3f126b2db3ed..379603234d5afe 100644
--- a/test/legacy_test/test_eigvals_op.py
+++ b/test/legacy_test/test_eigvals_op.py
@@ -37,6 +37,7 @@ class TestEigvalsOp(OpTest):
     def setUp(self):
         np.random.seed(0)
         paddle.enable_static()
+        self.python_api = paddle.linalg.eigvals
         self.op_type = "eigvals"
         self.set_dtype()
         self.set_input_dims()
@@ -67,7 +68,7 @@ def set_input_data(self):
     def test_check_output(self):
         self.__class__.no_need_check_grad = True
         self.check_output_with_place_customized(
-            checker=self.verify_output, place=core.CPUPlace()
+            checker=self.verify_output, place=core.CPUPlace(), check_pir=True
         )
 
     def verify_output(self, outs):
diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index f5013d298e170a..d3039ca365d34c 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -56,7 +56,7 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output(
             check_dygraph=self.check_dygraph(),
-            check_new_ir=self.check_dygraph(),
+            check_pir=self.check_dygraph(),
         )
 
     def test_check_grad_normal(self):
@@ -69,7 +69,7 @@ def test_check_grad_normal(self):
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
             check_prim_pir=self.check_dygraph(),
-            check_new_ir=self.check_dygraph(),
+            check_pir=self.check_dygraph(),
         )
 
     def test_check_grad_ingore_x(self):
@@ -83,7 +83,7 @@ def test_check_grad_ingore_x(self):
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
             check_prim_pir=self.check_dygraph(),
-            check_new_ir=self.check_dygraph(),
+            check_pir=self.check_dygraph(),
         )
 
     def test_check_grad_ingore_y(self):
@@ -97,7 +97,7 @@ def test_check_grad_ingore_y(self):
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
             check_prim_pir=self.check_dygraph(),
-            check_new_ir=self.check_dygraph(),
+            check_pir=self.check_dygraph(),
         )
 
     def init_input_output(self):
@@ -153,7 +153,7 @@ def test_check_output(self):
             place,
             atol=1e-3,
             check_dygraph=self.check_dygraph(),
-            check_new_ir=self.check_dygraph(),
+            check_pir=self.check_dygraph(),
         )
 
     def test_check_grad_normal(self):
@@ -169,7 +169,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -181,7 +181,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -215,7 +215,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
@@ -225,7 +225,7 @@ def test_check_grad_normal(self):
             'Out',
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_x(self):
@@ -237,7 +237,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -249,7 +249,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def if_enable_cinn(self):
@@ -744,16 +744,16 @@ def init_input_output(self):
         self.out = self.x + self.y
 
     def test_check_output(self):
-        self.check_output(check_new_ir=False)
+        self.check_output(check_pir=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', check_new_ir=False)
+        self.check_grad(['X', 'Y'], 'Out', check_pir=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False)
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_pir=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False)
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_pir=False)
 
 
 class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp):
@@ -772,7 +772,11 @@ def test_static_add(self):
         b = paddle.full([4, 5, 6], True, dtype='bool')
         c = a + b
         self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
-        paddle.enable_static()
+        with paddle.pir_utils.IrGuard():
+            a = 1.5
+            b = paddle.full([4, 5, 6], True, dtype='bool')
+            c = a + b
+            self.assertTrue(c.dtype == core.DataType.FLOAT32)
 
     def test_dygraph_add(self):
         paddle.disable_static()
diff --git a/test/legacy_test/test_elementwise_div_op.py b/test/legacy_test/test_elementwise_div_op.py
index bb1676bb00afbe..c17d2b8946a7c5 100644
--- a/test/legacy_test/test_elementwise_div_op.py
+++ b/test/legacy_test/test_elementwise_div_op.py
@@ -20,6 +20,7 @@
 import paddle
 from paddle import base
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 def broadcast_wrapper(shape=[1, 10, 12, 1]):
@@ -98,9 +99,9 @@ def compute_gradient_y(self, grad_out, out, y):
 
     def test_check_output(self):
         if self.place is None:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
         else:
-            self.check_output_with_place(self.place, check_new_ir=True)
+            self.check_output_with_place(self.place, check_pir=True)
 
     def test_check_gradient(self):
         check_list = []
@@ -128,11 +129,11 @@ def test_check_gradient(self):
                 'check_prim_pir': self.check_prim_pir,
             }
             if self.place is None:
-                self.check_grad(*check_args, **check_kwargs, check_new_ir=True)
+                self.check_grad(*check_args, **check_kwargs, check_pir=True)
             else:
                 check_args.insert(0, self.place)
                 self.check_grad_with_place(
-                    *check_args, **check_kwargs, check_new_ir=True
+                    *check_args, **check_kwargs, check_pir=True
                 )
 
 
@@ -221,11 +222,11 @@ def test_check_gradient(self):
                 'check_prim_pir': self.check_prim_pir,
             }
             if self.place is None:
-                self.check_grad(*check_args, **check_kwargs, check_new_ir=True)
+                self.check_grad(*check_args, **check_kwargs, check_pir=True)
             else:
                 check_args.insert(0, self.place)
                 self.check_grad_with_place(
-                    *check_args, **check_kwargs, check_new_ir=True
+                    *check_args, **check_kwargs, check_pir=True
                 )
 
     def if_check_prim(self):
@@ -279,11 +280,11 @@ def test_check_gradient(self):
                 'check_dygraph': self.check_dygraph,
             }
             if self.place is None:
-                self.check_grad(*check_args, **check_kwargs, check_new_ir=True)
+                self.check_grad(*check_args, **check_kwargs, check_pir=True)
             else:
                 check_args.insert(0, self.place)
                 self.check_grad_with_place(
-                    *check_args, **check_kwargs, check_new_ir=True
+                    *check_args, **check_kwargs, check_pir=True
                 )
 
 
@@ -454,15 +455,13 @@ def test_check_gradient(self):
                     'max_relative_error': max_relative_error,
                 }
                 if self.place is None:
-                    self.check_grad(
-                        *check_args, **check_kwargs, check_new_ir=True
-                    )
+                    self.check_grad(*check_args, **check_kwargs, check_pir=True)
                 else:
                     check_args.insert(0, self.place)
                     self.check_grad_with_place(
                         *check_args,
                         **check_kwargs,
-                        check_new_ir=True,
+                        check_pir=True,
                         check_prim=True,
                         check_prim_pir=True
                     )
@@ -490,6 +489,7 @@ def test_check_gradient(self):
 
 
 class TestElementwiseDivBroadcast(unittest.TestCase):
+    @test_with_pir_api
     def test_shape_with_batch_sizes(self):
         paddle.enable_static()
         with base.program_guard(base.Program()):
@@ -514,6 +514,17 @@ def test_name(self):
 
             y_1 = paddle.divide(x, y, name='div_res')
             self.assertEqual(('div_res' in y_1.name), True)
+
+        with paddle.pir_utils.IrGuard(), base.program_guard(base.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.divide(x, y, name='div_res')
+
+            def name_call():
+                self.assertEqual(('div_res' in y_1.name), True)
+
+            self.assertRaises(ValueError, name_call)
         paddle.disable_static()
 
     def test_dygraph(self):
@@ -556,7 +567,7 @@ def init_input_output(self):
         self.out = self.x / self.y
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -564,7 +575,7 @@ def test_check_grad_normal(self):
             'Out',
             numeric_grad_delta=1e-5,
             max_relative_error=1e-6,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_x(self):
@@ -574,7 +585,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             numeric_grad_delta=1e-5,
             max_relative_error=1e-6,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -584,7 +595,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             numeric_grad_delta=1e-5,
             max_relative_error=1e-6,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_elementwise_mod_op.py b/test/legacy_test/test_elementwise_mod_op.py
index bb9348b358ebdf..ba6a75c9e6ac87 100644
--- a/test/legacy_test/test_elementwise_mod_op.py
+++ b/test/legacy_test/test_elementwise_mod_op.py
@@ -45,9 +45,9 @@ def setUp(self):
 
     def test_check_output(self):
         if self.attrs['axis'] == -1:
-            self.check_output()
+            self.check_output(check_pir=True)
         else:
-            self.check_output()
+            self.check_output(check_pir=True)
 
     def init_input_output(self):
         self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
@@ -102,9 +102,9 @@ def init_input_output(self):
 
     def test_check_output(self):
         if self.attrs['axis'] == -1:
-            self.check_output()
+            self.check_output(check_pir=True)
         else:
-            self.check_output()
+            self.check_output(check_pir=True)
 
 
 @unittest.skipIf(
@@ -121,9 +121,9 @@ def init_input_output(self):
 
     def test_check_output(self):
         if self.attrs['axis'] == -1:
-            self.check_output()
+            self.check_output(check_pir=True)
         else:
-            self.check_output()
+            self.check_output(check_pir=True)
 
 
 class TestElementwiseModFP16Op_ZeroDim1(TestElementwiseModFP16Op):
@@ -181,7 +181,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_pir=True)
 
     def init_dtype(self):
         self.dtype = np.uint16
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index b5a4689c2d40de..0787bf4f5104ae 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -49,7 +49,7 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output(
             check_dygraph=(not self.use_mkldnn),
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
         )
 
     def test_check_grad_normal(self):
@@ -60,7 +60,7 @@ def test_check_grad_normal(self):
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
             check_prim_pir=(not self.use_mkldnn),
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
         )
 
     def test_check_grad_ingore_x(self):
@@ -72,7 +72,7 @@ def test_check_grad_ingore_x(self):
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
             check_prim_pir=(not self.use_mkldnn),
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
         )
 
     def test_check_grad_ingore_y(self):
@@ -84,7 +84,7 @@ def test_check_grad_ingore_y(self):
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
             check_prim_pir=(not self.use_mkldnn),
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
         )
 
     def init_input_output(self):
@@ -132,13 +132,13 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', check_new_ir=True)
+        self.check_grad(['X', 'Y'], 'Out', check_pir=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=True)
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_pir=True)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=True)
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_pir=True)
 
 
 class TestElementwiseMulOp_ZeroDim1(ElementwiseMulOp):
@@ -189,7 +189,7 @@ def setUp(self):
         self.if_enable_cinn()
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -197,7 +197,7 @@ def test_check_grad_normal(self):
             'Out',
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_x(self):
@@ -207,7 +207,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -217,7 +217,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def if_enable_cinn(self):
@@ -274,7 +274,7 @@ def setUp(self):
     def test_check_output(self):
         self.check_output(
             check_dygraph=self.check_dygraph,
-            check_new_ir=self.check_dygraph,
+            check_pir=self.check_dygraph,
         )
 
     def test_check_grad_normal(self):
@@ -283,7 +283,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
-            check_new_ir=self.check_dygraph,
+            check_pir=self.check_dygraph,
         )
 
     def test_check_grad_ingore_x(self):
@@ -293,7 +293,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
-            check_new_ir=self.check_dygraph,
+            check_pir=self.check_dygraph,
         )
 
     def test_check_grad_ingore_y(self):
@@ -303,7 +303,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
-            check_new_ir=self.check_dygraph,
+            check_pir=self.check_dygraph,
         )
 
     def init_input_attr_output(self):
@@ -432,7 +432,7 @@ def test_check_grad_normal(self):
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
             check_prim_pir=(not self.use_mkldnn),
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
         )
 
     def test_check_grad_ingore_x(self):
@@ -444,7 +444,7 @@ def test_check_grad_ingore_x(self):
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
             check_prim_pir=(not self.use_mkldnn),
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
         )
 
     def test_check_grad_ingore_y(self):
@@ -456,7 +456,7 @@ def test_check_grad_ingore_y(self):
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
             check_prim_pir=(not self.use_mkldnn),
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
         )
 
 
@@ -535,16 +535,16 @@ def init_input_output(self):
         self.out = self.x * self.y
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', check_new_ir=True)
+        self.check_grad(['X', 'Y'], 'Out', check_pir=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=True)
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_pir=True)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=True)
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_pir=True)
 
 
 class TestRealComplexElementwiseMulOp(TestComplexElementwiseMulOp):
diff --git a/test/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py
index c83676f686d7a0..82d4f889b28a15 100644
--- a/test/legacy_test/test_elementwise_pow_op.py
+++ b/test/legacy_test/test_elementwise_pow_op.py
@@ -44,7 +44,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         if hasattr(self, 'attrs'):
@@ -57,7 +57,7 @@ def test_check_grad_normal(self):
                 'Out',
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
 
@@ -204,7 +204,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
 
 class TestElementwisePowGradOpInt(unittest.TestCase):
@@ -260,7 +260,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -271,7 +271,7 @@ def test_check_grad(self):
             ),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -297,7 +297,7 @@ def setUp(self):
         self.outputs = {'Out': convert_float_to_uint16(out)}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(['X', 'Y'], 'Out')
diff --git a/test/legacy_test/test_elementwise_sub_op.py b/test/legacy_test/test_elementwise_sub_op.py
index 9058c6e79e2b72..29185c1844bf4d 100644
--- a/test/legacy_test/test_elementwise_sub_op.py
+++ b/test/legacy_test/test_elementwise_sub_op.py
@@ -23,6 +23,7 @@
 from paddle import base
 from paddle.base import core
 from paddle.base.layer_helper import LayerHelper
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestElementwiseOp(OpTest):
@@ -44,7 +45,7 @@ def init_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -52,7 +53,7 @@ def test_check_grad_normal(self):
             'Out',
             check_prim=self.check_prim,
             check_prim_pir=self.check_prim_pir,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_x(self):
@@ -63,7 +64,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_prim=self.check_prim,
             check_prim_pir=self.check_prim_pir,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -74,7 +75,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_prim=self.check_prim,
             check_prim_pir=self.check_prim_pir,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def if_check_prim(self):
@@ -134,7 +135,7 @@ def test_check_grad_ingore_x(self):
             max_relative_error=0.1,
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -147,7 +148,7 @@ def test_check_grad_ingore_y(self):
             max_relative_error=0.1,
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -392,12 +393,10 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False, check_new_ir=False)
+        self.check_output(check_dygraph=False, check_pir=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'], 'Out', check_dygraph=False, check_new_ir=False
-        )
+        self.check_grad(['X', 'Y'], 'Out', check_dygraph=False, check_pir=False)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -406,7 +405,7 @@ def test_check_grad_ingore_x(self):
             max_relative_error=0.005,
             no_grad_set=set("X"),
             check_dygraph=False,
-            check_new_ir=False,
+            check_pir=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -416,7 +415,7 @@ def test_check_grad_ingore_y(self):
             max_relative_error=0.005,
             no_grad_set=set('Y'),
             check_dygraph=False,
-            check_new_ir=False,
+            check_pir=False,
         )
 
 
@@ -452,13 +451,13 @@ def setUp(self):
     def test_check_output(self):
         place = core.CUDAPlace(0)
         self.check_output_with_place(
-            place, check_dygraph=False, check_new_ir=False
+            place, check_dygraph=False, check_pir=False
         )
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', check_dygraph=False, check_new_ir=False
+            place, ['X', 'Y'], 'Out', check_dygraph=False, check_pir=False
         )
 
     def test_check_grad_ingore_x(self):
@@ -469,7 +468,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_dygraph=False,
-            check_new_ir=False,
+            check_pir=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -480,7 +479,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_dygraph=False,
-            check_new_ir=False,
+            check_pir=False,
         )
 
 
@@ -846,11 +845,11 @@ def init_input_output(self):
         self.out = self.x - self.y
 
     def test_check_output(self):
-        self.check_output(check_new_ir=False)
+        self.check_output(check_pir=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['X', 'Y'], 'Out', check_prim=self.check_prim, check_new_ir=False
+            ['X', 'Y'], 'Out', check_prim=self.check_prim, check_pir=False
         )
 
     def test_check_grad_ingore_x(self):
@@ -859,7 +858,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=self.check_prim,
-            check_new_ir=False,
+            check_pir=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -868,7 +867,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=self.check_prim,
-            check_new_ir=False,
+            check_pir=False,
         )
 
     def if_enable_cinn(self):
@@ -905,8 +904,9 @@ def test_name(self):
             y_1 = self._executed_api(x, y, name='subtract_res')
             self.assertEqual(('subtract_res' in y_1.name), True)
 
+    @test_with_pir_api
     def test_declarative(self):
-        with base.program_guard(base.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
 
             def gen_data():
                 return {
@@ -919,7 +919,10 @@ def gen_data():
             z = self._executed_api(x, y)
             place = base.CPUPlace()
             exe = base.Executor(place)
-            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            if paddle.framework.in_pir_mode():
+                z_value = exe.run(feed=gen_data(), fetch_list=[z])
+            else:
+                z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
             z_expected = np.array([1.0, -2.0, 2.0])
             self.assertEqual((z_value == z_expected).all(), True)
 
diff --git a/test/legacy_test/test_elementwise_tensor_split.py b/test/legacy_test/test_elementwise_tensor_split.py
new file mode 100644
index 00000000000000..870dd70f4a5c7e
--- /dev/null
+++ b/test/legacy_test/test_elementwise_tensor_split.py
@@ -0,0 +1,67 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.base import core
+
+
+class TestElementwiseOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+
+    def test_float16_sub(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        gpu_info = paddle.device.cuda.get_device_properties()
+
+        gpu_name = gpu_info.name
+        try:
+            re_result = re.split(r'[ , -]', gpu_name)
+            memory = int(re_result[-1][:-2])
+        except:
+            memory = int(gpu_info.total_memory) // (1000**3)
+        if memory < 37:  # 37GB
+            return
+
+        paddle.disable_static()
+        tensor_a = paddle.rand(shape=[5120, 4, 384, 384], dtype="float16")
+        tensor_b = paddle.rand(shape=[5120, 1, 384, 384], dtype="float16")
+        tensor_z = paddle.subtract(tensor_a, tensor_b)
+
+        in0, in1 = paddle.split(tensor_a, num_or_sections=2, axis=1)
+        (
+            out0,
+            out1,
+        ) = paddle.split(tensor_z, num_or_sections=2, axis=1)
+
+        split_add0 = paddle.subtract(tensor_b, in0)
+        split_add1 = paddle.subtract(tensor_b, in1)
+
+        result1 = paddle.any(paddle.equal(out0, split_add0), [0, 1, 2, 3])
+        result2 = paddle.any(paddle.equal(out1, split_add1), [0, 1, 2, 3])
+        np.testing.assert_equal(result1.numpy(), True)
+        np.testing.assert_equal(result2.numpy(), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_empty_op.py b/test/legacy_test/test_empty_op.py
index 44e1f2fe30fb62..a49489417878ac 100644
--- a/test/legacy_test/test_empty_op.py
+++ b/test/legacy_test/test_empty_op.py
@@ -31,7 +31,7 @@ def setUp(self):
         self.init_config()
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         data_type = outs[0].dtype
@@ -121,7 +121,7 @@ def init_config(self):
         self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         data_type = outs[0].dtype
@@ -172,7 +172,7 @@ def init_config(self):
         self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         data_type = outs[0].dtype
@@ -312,7 +312,7 @@ def setUp(self):
         self.outputs = {'Out': convert_float_to_uint16(output)}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         max_value = np.nanmax(outs[0])
diff --git a/test/legacy_test/test_erf_op.py b/test/legacy_test/test_erf_op.py
index 24f32175151d65..d66cdc3ce11793 100644
--- a/test/legacy_test/test_erf_op.py
+++ b/test/legacy_test/test_erf_op.py
@@ -44,10 +44,18 @@ def _init_dtype(self):
         return "float64"
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_pir=True)
+
+    def test_check_grad_prim_pir(self):
+        # Todo(CZ): float64 loss greater than 1e-8
+        if self.dtype == "float64":
+            self.dtype = "float32"
+            self.rev_comp_atol = 1e-7
+            self.rev_comp_rtol = 1e-7
+        self.check_grad(['X'], 'Out', check_prim_pir=True)
 
 
 class TestErfOp_ZeroDim(TestErfOp):
@@ -93,10 +101,16 @@ def setUp(self):
         self.outputs = {'Out': y_ref}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 @unittest.skipIf(
@@ -121,12 +135,17 @@ def setUp(self):
 
     def test_check_output(self):
         place = paddle.base.core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = paddle.base.core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_expand_as_op.py b/test/legacy_test/test_expand_as_op.py
deleted file mode 100755
index b5b8013a2c9c63..00000000000000
--- a/test/legacy_test/test_expand_as_op.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def bcast(x, target_tensor):
-    x_dims = x.shape
-    y_dims = target_tensor.shape
-    bcast_dims = []
-    for i in range(len(x_dims)):
-        bcast_dims.append(int(y_dims[i] / x_dims[i]))
-    bcast_dims = np.array(bcast_dims).astype("int64")
-    return bcast_dims
-
-
-class TestExpandAsOpRank1(OpTest):
-    def setUp(self):
-        self.op_type = "expand_as"
-        x = np.random.rand(100).astype("float64")
-        target_tensor = np.random.rand(200).astype("float64")
-        self.inputs = {'X': x, 'target_tensor': target_tensor}
-        self.attrs = {}
-        bcast_dims = bcast(x, target_tensor)
-        output = np.tile(self.inputs['X'], bcast_dims)
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestExpandAsOpRank2(OpTest):
-    def setUp(self):
-        self.op_type = "expand_as"
-        x = np.random.rand(10, 12).astype("float64")
-        target_tensor = np.random.rand(20, 24).astype("float64")
-        self.inputs = {'X': x, 'target_tensor': target_tensor}
-        self.attrs = {}
-        bcast_dims = bcast(x, target_tensor)
-        output = np.tile(self.inputs['X'], bcast_dims)
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestExpandAsOpRank3(OpTest):
-    def setUp(self):
-        self.op_type = "expand_as"
-        x = np.random.rand(2, 3, 20).astype("float64")
-        target_tensor = np.random.rand(4, 6, 40).astype("float64")
-        self.inputs = {'X': x, 'target_tensor': target_tensor}
-        self.attrs = {}
-        bcast_dims = bcast(x, target_tensor)
-        output = np.tile(self.inputs['X'], bcast_dims)
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestExpandAsOpRank4(OpTest):
-    def setUp(self):
-        self.op_type = "expand_as"
-        x = np.random.rand(1, 1, 7, 16).astype("float64")
-        target_tensor = np.random.rand(4, 6, 14, 32).astype("float64")
-        self.inputs = {'X': x, 'target_tensor': target_tensor}
-        self.attrs = {}
-        bcast_dims = bcast(x, target_tensor)
-        output = np.tile(self.inputs['X'], bcast_dims)
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_expand_v2_op.py b/test/legacy_test/test_expand_v2_op.py
index f7ba37fb60cbb4..988043d472e252 100644
--- a/test/legacy_test/test_expand_v2_op.py
+++ b/test/legacy_test/test_expand_v2_op.py
@@ -47,10 +47,16 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestExpandV2OpRank1_ZeroDim1(TestExpandV2OpRank1):
@@ -130,10 +136,10 @@ def init_data(self):
         self.infer_expand_shape = [-1]
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_cinn=True, check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_cinn=True, check_pir=True)
 
 
 class TestExpandV2OpRank2_Corner_tensor_attr(TestExpandV2OpRank1_tensor_attr):
@@ -167,10 +173,10 @@ def init_data(self):
         self.expand_shape = [2, 100]
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_cinn=True, check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_cinn=True, check_pir=True)
 
 
 # Situation 4: input x is Integer
@@ -188,7 +194,7 @@ def setUp(self):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
 
 #  Situation 5: input x is Bool
@@ -204,7 +210,7 @@ def setUp(self):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
 
 #  Situation 6: input x is Integer
@@ -222,7 +228,7 @@ def setUp(self):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
 
 #  Situation 7: input x is Float16
@@ -244,7 +250,13 @@ def test_check_output(self):
         self.check_output(check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 #  Situation 8: input x is BF16
@@ -268,12 +280,17 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_cinn=True, check_new_ir=True)
+        self.check_output_with_place(place, check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
@@ -438,7 +455,7 @@ def test_check_output(self):
         self.check_output(check_prim=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_prim_pir=True)
 
 
 class TestExpandV2OpCompRank2_DimExpanding(TestExpandV2CompOpRank1):
diff --git a/test/legacy_test/test_exponential_op.py b/test/legacy_test/test_exponential_op.py
index de92243084ffbe..1df9276590a0f2 100644
--- a/test/legacy_test/test_exponential_op.py
+++ b/test/legacy_test/test_exponential_op.py
@@ -37,7 +37,7 @@ def config(self):
         self.dtype = "float64"
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist1, _ = np.histogram(outs[0], range=(0, 5))
@@ -360,7 +360,7 @@ def config(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist1, _ = np.histogram(outs[0], range=(0, 5))
@@ -411,7 +411,7 @@ def config(self):
     def test_check_output(self):
         place = core.CUDAPlace(0)
         self.check_output_with_place_customized(
-            checker=self.verify_output, place=place
+            checker=self.verify_output, place=place, check_pir=True
         )
 
     def verify_output(self, outs):
diff --git a/test/legacy_test/test_fake_dequantize_op.py b/test/legacy_test/test_fake_dequantize_op.py
index ee2f7f7b0820ab..9fc5f3500844f1 100644
--- a/test/legacy_test/test_fake_dequantize_op.py
+++ b/test/legacy_test/test_fake_dequantize_op.py
@@ -247,7 +247,7 @@ def setUp(self):
         self.outputs = {'Y': ydq}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=False)
 
 
 class TestChannelWiseDequantizeOp1(TestChannelWiseDequantizeOp):
@@ -281,7 +281,7 @@ def setUp(self):
         self.outputs = {'Y': ydq}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=False)
 
 
 class TestDequantizeOpDouble(TestDequantizeOp):
diff --git a/test/legacy_test/test_fill_any_like_op.py b/test/legacy_test/test_fill_any_like_op.py
index ebcbd575384212..a60ab183e36cd8 100644
--- a/test/legacy_test/test_fill_any_like_op.py
+++ b/test/legacy_test/test_fill_any_like_op.py
@@ -58,7 +58,7 @@ def init(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(check_prim=True, check_pir=True)
 
     def if_enable_cinn(self):
         pass
@@ -96,7 +96,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_prim=True, check_new_ir=True)
+        self.check_output_with_place(place, check_prim=True, check_pir=True)
 
     def if_enable_cinn(self):
         pass
diff --git a/test/legacy_test/test_fill_constant_op.py b/test/legacy_test/test_fill_constant_op.py
index 9f354b5d992767..7ea153d627cbdb 100644
--- a/test/legacy_test/test_fill_constant_op.py
+++ b/test/legacy_test/test_fill_constant_op.py
@@ -21,6 +21,7 @@
 import paddle
 from paddle import base
 from paddle.base import Program, core, program_guard
+from paddle.pir_utils import test_with_pir_api
 
 
 def fill_wrapper(shape, value=0.0):
@@ -44,7 +45,7 @@ def setUp(self):
         self.outputs = {'Out': np.full(self.shape, self.value)}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -115,7 +116,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
 
 class TestFillConstantOpWithSelectedRows(unittest.TestCase):
@@ -168,7 +169,7 @@ def init_data(self):
         self.value = 3.8
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestFillConstantOp2_ShapeTensorList(OpTest):
@@ -192,7 +193,7 @@ def init_data(self):
         self.infer_shape = [-1, -1]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestFillConstantOp3_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
@@ -226,7 +227,7 @@ def init_data(self):
         self.value = 3.8
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 # Situation 4: value is a tensor
@@ -250,7 +251,7 @@ def init_data(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 # Situation 5: value is a tensor
@@ -274,12 +275,14 @@ def init_data(self):
         self.dtype = np.int32
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 # Test python API
 class TestFillConstantAPI(unittest.TestCase):
+    @test_with_pir_api
     def test_api(self):
+        paddle.enable_static()
         positive_2_int32 = paddle.tensor.fill_constant([1], "int32", 2)
         positive_2_int64 = paddle.tensor.fill_constant([1], "int64", 2)
 
@@ -330,7 +333,7 @@ def test_api(self):
 
         exe = base.Executor(place=base.CPUPlace())
         res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run(
-            base.default_main_program(),
+            paddle.static.default_main_program(),
             feed={
                 "shape_tensor_int32": np.array([1, 2]).astype("int32"),
                 "shape_tensor_int64": np.array([1, 2]).astype("int64"),
@@ -487,6 +490,58 @@ def test_shape_tensor_list_dtype():
 
             self.assertRaises(TypeError, test_shape_tensor_list_dtype)
 
+        with paddle.pir_utils.IrGuard(), program_guard(Program()):
+            x1 = paddle.static.data(name='x1', shape=[-1, 1], dtype="int16")
+            self.assertRaises(
+                TypeError,
+                paddle.tensor.fill_constant,
+                shape=[1],
+                value=5,
+                dtype='uint4',
+            )
+
+            self.assertRaises(
+                ValueError,
+                paddle.tensor.fill_constant,
+                shape=[1.1],
+                value=5,
+                dtype='float32',
+                out=x1,
+            )
+
+            x3 = np.random.randn(100, 100).astype('int32')
+            self.assertRaises(
+                ValueError,
+                paddle.tensor.fill_constant,
+                shape=[100, 100],
+                value=5,
+                dtype='float64',
+                out=x3,
+            )
+
+    def test_pir_errors(self):
+        def test_shape_type():
+            # The shape dtype of fill_constant_op must be int32 or int64.
+            # test_shape_tensor_dtype:
+            with paddle.pir_utils.IrGuard():
+                new_ir_program = paddle.static.Program()
+                with paddle.static.program_guard(new_ir_program):
+                    shape = paddle.static.data(
+                        name="shape_tensor", shape=[2], dtype="int32"
+                    )
+                    out = paddle.tensor.fill_constant(
+                        shape=shape, dtype="float32", value=1
+                    )
+                    exe = base.Executor(place=base.CPUPlace())
+                    exe.run(
+                        feed={
+                            "shape_tensor": np.array([1, 2]).astype("float32")
+                        },
+                        fetch_list=[out],
+                    )
+
+        self.assertRaises(ValueError, test_shape_type)
+
 
 class TestFillConstantOp_ValueTensorBf16(OpTest):
     def setUp(self):
@@ -513,7 +568,7 @@ def init_data(self):
     def test_check_output(self):
         # no dynamic graph test for mkldnn
         self.check_output_with_place(
-            core.CPUPlace(), check_dygraph=False, check_new_ir=False
+            core.CPUPlace(), check_dygraph=False, check_pir=False
         )
 
 
diff --git a/test/legacy_test/test_fill_op.py b/test/legacy_test/test_fill_op.py
deleted file mode 100644
index 679ee25e041ab5..00000000000000
--- a/test/legacy_test/test_fill_op.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op import Operator
-from op_test import OpTest, convert_float_to_uint16
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-class TestFillOp1(OpTest):
-    def setUp(self):
-        self.op_type = "fill"
-        self.init_dtype()
-        self.init_shape()
-        self.init_value()
-        self.inputs = {}
-        self.attrs = {
-            'value': self.val.flatten().tolist(),
-            'shape': self.shape,
-            'dtype': int(core.VarDesc.VarType.FP64),
-            'force_cpu': False,
-        }
-        self.outputs = {'Out': self.val.astype('float64')}
-
-    def init_shape(self):
-        self.shape = [100, 200]
-
-    def init_value(self):
-        self.val = np.random.random(size=self.shape)
-
-    def init_dtype(self):
-        self.dtype = np.float64
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillOp2(OpTest):
-    def setUp(self):
-        self.op_type = "fill"
-        self.val = np.random.random(size=[100, 200])
-        self.inputs = {}
-        self.attrs = {
-            'value': self.val.flatten().tolist(),
-            'shape': [100, 200],
-            'dtype': int(core.VarDesc.VarType.FP64),
-            'force_cpu': True,
-        }
-        self.outputs = {'Out': self.val.astype('float64')}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillOp3(unittest.TestCase):
-    def check_with_place(self, place, f_cpu):
-        scope = core.Scope()
-        # create Out Variable
-        out = scope.var('Out').get_tensor()
-
-        # create and run fill_op operator
-        val = np.random.random(size=[300, 200])
-        fill_op = Operator(
-            "fill",
-            value=val.flatten(),
-            shape=[300, 200],
-            dtype=int(core.VarDesc.VarType.FP32),
-            force_cpu=f_cpu,
-            Out='Out',
-        )
-        fill_op.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out)
-        full_array = np.array(val, 'float32')
-
-        np.testing.assert_array_equal(result_array, full_array)
-
-    def test_fill_op(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            self.check_with_place(place, True)
-            self.check_with_place(place, False)
-
-
-class TestFillFP16OP(TestFillOp1):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFillInf(TestFillOp1):
-    def init_value(self):
-        self.val = np.full(fill_value=np.inf, shape=self.shape)
-
-
-class TestFillOpError(unittest.TestCase):
-    def test_errors(self):
-        with base.dygraph.base.guard():
-
-            def test_nan_fill_value():
-                tensor = paddle.zeros(shape=[100, 200])
-                tensor.fill_(np.nan)
-
-            self.assertRaises(ValueError, test_nan_fill_value)
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the bfloat16",
-)
-class TestFillBF16OP(OpTest):
-    def setUp(self):
-        self.op_type = "fill"
-        self.dtype = np.uint16
-        val = np.random.random(size=[100, 200])
-        self.inputs = {}
-        self.attrs = {
-            'value': val.flatten().tolist(),
-            'shape': [100, 200],
-            'dtype': int(core.VarDesc.VarType.BF16),
-            'force_cpu': False,
-        }
-        self.outputs = {'Out': convert_float_to_uint16(val)}
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_filter_by_instag_op.py b/test/legacy_test/test_filter_by_instag_op.py
deleted file mode 100644
index 211889feaa06b0..00000000000000
--- a/test/legacy_test/test_filter_by_instag_op.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This is unit test of Test filter_instag Op."""
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-"""This is Test Case 1"""
-
-
-class TestFilterByInstagOp(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        x1 = np.zeros((36, 4), dtype=np.float64)
-        for i in range(36):
-            for j in range(4):
-                x1[i, j] = i
-        x1_lod = [[1, 2, 3, 4, 5, 6, 7, 8]]
-
-        x2 = np.array([[1], [2], [1], [2], [1], [2], [1], [2]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1, 1, 1, 1, 1]]
-
-        x3 = np.array([2]).astype('int64')
-
-        out = np.zeros((20, 4), dtype=np.float64)
-        out_lod = [[2, 4, 6, 8]]
-        start_num_lst = [1, 6, 15, 28]
-
-        ln = 0
-        for i in range(4):
-            start = start_num_lst[i]
-            len = out_lod[0][i]
-            for j in range(len):
-                cur = start + j
-                for k in range(4):
-                    out[ln, k] = cur
-                ln += 1
-
-        mmap = np.array([[0, 1, 2], [2, 6, 4], [6, 15, 6], [12, 28, 8]]).astype(
-            'int64'
-        )
-        mmap_lod = [[1, 1, 1, 1]]
-
-        loss_weight = np.array([[1], [1], [1], [1]]).astype('double')
-
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-
-        self.attrs = {'is_lod': True, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'})
-
-
-"""This is Test Case 2"""
-
-
-class TestFilterByInstagOp2(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('double')
-        x1_lod = [[1, 1, 1, 1]]
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([1]).astype('int64')
-
-        out = np.zeros([2, 36]).astype('double')
-        out[0] = x1[1]
-        out[1] = x1[3]
-        out_lod = [[1, 1]]
-
-        mmap = np.array([[0, 1, 1], [1, 3, 1]]).astype('int64')
-        mmap_lod = [[1, 1]]
-
-        loss_weight = np.array([[1], [1]]).astype('double')
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': True, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'})
-
-
-"""This is Test Case 3"""
-
-
-class TestFilterByInstagOp3(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('double')
-        x1_lod = [[1, 1, 1, 1]]
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.zeros((1, 36)).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': True, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'})
-
-
-"""This is Test Case 4"""
-
-
-class TestFilterByInstagOp4(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('double')
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.zeros((1, 36)).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': x1,
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': False, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'})
-
-
-class TestFilterByInstagOp6(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('int64')
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.zeros((1, 36)).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': x1,
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': False, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        pass
-
-
-class TestFilterByInstagOp7(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('int32')
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.zeros((1, 36)).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': x1,
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': False, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        pass
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_flatten_contiguous_range_op.py b/test/legacy_test/test_flatten_contiguous_range_op.py
index d497da1cd27582..82ba03f559efc2 100644
--- a/test/legacy_test/test_flatten_contiguous_range_op.py
+++ b/test/legacy_test/test_flatten_contiguous_range_op.py
@@ -19,6 +19,7 @@
 
 import paddle
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestFlattenOp(OpTest):
@@ -46,18 +47,27 @@ def if_enable_cinn(self):
     def test_check_output(self):
         if str(self.dtype) in {"float16", "uint16"}:
             self.check_output_with_place(
-                core.CUDAPlace(0), no_check_set=["XShape"], check_prim=True
+                core.CUDAPlace(0),
+                no_check_set=["XShape"],
+                check_prim=True,
+                check_pir=True,
             )
         else:
-            self.check_output(no_check_set=["XShape"], check_prim=True)
+            self.check_output(
+                no_check_set=["XShape"], check_prim=True, check_pir=True
+            )
 
     def test_check_grad(self):
         if str(self.dtype) in {"float16", "uint16"}:
             self.check_grad_with_place(
-                core.CUDAPlace(0), ["X"], "Out", check_prim=True
+                core.CUDAPlace(0),
+                ["X"],
+                "Out",
+                check_prim=True,
+                check_pir=True,
             )
         else:
-            self.check_grad(["X"], "Out", check_prim=True)
+            self.check_grad(["X"], "Out", check_prim=True, check_pir=True)
 
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
@@ -452,6 +462,7 @@ class TestStaticFlattenPythonAPI(unittest.TestCase):
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return paddle.flatten(x, start_axis, stop_axis)
 
+    @test_with_pir_api
     def test_static_api(self):
         paddle.enable_static()
         np_x = np.random.rand(2, 3, 4, 4).astype('float32')
@@ -472,6 +483,7 @@ class TestStaticFlattenInferShapePythonAPI(unittest.TestCase):
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return paddle.flatten(x, start_axis, stop_axis)
 
+    @test_with_pir_api
     def test_static_api(self):
         paddle.enable_static()
         main_prog = paddle.static.Program()
@@ -480,7 +492,7 @@ def test_static_api(self):
                 name="x", shape=[-1, 3, -1, -1], dtype='float32'
             )
             out = self.execute_api(x, start_axis=2, stop_axis=3)
-        self.assertTrue((-1, 3, -1) == out.shape)
+        self.assertTrue((-1, 3, -1) == tuple(out.shape))
 
 
 class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI):
diff --git a/test/legacy_test/test_flatten_op.py b/test/legacy_test/test_flatten_op.py
deleted file mode 100644
index f59c6a91028d1e..00000000000000
--- a/test/legacy_test/test_flatten_op.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, paddle_static_guard
-
-import paddle
-
-
-class TestFlattenOp(OpTest):
-    def setUp(self):
-        self.op_type = "flatten"
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.in_shape).astype("float64")}
-        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 10)
-        self.axis = 1
-        self.new_shape = (3, 40)
-
-    def init_attrs(self):
-        self.attrs = {"axis": self.axis}
-
-
-class TestFlattenOp1(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 10)
-        self.axis = 0
-        self.new_shape = (1, 120)
-
-
-class TestFlattenOpWithDefaultAxis(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (10, 2, 2, 3)
-        self.new_shape = (10, 12)
-
-    def init_attrs(self):
-        self.attrs = {}
-
-
-class TestFlattenOpSixDims(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (3, 2, 3, 2, 4, 4)
-        self.axis = 4
-        self.new_shape = (36, 16)
-
-
-class TestFlattenOpFP16(unittest.TestCase):
-    def test_fp16_with_gpu(self):
-        if paddle.base.core.is_compiled_with_cuda():
-            with paddle_static_guard():
-                place = paddle.CUDAPlace(0)
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    input = np.random.random([12, 14]).astype("float16")
-                    x = paddle.static.data(
-                        name="x", shape=[12, 14], dtype="float16"
-                    )
-
-                    y = paddle.flatten(x)
-
-                    exe = paddle.static.Executor(place)
-                    res = exe.run(
-                        paddle.static.default_main_program(),
-                        feed={
-                            "x": input,
-                        },
-                        fetch_list=[y],
-                    )
-
-                    np.testing.assert_array_equal(res[0].shape, [12 * 14])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_full_like_op.py b/test/legacy_test/test_full_like_op.py
index 137e536126bb46..5cbcc3f5c78aa1 100644
--- a/test/legacy_test/test_full_like_op.py
+++ b/test/legacy_test/test_full_like_op.py
@@ -148,7 +148,7 @@ def init_data(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(check_prim=True, check_pir=True, check_prim_pir=True)
 
     def if_enable_cinn(self):
         pass
diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py
index 5be92f6f9b7056..d201b9d76e8d3a 100644
--- a/test/legacy_test/test_fused_rotary_position_embedding.py
+++ b/test/legacy_test/test_fused_rotary_position_embedding.py
@@ -31,7 +31,7 @@ def deal_qkv(init_q, init_k, init_v):
 
 def mult_qkv(value, cos_tensor, sin_tensor):
     rotate_half_q = paddle.reshape(
-        paddle.stack([value[:, :, :, 1::2], value[:, :, :, 0::2]], axis=-1),
+        paddle.stack([-value[:, :, :, 1::2], value[:, :, :, 0::2]], axis=-1),
         paddle.shape(value),
     )
     query = paddle.add(
@@ -59,7 +59,7 @@ def mult_qkv_rotate_half(value, cos_tensor, sin_tensor):
     return query
 
 
-def get_sin_cos_tensor(seq_len, head_dim, sign):
+def get_sin_cos_tensor(seq_len, head_dim, sign=1):
     pos_seq = paddle.arange(0, seq_len, 1, dtype="float32")
     indices = paddle.arange(0, head_dim, 2, dtype="float32")
 
@@ -93,15 +93,18 @@ def get_sin_cos_tensor(seq_len, head_dim, sign):
 
 
 def paddle_fused_rotary_position_embedding(
-    init_q, init_k, init_v, position_ids=None, use_neox_rotary_style=True
+    init_q,
+    init_k,
+    init_v,
+    sin_tensor=None,
+    cos_tensor=None,
+    position_ids=None,
+    use_neox_rotary_style=True,
 ):
     # permute q, k, v from [batch_size, seq_len, num_heads, head_dim]
     # to [batch_size, num_heads, seq_len, head_dim]
     q, k, v = deal_qkv(init_q, init_k, init_v)
 
-    sign = -1 if use_neox_rotary_style else 1
-    sin_tensor, cos_tensor = get_sin_cos_tensor(q.shape[2], q.shape[3], sign)
-
     if position_ids is not None:
         sin_tensor = sin_tensor.squeeze(axis=[0, 2])  # [seq_len, dim]
         cos_tensor = cos_tensor.squeeze(axis=[0, 2])  # [seq_len, dim]
@@ -146,60 +149,45 @@ def get_paddle_tensor(self):
         tmp.stop_gradient = False
         return tmp
 
+    def get_inputs(self, seed, with_sin_cos):
+        paddle.disable_static()
+        paddle.seed(seed)
+        tensor_q = self.get_paddle_tensor()
+        tensor_k = self.get_paddle_tensor()
+        tensor_v = self.get_paddle_tensor()
+
+        tensor_sin, tensor_cos = (
+            get_sin_cos_tensor(tensor_q.shape[1], tensor_q.shape[3], 1)
+            if with_sin_cos
+            else (None, None)
+        )
+        return tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos
+
     def get_forward_backward(
         self,
         rope_function,
         seed,
-        flag=False,
+        with_sin_cos=True,
         use_neox_rotary_style=True,
         position_ids=None,
     ):
         paddle.disable_static()
-        paddle.seed(seed)
         fw = []
         bw = []
-        tensor_q = self.get_paddle_tensor()
-        tensor_k = self.get_paddle_tensor()
-        tensor_v = self.get_paddle_tensor()
-        if use_neox_rotary_style:
-            if flag:
-                tensor_sin, tensor_cos = get_sin_cos_tensor(
-                    tensor_q.shape[1], tensor_q.shape[3], 1
-                )
-                out_q, out_k, out_v = rope_function(
-                    tensor_q,
-                    tensor_k,
-                    tensor_v,
-                    tensor_sin,
-                    tensor_cos,
-                    position_ids=position_ids,
-                )
-            else:
-                out_q, out_k, out_v = rope_function(
-                    tensor_q, tensor_k, tensor_v, position_ids=position_ids
-                )
-        else:
-            if flag:
-                tensor_sin, tensor_cos = get_sin_cos_tensor(
-                    tensor_q.shape[1], tensor_q.shape[3], 1
-                )
-                out_q, out_k, out_v = rope_function(
-                    tensor_q,
-                    tensor_k,
-                    tensor_v,
-                    tensor_sin,
-                    tensor_cos,
-                    position_ids=position_ids,
-                    use_neox_rotary_style=False,
-                )
-            else:
-                out_q, out_k, out_v = rope_function(
-                    tensor_q,
-                    tensor_k,
-                    tensor_v,
-                    position_ids=position_ids,
-                    use_neox_rotary_style=False,
-                )
+
+        tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos = self.get_inputs(
+            seed, with_sin_cos
+        )
+
+        out_q, out_k, out_v = rope_function(
+            tensor_q,
+            tensor_k,
+            tensor_v,
+            tensor_sin,
+            tensor_cos,
+            position_ids=position_ids,
+            use_neox_rotary_style=use_neox_rotary_style,
+        )
 
         fw.append(out_q)
         fw.append(out_k)
@@ -208,6 +196,7 @@ def get_forward_backward(
         out_gq = paddle.randn(out_q.shape, self.dtype)
         out_gk = paddle.randn(out_q.shape, self.dtype)
         out_gv = paddle.randn(out_q.shape, self.dtype)
+
         paddle.autograd.backward(
             [out_q, out_k, out_v], [out_gq, out_gk, out_gv], True
         )
@@ -234,10 +223,14 @@ def test_fused_rope(self):
 
     def test_fused_rope_with_sin_cos(self):
         p_fw, p_bw = self.get_forward_backward(
-            paddle_fused_rotary_position_embedding, seed=self.seed
+            paddle_fused_rotary_position_embedding,
+            seed=self.seed,
+            with_sin_cos=True,
         )
         f_fw, f_bw = self.get_forward_backward(
-            fused_rotary_position_embedding, seed=self.seed, flag=True
+            fused_rotary_position_embedding,
+            seed=self.seed,
+            with_sin_cos=True,
         )
         for i in range(len(p_fw)):
             np.testing.assert_allclose(
@@ -278,7 +271,6 @@ def test_fused_rope_position_ids(self):
         f_fw, f_bw = self.get_forward_backward(
             fused_rotary_position_embedding,
             seed=self.seed,
-            flag=True,
             position_ids=position_ids,
         )
         for i in range(len(p_fw)):
@@ -289,13 +281,59 @@ def test_fused_rope_position_ids(self):
                 p_bw[i].numpy(), f_bw[i].numpy(), rtol=1e-05
             )
 
-    def test_error(self):
+    def test_static(self):
+        tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos = self.get_inputs(
+            self.seed, True
+        )
+        p_fw, p_bw = self.get_forward_backward(
+            paddle_fused_rotary_position_embedding,
+            seed=self.seed,
+            use_neox_rotary_style=False,
+        )
+
         paddle.enable_static()
-        with self.assertRaises(RuntimeError):
-            static_q = paddle.static.data(
-                name="q", shape=self.shape, dtype=self.dtype
-            )
-            fused_rotary_position_embedding(static_q, static_q, static_q)
+
+        q = paddle.static.data(name="q", shape=self.shape, dtype=self.dtype)
+        k = paddle.static.data(name="k", shape=self.shape, dtype=self.dtype)
+        v = paddle.static.data(name="v", shape=self.shape, dtype=self.dtype)
+        sin = paddle.static.data(
+            name="sin",
+            shape=(1, tensor_q.shape[1], 1, tensor_q.shape[3]),
+            dtype=self.dtype,
+        )
+        cos = paddle.static.data(
+            name="cos",
+            shape=(1, tensor_q.shape[1], 1, tensor_q.shape[3]),
+            dtype=self.dtype,
+        )
+
+        out_q, out_k, out_v = fused_rotary_position_embedding(
+            q,
+            k,
+            v,
+            sin,
+            cos,
+            position_ids=None,
+            use_neox_rotary_style=False,
+        )
+
+        exe = paddle.static.Executor()
+
+        feed = {
+            'q': tensor_q.numpy(),
+            'k': tensor_k.numpy(),
+            'v': tensor_v.numpy(),
+            'sin': tensor_sin.numpy(),
+            'cos': tensor_cos.numpy(),
+        }
+        outs = exe.run(
+            paddle.static.default_main_program(),
+            feed=feed,
+            fetch_list=[out_q, out_k, out_v],
+        )
+
+        for i in range(3):
+            np.testing.assert_allclose(p_fw[i].numpy(), outs[i], rtol=1e-05)
         paddle.disable_static()
 
 
diff --git a/test/legacy_test/test_fusion_transpose_flatten_concat_op.py b/test/legacy_test/test_fusion_transpose_flatten_concat_op.py
index de557e4c4a52ed..a0ef5e25b58b69 100644
--- a/test/legacy_test/test_fusion_transpose_flatten_concat_op.py
+++ b/test/legacy_test/test_fusion_transpose_flatten_concat_op.py
@@ -54,7 +54,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, 1e-6)
+        self.check_output_with_place(place, 1e-6, check_dygraph=False)
 
     def init_test_case(self):
         self.shapes = [(3, 4, 17, 17), (3, 8, 7, 7), (3, 12, 5, 5)]
diff --git a/test/legacy_test/test_gather_nd_op.py b/test/legacy_test/test_gather_nd_op.py
index a10faff2ac1f35..3a27faf99cb6b8 100644
--- a/test/legacy_test/test_gather_nd_op.py
+++ b/test/legacy_test/test_gather_nd_op.py
@@ -53,10 +53,16 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGatherNdOpWithEmptyIndexFP16(TestGatherNdOpWithEmptyIndex):
@@ -75,12 +81,17 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
@@ -114,10 +125,16 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGatherNdOpWithIndex1_ZeroDim(TestGatherNdOpWithIndex1):
@@ -163,12 +180,17 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
@@ -202,10 +224,16 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGatherNdOpWithLowIndexFP16(TestGatherNdOpWithLowIndex):
@@ -224,7 +252,7 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
@@ -233,8 +261,9 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             numeric_grad_delta=0.5,
+            check_prim_pir=True,
         )
 
 
@@ -273,15 +302,16 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             numeric_grad_delta=0.05,
+            check_prim_pir=True,
         )
 
 
@@ -301,7 +331,7 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
@@ -310,8 +340,9 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             numeric_grad_delta=0.5,
+            check_prim_pir=True,
         )
 
 
@@ -342,10 +373,16 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGatherNdOpWithSameIndexAsXFP16(TestGatherNdOpWithSameIndexAsX):
@@ -364,7 +401,7 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
@@ -373,8 +410,9 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             numeric_grad_delta=0.5,
+            check_prim_pir=True,
         )
 
 
@@ -407,10 +445,16 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGatherNdOpWithHighRankSameFP16(TestGatherNdOpWithHighRankSame):
@@ -429,12 +473,17 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
@@ -468,10 +517,16 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGatherNdOpWithHighRankDiffFP16(TestGatherNdOpWithHighRankDiff):
@@ -490,12 +545,17 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py
index 8f03e0f547e8de..2a0f30a84e03c9 100644
--- a/test/legacy_test/test_gaussian_random_op.py
+++ b/test/legacy_test/test_gaussian_random_op.py
@@ -46,7 +46,7 @@ def set_attrs(self):
         self.std = 2.0
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         self.assertEqual(outs[0].shape, (123, 92))
@@ -88,7 +88,7 @@ def set_attrs(self):
 
     def test_check_output(self):
         self.check_output_with_place_customized(
-            self.verify_output, place=core.CUDAPlace(0), check_new_ir=True
+            self.verify_output, place=core.CUDAPlace(0), check_pir=True
         )
 
     def verify_output(self, outs):
@@ -141,7 +141,7 @@ def set_attrs(self):
 
     def test_check_output(self):
         self.check_output_with_place_customized(
-            self.verify_output, place=core.CUDAPlace(0), check_new_ir=True
+            self.verify_output, place=core.CUDAPlace(0), check_pir=True
         )
 
     def verify_output(self, outs):
@@ -196,7 +196,7 @@ def init_data(self):
         self.seed = 10
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
 
 class TestGaussianRandomOp2_ShapeTensorList(
diff --git a/test/legacy_test/test_gumbel_softmax_op.py b/test/legacy_test/test_gumbel_softmax_op.py
index e3fbf15a299d8c..97751840e687e4 100644
--- a/test/legacy_test/test_gumbel_softmax_op.py
+++ b/test/legacy_test/test_gumbel_softmax_op.py
@@ -46,10 +46,10 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_new_ir=True)
+        self.check_grad(["X"], "Out", check_pir=True)
 
 
 class TestGumbelSoftmax_ZeroDim(OpTest):
@@ -68,10 +68,10 @@ def setUp(self):
         self.attrs = {"hard": True, "axis": -1}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_new_ir=True)
+        self.check_grad(["X"], "Out", check_pir=True)
 
 
 class TestGumbelSoftmaxOp2(TestGumbelSoftmaxOp):
@@ -176,7 +176,7 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output_customized(self.accumulate_output, check_new_ir=True)
+        self.check_output_customized(self.accumulate_output, check_pir=True)
         # Experiment should result in batch num .
         self.assertEqual(self.counts.sum(), self.shape[0])
 
@@ -192,7 +192,7 @@ def test_check_output(self):
         self.assertLess(np.max(np.abs(z)).item(), 2.58)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_new_ir=True)
+        self.check_grad(["X"], "Out", check_pir=True)
 
 
 class TestGumbelSoftmaxOpGrad(unittest.TestCase):
diff --git a/test/legacy_test/test_hypot.py b/test/legacy_test/test_hypot.py
new file mode 100644
index 00000000000000..66a049038eb5ae
--- /dev/null
+++ b/test/legacy_test/test_hypot.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+
+paddle.enable_static()
+
+
+class TestHypotAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_shape = [10, 10]
+        self.y_shape = [10, 1]
+        self.x_np = np.random.uniform(-10, 10, self.x_shape).astype(np.float32)
+        self.y_np = np.random.uniform(-10, 10, self.y_shape).astype(np.float32)
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        startup_program = base.Program()
+        train_program = base.Program()
+        with base.program_guard(startup_program, train_program):
+            x = paddle.static.data(
+                name='input1', dtype='float32', shape=self.x_shape
+            )
+            y = paddle.static.data(
+                name='input2', dtype='float32', shape=self.y_shape
+            )
+            out = paddle.hypot(x, y)
+
+            place = (
+                base.CUDAPlace(0)
+                if core.is_compiled_with_cuda()
+                else base.CPUPlace()
+            )
+            exe = base.Executor(place)
+            res = exe.run(
+                base.default_main_program(),
+                feed={'input1': self.x_np, 'input2': self.y_np},
+                fetch_list=[out],
+            )
+            np_out = np.hypot(self.x_np, self.y_np)
+            np.testing.assert_allclose(res[0], np_out, atol=1e-5, rtol=1e-5)
+            paddle.disable_static()
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x_np)
+        y = paddle.to_tensor(self.y_np)
+        result = paddle.hypot(x, y)
+        np.testing.assert_allclose(
+            np.hypot(self.x_np, self.y_np), result.numpy(), rtol=1e-05
+        )
+
+        paddle.enable_static()
+
+    def test_error(self):
+        x = paddle.to_tensor(self.x_np)
+        y = 3.8
+        self.assertRaises(TypeError, paddle.hypot, x, y)
+        self.assertRaises(TypeError, paddle.hypot, y, x)
+
+
+class TestHypotAPIBroadCast(TestHypotAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.float32)
+        self.y_np = np.array([20]).astype(np.float32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestHypotAPI3(TestHypotAPI):
+    def setUp(self):
+        self.x_shape = []
+        self.y_shape = []
+        self.x_np = np.random.uniform(-10, 10, self.x_shape).astype(np.float32)
+        self.y_np = np.random.uniform(-10, 10, self.y_shape).astype(np.float32)
+
+
+class TestHypotAPI4(TestHypotAPI):
+    def setUp(self):
+        self.x_shape = [1]
+        self.y_shape = [1]
+        self.x_np = np.random.uniform(-10, 10, self.x_shape).astype(np.float32)
+        self.y_np = np.random.uniform(-10, 10, self.y_shape).astype(np.float32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_increment.py b/test/legacy_test/test_increment.py
index 4887564e9b9bb2..3055ffe1bdcf3a 100755
--- a/test/legacy_test/test_increment.py
+++ b/test/legacy_test/test_increment.py
@@ -18,9 +18,11 @@
 
 import paddle
 from paddle import base
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestIncrement(unittest.TestCase):
+    @test_with_pir_api
     def test_api(self):
         with base.program_guard(base.Program(), base.Program()):
             input = paddle.tensor.fill_constant(
@@ -41,6 +43,7 @@ def test_api(self):
 
 
 class TestInplaceApiWithDataTransform(unittest.TestCase):
+    @test_with_pir_api
     def test_increment(self):
         if base.core.is_compiled_with_cuda():
             paddle.enable_static()
diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py
index e3f1de1048e113..cb45f2fd8969f2 100644
--- a/test/legacy_test/test_inplace.py
+++ b/test/legacy_test/test_inplace.py
@@ -56,7 +56,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c + var_d)
             with self.assertRaisesRegex(
                 RuntimeError,
-                f"received tensor_version:{1} != wrapper_version_snapshot:{0}",
+                "received tensor_version:1 != wrapper_version_snapshot:0",
             ):
                 loss.backward()
 
@@ -171,7 +171,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                f"received tensor_version:{1} != wrapper_version_snapshot:{0}",
+                "received tensor_version:1 != wrapper_version_snapshot:0",
             ):
                 loss.backward()
 
@@ -834,6 +834,59 @@ def test_error(self):
         self.assertRaises(ValueError, paddle.gcd_, x, y)
 
 
+class TestDygraphInplaceHypot(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.randint(2, size=200)
+        self.input_var_numpy = self.input_var_numpy.reshape([10, 20])
+        self.dtype = "float32"
+        self.y = paddle.randn(shape=[10, 20], dtype="float32")
+
+    def inplace_api_processing(self, var):
+        return paddle.hypot_(var, self.y)
+
+    def non_inplace_api_processing(self, var):
+        return paddle.hypot(var, self.y)
+
+    def test_errors(self):
+        x = 3.0
+        self.assertRaises(TypeError, paddle.hypot_, x, self.y)
+        self.assertRaises(TypeError, paddle.hypot_, self.y, x)
+
+    def test_forward_version(self):
+        with paddle.base.dygraph.guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            self.assertEqual(var.inplace_version, 0)
+
+            inplace_var = self.inplace_api_processing(var)
+            self.assertEqual(var.inplace_version, 3)
+
+            inplace_var[0] = 2.0
+            self.assertEqual(var.inplace_version, 4)
+
+            inplace_var = self.inplace_api_processing(inplace_var)
+            self.assertEqual(var.inplace_version, 7)
+
+    def test_backward_error(self):
+        # It raises an error because the inplace operator will result
+        # in incorrect gradient computation.
+        with paddle.base.dygraph.guard():
+            var_a = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            var_a.stop_gradient = False
+
+            var_b = var_a**2
+            # Here, the gradient computation will use the value of var_b
+            var_c = var_b**2
+            self.inplace_api_processing(var_b)
+            var_c = paddle.cast(var_c, "float32")
+
+            loss = paddle.nn.functional.relu(var_c)
+            with self.assertRaisesRegex(
+                RuntimeError,
+                f"received tensor_version:{3} != wrapper_version_snapshot:{0}",
+            ):
+                loss.backward()
+
+
 class TestDygraphInplaceNanToNum(TestDygraphInplace):
     def init_data(self):
         self.input_var_numpy = np.array(
@@ -886,7 +939,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                f"received tensor_version:{3} != wrapper_version_snapshot:{0}",
+                "received tensor_version:3 != wrapper_version_snapshot:0",
             ):
                 loss.backward()
 
@@ -975,7 +1028,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                f"received tensor_version:{2} != wrapper_version_snapshot:{0}",
+                "received tensor_version:2 != wrapper_version_snapshot:0",
             ):
                 loss.backward()
 
@@ -1051,7 +1104,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                f"received tensor_version:{2} != wrapper_version_snapshot:{0}",
+                "received tensor_version:2 != wrapper_version_snapshot:0",
             ):
                 loss.backward()
 
@@ -1347,7 +1400,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                f"received tensor_version:{2} != wrapper_version_snapshot:{0}",
+                "received tensor_version:2 != wrapper_version_snapshot:0",
             ):
                 loss.backward()
 
@@ -1389,7 +1442,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                f"received tensor_version:{2} != wrapper_version_snapshot:{0}",
+                "received tensor_version:2 != wrapper_version_snapshot:0",
             ):
                 loss.backward()
 
diff --git a/test/legacy_test/test_inplace_abn_op.py b/test/legacy_test/test_inplace_abn_op.py
deleted file mode 100644
index d56a467a2ed79d..00000000000000
--- a/test/legacy_test/test_inplace_abn_op.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-class TestInplaceANBOpTraining(unittest.TestCase):
-    def setUp(self):
-        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.N = 4
-        self.C = 5
-        self.H = 7
-        self.W = 9
-        self.dshape = [self.N, self.C, self.H, self.W]
-
-    def build_program(
-        self,
-        place,
-        layout,
-        seed,
-        only_forward=False,
-        activation="identity",
-        alpha=1.0,
-        use_cuda=False,
-        inplace=False,
-    ):
-        main = base.Program()
-        startup = base.Program()
-        main.random_seed = seed
-        startup.random_seed = seed
-        with base.unique_name.guard():
-            with base.program_guard(main, startup):
-                data = paddle.static.data(
-                    name='input',
-                    shape=self.dshape,
-                    dtype=self.dtype,
-                )
-                data.stop_gradient = False
-                data.desc.set_need_check_feed(False)
-
-                bn = paddle.static.nn.batch_norm(
-                    data,
-                    param_attr=base.ParamAttr(name='bn_scale'),
-                    bias_attr=base.ParamAttr(name='bn_bias'),
-                    moving_mean_name='bn_moving_mean',
-                    moving_variance_name='bn_moving_variance',
-                    data_layout=layout,
-                    is_test=only_forward,
-                    in_place=inplace,
-                )
-                if activation == 'leaky_relu':
-                    bn = paddle.nn.functional.leaky_relu(bn, alpha)
-                if activation == 'elu':
-                    bn = paddle.nn.functional.elu(bn, alpha)
-
-                # NOTE: in inplace mode input and output of bn
-                # may have same name, multiply 1. to generate
-                # a new Variable for fetch
-                bn = bn * 1.0
-                sigmoid = paddle.nn.functional.sigmoid(bn)
-                out = paddle.sum(sigmoid)
-                if not only_forward:
-                    sgd_opt = paddle.optimizer.SGD(learning_rate=0.0)
-                    sgd_opt.backward(out)
-        return main, startup, [out, bn]
-
-    def test_all_branches(self):
-        seed = 10
-        os.environ['FLAGS_cudnn_deterministic'] = "1"
-        data = np.random.random(size=self.dshape).astype(self.dtype) * 4.0 - 2
-        use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
-        alpha = 0.1
-        layouts = ["NCHW", "NHWC"]
-        for use_cuda in use_cudas:
-            place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
-            for layout in layouts:
-                for activation in ['identity', 'leaky_relu']:
-                    main, startup, outs = self.build_program(
-                        place,
-                        layout,
-                        seed,
-                        False,
-                        activation,
-                        alpha,
-                        use_cuda,
-                        False,
-                    )
-                    exe = base.Executor(place)
-                    exe.run(startup)
-                    exe.run(program=main, feed={'input': data})
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_input_spec.py b/test/legacy_test/test_input_spec.py
index 47c461a2a1eab4..a1e8c5e8522959 100644
--- a/test/legacy_test/test_input_spec.py
+++ b/test/legacy_test/test_input_spec.py
@@ -200,7 +200,7 @@ def check_result(self, specs, path):
         np.testing.assert_allclose(dy_out, pred_out, rtol=1e-05)
 
         # @to_static by InputSpec
-        net = paddle.jit.to_static(net, input_spec=specs)
+        net = paddle.jit.to_static(net, input_spec=specs, full_graph=True)
         st_out = net(self.x, *specs[1:])
 
         np.testing.assert_allclose(dy_out, st_out, rtol=1e-05)
@@ -217,7 +217,7 @@ def test_spec_compatible(self):
         net = NetWithNonTensorSpec(self.in_num, self.out_num)
 
         specs = [self.x_spec, False, "bn", -10]
-        net = paddle.jit.to_static(net, input_spec=specs)
+        net = paddle.jit.to_static(net, input_spec=specs, full_graph=True)
         net.eval()
 
         path = os.path.join(self.temp_dir.name, './net_twice')
@@ -288,7 +288,7 @@ def test_non_tensor_with_prune(self):
         np.testing.assert_allclose(dy_out, pred_out, rtol=1e-05)
 
         # @to_static by InputSpec
-        net = paddle.jit.to_static(net, input_spec=specs)
+        net = paddle.jit.to_static(net, input_spec=specs, full_graph=True)
         st_out, _ = net(self.x, self.y, *specs[2:])
 
         np.testing.assert_allclose(dy_out, st_out, rtol=1e-05)
@@ -351,7 +351,9 @@ def tearDown(self):
     def test_run(self):
         net = NegSpecNet()
         net = paddle.jit.to_static(
-            net, input_spec=[paddle.static.InputSpec(shape=[-1, 10])]
+            net,
+            input_spec=[paddle.static.InputSpec(shape=[-1, 10])],
+            full_graph=True,
         )
         x = paddle.randn([2, 10])
         out = net(x)
diff --git a/test/legacy_test/test_isfinite_op.py b/test/legacy_test/test_isfinite_op.py
deleted file mode 100755
index 94e52ee9f2e96c..00000000000000
--- a/test/legacy_test/test_isfinite_op.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, convert_float_to_uint16
-
-from paddle.base import core
-
-
-class TestInf(OpTest):
-    def setUp(self):
-        self.op_type = "isinf"
-        self.dtype = np.float32
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        x[0] = np.inf
-        x[-1] = np.inf
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.array([True]).astype(self.dtype)}
-
-    def init_dtype(self):
-        pass
-
-    def test_output(self):
-        self.check_output()
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestFP16Inf(TestInf):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-# BFP16 isinf Test
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "core is not compiled with CUDA or not support the bfloat16",
-)
-class TestInfBF16(OpTest):
-    def setUp(self):
-        self.op_type = "isinf"
-        self.dtype = np.uint16
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
-        x[0] = np.inf
-        x[-1] = np.inf
-
-        out = np.array([True])
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.outputs = {'Out': out}
-
-    def test_output(self):
-        self.check_output_with_place(core.CUDAPlace(0))
-
-
-class TestNAN(OpTest):
-    def setUp(self):
-        self.op_type = "isnan"
-        self.dtype = np.float32
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        x[0] = np.nan
-        x[-1] = np.nan
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.array([True]).astype(self.dtype)}
-
-    def init_dtype(self):
-        pass
-
-    def test_output(self):
-        self.check_output()
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestFP16NAN(TestNAN):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-# BFP16 isnan Test
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "core is not compiled with CUDA or not support the bfloat16",
-)
-class TestNANBF16(OpTest):
-    def setUp(self):
-        self.op_type = "isnan"
-        self.dtype = np.uint16
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
-        x[0] = np.nan
-        x[-1] = np.nan
-
-        out = np.array([True])
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.outputs = {'Out': out}
-
-    def test_output(self):
-        self.check_output_with_place(core.CUDAPlace(0))
-
-
-class TestIsfinite(OpTest):
-    def setUp(self):
-        self.op_type = "isfinite"
-        self.dtype = np.float32
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        x[0] = np.inf
-        x[-1] = np.nan
-        out = np.isinf(x) | np.isnan(x)
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.array([False]).astype(self.dtype)}
-
-    def init_dtype(self):
-        pass
-
-    def test_output(self):
-        self.check_output()
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestFP16Isfinite(TestIsfinite):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-# BFP16 isfinite Test
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "core is not compiled with CUDA or not support the bfloat16",
-)
-class TestIsfiniteBF16(OpTest):
-    def setUp(self):
-        self.op_type = "isfinite"
-        self.dtype = np.uint16
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
-        x[0] = np.inf
-        x[-1] = np.nan
-
-        out = np.array([False])
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.outputs = {'Out': out}
-
-    def test_output(self):
-        self.check_output_with_place(core.CUDAPlace(0))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py
index 3fb01bb3d0b62a..cc4726f3458cf4 100644
--- a/test/legacy_test/test_layer_norm_op.py
+++ b/test/legacy_test/test_layer_norm_op.py
@@ -143,7 +143,7 @@ def test_check_output(self):
             rtol=self.ori_rtol,
             check_prim=self.check_prim,
             check_prim_pir=self.check_prim_pir,
-            check_new_ir=self.check_new_ir,
+            check_pir=self.check_pir,
         )
 
     def test_check_grad(self):
@@ -153,7 +153,7 @@ def test_check_grad(self):
             max_relative_error=self.max_relative_error,
             check_prim=self.check_prim,
             check_prim_pir=self.check_prim_pir,
-            check_new_ir=self.check_new_ir,
+            check_pir=self.check_pir,
         )
 
     def initConfig(self):
@@ -177,7 +177,7 @@ def initConfig(self):
         self.has_bias = True
         self.check_prim = True
         self.check_prim_pir = True
-        self.check_new_ir = True
+        self.check_pir = True
 
     def initTestCase(self):
         np.random.seed(123)
@@ -247,7 +247,7 @@ def test_check_output(self):
             rtol=self.ori_rtol,
             check_prim=self.check_prim,
             check_prim_pir=self.check_prim_pir,
-            check_new_ir=self.check_new_ir,
+            check_pir=self.check_pir,
         )
 
     def test_check_grad(self):
@@ -258,7 +258,7 @@ def test_check_grad(self):
             max_relative_error=self.max_relative_error,
             check_prim=self.check_prim,
             check_prim_pir=self.check_prim_pir,
-            check_new_ir=self.check_new_ir,
+            check_pir=self.check_pir,
         )
 
     def initConfig(self):
@@ -275,7 +275,7 @@ def initConfig(self):
         self.has_bias = True
         self.check_prim = True
         self.check_prim_pir = True
-        self.check_new_ir = True
+        self.check_pir = True
 
     def initTestCase(self):
         np.random.seed(123)
@@ -347,7 +347,7 @@ def initConfig(self):
         self.has_bias = False
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 @unittest.skipIf(
@@ -371,7 +371,7 @@ def initConfig(self):
         self.has_bias = False
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 @unittest.skipIf(
@@ -400,7 +400,7 @@ def initConfig(self):
         self.has_bias = False
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 @unittest.skipIf(
@@ -424,7 +424,7 @@ def initConfig(self):
         self.has_bias = False
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 @unittest.skipIf(
@@ -453,7 +453,7 @@ def initConfig(self):
         self.has_bias = True
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 @unittest.skipIf(
@@ -477,7 +477,7 @@ def initConfig(self):
         self.has_bias = True
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 class TestLayerNormOpByOpTestFP32(TestLayerNormOpByOpTest):
@@ -497,7 +497,7 @@ def initConfig(self):
         self.has_bias = True
         self.check_prim = True
         self.check_prim_pir = True
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 class TestLayerNormOpByOpTestFP32_case2(TestLayerNormOpByOpTest):
@@ -517,7 +517,7 @@ def initConfig(self):
         self.has_bias = False
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 class TestLayerNormOpByOpTestFP32_case3(TestLayerNormOpByOpTest):
@@ -537,7 +537,7 @@ def initConfig(self):
         self.has_bias = False
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 class TestLayerNormOpByOpTestFP32_case4(TestLayerNormOpByOpTest):
@@ -557,7 +557,7 @@ def initConfig(self):
         self.has_bias = True
         self.check_prim = False
         self.check_prim_pir = False
-        self.check_new_ir = True
+        self.check_pir = True
 
 
 class TestLayerNormOp(unittest.TestCase):
@@ -838,6 +838,11 @@ def test_errors(self):
                 name='x2', shape=[-1, 3, 32, 32], dtype="int32"
             )
             self.assertRaises(TypeError, layer_norm, x2)
+        with paddle.pir_utils.IrGuard(), program_guard(Program(), Program()):
+            layer_norm = paddle.nn.LayerNorm([32, 32])
+            # the input of LayerNorm must be Variable.
+            x1 = np.random.random((3, 32, 32)).astype('float32')
+            self.assertRaises(ValueError, layer_norm, x1)
 
 
 @unittest.skipIf(
diff --git a/test/legacy_test/test_logcumsumexp_op.py b/test/legacy_test/test_logcumsumexp_op.py
index 373548f679b88b..0be9a6f4d450b9 100644
--- a/test/legacy_test/test_logcumsumexp_op.py
+++ b/test/legacy_test/test_logcumsumexp_op.py
@@ -232,7 +232,7 @@ def setUp(self):
             self.outputs = {'Out': np_logcumsumexp(input, **attrs)}
 
         def test_check_output(self):
-            self.check_output()
+            self.check_output(check_pir=True)
 
         def test_check_grad(self):
             self.check_grad(
@@ -245,6 +245,7 @@ def test_check_grad(self):
                         **self.attrs
                     )
                 ],
+                check_pir=True,
             )
 
         def input_and_attrs(self):
@@ -332,7 +333,7 @@ def test_check_output(self):
         place = core.CUDAPlace(0)
         place = core.CUDAPlace(0)
         self.check_output_with_place_customized(
-            checker=self.verify_output, place=place
+            checker=self.verify_output, place=place, check_pir=True
         )
 
     def verify_output(self, outs):
@@ -352,7 +353,12 @@ def verify_output(self, outs):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', numeric_grad_delta=0.5, max_relative_error=0.5
+            place,
+            ['X'],
+            'Out',
+            numeric_grad_delta=0.5,
+            max_relative_error=0.5,
+            check_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_lookup_table_v2_op.py b/test/legacy_test/test_lookup_table_v2_op.py
index 035aef9f7576c9..ad708eb137bb1f 100644
--- a/test/legacy_test/test_lookup_table_v2_op.py
+++ b/test/legacy_test/test_lookup_table_v2_op.py
@@ -62,7 +62,7 @@ def id_dtype(self):
         return "int64"
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -70,7 +70,7 @@ def test_check_grad(self):
             'Out',
             no_grad_set=set('Ids'),
             check_cinn=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -99,7 +99,7 @@ def setUp(self):
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -107,7 +107,7 @@ def test_check_grad(self):
             'Out',
             no_grad_set=set('Ids'),
             check_cinn=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -122,7 +122,7 @@ def test_check_output(self):
         padding_idx = np.random.choice(ids, 1)[0]
         self.outputs['Out'][ids == padding_idx] = np.zeros(31)
         self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
 
 @skip_check_grad_ci(
@@ -137,7 +137,7 @@ def test_check_output(self):
         padding_idx = np.random.choice(flatten_idx, 1)[0]
         self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
         self.attrs = {'padding_idx': padding_idx}
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
 
 class TestLookupTableWIsSelectedRows(unittest.TestCase):
@@ -355,7 +355,7 @@ def id_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_cinn=True, check_new_ir=True)
+        self.check_output_with_place(place, check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
@@ -365,7 +365,7 @@ def test_check_grad(self):
             'Out',
             no_grad_set=set('Ids'),
             check_cinn=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_lr_scheduler.py b/test/legacy_test/test_lr_scheduler.py
index 54484ecc6ad2c2..ba1f712dce2fd8 100644
--- a/test/legacy_test/test_lr_scheduler.py
+++ b/test/legacy_test/test_lr_scheduler.py
@@ -464,6 +464,31 @@ def exp_range(x):
     return base_learning_rate + base_height * scale_fn(eval(scale_mode))
 
 
+linear_last_lr = None
+
+
+def linear_lr(
+    epoch_num,
+    learning_rate,
+    total_steps,
+    start_factor=1.0 / 3,
+    end_factor=1.0,
+    verbose=False,
+):
+    global linear_last_lr
+    if epoch_num == 0:
+        linear_last_lr = learning_rate * start_factor
+        return linear_last_lr
+    elif epoch_num > total_steps:
+        return linear_last_lr
+    else:
+        base_lr = total_steps * start_factor
+        cur_factor = end_factor - start_factor
+        factor = 1.0 + cur_factor / (base_lr + (epoch_num - 1) * cur_factor)
+        linear_last_lr *= factor
+        return linear_last_lr
+
+
 class TestLRScheduler(unittest.TestCase):
     def _test_static(self, python_func, paddle_api, kwarg, place):
         scheduler = paddle_api(**kwarg)
@@ -711,6 +736,19 @@ def test_scheduler(self):
             paddle.optimizer.lr.PiecewiseDecay(
                 boundaries=[100, 200], values=[0.5, 0.1]
             )
+        # check minus total_steps
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.LinearLR(learning_rate=1, total_steps=-1)
+        # check start_factor
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.LinearLR(
+                learning_rate=1, total_steps=5, start_factor=2
+            )
+        # check end_factor
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.LinearLR(
+                learning_rate=1, total_steps=5, end_factor=2
+            )
 
         func_api_kwargs = [
             (
@@ -944,6 +982,28 @@ def test_scheduler(self):
                     "verbose": False,
                 },
             ),
+            (
+                linear_lr,
+                paddle.optimizer.lr.LinearLR,
+                {
+                    "learning_rate": 0.2,
+                    "total_steps": 40,
+                    "start_factor": 0.5,
+                    "end_factor": 1,
+                    "verbose": False,
+                },
+            ),
+            (
+                linear_lr,
+                paddle.optimizer.lr.LinearLR,
+                {
+                    "learning_rate": 0.2,
+                    "total_steps": 5,
+                    "start_factor": 0.2,
+                    "end_factor": 0.5,
+                    "verbose": False,
+                },
+            ),
         ]
 
         for python_func, paddle_api, kwarg in func_api_kwargs:
diff --git a/test/legacy_test/test_lstm_unit_op.py b/test/legacy_test/test_lstm_unit_op.py
deleted file mode 100644
index 8a1b2fc238b229..00000000000000
--- a/test/legacy_test/test_lstm_unit_op.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def sigmoid_np(x):
-    return 1.0 / (1.0 + np.exp(-x))
-
-
-def tanh_np(x):
-    return 2 * sigmoid_np(2.0 * x) - 1.0
-
-
-class LstmUnitTest(OpTest):
-    def setUp(self):
-        self.op_type = "lstm_unit"
-        x_np = np.random.normal(size=(15, 160)).astype("float64")
-        c_np = np.random.normal(size=(15, 40)).astype("float64")
-        i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
-        forget_bias_np = 0.0
-        self.attrs = {'forget_bias': 0.0}
-
-        new_c = c_np * sigmoid_np(f_np + forget_bias_np) + sigmoid_np(
-            i_np
-        ) * tanh_np(j_np)
-        new_h = tanh_np(new_c) * sigmoid_np(o_np)
-
-        self.inputs = {'X': x_np, 'C_prev': c_np}
-        self.outputs = {'C': new_c, 'H': new_h}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'C_prev'], ['C', 'H'])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_lstmp_op.py b/test/legacy_test/test_lstmp_op.py
deleted file mode 100644
index f1af2191409356..00000000000000
--- a/test/legacy_test/test_lstmp_op.py
+++ /dev/null
@@ -1,379 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import test_lstm_op as LstmTest
-
-ACTIVATION = {
-    'identity': LstmTest.identity,
-    'sigmoid': LstmTest.sigmoid,
-    'tanh': LstmTest.tanh,
-    'relu': LstmTest.relu,
-}
-
-
-# LSTM with recurrent projection Layer
-def lstmp(
-    input,  # T x 4D
-    lod,  # 1 x N
-    h0=None,  # N x D
-    c0=None,  # N x D
-    w_r=None,  # P x 4D
-    w_rh=None,  # D x P
-    w_b=None,  # 1 x 4D
-    w_c=None,  # 1 x 3D
-    is_reverse=False,
-    proj_clip=0.0,
-    cell_clip=0.0,
-    act_gate=None,
-    act_cell=None,
-    act_cand=None,
-    act_proj=None,
-):
-    def _step(
-        x,
-        w_r,
-        w_rh,
-        w_c,
-        r_pre,
-        c_pre,
-        proj_clip,
-        cell_clip,
-        act_gate,
-        act_cell,
-        act_cand,
-        act_proj,
-    ):
-        g = np.dot(r_pre, w_r)  # 1 x 4D
-        g = g + x
-        g = np.reshape(g, (1, g.size))
-        c, g_i, g_f, g_o = np.split(g, 4, axis=1)
-        if w_c is None:
-            g_i = act_gate(g_i)  # 1 x D
-            g_f = act_gate(g_f)  # 1 x D
-        else:
-            w_ic, w_fc, _ = np.split(w_c, 3, axis=1)
-            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
-            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
-        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
-
-        def array_clip(a, clip):
-            size = np.prod(a.shape)
-            new_a = np.reshape(a, (size))
-            for i in range(size):
-                new_a[i] = max(new_a[i], -1.0 * clip)
-                new_a[i] = min(new_a[i], clip)
-            new_a = np.reshape(new_a, a.shape)
-            return new_a
-
-        if cell_clip > 0.0:
-            c = array_clip(c, cell_clip)
-        if w_c is None:
-            g_o = act_gate(g_o)  # 1 x D
-        else:
-            _, _, w_oc = np.split(w_c, 3, axis=1)
-            g_o = act_gate(g_o + w_oc * c)  # 1 x D
-        h = g_o * act_cell(c)
-        # projection
-        r = np.dot(h, w_rh)
-        r = act_proj(r)
-        if proj_clip > 0.0:
-            r = array_clip(r, proj_clip)
-        return r, c
-
-    def _reverse(x, offset):
-        y = np.zeros_like(x)
-        for i in range(len(offset) - 1):
-            b, e = offset[i], offset[i + 1]
-            y[b:e, :] = np.flip(x[b:e, :], 0)
-        return y
-
-    offset = [0]
-    for l in lod[0]:
-        offset.append(offset[-1] + l)
-    batch_size = len(lod[0])
-    # recurrent projection state
-    projection = []
-    cell = []
-    input = _reverse(input, offset) if is_reverse else input
-    if w_b is not None:
-        input = input + np.tile(w_b, (offset[-1], 1))
-    for i in range(batch_size):
-        # compute one sequence
-        seq_len = lod[0][i]
-        x = input[offset[i] : offset[i + 1], :]
-        r_pre = h0[i]
-        c_pre = c0[i]  # 1 x D
-        for j in range(seq_len):
-            # compute one step
-            r_pre, c_pre = _step(
-                x[j],
-                w_r,
-                w_rh,
-                w_c,
-                r_pre,
-                c_pre,
-                proj_clip,
-                cell_clip,
-                act_gate,
-                act_cell,
-                act_cand,
-                act_proj,
-            )
-            projection.append(r_pre.flatten())
-            cell.append(c_pre.flatten())
-
-    projection = np.array(projection).astype('float64')
-    cell = np.array(cell).astype('float64')
-
-    projection = _reverse(projection, offset) if is_reverse else projection
-    cell = _reverse(cell, offset) if is_reverse else cell
-
-    assert projection.shape == (input.shape[0], w_r.shape[0])  # T x P
-    assert cell.shape == (input.shape[0], input.shape[1] / 4)  # T x D
-    return projection, cell
-
-
-class TestLstmpOp(LstmTest.TestLstmOp):
-    def reset_argument(self):
-        pass
-
-    def setUp(self):
-        self.set_argument()
-        # projection size
-        self.P = 10
-        self.act_proj = self.act_cell
-
-        self.reset_argument()
-        self.op_type = 'lstmp'
-
-        T = sum(self.lod[0])
-        N = len(self.lod[0])
-        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
-        if self.has_initial_state:
-            h0 = np.random.normal(size=(N, self.P)).astype('float64')
-            c0 = np.random.normal(size=(N, self.D)).astype('float64')
-        else:
-            h0 = np.zeros((N, self.P)).astype('float64')
-            c0 = np.zeros((N, self.D)).astype('float64')
-        w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
-        if self.use_peepholes:
-            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
-        else:
-            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
-
-        w_b = b[:, 0 : 4 * self.D]
-        w_c = b[:, 4 * self.D :] if self.use_peepholes else None
-        w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
-        proj_clip = 0.1
-        cell_clip = 0.1
-        r, c = lstmp(
-            x,
-            self.lod,
-            h0,
-            c0,
-            w,
-            w_rh,
-            w_b,
-            w_c,
-            self.is_reverse,
-            proj_clip,
-            cell_clip,
-            ACTIVATION[self.act_gate],
-            ACTIVATION[self.act_cell],
-            ACTIVATION[self.act_cand],
-            ACTIVATION[self.act_proj],
-        )
-
-        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
-
-        self.inputs['Bias'] = b
-
-        if self.has_initial_state:
-            self.inputs['H0'] = h0
-            self.inputs['C0'] = c0
-
-        self.outputs = {
-            'Projection': (r, self.lod),
-            'Cell': (c, self.lod),
-        }
-        self.attrs = {
-            'use_peepholes': self.use_peepholes,
-            'is_reverse': self.is_reverse,
-            'proj_clip': proj_clip,
-            'cell_clip': cell_clip,
-            'gate_activation': self.act_gate,
-            'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand,
-            'proj_activation': self.act_proj,
-        }
-
-    def test_check_output(self):
-        self.check_output(atol=1e-8, check_dygraph=False)
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
-            'float64'
-        )
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            check_dygraph=False,
-        )
-
-
-class TestLstmpOpHasInitial(TestLstmpOp):
-    def reset_argument(self):
-        self.has_initial_state = True
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
-            'float64'
-        )
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            check_dygraph=False,
-        )
-
-    def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
-            'float64'
-        )
-        self.check_grad(
-            ['Input', 'ProjWeight', 'Weight'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Bias'),
-            check_dygraph=False,
-        )
-
-    def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
-            'float64'
-        )
-        self.check_grad(
-            ['Input', 'ProjWeight', 'Bias'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Weight'),
-            check_dygraph=False,
-        )
-
-    def test_check_grad_ingore_proj_weight(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
-            'float64'
-        )
-        self.check_grad(
-            ['Input', 'Weight', 'Bias'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('ProjWeight'),
-            check_dygraph=False,
-        )
-
-    def test_check_grad_ingore_input(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
-            'float64'
-        )
-        self.check_grad(
-            ['Weight', 'ProjWeight', 'Bias'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Input'),
-            check_dygraph=False,
-        )
-
-    def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
-            'float64'
-        )
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('H0'),
-            check_dygraph=False,
-        )
-
-    def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
-            'float64'
-        )
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('C0'),
-            check_dygraph=False,
-        )
-
-
-class TestLstmpOpRerverse(TestLstmpOp):
-    def reset_argument(self):
-        self.is_reverse = True
-
-
-class TestLstmpOpNotUsePeepholes(TestLstmpOp):
-    def reset_argument(self):
-        self.use_peepholes = False
-
-
-class TestLstmpOpLinearProjection(TestLstmpOp):
-    def reset_argument(self):
-        self.act_proj = 'identity'
-
-
-class TestLstmpOpLen0Case1(TestLstmpOp):
-    def reset_argument(self):
-        self.lod = [[0, 4, 0]]
-
-
-class TestLstmpOpLen0Case2(TestLstmpOp):
-    def reset_argument(self):
-        self.lod = [[2, 0, 3]]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_margin_rank_loss_op.py b/test/legacy_test/test_margin_rank_loss_op.py
deleted file mode 100644
index a795bc23694b37..00000000000000
--- a/test/legacy_test/test_margin_rank_loss_op.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, paddle_static_guard
-
-import paddle
-from paddle import base
-
-
-class TestMarginRankLossOp(OpTest):
-    def setUp(self):
-        self.op_type = "margin_rank_loss"
-        batch_size = 5
-        margin = 0.5
-        # labels_{i} = {-1, 1}
-        label = (
-            2 * np.random.randint(0, 2, size=(batch_size, 1)).astype("float32")
-            - 1
-        )
-        x1 = np.random.random((batch_size, 1)).astype("float32")
-        x2 = np.random.random((batch_size, 1)).astype("float32")
-        # loss = max(0, -label * (x1 - x2) + margin)
-        loss = -label * (x1 - x2) + margin
-        loss = np.where(loss > 0, loss, 0)
-        act = np.where(loss > 0, 1.0, 0.0)
-
-        self.attrs = {'margin': margin}
-        self.inputs = {'Label': label, 'X1': x1, 'X2': x2}
-        self.outputs = {'Activated': act, 'Out': loss}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X1", "X2"], "Out")
-
-    def test_check_grad_ignore_x1(self):
-        self.check_grad(["X2"], "Out", no_grad_set=set('X1'))
-
-    def test_check_grad_ignore_x2(self):
-        self.check_grad(["X1"], "Out", no_grad_set=set('X2'))
-
-
-class TestMarginRankLossLayer(unittest.TestCase):
-    def setUp(self):
-        self.batch_size = 5
-        self.margin = 0.5
-        # labels_{i} = {-1, 1}
-        self.label = (
-            2
-            * np.random.randint(0, 2, size=(self.batch_size, 1)).astype(
-                "float32"
-            )
-            - 1
-        )
-        self.x1 = np.random.random((self.batch_size, 1)).astype("float32")
-        self.x2 = np.random.random((self.batch_size, 1)).astype("float32")
-        # loss = max(0, -label * (x1 - x2) + margin)
-        loss = -self.label * (self.x1 - self.x2) + self.margin
-        loss = np.where(loss > 0, loss, 0)
-        self.loss = loss
-
-    def test_identity(self):
-        place = base.CPUPlace()
-        self.check_identity(place)
-
-        if base.is_compiled_with_cuda():
-            place = base.CUDAPlace(0)
-            self.check_identity(place)
-
-    def check_identity(self, place):
-        with paddle_static_guard():
-            main = base.Program()
-            start = base.Program()
-            with base.unique_name.guard():
-                with base.program_guard(main, start):
-                    label = paddle.static.data(
-                        "label", (self.batch_size, 1), "float32"
-                    )
-                    x1 = paddle.static.data(
-                        "x1", (self.batch_size, 1), "float32"
-                    )
-                    x2 = paddle.static.data(
-                        "x2", (self.batch_size, 1), "float32"
-                    )
-                    out = paddle.nn.functional.margin_ranking_loss(
-                        x1, x2, label, self.margin, 'none'
-                    )
-
-            exe = base.Executor(place)
-            exe.run(start)
-            (out_np,) = exe.run(
-                main,
-                feed={"label": self.label, "x1": self.x1, "x2": self.x2},
-                fetch_list=[out],
-            )
-            np.testing.assert_allclose(out_np, self.loss)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_math_op_patch_pir.py b/test/legacy_test/test_math_op_patch_pir.py
new file mode 100644
index 00000000000000..9a7ab29ea4451e
--- /dev/null
+++ b/test/legacy_test/test_math_op_patch_pir.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+import warnings
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+paddle.enable_static()
+paddle.device.set_device("cpu")
+
+
+def new_program():
+    # TODO(gouzil): Optimize program code
+    main_program = paddle.static.Program()
+    startup_program = paddle.static.Program()
+    place = base.CPUPlace()
+    exe = base.Executor(place)
+    return (
+        main_program,
+        exe,
+        paddle.static.program_guard(
+            main_program=main_program, startup_program=startup_program
+        ),
+    )
+
+
+class TestMathOpPatchesPir(unittest.TestCase):
+    def test_pow(self):
+        # Calculate results in dynamic graphs
+        paddle.disable_static()
+        x_np = np.random.random([10, 1024]).astype('float32')
+        y_np = np.random.random([10, 1024]).astype('float32')
+        res_np_b = x_np**y_np
+        res_np_c = paddle.pow(paddle.to_tensor(x_np), 2)
+        # TODO(gouzil): solve paddle.fill_constant problem
+        # res_np_d = x_np.__pow__(2)
+        # res_np_e = x_np.__rpow__(2)
+        paddle.enable_static()
+        # Calculate results under pir
+        with paddle.pir_utils.IrGuard():
+            main_program, exe, program_guard = new_program()
+            with program_guard:
+                x = paddle.static.data(
+                    name='x', shape=[10, 1024], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[10, 1024], dtype='float32'
+                )
+                b = x**y
+                c = x.pow(2)
+                # d = x.__pow__(2)
+                # e = x.__rpow__(2)
+                # TODO(gouzil): Why not use `paddle.static.default_main_program()`？
+                # Because different case do not isolate parameters (This is a known problem)
+                (b_np, c_np) = exe.run(
+                    main_program,
+                    feed={"x": x_np, "y": y_np},
+                    fetch_list=[b, c],
+                )
+                np.testing.assert_allclose(res_np_b, b_np, rtol=1e-05)
+                np.testing.assert_allclose(res_np_c, c_np, rtol=1e-05)
+                # np.testing.assert_allclose(res_np_d, d_np, rtol=1e-05)
+                # np.testing.assert_allclose(res_np_e, e_np, rtol=1e-05)
+
+    def test_mod(self):
+        paddle.disable_static()
+        x_np = np.random.randint(1, 100, size=[10, 1024], dtype=np.int64)
+        y_np = np.random.randint(1, 100, size=[10, 1024], dtype=np.int64)
+        res_np_b = x_np % y_np
+        res_np_c = paddle.mod(paddle.to_tensor(x_np), paddle.to_tensor(y_np))
+        res_np_d = x_np.__mod__(y_np)
+        paddle.enable_static()
+        with paddle.pir_utils.IrGuard():
+            main_program, exe, program_guard = new_program()
+            with program_guard:
+                x = paddle.static.data(
+                    name='x', shape=[10, 1024], dtype='int64'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[10, 1024], dtype='int64'
+                )
+                b = x % y
+                c = x.mod(y)
+                d = x.__mod__(y)
+                (b_np, c_np, d_np) = exe.run(
+                    main_program,
+                    feed={"x": x_np, "y": y_np},
+                    fetch_list=[b, c, d],
+                )
+                np.testing.assert_allclose(res_np_b, b_np, atol=1e-05)
+                np.testing.assert_allclose(res_np_c, c_np, atol=1e-05)
+                np.testing.assert_allclose(res_np_d, d_np, atol=1e-05)
+
+    def test_matmul(self):
+        paddle.disable_static()
+        x_np = np.random.uniform(-1, 1, [2, 3]).astype('float32')
+        y_np = np.random.uniform(-1, 1, [3, 5]).astype('float32')
+        res_np_b = x_np @ y_np  # __matmul__
+        res_np_c = paddle.matmul(paddle.to_tensor(x_np), paddle.to_tensor(y_np))
+        res_np_d = x_np.__matmul__(y_np)
+        paddle.enable_static()
+        with paddle.pir_utils.IrGuard():
+            main_program, exe, program_guard = new_program()
+            with program_guard:
+                x = paddle.static.data(name='x', shape=[2, 3], dtype='float32')
+                y = paddle.static.data(name='y', shape=[3, 5], dtype='float32')
+                b = x @ y
+                c = x.matmul(y)
+                d = x.__matmul__(y)
+                (b_np, c_np, d_np) = exe.run(
+                    main_program,
+                    feed={"x": x_np, "y": y_np},
+                    fetch_list=[b, c, d],
+                )
+                np.testing.assert_allclose(res_np_b, b_np, atol=1e-05)
+                np.testing.assert_allclose(res_np_c, c_np, atol=1e-05)
+                np.testing.assert_allclose(res_np_d, d_np, atol=1e-05)
+
+    def test_floordiv(self):
+        paddle.disable_static()
+        x_np = np.full([10, 1024], 10, np.int64)
+        y_np = np.full([10, 1024], 2, np.int64)
+        res_np_b = x_np // y_np
+        res_np_c = paddle.floor_divide(
+            paddle.to_tensor(x_np), paddle.to_tensor(y_np)
+        )
+        res_np_d = x_np.__floordiv__(y_np)
+        paddle.enable_static()
+        with paddle.pir_utils.IrGuard():
+            main_program, exe, program_guard = new_program()
+            with program_guard:
+                x = paddle.static.data(
+                    name='x', shape=[10, 1024], dtype='int64'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[10, 1024], dtype='int64'
+                )
+                b = x // y
+                c = x.floor_divide(y)
+                d = x.__floordiv__(y)
+                (b_np, c_np, d_np) = exe.run(
+                    main_program,
+                    feed={"x": x_np, "y": y_np},
+                    fetch_list=[b, c, d],
+                )
+                np.testing.assert_allclose(res_np_b, b_np, atol=1e-05)
+                np.testing.assert_allclose(res_np_c, c_np, atol=1e-05)
+                np.testing.assert_allclose(res_np_d, d_np, atol=1e-05)
+
+    def test_item(self):
+        with paddle.pir_utils.IrGuard():
+            x = paddle.static.data(name='x', shape=[3, 2, 1])
+            y = paddle.static.data(
+                name='y',
+                shape=[
+                    3,
+                ],
+            )
+            self.assertTrue(y.item() == y)
+            with self.assertRaises(TypeError):
+                x.item()
+
+    def test_place(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            with paddle.pir_utils.IrGuard():
+                x = paddle.static.data(name='x', shape=[3, 2, 1])
+                x.place()
+                self.assertTrue(len(w) == 1)
+                self.assertTrue("place" in str(w[-1].message))
+
+    def test_some_dim(self):
+        with paddle.pir_utils.IrGuard():
+            x = paddle.static.data(name='x', shape=[3, 2, 1])
+            self.assertEqual(x.dim(), 3)
+            self.assertEqual(x.ndimension(), 3)
+            self.assertEqual(x.ndim, 3)
+
+    def test_math_exists(self):
+        with paddle.pir_utils.IrGuard():
+            a = paddle.static.data(name='a', shape=[1], dtype='float32')
+            self.assertTrue(isinstance(a, paddle.pir.OpResult))
+            self.assertTrue(inspect.ismethod(a.dot))
+            self.assertTrue(inspect.ismethod(a.logsumexp))
+            self.assertTrue(inspect.ismethod(a.multiplex))
+            self.assertTrue(inspect.ismethod(a.prod))
+            self.assertTrue(inspect.ismethod(a.scale))
+            self.assertTrue(inspect.ismethod(a.stanh))
+            self.assertTrue(inspect.ismethod(a.add_n))
+            self.assertTrue(inspect.ismethod(a.max))
+            self.assertTrue(inspect.ismethod(a.maximum))
+            self.assertTrue(inspect.ismethod(a.min))
+            self.assertTrue(inspect.ismethod(a.minimum))
+            self.assertTrue(inspect.ismethod(a.floor_divide))
+            self.assertTrue(inspect.ismethod(a.remainder))
+            self.assertTrue(inspect.ismethod(a.floor_mod))
+            self.assertTrue(inspect.ismethod(a.multiply))
+            self.assertTrue(inspect.ismethod(a.inverse))
+            self.assertTrue(inspect.ismethod(a.log1p))
+            self.assertTrue(inspect.ismethod(a.erf))
+            self.assertTrue(inspect.ismethod(a.addmm))
+            self.assertTrue(inspect.ismethod(a.clip))
+            self.assertTrue(inspect.ismethod(a.trace))
+            self.assertTrue(inspect.ismethod(a.kron))
+            self.assertTrue(inspect.ismethod(a.isinf))
+            self.assertTrue(inspect.ismethod(a.isnan))
+            self.assertTrue(inspect.ismethod(a.concat))
+            self.assertTrue(inspect.ismethod(a.broadcast_to))
+            self.assertTrue(inspect.ismethod(a.scatter_nd_add))
+            self.assertTrue(inspect.ismethod(a.scatter_nd))
+            self.assertTrue(inspect.ismethod(a.shard_index))
+            self.assertTrue(inspect.ismethod(a.chunk))
+            self.assertTrue(inspect.ismethod(a.stack))
+            self.assertTrue(inspect.ismethod(a.strided_slice))
+            self.assertTrue(inspect.ismethod(a.unsqueeze))
+            self.assertTrue(inspect.ismethod(a.unstack))
+            self.assertTrue(inspect.ismethod(a.argmax))
+            self.assertTrue(inspect.ismethod(a.argmin))
+            self.assertTrue(inspect.ismethod(a.argsort))
+            self.assertTrue(inspect.ismethod(a.masked_select))
+            self.assertTrue(inspect.ismethod(a.topk))
+            self.assertTrue(inspect.ismethod(a.index_select))
+            self.assertTrue(inspect.ismethod(a.nonzero))
+            self.assertTrue(inspect.ismethod(a.sort))
+            self.assertTrue(inspect.ismethod(a.index_sample))
+            self.assertTrue(inspect.ismethod(a.mean))
+            self.assertTrue(inspect.ismethod(a.std))
+            self.assertTrue(inspect.ismethod(a.numel))
+            self.assertTrue(inspect.ismethod(a.asin_))
+            self.assertTrue(inspect.ismethod(a.atan2))
+            self.assertTrue(inspect.ismethod(a.atanh_))
+            self.assertTrue(inspect.ismethod(a.diagflat))
+            self.assertTrue(inspect.ismethod(a.multinomial))
+            self.assertTrue(inspect.ismethod(a.pinv))
+            self.assertTrue(inspect.ismethod(a.renorm))
+            self.assertTrue(inspect.ismethod(a.renorm_))
+            self.assertTrue(inspect.ismethod(a.tan))
+            self.assertTrue(inspect.ismethod(a.tan_))
+            self.assertTrue(inspect.ismethod(a.tril))
+            self.assertTrue(inspect.ismethod(a.tril_))
+            self.assertTrue(inspect.ismethod(a.triu))
+            self.assertTrue(inspect.ismethod(a.triu_))
+            self.assertTrue(inspect.ismethod(a.stft))
+            self.assertTrue(inspect.ismethod(a.istft))
+            self.assertTrue(inspect.ismethod(a.abs_))
+            self.assertTrue(inspect.ismethod(a.acos_))
+            self.assertTrue(inspect.ismethod(a.atan_))
+            self.assertTrue(inspect.ismethod(a.cos_))
+            self.assertTrue(inspect.ismethod(a.cosh_))
+            self.assertTrue(inspect.ismethod(a.sin_))
+            self.assertTrue(inspect.ismethod(a.sinh_))
+            self.assertTrue(inspect.ismethod(a.acosh_))
+            self.assertTrue(inspect.ismethod(a.asinh_))
+            self.assertTrue(inspect.ismethod(a.diag))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_math_op_patch_var_base.py b/test/legacy_test/test_math_op_patch_var_base.py
index a49c2000de92d3..af5fbd9ba9ca1c 100644
--- a/test/legacy_test/test_math_op_patch_var_base.py
+++ b/test/legacy_test/test_math_op_patch_var_base.py
@@ -576,7 +576,6 @@ def test_tensor_patch_method(self):
         self.assertTrue(inspect.ismethod(a.remainder))
         self.assertTrue(inspect.ismethod(a.floor_mod))
         self.assertTrue(inspect.ismethod(a.multiply))
-        self.assertTrue(inspect.ismethod(a.logsumexp))
         self.assertTrue(inspect.ismethod(a.inverse))
         self.assertTrue(inspect.ismethod(a.log1p))
         self.assertTrue(inspect.ismethod(a.erf))
@@ -635,37 +634,6 @@ def test_tensor_patch_method(self):
         self.assertTrue(inspect.ismethod(x.acosh_))
         self.assertTrue(inspect.ismethod(x.asinh_))
         self.assertTrue(inspect.ismethod(x.diag))
-        self.assertTrue(inspect.ismethod(x.eye))
-        self.assertTrue(inspect.ismethod(x.linspace))
-        self.assertTrue(inspect.ismethod(x.fill_constant))
-        self.assertTrue(inspect.ismethod(x.ones))
-        self.assertTrue(inspect.ismethod(x.ones_like))
-        self.assertTrue(inspect.ismethod(x.zeros))
-        self.assertTrue(inspect.ismethod(x.zeros_like))
-        self.assertTrue(inspect.ismethod(x.arange))
-        self.assertTrue(inspect.ismethod(x.full))
-        self.assertTrue(inspect.ismethod(x.full_like))
-        self.assertTrue(inspect.ismethod(x.meshgrid))
-        self.assertTrue(inspect.ismethod(x.empty))
-        self.assertTrue(inspect.ismethod(x.empty_like))
-        self.assertTrue(inspect.ismethod(x.complex))
-        self.assertTrue(inspect.ismethod(x.eigh))
-        self.assertTrue(inspect.ismethod(x.standard_normal))
-        self.assertTrue(inspect.ismethod(x.normal))
-        self.assertTrue(inspect.ismethod(x.uniform))
-        self.assertTrue(inspect.ismethod(x.randn))
-        self.assertTrue(inspect.ismethod(x.rand))
-        self.assertTrue(inspect.ismethod(x.randint))
-        self.assertTrue(inspect.ismethod(x.randint_like))
-        self.assertTrue(inspect.ismethod(x.randperm))
-        self.assertTrue(inspect.ismethod(x.poisson))
-        self.assertTrue(inspect.ismethod(x.searchsorted))
-        self.assertTrue(inspect.ismethod(x.set_printoptions))
-        self.assertTrue(inspect.ismethod(x.array_length))
-        self.assertTrue(inspect.ismethod(x.array_read))
-        self.assertTrue(inspect.ismethod(x.array_write))
-        self.assertTrue(inspect.ismethod(x.create_array))
-        self.assertTrue(inspect.ismethod(x.einsum))
 
     def test_complex_scalar(self):
         a_np = np.random.random(self.shape).astype(self.dtype)
diff --git a/test/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py
index 0293e0414a23ea..eb893971e026b2 100644
--- a/test/legacy_test/test_matmul_v2_op.py
+++ b/test/legacy_test/test_matmul_v2_op.py
@@ -99,7 +99,7 @@ def setUp(self):
     def test_check_output(self):
         self.check_output(
             check_cinn=self.check_cinn if hasattr(self, 'check_cinn') else True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad(self):
@@ -111,7 +111,7 @@ def test_check_grad(self):
                 check_cinn=self.check_cinn
                 if hasattr(self, 'check_cinn')
                 else True,
-                check_new_ir=True,
+                check_pir=True,
             )
         else:
             self.check_grad(
@@ -120,7 +120,7 @@ def test_check_grad(self):
                 check_cinn=self.check_cinn
                 if hasattr(self, 'check_cinn')
                 else True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
 
@@ -362,7 +362,7 @@ def test_check_output(self):
                         check_cinn=self.check_cinn
                         if hasattr(self, 'check_cinn')
                         else True,
-                        check_new_ir=True,
+                        check_pir=True,
                     )
 
         def test_check_grad(self):
@@ -376,7 +376,7 @@ def test_check_grad(self):
                     check_cinn=self.check_cinn
                     if hasattr(self, 'check_cinn')
                     else True,
-                    check_new_ir=True,
+                    check_pir=True,
                 )
 
     cls_name = "{}_{}".format(parent.__name__, "Fp16")
@@ -436,7 +436,7 @@ def test_check_output(self):
                 check_cinn=self.check_cinn
                 if hasattr(self, 'check_cinn')
                 else True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
         def test_check_grad_x(self):
@@ -453,7 +453,7 @@ def test_check_grad_x(self):
                 check_cinn=self.check_cinn
                 if hasattr(self, 'check_cinn')
                 else True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
         def test_check_grad_y(self):
@@ -470,7 +470,7 @@ def test_check_grad_y(self):
                 check_cinn=self.check_cinn
                 if hasattr(self, 'check_cinn')
                 else True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
         def test_check_grad(self):
@@ -745,7 +745,7 @@ def init_input_output(self):
         self.out = np.matmul(self.x, self.y)
 
     def test_check_output(self):
-        self.check_output(check_cinn=False, check_new_ir=True)
+        self.check_output(check_cinn=False, check_pir=True)
 
 
 class TestInt32MatMulOpBroadcast(OpTest):
@@ -797,7 +797,7 @@ def init_input_output(self):
         self.out = np.matmul(self.x, self.y)
 
     def test_check_output(self):
-        self.check_output(check_cinn=False, check_new_ir=True)
+        self.check_output(check_cinn=False, check_pir=True)
 
 
 class TestInt64MatMulOpBroadcast(OpTest):
diff --git a/test/legacy_test/test_maximum_op.py b/test/legacy_test/test_maximum_op.py
index 818bdb65fee682..a0e660112bd030 100644
--- a/test/legacy_test/test_maximum_op.py
+++ b/test/legacy_test/test_maximum_op.py
@@ -18,6 +18,7 @@
 
 import paddle
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class ApiMaximumTest(unittest.TestCase):
@@ -39,6 +40,7 @@ def setUp(self):
         self.np_expected3 = np.maximum(self.input_a, self.input_c)
         self.np_expected4 = np.maximum(self.input_b, self.input_c)
 
+    @test_with_pir_api
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(
@@ -119,3 +121,7 @@ def test_dynamic_api(self):
         res = paddle.maximum(b, c)
         res = res.numpy()
         np.testing.assert_allclose(res, self.np_expected4, rtol=1e-05)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_mean_iou.py b/test/legacy_test/test_mean_iou.py
deleted file mode 100644
index f50a8beb010f91..00000000000000
--- a/test/legacy_test/test_mean_iou.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def compute_mean_iou(
-    predictions, labels, num_classes, in_wrongs, in_corrects, in_mean_ious
-):
-    assert predictions.shape == labels.shape
-    predictions = predictions.flatten()
-    labels = labels.flatten()
-
-    out_wrong = np.zeros([num_classes]).astype("int32")
-    for _, wrong in in_wrongs:
-        out_wrong += wrong
-    out_correct = np.zeros([num_classes]).astype("int32")
-    for _, correct in in_corrects:
-        out_correct += correct
-
-    for pred, label in zip(predictions, labels):
-        if pred == label:
-            out_correct[pred] += 1
-        else:
-            out_wrong[pred] += 1
-            out_wrong[label] += 1
-
-    denominator = out_wrong + out_correct
-    valid_count = (denominator != 0).sum()
-    denominator = np.where(
-        denominator > 0, denominator, np.ones(denominator.shape)
-    )
-    mean_iou = (out_correct / denominator).sum() / valid_count
-
-    for _, in_mean_iou in in_mean_ious:
-        mean_iou += float(in_mean_iou)
-    return mean_iou, out_wrong, out_correct
-
-
-class TestMeanIOUOp(OpTest):
-    def setUp(self):
-        self.config()
-        self.op_type = "mean_iou"
-        predictions = np.random.randint(
-            0, self.num_classes, self.image_size
-        ).astype("int32")
-        labels = np.random.randint(0, self.num_classes, self.image_size).astype(
-            "int32"
-        )
-
-        in_wrongs = []
-        for i in range(self.in_wrong_num):
-            in_wrongs.append(
-                (
-                    "in_wrong_%d" % i,
-                    np.random.randint(0, 10, [self.num_classes]).astype(
-                        "int32"
-                    ),
-                )
-            )
-
-        in_corrects = []
-        for i in range(self.in_correct_num):
-            in_corrects.append(
-                (
-                    "in_correct_%d" % i,
-                    np.random.randint(0, 10, [self.num_classes]).astype(
-                        "int32"
-                    ),
-                )
-            )
-
-        self.inputs = {
-            'Predictions': predictions,
-            'Labels': labels,
-            'InWrongs': in_wrongs,
-            'InCorrects': in_corrects,
-            'InMeanIou': self.in_mean_ious,
-        }
-        self.attrs = {'num_classes': int(self.num_classes)}
-        mean_iou, out_wrong, out_correct = compute_mean_iou(
-            predictions,
-            labels,
-            self.num_classes,
-            in_wrongs,
-            in_corrects,
-            self.in_mean_ious,
-        )
-        self.outputs = {
-            'OutMeanIou': mean_iou,
-            'OutWrong': out_wrong,
-            'OutCorrect': out_correct,
-        }
-
-    def config(self):
-        self.num_classes = 10
-        self.image_size = [128, 128]
-        self.in_wrong_num = 0
-        self.in_correct_num = 0
-        self.in_mean_ious = []
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCase1(TestMeanIOUOp):
-    def config(self):
-        self.num_classes = 5
-        self.image_size = [100, 128]
-        self.in_wrong_num = 2
-        self.in_correct_num = 2
-        self.in_mean_ious = []
-        for i in range(2):
-            self.in_mean_ious.append(
-                (
-                    "in_mean_iou_%d" % i,
-                    np.random.uniform(0, 1, []).astype("float32"),
-                )
-            )
-
-    # NOTE(dev): Skip check_dygraph becuase Python API doesn't expose
-    # in_wrong_num/in_correct_num/in_mean_iou_num argument
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-
-class TestCase2(TestCase1):
-    def config(self):
-        self.num_classes = 5
-        self.image_size = [100, 128]
-        self.in_wrong_num = 2
-        self.in_correct_num = 2
-        self.in_mean_ious = []
-        for i in range(2):
-            self.in_mean_ious.append(
-                (
-                    "in_mean_iou_%d" % i,
-                    np.random.uniform(0, 1, [1]).astype("float32"),
-                )
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_mean_op.py b/test/legacy_test/test_mean_op.py
index ee8cf92ffcb0a5..e217b31d980d65 100644
--- a/test/legacy_test/test_mean_op.py
+++ b/test/legacy_test/test_mean_op.py
@@ -23,6 +23,7 @@
 import paddle
 from paddle import base
 from paddle.base import Program, core, program_guard
+from paddle.pir_utils import test_with_pir_api
 
 np.random.seed(10)
 
@@ -52,10 +53,10 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_checkout_grad(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
 
 class TestMeanOp_ZeroDim(OpTest):
@@ -67,18 +68,26 @@ def setUp(self):
         self.outputs = {'Out': np.mean(self.inputs["X"])}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_checkout_grad(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
 
 class TestMeanOpError(unittest.TestCase):
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.int32)
+        self.place = (
+            paddle.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of mean_op must be Variable.
-
             input1 = 12
             self.assertRaises(TypeError, paddle.mean, input1)
             # The input dtype of mean_op must be float16, float32, float64.
@@ -90,6 +99,20 @@ def test_errors(self):
                 name='input3', shape=[-1, 4], dtype="float16"
             )
             paddle.nn.functional.softmax(input3)
+
+        with paddle.pir_utils.IrGuard(), program_guard(Program(), Program()):
+            input1 = 12
+            self.assertRaises(ValueError, paddle.mean, input1)
+
+            input2 = paddle.static.data(
+                name='input2', shape=[2, 3, 4, 5], dtype="int32"
+            )
+
+            out = paddle.mean(input2)
+
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'input2': self.x}, fetch_list=[out])
+
         paddle.disable_static()
 
 
@@ -104,7 +127,7 @@ def init_dtype_type(self):
     def test_check_output(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_output_with_place(place, check_new_ir=True)
+            self.check_output_with_place(place, check_pir=True)
 
     def test_checkout_grad(self):
         place = core.CUDAPlace(0)
@@ -128,11 +151,11 @@ def init_dtype_type(self):
 
     def test_check_output(self):
         paddle.enable_static()
-        self.check_output_with_place(core.CPUPlace(), check_new_ir=True)
+        self.check_output_with_place(core.CPUPlace(), check_pir=True)
 
     def test_checkout_grad(self):
         place = core.CPUPlace()
-        self.check_grad_with_place(place, ['X'], 'Out', check_new_ir=True)
+        self.check_grad_with_place(place, ['X'], 'Out', check_pir=True)
 
 
 def ref_reduce_mean(x, axis=None, keepdim=False, reduce_all=False):
@@ -190,7 +213,7 @@ def if_enable_cinn(self):
     def test_check_output(self):
         if self.dtype != 'float16':
             self.check_output(
-                check_prim=True, check_prim_pir=True, check_new_ir=True
+                check_prim=True, check_prim_pir=True, check_pir=True
             )
         else:
             place = paddle.CUDAPlace(0)
@@ -198,7 +221,7 @@ def test_check_output(self):
                 place=place,
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
     def test_check_grad(self):
@@ -208,7 +231,7 @@ def test_check_grad(self):
                 ['Out'],
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
         else:
             place = paddle.CUDAPlace(0)
@@ -219,7 +242,7 @@ def test_check_grad(self):
                 numeric_grad_delta=0.5,
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
 
@@ -446,6 +469,7 @@ def setUp(self):
             else paddle.CPUPlace()
         )
 
+    @test_with_pir_api
     def test_api_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
diff --git a/test/legacy_test/test_min_op.py b/test/legacy_test/test_min_op.py
index e24471b20dca8f..78601c77ecf069 100644
--- a/test/legacy_test/test_min_op.py
+++ b/test/legacy_test/test_min_op.py
@@ -21,6 +21,7 @@
 import paddle
 from paddle import base
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class ApiMinTest(unittest.TestCase):
@@ -30,6 +31,7 @@ def setUp(self):
         else:
             self.place = core.CPUPlace()
 
+    @test_with_pir_api
     def test_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(
diff --git a/test/legacy_test/test_minimum_op.py b/test/legacy_test/test_minimum_op.py
index 6267b78b4cf9db..79970ce77f406b 100644
--- a/test/legacy_test/test_minimum_op.py
+++ b/test/legacy_test/test_minimum_op.py
@@ -18,6 +18,7 @@
 
 import paddle
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class ApiMinimumTest(unittest.TestCase):
@@ -39,6 +40,7 @@ def setUp(self):
         self.np_expected3 = np.minimum(self.input_a, self.input_c)
         self.np_expected4 = np.minimum(self.input_b, self.input_c)
 
+    @test_with_pir_api
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(
@@ -119,3 +121,7 @@ def test_dynamic_api(self):
         res = paddle.minimum(b, c)
         res = res.numpy()
         np.testing.assert_allclose(res, self.np_expected4, rtol=1e-05)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_multinomial_op.py b/test/legacy_test/test_multinomial_op.py
index bb4c53fb348217..e886876b27583a 100644
--- a/test/legacy_test/test_multinomial_op.py
+++ b/test/legacy_test/test_multinomial_op.py
@@ -59,7 +59,7 @@ def init_data(self):
         self.attrs = {"num_samples": 100000, "replacement": True}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def sample_output(self, out):
         return sample_output_one_dimension(out, 4)
@@ -122,7 +122,7 @@ def init_data(self):
         self.attrs = {"num_samples": 100000, "replacement": True}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def sample_output(self, out):
         return sample_output_one_dimension(out, 4)
@@ -178,6 +178,7 @@ class TestMultinomialBF16OP(OpTest):
     def setUp(self):
         paddle.enable_static()
         self.op_type = "multinomial"
+        self.python_api = paddle.multinomial
         self.dtype = np.uint16
         self.init_data()
         self.inputs = {"X": convert_float_to_uint16(self.input_np)}
@@ -190,7 +191,9 @@ def init_data(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place_customized(self.verify_output, place)
+        self.check_output_with_place_customized(
+            self.verify_output, place, check_pir=True
+        )
 
     def sample_output(self, out):
         return sample_output_one_dimension(out, 4)
diff --git a/test/legacy_test/test_numel_op.py b/test/legacy_test/test_numel_op.py
index 33f1dc7cf4c2cc..7e0f75c8650775 100644
--- a/test/legacy_test/test_numel_op.py
+++ b/test/legacy_test/test_numel_op.py
@@ -18,8 +18,8 @@
 from op_test import OpTest, convert_float_to_uint16
 
 import paddle
-from paddle import base
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestNumelOp(OpTest):
@@ -34,7 +34,7 @@ def setUp(self):
         self.outputs = {'Out': np.array(np.size(x))}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def init(self):
         self.shape = (6, 56, 8, 55)
@@ -136,7 +136,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = paddle.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def init(self):
         self.shape = (6, 56, 8, 55)
@@ -148,10 +148,11 @@ def init(self):
 
 
 class TestNumelAPI(unittest.TestCase):
+    @test_with_pir_api
     def test_numel_static(self):
-        main_program = base.Program()
-        startup_program = base.Program()
-        with base.program_guard(main_program, startup_program):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
             shape1 = [2, 1, 4, 5]
             shape2 = [1, 4, 5]
             x_1 = paddle.static.data(shape=shape1, dtype='int32', name='x_1')
@@ -188,9 +189,9 @@ def test_numel_imperative(self):
         paddle.enable_static()
 
     def test_error(self):
-        main_program = base.Program()
-        startup_program = base.Program()
-        with base.program_guard(main_program, startup_program):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
 
             def test_x_type():
                 shape = [1, 4, 5]
@@ -199,6 +200,16 @@ def test_x_type():
 
             self.assertRaises(TypeError, test_x_type)
 
+    def test_pir_error(self):
+        with paddle.pir_utils.IrGuard():
+
+            def test_x_type():
+                shape = [1, 4, 5]
+                input_1 = np.random.random(shape).astype("int32")
+                out_1 = paddle.numel(input_1)
+
+            self.assertRaises(ValueError, test_x_type)
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/test/legacy_test/test_pad2d_op.py b/test/legacy_test/test_pad2d_op.py
deleted file mode 100644
index 8c1545ec718bce..00000000000000
--- a/test/legacy_test/test_pad2d_op.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestPad2dOp(OpTest):
-    def setUp(self):
-        self.pad_value = 0.0
-        self.variable_paddings = False
-        self.initTestCase()
-        self.op_type = "pad2d"
-        self.inputs = {'X': np.random.random(self.shape).astype("float64")}
-        self.attrs = {}
-        if self.variable_paddings:
-            self.attrs['paddings'] = []
-            self.inputs['Paddings'] = (
-                np.array(self.paddings).flatten().astype("int32")
-            )
-        else:
-            self.attrs['paddings'] = (
-                np.array(self.paddings).flatten().astype("int32")
-            )
-        self.attrs['pad_value'] = self.pad_value
-        self.attrs['mode'] = self.mode
-        self.attrs['data_format'] = self.data_format
-        if self.data_format == "NCHW":
-            paddings = [
-                (0, 0),
-                (0, 0),
-                (self.paddings[0], self.paddings[1]),
-                (self.paddings[2], self.paddings[3]),
-            ]
-        else:
-            paddings = [
-                (0, 0),
-                (self.paddings[0], self.paddings[1]),
-                (self.paddings[2], self.paddings[3]),
-                (0, 0),
-            ]
-        if self.mode == "constant":
-            out = np.pad(
-                self.inputs['X'],
-                paddings,
-                mode=self.mode,
-                constant_values=self.pad_value,
-            )
-        else:
-            out = np.pad(self.inputs['X'], paddings, mode=self.mode)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', check_dygraph=False)
-
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "constant"
-        self.data_format = "NCHW"
-        self.pad_value = 0.0
-
-
-class TestCase1(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "reflect"
-        self.data_format = "NCHW"
-
-
-class TestCase2(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "edge"
-        self.data_format = "NCHW"
-
-
-class TestCase3(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 4, 4)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "reflect"
-        self.data_format = "NHWC"
-
-
-class TestCase4(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 4, 4)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "edge"
-        self.data_format = "NHWC"
-
-
-class TestCase5(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 4, 4)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "constant"
-        self.pad_value = 1.2
-        self.data_format = "NHWC"
-
-
-class TestCase6(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 4, 4)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "constant"
-        self.pad_value = 1.2
-        self.data_format = "NHWC"
-        self.variable_paddings = True
-
-
-class TestCase7(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "reflect"
-        self.data_format = "NCHW"
-        self.variable_paddings = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_pad3d_op.py b/test/legacy_test/test_pad3d_op.py
index 42efb91a166d17..52c9557766914c 100644
--- a/test/legacy_test/test_pad3d_op.py
+++ b/test/legacy_test/test_pad3d_op.py
@@ -91,10 +91,10 @@ def setUp(self):
             self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
     def get_dtype(self):
         return np.float64
@@ -214,11 +214,11 @@ def get_dtype(self):
             return np.float16
 
         def test_check_output(self):
-            self.check_output(atol=1e-3, check_new_ir=True)
+            self.check_output(atol=1e-3, check_pir=True)
 
         def test_check_grad_normal(self):
             self.check_grad(
-                ['X'], 'Out', max_relative_error=1.5e-3, check_new_ir=True
+                ['X'], 'Out', max_relative_error=1.5e-3, check_pir=True
             )
 
     cls_name = "{}_{}".format(parent.__name__, "FP16OP")
@@ -253,12 +253,12 @@ def get_dtype(self):
 
         def test_check_output(self):
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-2, check_new_ir=True)
+            self.check_output_with_place(place, atol=1e-2, check_pir=True)
 
         def test_check_grad_normal(self):
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
-                place, ['X'], 'Out', max_relative_error=1e-2, check_new_ir=True
+                place, ['X'], 'Out', max_relative_error=1e-2, check_pir=True
             )
 
     cls_name = "{}_{}".format(parent.__name__, "BF16OP")
diff --git a/test/legacy_test/test_pad_constant_like.py b/test/legacy_test/test_pad_constant_like.py
deleted file mode 100644
index e304bdf29e4c2a..00000000000000
--- a/test/legacy_test/test_pad_constant_like.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestPadConstantLikeOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = "pad_constant_like"
-        self.inputs = {
-            'X': np.random.random(self.x_shape).astype("float64"),
-            'Y': np.random.random(self.y_shape).astype("float64"),
-        }
-        self.attrs = {}
-        self.attrs['pad_value'] = self.pad_value
-        self.outputs = {
-            'Out': np.pad(
-                self.inputs['Y'],
-                self.paddings,
-                mode='constant',
-                constant_values=self.pad_value,
-            )
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Y'], 'Out')
-
-    def initTestCase(self):
-        self.x_shape = (16, 40)
-        self.y_shape = (3, 40)
-        self.pad_value = 0.1
-        self.paddings = [(0, 13), (0, 0)]
-
-
-class TestCase1(TestPadConstantLikeOp):
-    def initTestCase(self):
-        self.x_shape = (4, 3, 4, 5)
-        self.y_shape = (2, 3, 4, 5)
-        self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)]
-        self.pad_value = 0.5
-
-
-class TestCase2(TestPadConstantLikeOp):
-    def initTestCase(self):
-        self.x_shape = (4, 3, 4, 10)
-        self.y_shape = (2, 3, 2, 10)
-        self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)]
-        self.pad_value = 0.5
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_pad_op.py b/test/legacy_test/test_pad_op.py
index 8054d7c75ffb11..81efa838178e8f 100644
--- a/test/legacy_test/test_pad_op.py
+++ b/test/legacy_test/test_pad_op.py
@@ -21,7 +21,8 @@
 from utils import static_guard
 
 import paddle
-from paddle.base import Program, core, program_guard
+from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 def pad_wrapper(x, paddings, pad_value):
@@ -57,10 +58,16 @@ def get_dtype(self):
         return np.float64
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def initTestCase(self):
         self.shape = (16, 16)
@@ -101,7 +108,13 @@ def get_dtype(self):
             return np.float16
 
         def test_check_grad_normal(self):
-            self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=True,
+                check_pir=True,
+                check_prim_pir=True,
+            )
 
     cls_name = "{}_{}".format(parent.__name__, "Fp16")
     TestPadFp16.__name__ = cls_name
@@ -117,7 +130,9 @@ def test_check_grad_normal(self):
 class TestPadOpError(unittest.TestCase):
     def test_errors(self):
         with static_guard():
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 input_data = np.random.random((2, 2)).astype("float32")
 
                 def test_Variable():
@@ -138,9 +153,9 @@ def init_info(self):
 
     def test_static(self):
         with static_guard():
-            main_prog = Program()
-            starup_prog = Program()
-            with program_guard(main_prog, starup_prog):
+            main_prog = paddle.static.Program()
+            starup_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, starup_prog):
                 fc = paddle.nn.Linear(4, 10)
                 x = paddle.randn([2, 4])
                 x.stop_gradient = False
@@ -159,6 +174,7 @@ def test_static(self):
                     res[0], [1, 1], 'constant', constant_values=[1.0, 1.0]
                 )
                 np.testing.assert_allclose(res[1], gt)
+
                 paddle.static.save_inference_model(
                     self.save_path, [x], [feat, out], exe
                 )
@@ -172,6 +188,29 @@ def test_static(self):
                 )
                 np.testing.assert_allclose(infer_outs[1], gt)
 
+    def test_pir_static(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            starup_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, starup_prog):
+                fc = paddle.nn.Linear(4, 10)
+                x = paddle.randn([2, 4])
+                x.stop_gradient = False
+                feat = fc(x)  # [2,3,10]
+
+                out = self.call_func(feat)
+
+                sgd = paddle.optimizer.SGD()
+                sgd.minimize(paddle.mean(out))
+
+                exe = paddle.static.Executor()
+                exe.run(starup_prog)
+                res = exe.run(fetch_list=[feat, out])
+                gt = np.pad(
+                    res[0], [1, 1], 'constant', constant_values=[1.0, 1.0]
+                )
+                np.testing.assert_allclose(res[1], gt)
+
     def path_prefix(self):
         return 'padding_value'
 
@@ -196,12 +235,13 @@ def call_func(self, x):
 
 
 class TestPaddingValueTensor3(unittest.TestCase):
+    @test_with_pir_api
     def test_static(self):
         with static_guard():
             np_x = np.random.random((16, 16)).astype('float32')
-            main_prog = Program()
-            starup_prog = Program()
-            with program_guard(main_prog, starup_prog):
+            main_prog = paddle.static.Program()
+            starup_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, starup_prog):
                 x = paddle.assign(np_x).astype('float32')
                 pad_value = paddle.assign([0.0]).astype('float64')
                 y = paddle.nn.functional.pad(x, [0, 1, 2, 3], value=pad_value)
@@ -253,12 +293,17 @@ def initTestCase(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_new_ir=True)
+        self.check_output_with_place(place, check_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel.py b/test/legacy_test/test_parallel_dygraph_dataparallel.py
index de3160e9c6f9c9..b3cbfbf0966f89 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel.py
@@ -121,6 +121,7 @@ def start_local_trainers(
             "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
+            "FLAGS_dynamic_static_unified_comm": "0",
         }
 
         proc_env["FLAGS_allocator_strategy"] = allocator_strategy
diff --git a/test/legacy_test/test_poisson_op.py b/test/legacy_test/test_poisson_op.py
index 2002b94ac8013a..b2b889645ddfc8 100644
--- a/test/legacy_test/test_poisson_op.py
+++ b/test/legacy_test/test_poisson_op.py
@@ -63,7 +63,7 @@ def verify_output(self, outs):
         np.testing.assert_allclose(hist, prob, rtol=0.01)
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -73,6 +73,7 @@ def test_check_grad_normal(self):
             user_defined_grad_outputs=[
                 np.random.rand(2048, 1024).astype(self.dtype)
             ],
+            check_pir=True,
         )
 
 
@@ -409,7 +410,7 @@ def verify_output(self, outs):
     def test_check_output(self):
         place = core.CUDAPlace(0)
         self.check_output_with_place_customized(
-            self.verify_output, place, check_new_ir=True
+            self.verify_output, place, check_pir=True
         )
 
     def test_check_grad(self):
@@ -422,7 +423,7 @@ def test_check_grad(self):
             user_defined_grad_outputs=[
                 np.random.rand(2048, 1024).astype("float32")
             ],
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_pool2d_api.py b/test/legacy_test/test_pool2d_api.py
index fcca5381fa4f06..84615340fe051e 100644
--- a/test/legacy_test/test_pool2d_api.py
+++ b/test/legacy_test/test_pool2d_api.py
@@ -25,6 +25,7 @@
 from paddle import base
 from paddle.base import core
 from paddle.nn.functional import avg_pool2d, max_pool2d
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestPool2D_API(unittest.TestCase):
@@ -52,7 +53,7 @@ def check_avg_static_results(self, place):
 
             exe = base.Executor(place)
             fetches = exe.run(
-                base.default_main_program(),
+                paddle.static.default_main_program(),
                 feed={"input": input_np},
                 fetch_list=[result],
             )
@@ -144,7 +145,7 @@ def check_max_static_results(self, place):
 
             exe = base.Executor(place)
             fetches = exe.run(
-                base.default_main_program(),
+                paddle.static.default_main_program(),
                 feed={"input": input_np},
                 fetch_list=[result],
             )
@@ -360,8 +361,6 @@ def test_pool2d(self):
         for place in self.places:
             self.check_max_dygraph_results(place)
             self.check_avg_dygraph_results(place)
-            self.check_max_static_results(place)
-            self.check_avg_static_results(place)
             self.check_max_dygraph_stride_is_none(place)
             self.check_avg_dygraph_stride_is_none(place)
             self.check_max_dygraph_padding(place)
@@ -370,6 +369,14 @@ def test_pool2d(self):
             self.check_max_dygraph_ceilmode_results(place)
             self.check_max_dygraph_nhwc_results(place)
 
+    @test_with_pir_api
+    def test_pool2d_static(self):
+        paddle.enable_static()
+        for place in self.places:
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+        paddle.disable_static()
+
 
 class TestPool2DError_API(unittest.TestCase):
     def test_error_api(self):
diff --git a/test/legacy_test/test_print_op.py b/test/legacy_test/test_print_op.py
index 3352d2b23ef937..c4390d76bb9ffd 100755
--- a/test/legacy_test/test_print_op.py
+++ b/test/legacy_test/test_print_op.py
@@ -97,8 +97,8 @@ def test_errors(self):
                 np.array([[-1]]), [[1]], paddle.CPUPlace()
             )
             self.assertRaises(TypeError, paddle.static.Print, x1)
-            # The input dtype of Print_op must be float32, float64, int32_t, int64_t or bool.
-            x2 = paddle.static.data(name='x2', shape=[4], dtype="float16")
+            # The input dtype of Print_op must be uint16, float16, float32, float64, int32_t, int64_t or bool.
+            x2 = paddle.static.data(name='x2', shape=[4], dtype="int8")
             self.assertRaises(TypeError, paddle.static.Print, x2)
 
 
diff --git a/test/legacy_test/test_proximal_adagrad_op.py b/test/legacy_test/test_proximal_adagrad_op.py
deleted file mode 100644
index 45d25d3a213502..00000000000000
--- a/test/legacy_test/test_proximal_adagrad_op.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestProximalAdagradOp(OpTest):
-    def setUp(self):
-        self.op_type = "proximal_adagrad"
-        w = np.random.random((102, 105)).astype("float32")
-        m = np.random.random((102, 105)).astype("float32")
-        g = np.random.random((102, 105)).astype("float32")
-        lr = np.array([0.1]).astype("float32")
-        l1 = 0.1
-        l2 = 0.2
-
-        self.inputs = {'Param': w, 'Grad': g, 'Moment': m, 'LearningRate': lr}
-        self.attrs = {'l1': l1, 'l2': l2}
-        param_out = 0.0
-
-        moment_out = m + g * g
-        prox_param = w - lr * g / np.sqrt(moment_out)
-        if l1 > 0.0:
-            x = np.abs(prox_param) - lr * l1
-            x[x < 0] = 0
-            param_out = np.sign(prox_param) * (x / (1.0 + lr * l2))
-        else:
-            param_out = prox_param / (1.0 + lr * l2)
-
-        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_randint_op.py b/test/legacy_test/test_randint_op.py
index a48750eebdc7d3..fefae2c4d81648 100644
--- a/test/legacy_test/test_randint_op.py
+++ b/test/legacy_test/test_randint_op.py
@@ -46,7 +46,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -87,7 +87,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -107,7 +107,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
diff --git a/test/legacy_test/test_random_crop_op.py b/test/legacy_test/test_random_crop_op.py
deleted file mode 100644
index 08355378207c13..00000000000000
--- a/test/legacy_test/test_random_crop_op.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestRandomCropOp(OpTest):
-    def setUp(self):
-        to_crop = np.array(
-            [[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] * 5
-        ).astype(np.int32)
-        self.possible_res = [
-            np.array([[1, 2, 3], [5, 6, 7]]).astype(np.int32),
-            np.array([[2, 3, 4], [6, 7, 8]]).astype(np.int32),
-            np.array([[5, 6, 7], [9, 10, 11]]).astype(np.int32),
-            np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32),
-        ]
-        self.op_type = "random_crop"
-        self.inputs = {'X': to_crop, 'Seed': np.array([10]).astype('int64')}
-        self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
-        self.attrs = {'shape': [2, 3]}
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_output)
-
-    def verify_output(self, outs):
-        out = np.array(outs[1])
-        for ins in out[:]:
-            is_equal = [(ins == res).all() for res in self.possible_res]
-            self.assertIn(True, is_equal)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_randperm_op.py b/test/legacy_test/test_randperm_op.py
index ceb8b82aa0f55d..9cb270801fece9 100644
--- a/test/legacy_test/test_randperm_op.py
+++ b/test/legacy_test/test_randperm_op.py
@@ -83,7 +83,7 @@ def init_attrs(self):
         pass
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         out_np = np.array(outs[0])
@@ -144,7 +144,9 @@ def init_attrs(self):
         self.np_dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place_customized(self.verify_output, self.place)
+        self.check_output_with_place_customized(
+            self.verify_output, self.place, check_pir=True
+        )
 
     def verify_output(self, outs):
         out_np = convert_uint16_to_float(np.array(outs[0]))
diff --git a/test/legacy_test/test_reduce_op.py b/test/legacy_test/test_reduce_op.py
index d60fb8bfeb1468..a88f2650a005dc 100644
--- a/test/legacy_test/test_reduce_op.py
+++ b/test/legacy_test/test_reduce_op.py
@@ -22,6 +22,7 @@
 from paddle import base
 from paddle.base import Program, core, program_guard
 from paddle.base.framework import convert_np_dtype_to_dtype_, in_pir_mode
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestSumOp(OpTest):
@@ -55,14 +56,14 @@ def calc_output(self):
         self.out = self.x.sum(axis=tuple(self.attrs['dim']))
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -95,7 +96,7 @@ def test_check_grad(self):
         self.check_grad(
             ['X'],
             'Out',
-            check_new_ir=True,
+            check_pir=True,
             check_prim=True,
             check_prim_pir=True,
         )
@@ -125,10 +126,10 @@ def init_attrs(self):
         self.attrs = {'dim': (0, 3)}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
 
 class TestSumOp_withInt(TestSumOp):
@@ -141,7 +142,7 @@ def init_attrs(self):
         self.attrs = {'dim': (0, 1)}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def calc_gradient(self):
         x = self.inputs["X"]
@@ -155,7 +156,7 @@ def test_check_grad(self):
             user_defined_grads=self.calc_gradient(),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -167,7 +168,7 @@ def init_attrs(self):
         self.attrs = {'dim': (0, 1, 2)}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def calc_gradient(self):
         x = self.inputs["X"]
@@ -181,7 +182,7 @@ def test_check_grad(self):
             user_defined_grads=self.calc_gradient(),
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -194,7 +195,7 @@ def init_dtype(self):
             self.dtype = np.float16
 
         def test_check_output(self):
-            self.check_output(check_new_ir=True)
+            self.check_output(check_pir=True)
 
         def test_check_grad(self):
             self.check_grad(
@@ -202,7 +203,7 @@ def test_check_grad(self):
                 'Out',
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
 
@@ -231,7 +232,7 @@ def init_dtype(self):
 
         def test_check_output(self):
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, check_new_ir=True)
+            self.check_output_with_place(place, check_pir=True)
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
@@ -242,7 +243,7 @@ def test_check_grad(self):
                 user_defined_grads=self.gradient,
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
         def calc_gradient(self):
@@ -279,7 +280,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -288,7 +289,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -314,7 +315,7 @@ def init_inputs_and_outputs(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -323,7 +324,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -368,7 +369,7 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -377,7 +378,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def init_dtype(self):
@@ -403,7 +404,7 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        self.check_output_with_place(core.CUDAPlace(0), check_new_ir=True)
+        self.check_output_with_place(core.CUDAPlace(0), check_pir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -413,7 +414,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -826,7 +827,7 @@ def setUp(self):
         self.attrs = {'reduce_all': True}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAllFloatOp(OpTest):
@@ -838,7 +839,7 @@ def setUp(self):
         self.attrs = {'reduce_all': True}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAllIntOp(OpTest):
@@ -850,7 +851,7 @@ def setUp(self):
         self.attrs = {'reduce_all': True}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAllOp_ZeroDim(OpTest):
@@ -862,7 +863,7 @@ def setUp(self):
         self.attrs = {'dim': []}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAll8DOp(OpTest):
@@ -878,7 +879,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAllOpWithDim(OpTest):
@@ -890,7 +891,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAll8DOpWithDim(OpTest):
@@ -906,7 +907,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAllOpWithKeepDim(OpTest):
@@ -920,7 +921,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAll8DOpWithKeepDim(OpTest):
@@ -940,7 +941,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
 
 class TestAllOpError(unittest.TestCase):
@@ -964,7 +965,7 @@ def setUp(self):
         self.attrs = {'reduce_all': True}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAnyFloatOp(OpTest):
@@ -976,7 +977,7 @@ def setUp(self):
         self.attrs = {'reduce_all': True}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAnyIntOp(OpTest):
@@ -988,7 +989,7 @@ def setUp(self):
         self.attrs = {'reduce_all': True}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAnyOp_ZeroDim(OpTest):
@@ -1000,7 +1001,7 @@ def setUp(self):
         self.attrs = {'dim': []}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAny8DOp(OpTest):
@@ -1016,7 +1017,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAnyOpWithDim(OpTest):
@@ -1028,7 +1029,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].any(axis=1)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAny8DOpWithDim(OpTest):
@@ -1044,7 +1045,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAnyOpWithKeepDim(OpTest):
@@ -1060,7 +1061,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAny8DOpWithKeepDim(OpTest):
@@ -1080,7 +1081,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir=True)
 
 
 class TestAnyOpError(unittest.TestCase):
@@ -1303,7 +1304,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -1312,7 +1313,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             only_check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -1615,6 +1616,15 @@ def test_errors(self):
                 x2 = paddle.static.data(name='x2', shape=[-1, 4], dtype="uint8")
                 self.assertRaises(TypeError, paddle.sum, x2)
 
+            with paddle.pir_utils.IrGuard(), program_guard(
+                Program(), Program()
+            ):
+                # The input type of reduce_sum_op must be Variable.
+                x1 = base.create_lod_tensor(
+                    np.array([[-1]]), [[1]], base.CPUPlace()
+                )
+                self.assertRaises(ValueError, paddle.sum, x1)
+
 
 class API_TestSumOp(unittest.TestCase):
     def run_static(
@@ -1645,6 +1655,7 @@ def run_static(
                 rtol=1e-05,
             )
 
+    @test_with_pir_api
     def test_static(self):
         shape = [10, 10]
         axis = 1
@@ -1702,21 +1713,25 @@ def setUp(self):
             self.places.append(base.CUDAPlace(0))
 
     def check_static_result(self, place):
-        with base.program_guard(base.Program(), base.Program()):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
             input = paddle.static.data(name="input", shape=[4, 4], dtype="bool")
             result = paddle.all(x=input)
             input_np = np.random.randint(0, 2, [4, 4]).astype("bool")
 
             exe = base.Executor(place)
             fetches = exe.run(
-                base.default_main_program(),
+                main,
                 feed={"input": input_np},
                 fetch_list=[result],
             )
             self.assertTrue((fetches[0] == np.all(input_np)).all())
 
     def check_static_float_result(self, place):
-        with base.program_guard(base.Program(), base.Program()):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
             input = paddle.static.data(
                 name="input", shape=[4, 4], dtype="float"
             )
@@ -1725,26 +1740,29 @@ def check_static_float_result(self, place):
 
             exe = base.Executor(place)
             fetches = exe.run(
-                base.default_main_program(),
+                main,
                 feed={"input": input_np},
                 fetch_list=[result],
             )
             self.assertTrue((fetches[0] == np.all(input_np)).all())
 
     def check_static_int_result(self, place):
-        with base.program_guard(base.Program(), base.Program()):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
             input = paddle.static.data(name="input", shape=[4, 4], dtype="int")
             result = paddle.all(x=input)
             input_np = np.random.randint(0, 2, [4, 4]).astype("int")
 
             exe = base.Executor(place)
             fetches = exe.run(
-                base.default_main_program(),
+                main,
                 feed={"input": input_np},
                 fetch_list=[result],
             )
             self.assertTrue((fetches[0] == np.all(input_np)).all())
 
+    @test_with_pir_api
     def test_static(self):
         for place in self.places:
             self.check_static_result(place=place)
@@ -1803,21 +1821,25 @@ def setUp(self):
             self.places.append(base.CUDAPlace(0))
 
     def check_static_result(self, place):
-        with base.program_guard(base.Program(), base.Program()):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
             input = paddle.static.data(name="input", shape=[4, 4], dtype="bool")
             result = paddle.any(x=input)
             input_np = np.random.randint(0, 2, [4, 4]).astype("bool")
 
             exe = base.Executor(place)
             fetches = exe.run(
-                base.default_main_program(),
+                main,
                 feed={"input": input_np},
                 fetch_list=[result],
             )
             self.assertTrue((fetches[0] == np.any(input_np)).all())
 
     def check_static_float_result(self, place):
-        with base.program_guard(base.Program(), base.Program()):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
             input = paddle.static.data(
                 name="input", shape=[4, 4], dtype="float"
             )
@@ -1826,26 +1848,29 @@ def check_static_float_result(self, place):
 
             exe = base.Executor(place)
             fetches = exe.run(
-                base.default_main_program(),
+                main,
                 feed={"input": input_np},
                 fetch_list=[result],
             )
             self.assertTrue((fetches[0] == np.any(input_np)).all())
 
     def check_static_int_result(self, place):
-        with base.program_guard(base.Program(), base.Program()):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
             input = paddle.static.data(name="input", shape=[4, 4], dtype="int")
             result = paddle.any(x=input)
             input_np = np.random.randint(0, 2, [4, 4]).astype("int")
 
             exe = base.Executor(place)
             fetches = exe.run(
-                base.default_main_program(),
+                main,
                 feed={"input": input_np},
                 fetch_list=[result],
             )
             self.assertTrue((fetches[0] == np.any(input_np)).all())
 
+    @test_with_pir_api
     def test_static(self):
         for place in self.places:
             self.check_static_result(place=place)
diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
index f0128173b44894..dd1f7e00447343 100755
--- a/test/legacy_test/test_reshape_op.py
+++ b/test/legacy_test/test_reshape_op.py
@@ -44,14 +44,14 @@ def init_data(self):
         self.infered_shape = (12, 10)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
             ["X"],
             "Out",
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -123,7 +123,7 @@ def init_data(self):
         self.infered_shape = (12, 10)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -131,7 +131,7 @@ def test_check_grad(self):
             "Out",
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -157,7 +157,7 @@ def init_data(self):
         self.infered_shape = (12, 10)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -165,7 +165,7 @@ def test_check_grad(self):
             "Out",
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -209,7 +209,7 @@ def init_data(self):
         self.actual_shape = (2, 3, 20)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -217,7 +217,7 @@ def test_check_grad(self):
             "Out",
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -254,7 +254,7 @@ def init_data(self):
         self.shape = (-1, -1)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -262,7 +262,7 @@ def test_check_grad(self):
             "Out",
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -308,7 +308,7 @@ def init_data(self):
         self.infered_shape = (10, 10)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -316,7 +316,7 @@ def test_check_grad(self):
             "Out",
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -370,7 +370,7 @@ def test_check_output(self):
             base.core.CPUPlace(),
             atol=1e-5,
             no_check_set=['XShape'],
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad(self):
diff --git a/test/legacy_test/test_rnn_memory_helper_op.py b/test/legacy_test/test_rnn_memory_helper_op.py
deleted file mode 100644
index 16a0cccb10d6f8..00000000000000
--- a/test/legacy_test/test_rnn_memory_helper_op.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from paddle.base import core
-from paddle.base.executor import Executor
-from paddle.base.framework import Program
-
-
-class RNNMemoryHelperOpTest(unittest.TestCase):
-    def setUp(self):
-        self.program = Program()
-        self.place = core.CPUPlace()
-
-        self.X = self.program.global_block().create_var(
-            name='X', shape=[2, 3], dtype='float32'
-        )
-        self.Out = self.program.global_block().create_var(
-            name='Out', shape=[2, 3], dtype='float32'
-        )
-        self.program.global_block().append_op(
-            type='rnn_memory_helper',
-            inputs={"X": self.X},
-            outputs={"Out": self.Out},
-            attrs={},
-        )
-
-    def test_forward(self):
-        x_np = np.random.normal(size=(2, 3)).astype("float32")
-        self.feed_map = {'X': x_np}
-        self.fetch_list = [self.Out]
-        exe = Executor(self.place)
-        out = exe.run(
-            self.program, feed=self.feed_map, fetch_list=self.fetch_list
-        )
-        np.testing.assert_allclose(out[0], x_np, rtol=1e-05)
-
-
-class RNNMemoryHelperGradOpTest(unittest.TestCase):
-    def setUp(self):
-        self.program = Program()
-        self.place = core.CPUPlace()
-
-        self.input_names = ['X', 'Out', 'Out@GRAD']
-        self.input_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32'
-            )
-            for name in self.input_names
-        }
-
-        self.output_names = ['X@GRAD']
-        self.output_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32'
-            )
-            for name in self.output_names
-        }
-
-        self.program.global_block().append_op(
-            type='rnn_memory_helper_grad',
-            inputs=self.input_vars,
-            outputs=self.output_vars,
-            attrs={},
-        )
-
-    def test_backward(self):
-        self.feed_map = {
-            name: np.random.normal(size=(2, 3)).astype("float32")
-            for name in self.input_names
-        }
-        self.fetch_list = [self.output_vars['X@GRAD']]
-
-        exe = Executor(self.place)
-        out = exe.run(
-            self.program, feed=self.feed_map, fetch_list=self.fetch_list
-        )
-        np.isclose(out[0], self.feed_map['Out@GRAD'], rtol=1e-5)
-
-
-class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
-    def setUp(self):
-        self.program = Program()
-        self.fake_program = Program()
-        self.place = core.CPUPlace()
-
-        self.input_names = ['X', 'Out']
-        self.input_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32'
-            )
-            for name in self.input_names
-        }
-        self.input_vars[
-            "Out@GRAD"
-        ] = self.fake_program.global_block().create_var(
-            name="Out@GRAD", shape=[2, 3], dtype='float32'
-        )
-
-        self.output_names = ['X@GRAD']
-        self.output_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32'
-            )
-            for name in self.output_names
-        }
-
-        self.program.global_block().append_op(
-            type='rnn_memory_helper_grad',
-            inputs=self.input_vars,
-            outputs=self.output_vars,
-            attrs={},
-        )
-
-    def test_backward(self):
-        self.feed_map = {
-            name: np.random.normal(size=(2, 3)).astype("float32")
-            for name in ['X', 'Out']
-        }
-        self.fetch_list = [self.output_vars['X@GRAD']]
-
-        exe = Executor(self.place)
-        out = exe.run(
-            self.program, feed=self.feed_map, fetch_list=self.fetch_list
-        )
-        np.testing.assert_allclose(
-            out[0], np.zeros(shape=(2, 3)).astype('float32'), rtol=1e-05
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_roi_perspective_transform_op.py b/test/legacy_test/test_roi_perspective_transform_op.py
deleted file mode 100644
index 59a7a3f3b4a110..00000000000000
--- a/test/legacy_test/test_roi_perspective_transform_op.py
+++ /dev/null
@@ -1,261 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License")
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUWARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from math import floor, sqrt
-
-import numpy as np
-
-
-def gt_e(a, b):
-    return a > b or abs(a - b) < 1e-4
-
-
-def gt(a, b):
-    return (a - b) > 1e-4
-
-
-def lt_e(a, b):
-    return a < b or abs(a - b) < 1e-4
-
-
-def in_quad(x, y, roi_x, roi_y):
-    # check if (x, y) is in the boundary of roi
-    for i in range(4):
-        xs = roi_x[i]
-        ys = roi_y[i]
-        xe = roi_x[(i + 1) % 4]
-        ye = roi_y[(i + 1) % 4]
-        if abs(ys - ye) < 1e-4:
-            if (
-                abs(y - ys) < 1e-4
-                and abs(y - ye) < 1e-4
-                and gt_e(x, min(xs, xe))
-                and lt_e(x, max(xs, xe))
-            ):
-                return True
-        else:
-            intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs
-            if (
-                abs(intersec_x - x) < 1e-4
-                and gt_e(y, min(ys, ye))
-                and lt_e(y, max(ys, ye))
-            ):
-                return True
-    n_cross = 0
-    for i in range(4):
-        xs = roi_x[i]
-        ys = roi_y[i]
-        xe = roi_x[(i + 1) % 4]
-        ye = roi_y[(i + 1) % 4]
-        if abs(ys - ye) < 1e-4:
-            continue
-        if lt_e(y, min(ys, ye)) or gt(y, max(ys, ye)):
-            continue
-        intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs
-        if abs(intersec_x - x) < 1e-4:
-            return True
-        if gt(intersec_x, x):
-            n_cross += 1
-    return n_cross % 2 == 1
-
-
-def get_transform_matrix(transformed_width, transformed_height, roi_x, roi_y):
-    x0 = roi_x[0]
-    x1 = roi_x[1]
-    x2 = roi_x[2]
-    x3 = roi_x[3]
-    y0 = roi_y[0]
-    y1 = roi_y[1]
-    y2 = roi_y[2]
-    y3 = roi_y[3]
-
-    len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1))
-    len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2))
-    len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3))
-    len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0))
-    estimated_height = (len2 + len4) / 2.0
-    estimated_width = (len1 + len3) / 2.0
-
-    normalized_height = max(2, transformed_height)
-    normalized_width = (
-        round(estimated_width * (normalized_height - 1) / estimated_height) + 1
-    )
-    normalized_width = max(2, min(normalized_width, transformed_width))
-
-    dx1 = x1 - x2
-    dx2 = x3 - x2
-    dx3 = x0 - x1 + x2 - x3
-    dy1 = y1 - y2
-    dy2 = y3 - y2
-    dy3 = y0 - y1 + y2 - y3
-    matrix = np.zeros([9])
-    matrix[6] = (
-        (dx3 * dy2 - dx2 * dy3)
-        / (dx1 * dy2 - dx2 * dy1 + 1e-5)
-        / (normalized_width - 1)
-    )
-    matrix[7] = (
-        (dx1 * dy3 - dx3 * dy1)
-        / (dx1 * dy2 - dx2 * dy1 + 1e-5)
-        / (normalized_height - 1)
-    )
-    matrix[8] = 1
-
-    matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) / (
-        normalized_width - 1
-    )
-    matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) / (
-        normalized_height - 1
-    )
-    matrix[5] = y0
-
-    matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) / (
-        normalized_width - 1
-    )
-    matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) / (
-        normalized_height - 1
-    )
-    matrix[2] = x0
-    return matrix
-
-
-def get_source_coords(matrix, out_w, out_h):
-    u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]
-    v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]
-    w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]
-    in_w = u / w
-    in_h = v / w
-    return in_w, in_h
-
-
-def bilinear_interpolate(in_data, in_n, in_c, in_w, in_h):
-    batch_size = in_data.shape[0]
-    channels = in_data.shape[1]
-    height = in_data.shape[2]
-    width = in_data.shape[3]
-
-    if (
-        gt_e(-0.5, in_w)
-        or gt_e(in_w, width - 0.5)
-        or gt_e(-0.5, in_h)
-        or gt_e(in_h, height - 0.5)
-    ):
-        return 0.0
-
-    if gt_e(0, in_w):
-        in_w = 0
-    if gt_e(0, in_h):
-        in_h = 0
-
-    in_w_floor = floor(in_w)
-    in_h_floor = floor(in_h)
-
-    if gt_e(in_w_floor, width - 1):
-        in_w_ceil = width - 1
-        in_w_floor = width - 1
-        in_w = in_w_floor
-    else:
-        in_w_ceil = in_w_floor + 1
-
-    if gt_e(in_h_floor, height - 1):
-        in_h_ceil = height - 1
-        in_h_floor = height - 1
-        in_h = in_h_floor
-    else:
-        in_h_ceil = in_h_floor + 1
-
-    w_floor = in_w - in_w_floor
-    h_floor = in_h - in_h_floor
-    w_ceil = 1 - w_floor
-    h_ceil = 1 - h_floor
-    v1 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_floor)]
-    v2 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_floor)]
-    v3 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_ceil)]
-    v4 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_ceil)]
-    w1 = w_ceil * h_ceil
-    w2 = w_ceil * h_floor
-    w3 = w_floor * h_floor
-    w4 = w_floor * h_ceil
-    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
-    return val
-
-
-def lod_convert(lod):
-    ret = [0]
-    for count in lod:
-        ret.append(ret[-1] + count)
-    return ret
-
-
-def roi_transform(
-    in_data,
-    rois,
-    rois_lod,
-    transformed_height,
-    transformed_width,
-    spatial_scale,
-):
-    channels = in_data.shape[1]
-    in_height = in_data.shape[2]
-    in_width = in_data.shape[3]
-    rois_num = rois.shape[0]
-
-    roi2image = [0] * rois_num
-    rois_lod = lod_convert(rois_lod[0])
-    for i in range(len(rois_lod) - 1):
-        for j in range(rois_lod[i], rois_lod[i + 1]):
-            roi2image[j] = i
-
-    out = np.zeros([rois_num, channels, transformed_height, transformed_width])
-    mask = np.zeros(
-        [rois_num, 1, transformed_height, transformed_width]
-    ).astype('int')
-    matrix = np.zeros([rois_num, 9], dtype=in_data.dtype)
-    for n in range(rois_num):
-        roi_x = []
-        roi_y = []
-        for k in range(4):
-            roi_x.append(rois[n][2 * k] * spatial_scale)
-            roi_y.append(rois[n][2 * k + 1] * spatial_scale)
-        image_id = roi2image[n]
-        transform_matrix = get_transform_matrix(
-            transformed_width, transformed_height, roi_x, roi_y
-        )
-        matrix[n] = transform_matrix
-        for c in range(channels):
-            for out_h in range(transformed_height):
-                for out_w in range(transformed_width):
-                    in_w, in_h = get_source_coords(
-                        transform_matrix, out_w, out_h
-                    )
-                    if (
-                        in_quad(in_w, in_h, roi_x, roi_y)
-                        and gt(in_w, -0.5)
-                        and gt(in_width - 0.5, in_w)
-                        and gt(in_h, -0.5)
-                        and gt(in_height - 0.5, in_h)
-                    ):
-                        out[n][c][out_h][out_w] = bilinear_interpolate(
-                            in_data, image_id, c, in_w, in_h
-                        )
-                        mask[n][0][out_h][out_w] = 1
-                    else:
-                        out[n][c][out_h][out_w] = 0.0
-                        mask[n][0][out_h][out_w] = 0
-    return out.astype("float32"), mask, matrix
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_run.py b/test/legacy_test/test_run.py
index e0ec7c9657fb54..331d45a514a932 100644
--- a/test/legacy_test/test_run.py
+++ b/test/legacy_test/test_run.py
@@ -207,4 +207,5 @@ def test_ps_4(self):
 
 
 if __name__ == '__main__':
+    os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
     unittest.main()
diff --git a/test/legacy_test/test_sample_logits_op.py b/test/legacy_test/test_sample_logits_op.py
deleted file mode 100644
index 64c70b5a8a07c5..00000000000000
--- a/test/legacy_test/test_sample_logits_op.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestSampleLogitsOp(OpTest):
-    def setUp(self):
-        self.op_type = "sample_logits"
-        self.dtype = np.float64
-        self.use_mkldnn = False
-        bs = 2
-        K = 20
-        NT = 10
-        S = 5
-
-        Samples = np.random.random([bs, NT + S]).astype('int64')
-        Probabilities = np.random.random([bs, NT + S]).astype('float64')
-        LogitsDim = np.array([bs, K], dtype=np.int64)
-        LabelsDim = np.array([bs, NT], dtype=np.int64)
-        SampledLogits = np.random.random([bs, NT + S]).astype('float64')
-        SampledLabels = np.random.random([bs, NT]).astype('int64')
-
-        self.bs = bs
-        self.K = K
-        self.NT = NT
-        self.S = S
-        Labels = np.array(list(range(self.NT)) * self.bs).astype('int64')
-        self.Labels = Labels.reshape(self.bs, -1)
-        self.Logits = np.random.random([self.bs, self.K]).astype('float64')
-
-        self.inputs = {"Logits": self.Logits, "Labels": self.Labels}
-        self.fetch_list = [
-            'Samples',
-            'Probabilities',
-            'SampledLogits',
-            'SampledLabels',
-        ]
-        self.outputs = collections.OrderedDict(
-            (
-                ('Samples', Samples),
-                ('Probabilities', Probabilities),
-                ('LogitsDim', LogitsDim),
-                ('LabelsDim', LabelsDim),
-                ('SampledLogits', SampledLogits),
-                ('SampledLabels', SampledLabels),
-            )
-        )
-
-        self.attrs = {'num_samples': self.S}
-
-    def test_check_output(self):
-        places = self._get_places()
-        for p in places:
-            (Samples, Probabilities, SampledLogits, SampledLabels) = (
-                np.array(o) for o in self.calc_output(p)
-            )
-
-            assert (
-                Samples.dtype == np.int64
-            ), f"Samples dtype is {Samples.dtype}, not int64"
-            assert (
-                Probabilities.dtype == np.float64
-            ), f"Probabilities dtype is {Probabilities.dtype}, not float64"
-            assert (
-                SampledLogits.dtype == np.float64
-            ), f"SampledLogits dtype is {SampledLogits.dtype}, not float64"
-            assert (
-                SampledLabels.dtype == np.int64
-            ), f"SampledLabels dtype is {SampledLabels.dtype}, not int64"
-
-            assert Samples.shape == (self.bs, self.NT + self.S)
-            assert Probabilities.shape == (self.bs, self.NT + self.S)
-            assert SampledLogits.shape == (self.bs, self.NT + self.S)
-            assert SampledLabels.shape == (self.bs, self.NT)
-
-            assert (SampledLabels == self.Labels).all()
-            sampled_logits = self.Logits[:, Samples[0][: self.NT]]
-            sampled_logits -= np.log(Probabilities[:, : self.NT])
-            np.testing.assert_almost_equal(
-                sampled_logits, SampledLogits[:, : self.NT]
-            )
-
-    def test_check_grad(self):
-        self._check_grad_helper()
-        for p in self._get_places():
-            grads = self._get_gradient(['Logits'], p, ['SampledLogits'], [])
-            np.testing.assert_almost_equal(grads[0].sum(), np.array([1.0]))
-
-
-class TestSampleLogitsOpNoUniq(TestSampleLogitsOp):
-    def setUp(self):
-        super().setUp()
-        self.attrs = {'num_samples': self.S, 'uniq': False}
-
-
-class TestSampleLogitsOpWithAccidentalHits(TestSampleLogitsOp):
-    def setUp(self):
-        super().setUp()
-        self.attrs = {'num_samples': self.S, 'remove_accidental_hits': False}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_scale_op.py b/test/legacy_test/test_scale_op.py
index a6cea49a2bce32..5f33de74b3b614 100644
--- a/test/legacy_test/test_scale_op.py
+++ b/test/legacy_test/test_scale_op.py
@@ -42,10 +42,10 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
 
 class TestScaleOpScaleVariable(OpTest):
@@ -66,10 +66,10 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
 
 class TestScaleOpSelectedRows(unittest.TestCase):
@@ -150,10 +150,10 @@ def init_dtype_type(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_new_ir=True)
+        self.check_grad(["X"], "Out", check_pir=True)
 
 
 @unittest.skipIf(
@@ -172,10 +172,10 @@ def setUp(self):
         self.outputs = {'Out': convert_float_to_uint16(out)}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', numeric_grad_delta=0.8, check_new_ir=True)
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.8, check_pir=True)
 
 
 @unittest.skipIf(
diff --git a/test/legacy_test/test_shape_op.py b/test/legacy_test/test_shape_op.py
index 6ced0cfd4a8c89..4ee95e9c4f3bde 100644
--- a/test/legacy_test/test_shape_op.py
+++ b/test/legacy_test/test_shape_op.py
@@ -36,7 +36,7 @@ def config(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_cinn=True, check_new_ir=True)
+        self.check_output(check_cinn=True, check_pir=True)
 
 
 class case1(TestShapeOp):
@@ -125,7 +125,7 @@ def config(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_cinn=True, check_new_ir=True)
+        self.check_output_with_place(place, check_cinn=True, check_pir=True)
 
 
 class case1Bf16(TestShapeOpBf16):
diff --git a/test/legacy_test/test_sigmoid_focal_loss_op.py b/test/legacy_test/test_sigmoid_focal_loss_op.py
deleted file mode 100644
index efe1922165fb48..00000000000000
--- a/test/legacy_test/test_sigmoid_focal_loss_op.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import math
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-def sigmoid_focal_loss_forward(
-    x_data, label_data, fg_num_data, gamma, alpha, num_classes
-):
-    x_data_t = copy.deepcopy(x_data)
-    out_data = copy.deepcopy(x_data)
-    x_width = len(x_data)
-    x_height = len(x_data[0, :])
-    x_data_t = x_data_t.flatten()
-    out_data = out_data.flatten()
-    for idx in range(len(x_data_t)):
-        x = x_data_t[idx]
-        a = int(idx / num_classes)
-        d = int(idx % num_classes)
-        label = label_data[a]
-        c_pos = float(int(label) == int(d + 1))
-        c_neg = float((int(label) != -1) & (int(label) != (d + 1)))
-        fg_num = max(fg_num_data, 1)
-        z_neg = (1.0 - alpha) / fg_num
-        z_pos = alpha / fg_num
-
-        p = 1.0 / (1.0 + math.exp(-x))
-        FLT_MIN = 1.175494351e-38
-        term_pos = math.pow((1.0 - p), gamma) * math.log(max(FLT_MIN, p))
-        term_neg = math.pow(p, gamma) * (
-            -1.0 * x * (x >= 0)
-            - math.log(1.0 + math.exp(x - 2.0 * x * (x >= 0)))
-        )
-        out_data[idx] = 0.0
-        out_data[idx] += -c_pos * term_pos * z_pos
-        out_data[idx] += -c_neg * term_neg * z_neg
-
-    out_data = out_data.reshape(x_width, x_height)
-    return out_data
-
-
-class TestSigmoidFocalLossOp1(OpTest):
-    def set_argument(self):
-        self.num_anchors = 10
-        self.num_classes = 10
-        self.gamma = 2.0
-        self.alpha = 0.25
-
-    def setUp(self):
-        self.set_argument()
-
-        dims = (self.num_anchors, self.num_classes)
-        X = np.random.standard_normal(dims).astype("float64")
-        L = np.random.randint(0, self.num_classes + 1, (dims[0], 1)).astype(
-            "int32"
-        )
-        F = np.zeros(1)
-        F[0] = len(np.where(L > 0)[0])
-        F = F.astype("int32")
-
-        self.op_type = "sigmoid_focal_loss"
-        self.inputs = {
-            'X': X,
-            'Label': L,
-            'FgNum': F,
-        }
-        self.attrs = {
-            'gamma': self.gamma,
-            'alpha': self.alpha,
-        }
-        loss = sigmoid_focal_loss_forward(
-            self.inputs['X'],
-            self.inputs['Label'],
-            self.inputs['FgNum'],
-            self.gamma,
-            self.alpha,
-            self.num_classes,
-        )
-        self.outputs = {'Out': loss.astype('float64')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-@unittest.skipIf(
-    not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestSigmoidFocalLossOp2(TestSigmoidFocalLossOp1):
-    def test_check_output(self):
-        place = paddle.CUDAPlace(0)
-        self.check_output_with_place(place, atol=2e-3)
-
-    def test_check_grad(self):
-        place = paddle.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.002
-        )
-
-
-class TestSigmoidFocalLossOp3(TestSigmoidFocalLossOp1):
-    def set_argument(self):
-        self.num_anchors = 200
-        self.num_classes = 10
-        self.gamma = 1.0
-        self.alpha = 0.5
-
-
-@unittest.skipIf(
-    not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestSigmoidFocalLossOp4(TestSigmoidFocalLossOp3):
-    def test_check_output(self):
-        place = paddle.CUDAPlace(0)
-        self.check_output_with_place(place, atol=2e-3)
-
-    def test_check_grad(self):
-        place = paddle.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.002
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_sign_op.py b/test/legacy_test/test_sign_op.py
index 80dcc6909bfb76..404b52ef1d1fce 100644
--- a/test/legacy_test/test_sign_op.py
+++ b/test/legacy_test/test_sign_op.py
@@ -76,28 +76,12 @@ def test_check_grad(self):
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
-class TestSignOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of sign_op must be Variable or numpy.ndarray.
-            input1 = 12
-            self.assertRaises(TypeError, paddle.sign, input1)
-            # The input dtype of sign_op must be float16, float32, float64.
-            input2 = paddle.static.data(
-                name='input2', shape=[-1, 12, 10], dtype="int32"
-            )
-            input3 = paddle.static.data(
-                name='input3', shape=[-1, 12, 10], dtype="int64"
-            )
-            self.assertRaises(TypeError, paddle.sign, input2)
-            self.assertRaises(TypeError, paddle.sign, input3)
-            input4 = paddle.static.data(
-                name='input4', shape=[-1, 4], dtype="float16"
-            )
-            paddle.sign(input4)
-
-
 class TestSignAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = [base.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.place.append(base.CUDAPlace(0))
+
     def test_dygraph(self):
         with base.dygraph.guard():
             np_x = np.array([-1.0, 0.0, -0.0, 1.2, 1.5], dtype='float64')
@@ -108,23 +92,51 @@ def test_dygraph(self):
             self.assertEqual((np_z == z_expected).all(), True)
 
     def test_static(self):
-        with program_guard(Program(), Program()):
-            # The input type of sign_op must be Variable or numpy.ndarray.
-            input1 = 12
-            self.assertRaises(TypeError, paddle.tensor.math.sign, input1)
-            # The input dtype of sign_op must be float16, float32, float64.
-            input2 = paddle.static.data(
-                name='input2', shape=[-1, 12, 10], dtype="int32"
-            )
-            input3 = paddle.static.data(
-                name='input3', shape=[-1, 12, 10], dtype="int64"
-            )
-            self.assertRaises(TypeError, paddle.tensor.math.sign, input2)
-            self.assertRaises(TypeError, paddle.tensor.math.sign, input3)
-            input4 = paddle.static.data(
-                name='input4', shape=[-1, 4], dtype="float16"
-            )
-            paddle.sign(input4)
+        np_input2 = np.random.uniform(-10, 10, (12, 10)).astype("int16")
+        np_input3 = np.random.uniform(-10, 10, (12, 10)).astype("int32")
+        np_input4 = np.random.uniform(-10, 10, (12, 10)).astype("int64")
+        np_out2 = np.sign(np_input2)
+        np_out3 = np.sign(np_input3)
+        np_out4 = np.sign(np_input4)
+
+        def run(place):
+            with program_guard(Program(), Program()):
+                # The input type of sign_op must be Variable or numpy.ndarray.
+                input1 = 12
+                self.assertRaises(TypeError, paddle.tensor.math.sign, input1)
+                # The result of sign_op must correct.
+                input2 = paddle.static.data(
+                    name='input2', shape=[12, 10], dtype="int16"
+                )
+                input3 = paddle.static.data(
+                    name='input3', shape=[12, 10], dtype="int32"
+                )
+                input4 = paddle.static.data(
+                    name='input4', shape=[12, 10], dtype="int64"
+                )
+                out2 = paddle.sign(input2)
+                out3 = paddle.sign(input3)
+                out4 = paddle.sign(input4)
+                exe = paddle.static.Executor(place)
+                res2, res3, res4 = exe.run(
+                    paddle.static.default_main_program(),
+                    feed={
+                        "input2": np_input2,
+                        "input3": np_input3,
+                        "input4": np_input4,
+                    },
+                    fetch_list=[out2, out3, out4],
+                )
+                self.assertEqual((res2 == np_out2).all(), True)
+                self.assertEqual((res3 == np_out3).all(), True)
+                self.assertEqual((res4 == np_out4).all(), True)
+                input5 = paddle.static.data(
+                    name='input5', shape=[-1, 4], dtype="float16"
+                )
+                paddle.sign(input5)
+
+        for place in self.place:
+            run(place)
 
 
 class TestSignDoubleGradCheck(unittest.TestCase):
diff --git a/test/legacy_test/test_slice_op.py b/test/legacy_test/test_slice_op.py
index 065251b246928e..e409287c90b688 100644
--- a/test/legacy_test/test_slice_op.py
+++ b/test/legacy_test/test_slice_op.py
@@ -67,7 +67,7 @@ def config(self):
         self.out = self.input[1:3, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -75,7 +75,8 @@ def test_check_grad_normal(self):
             'Out',
             max_relative_error=0.006,
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
@@ -125,7 +126,7 @@ def config(self):
         self.out = self.input[1:2]
 
     def test_check_output(self):
-        self.check_output_with_place(paddle.CPUPlace(), check_new_ir=True)
+        self.check_output_with_place(paddle.CPUPlace(), check_pir=True)
 
 
 # 1.2 with attr(decrease)
@@ -157,7 +158,7 @@ def config(self):
         self.out = self.input[1:2, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -165,11 +166,11 @@ def test_check_grad_normal(self):
             'Out',
             max_relative_error=0.006,
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
-# Situation 2: starts(list, have tensor), ends(list, no tensor)
 # without attr(decrease)
 class TestSliceOp_starts_ListTensor(OpTest):
     def setUp(self):
@@ -203,11 +204,11 @@ def config(self):
         self.starts_infer = [-1, 0, -1]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+            ['Input'], 'Out', max_relative_error=0.006, check_pir=True
         )
 
 
@@ -248,11 +249,11 @@ def config(self):
         self.starts_infer = [1, -1, 2]
 
     def test_check_output(self):
-        self.check_output(check_dygraph=True, check_new_ir=True)
+        self.check_output(check_dygraph=True, check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+            ['Input'], 'Out', max_relative_error=0.006, check_pir=True
         )
 
 
@@ -301,11 +302,11 @@ def config(self):
         self.out = self.input[1, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+            ['Input'], 'Out', max_relative_error=0.006, check_pir=True
         )
 
 
@@ -339,11 +340,11 @@ def config(self):
         self.out = self.input[1:3, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+            ['Input'], 'Out', max_relative_error=0.006, check_pir=True
         )
 
 
@@ -378,11 +379,11 @@ def config(self):
         self.out = self.input[1, 0, 2:4, :]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+            ['Input'], 'Out', max_relative_error=0.006, check_pir=True
         )
 
 
@@ -424,11 +425,11 @@ def config(self):
         self.ends_infer = [-1, 3, 4]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+            ['Input'], 'Out', max_relative_error=0.006, check_pir=True
         )
 
 
@@ -468,10 +469,10 @@ def config(self):
         self.out = self.input[0:20, 1:3, 1:3]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', check_new_ir=True)
+        self.check_grad(['Input'], 'Out', check_pir=True)
 
 
 # Test CUDA float16
@@ -507,7 +508,7 @@ def test_check_output(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
             self.check_output_with_place(
-                place, check_prim=True, check_new_ir=True
+                place, check_prim=True, check_pir=True, check_prim_pir=True
             )
 
     def test_check_grad_normal(self):
@@ -519,7 +520,8 @@ def test_check_grad_normal(self):
                 ['Input'],
                 'Out',
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
+                check_prim_pir=True,
             )
 
 
@@ -555,7 +557,7 @@ def test_check_output(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
             self.check_output_with_place(
-                place, check_prim=True, check_new_ir=True
+                place, check_prim=True, check_pir=True, check_prim_pir=True
             )
 
     def test_check_grad_normal(self):
@@ -567,7 +569,8 @@ def test_check_grad_normal(self):
                 'Out',
                 numeric_grad_delta=0.5,
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
+                check_prim_pir=True,
             )
 
 
@@ -597,10 +600,16 @@ def config(self):
         self.infer_flags = [1, 1, 1]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['Input'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 # Test python API
@@ -648,7 +657,7 @@ def test_1(self):
 
             exe = base.Executor(place=base.CPUPlace())
             res_1, res_2, res_3, res_4, res_5, res_6, res_7 = exe.run(
-                base.default_main_program(),
+                paddle.static.default_main_program(),
                 feed={
                     "x": input,
                     'starts': np.array([-3, 0, 2]).astype("int32"),
@@ -665,6 +674,65 @@ def test_1(self):
             np.testing.assert_array_equal(res_6, input[-3:3, 0:100, :, 2:-1])
             np.testing.assert_array_equal(res_7, input[-1, 0:100, :, 2:-1])
 
+    def test_pir(self):
+        with paddle.pir_utils.IrGuard(), paddle.static.program_guard(
+            paddle.static.Program()
+        ):
+            input = np.random.random([3, 4, 5, 6]).astype("float64")
+            minus_1 = paddle.tensor.fill_constant([], "int32", -1)
+            minus_3 = paddle.tensor.fill_constant([], "int64", -3)
+            starts = paddle.static.data(name='starts', shape=[3], dtype="int32")
+            ends = paddle.static.data(name='ends', shape=[3], dtype="int32")
+            x = paddle.static.data(
+                name="x",
+                shape=[3, 4, 5, 6],
+                dtype="float64",
+            )
+
+            # value_int64 is greater than 2147483647 which is the max of int32
+            value_int64 = paddle.tensor.fill_constant([1], "int64", 2147483648)
+
+            out_1 = paddle.slice(
+                x,
+                axes=[0, 1, 2],
+                starts=[-3, 0, 2],
+                ends=[value_int64, 100, -1],
+            )
+            out_2 = paddle.slice(
+                x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, -1]
+            )
+            out_3 = paddle.slice(
+                x,
+                axes=[0, 1, 3],
+                starts=[minus_3, 0, 2],
+                ends=[3, 100, minus_1],
+            )
+            out_4 = paddle.slice(x, axes=[0, 1, 2], starts=starts, ends=ends)
+
+            out_5 = x[-3:3, 0:100, 2:-1]
+            out_6 = x[minus_3:3, 0:100, :, 2:-1]
+            # open it after supporting control flow
+            # out_7 = x[minus_1, 0:100, :, 2:minus_1]
+
+            exe = base.Executor(place=base.CPUPlace())
+            res_1, res_2, res_3, res_4, res_5, res_6 = exe.run(
+                paddle.static.default_main_program(),
+                feed={
+                    "x": input,
+                    'starts': np.array([-3, 0, 2]).astype("int32"),
+                    'ends': np.array([3, 100, -1]).astype("int32"),
+                },
+                fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6],
+            )
+
+            np.testing.assert_array_equal(res_1, input[-3:3, 0:100, 2:-1, :])
+            np.testing.assert_array_equal(res_2, input[-3:3, 0:100, :, 2:-1])
+            np.testing.assert_array_equal(res_3, input[-3:3, 0:100, :, 2:-1])
+            np.testing.assert_array_equal(res_4, input[-3:3, 0:100, 2:-1, :])
+            np.testing.assert_array_equal(res_5, input[-3:3, 0:100, 2:-1, :])
+            np.testing.assert_array_equal(res_6, input[-3:3, 0:100, :, 2:-1])
+            # np.testing.assert_array_equal(res_7, input[-1, 0:100, :, 2:-1])
+
 
 class TestSliceApiWithTensor(unittest.TestCase):
     def test_starts_ends_is_tensor(self):
@@ -745,7 +813,7 @@ def setUp(self):
 
     def set_program_and_run(self, main_program, case_num):
         with paddle_static_guard():
-            with base.program_guard(main_program):
+            with paddle.static.program_guard(main_program):
                 x = [
                     paddle.static.data(
                         name='x0', shape=self.shape, dtype="float32"
@@ -801,7 +869,7 @@ def set_program_and_run(self, main_program, case_num):
                 )
 
     def test_case_1(self):
-        main_program = base.Program()
+        main_program = paddle.static.Program()
         self.set_program_and_run(main_program, 1)
 
         self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR)
@@ -813,7 +881,7 @@ def test_case_1(self):
 
     def test_case_2(self):
         with paddle_static_guard():
-            main_program = base.Program()
+            main_program = paddle.static.Program()
             self.set_program_and_run(main_program, 2)
 
             self.assertTrue(
@@ -829,7 +897,7 @@ def test_case_2(self):
 
     def test_case_3(self):
         with paddle_static_guard():
-            main_program = base.Program()
+            main_program = paddle.static.Program()
             self.set_program_and_run(main_program, 3)
 
             self.assertTrue(
@@ -884,6 +952,13 @@ def test(self):
             out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
             self.assertEqual(out0.shape, (3, -1, 5))
 
+    def test_pir(self):
+        with paddle.pir_utils.IrGuard():
+            x = paddle.static.data('x', shape=[3, -1, 5])
+
+            out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
+            self.assertEqual(out0.shape, [3, -1, 5])
+
     def test_axis_less_than_zero(self):
         # Using paddle.disable_static will make other unittests fail.
         with base.dygraph.guard():
diff --git a/test/legacy_test/test_smooth_l1_loss_op.py b/test/legacy_test/test_smooth_l1_loss_op.py
deleted file mode 100644
index fb3fd40c0a8237..00000000000000
--- a/test/legacy_test/test_smooth_l1_loss_op.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def smooth_l1_loss_forward(val, sigma2):
-    abs_val = abs(val)
-    if abs_val < 1.0 / sigma2:
-        return 0.5 * val * val * sigma2
-    else:
-        return abs_val - 0.5 / sigma2
-
-
-class TestSmoothL1LossOp1(OpTest):
-    def setUp(self):
-        self.op_type = "smooth_l1_loss"
-        dims = (5, 20)
-        self.inputs = {
-            'X': np.random.random(dims).astype("float32"),
-            'Y': np.random.random(dims).astype("float32"),
-        }
-        sigma = 3.0
-        self.attrs = {'sigma': sigma}
-        sigma2 = sigma * sigma
-        diff = self.inputs['X'] - self.inputs['Y']
-        loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2).sum(1)
-        loss = loss.reshape((dims[0], 1))
-        self.outputs = {
-            'Diff': diff.astype('float32'),
-            'Out': loss.astype('float32'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set("X"),
-        )
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set('Y'),
-        )
-
-
-class TestSmoothL1LossOp2(OpTest):
-    def setUp(self):
-        self.op_type = "smooth_l1_loss"
-        dims = (5, 20)
-        self.inputs = {
-            'X': np.random.random(dims).astype("float32"),
-            'Y': np.random.random(dims).astype("float32"),
-            'InsideWeight': np.random.random(dims).astype("float32"),
-            'OutsideWeight': np.random.random(dims).astype("float32"),
-        }
-        sigma = 3.0
-        self.attrs = {'sigma': sigma}
-        sigma2 = sigma * sigma
-        diff = self.inputs['X'] - self.inputs['Y']
-        diff = diff * self.inputs['InsideWeight']
-        loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2)
-        loss = loss * self.inputs['OutsideWeight']
-        loss = loss.sum(1).reshape((dims[0], 1))
-        self.outputs = {
-            'Diff': diff.astype('float32'),
-            'Out': loss.astype('float32'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.03)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set={'X', 'InsideWeight', 'OutsideWeight'},
-        )
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set={'Y', 'InsideWeight', 'OutsideWeight'},
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
index 8b6e944e89a19b..0f5e2aee011737 100644
--- a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
+++ b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
@@ -20,6 +20,7 @@
 import paddle
 from paddle import base, incubate
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 paddle.enable_static()
 
@@ -50,11 +51,11 @@ def setUp(self):
         self.outputs = {'Out': rst}
 
     def test_check_output(self):
-        self.check_output_with_place(core.CUDAPlace(0), check_new_ir=True)
+        self.check_output_with_place(core.CUDAPlace(0), check_pir=True)
 
     def test_check_grad(self):
         self.check_grad_with_place(
-            core.CUDAPlace(0), ["X"], "Out", check_new_ir=True
+            core.CUDAPlace(0), ["X"], "Out", check_pir=True
         )
 
 
@@ -72,14 +73,14 @@ def setUp(self):
 
     def test_check_output(self):
         try:
-            self.check_output_with_place(core.CPUPlace(), check_new_ir=True)
+            self.check_output_with_place(core.CPUPlace(), check_pir=True)
         except (NotImplementedError, RuntimeError):
             pass
 
     def test_check_grad(self):
         try:
             self.check_grad_with_place(
-                core.CPUPlace(), ["X"], "Out", check_new_ir=True
+                core.CPUPlace(), ["X"], "Out", check_pir=True
             )
         except (NotImplementedError, RuntimeError):
             pass
@@ -92,11 +93,14 @@ class TestDropoutBiasFuseOp2(unittest.TestCase):
     # test the python side API for softmax_mask_fuse op
     def setUp(self):
         np.random.seed(123)
-        self.dtypes = ['float16', 'float32']
+        self.dtypes = ['float32', 'float16']
 
+    @test_with_pir_api
     def test_static(self):
         for dtype in self.dtypes:
-            with base.program_guard(base.Program(), base.Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 input_x = paddle.static.data(
                     name="x", shape=[1, 4, 32, 32], dtype=dtype
                 )
@@ -107,7 +111,7 @@ def test_static(self):
 
                 exe = base.Executor(base.CUDAPlace(0))
                 fetches = exe.run(
-                    base.default_main_program(),
+                    paddle.static.default_main_program(),
                     feed={"x": x_in_np},
                     fetch_list=[rst],
                 )
diff --git a/test/legacy_test/test_softmax_op.py b/test/legacy_test/test_softmax_op.py
index e684daa695a23e..ae98b434766192 100644
--- a/test/legacy_test/test_softmax_op.py
+++ b/test/legacy_test/test_softmax_op.py
@@ -84,9 +84,17 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5, check_new_ir=True)
+            self.check_output_with_place(
+                place,
+                atol=1e-5,
+                check_prim=True,
+                check_pir=True,
+                check_prim_pir=True,
+            )
         else:
-            self.check_output(check_prim=True, check_new_ir=True)
+            self.check_output(
+                check_prim=True, check_pir=True, check_prim_pir=True
+            )
 
     def test_check_grad(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -99,7 +107,8 @@ def test_check_grad(self):
                     "Out",
                     max_relative_error=0.01,
                     check_dygraph=(not self.use_mkldnn),
-                    check_new_ir=True,
+                    check_pir=True,
+                    check_prim_pir=True,
                 )
         else:
             self.check_grad(
@@ -108,7 +117,8 @@ def test_check_grad(self):
                 max_relative_error=0.01,
                 check_dygraph=(not self.use_mkldnn),
                 check_prim=True,
-                check_new_ir=True,
+                check_pir=True,
+                check_prim_pir=True,
             )
 
 
@@ -146,9 +156,13 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5, check_new_ir=True)
+            self.check_output_with_place(
+                place, atol=1e-5, check_pir=True, check_prim_pir=True
+            )
         else:
-            self.check_output(check_prim=True, check_new_ir=True)
+            self.check_output(
+                check_prim=True, check_pir=True, check_prim_pir=True
+            )
 
 
 @unittest.skipIf(
@@ -158,6 +172,8 @@ class TestSoftmaxOp_ZeroDim2(TestSoftmaxOp):
     def setUp(self):
         self.op_type = "softmax"
         self.python_api = F.softmax
+        self.public_python_api = F.softmax
+        self.prim_op_type = "comp"
         self.use_cudnn = True
         self.use_mkldnn = False
         # explicilty use float32 for ROCm, as MIOpen does not yet support float64
@@ -180,9 +196,17 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5, check_new_ir=True)
+            self.check_output_with_place(
+                place,
+                check_prim=True,
+                atol=1e-5,
+                check_pir=True,
+                check_prim_pir=True,
+            )
         else:
-            self.check_output(check_prim=True, check_new_ir=True)
+            self.check_output(
+                check_prim=True, check_pir=True, check_prim_pir=True
+            )
 
 
 class TestSoftmaxOp2(TestSoftmaxOp):
@@ -357,7 +381,11 @@ def test_check_output(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 self.check_output_with_place(
-                    place, atol=1e-3, check_new_ir=True
+                    place,
+                    atol=1e-3,
+                    check_prim=True,
+                    check_pir=True,
+                    check_prim_pir=True,
                 )
 
     # FIXME: If the x_shape is [10, 10], gradient failed.
@@ -386,7 +414,11 @@ def test_check_output(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 self.check_output_with_place(
-                    place, atol=1e-3, check_new_ir=True
+                    place,
+                    atol=1e-3,
+                    check_prim=True,
+                    check_pir=True,
+                    check_prim_pir=True,
                 )
 
 
@@ -437,7 +469,8 @@ def test_check_output(self):
             place,
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
+            check_prim_pir=(not self.use_mkldnn),
         )
 
     def test_check_grad(self):
@@ -449,7 +482,8 @@ def test_check_grad(self):
             numeric_grad_delta=0.05,
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
-            check_new_ir=(not self.use_mkldnn),
+            check_pir=(not self.use_mkldnn),
+            check_prim_pir=(not self.use_mkldnn),
         )
 
 
diff --git a/test/legacy_test/test_sort_op.py b/test/legacy_test/test_sort_op.py
index bbae7e75c833b1..6559f966b46859 100644
--- a/test/legacy_test/test_sort_op.py
+++ b/test/legacy_test/test_sort_op.py
@@ -19,12 +19,14 @@
 import paddle
 from paddle import base
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestSortOnCPU(unittest.TestCase):
     def setUp(self):
         self.place = core.CPUPlace()
 
+    @test_with_pir_api
     def test_api_0(self):
         with base.program_guard(base.Program()):
             input = paddle.static.data(
@@ -43,6 +45,7 @@ def test_api_0(self):
             np_result = np.sort(result)
             self.assertEqual((result == np_result).all(), True)
 
+    @test_with_pir_api
     def test_api_1(self):
         with base.program_guard(base.Program()):
             input = paddle.static.data(
@@ -93,3 +96,7 @@ def test_api_1(self):
             (np.sort(self.input_data, axis=-1) == out.numpy()).all(), True
         )
         paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_space_to_depth_op.py b/test/legacy_test/test_space_to_depth_op.py
deleted file mode 100644
index c7cd6cae179dbf..00000000000000
--- a/test/legacy_test/test_space_to_depth_op.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-from paddle import base
-
-
-class TestSpaceToDepthOp(OpTest):
-    @staticmethod
-    def helper(in_, width, height, channel, batch, blocksize, forward, out_):
-        channel_out = channel // (blocksize * blocksize)
-        for b in range(batch):
-            for k in range(channel):
-                for j in range(height):
-                    for i in range(width):
-                        in_index = i + width * (j + height * (k + channel * b))
-                        channel2 = k % channel_out
-                        offset = k // channel_out
-                        width2 = i * blocksize + offset % blocksize
-                        height2 = j * blocksize + offset // blocksize
-                        out_index = width2 + width * blocksize * (
-                            height2
-                            + height * blocksize * (channel2 + channel_out * b)
-                        )
-                        if forward:
-                            out_[out_index] = in_[in_index]
-                        else:
-                            out_[in_index] = in_[out_index]
-
-    def setUp(self):
-        self.init_data()
-
-        self.op_type = "space_to_depth"
-        self.inputs = {"X": self.x}
-        self.helper(
-            self.x_1d,
-            self.x.shape[3],
-            self.x.shape[2],
-            self.x.shape[1],
-            self.x.shape[0],
-            self.blocksize,
-            self.forward,
-            self.out_1d,
-        )
-        self.out = np.reshape(self.out_1d, self.infered_shape)
-        self.attrs = {"blocksize": self.blocksize}
-        self.outputs = {"Out": self.out}
-
-    def init_data(self):
-        self.ori_shape = (32, 12, 6, 6)
-        self.infered_shape = (32, 48, 3, 3)
-        self.one_d_len = 32 * 48 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-    def test_check_output(self):
-        place = (
-            base.core.CUDAPlace(0)
-            if base.core.is_compiled_with_cuda()
-            else base.core.CPUPlace()
-        )
-        self.check_output_with_place(
-            place=place, atol=1e-5, no_check_set=None, equal_nan=False
-        )
-
-    def test_check_grad(self):
-        place = (
-            base.core.CUDAPlace(0)
-            if base.core.is_compiled_with_cuda()
-            else base.core.CPUPlace()
-        )
-        self.check_grad_with_place(place, ['X'], 'Out')
-
-
-class TestSpaceToDepthOpBasic(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 8, 6, 6)
-        self.infered_shape = (32, 32, 3, 3)
-        self.one_d_len = 32 * 32 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpDoubleBasic(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 8, 6, 6)
-        self.infered_shape = (32, 32, 3, 3)
-        self.one_d_len = 32 * 32 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpWithStride3(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 9, 6, 6)
-        self.infered_shape = (32, 81, 2, 2)
-        self.one_d_len = 32 * 81 * 2 * 2
-
-        self.blocksize = 3
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpWithNotSquare(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 9, 9, 6)
-        self.infered_shape = (32, 81, 3, 2)
-        self.one_d_len = 32 * 81 * 3 * 2
-
-        self.blocksize = 3
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py
index 92dfe72f8443e3..a192078899dd7c 100644
--- a/test/legacy_test/test_split_op.py
+++ b/test/legacy_test/test_split_op.py
@@ -57,7 +57,7 @@ def _set_op_type(self):
         self.op_type = "split"
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -65,7 +65,7 @@ def test_check_grad(self):
             ['out0', 'out1', 'out2'],
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -117,7 +117,7 @@ def _set_op_type(self):
         self.op_type = "split"
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -125,7 +125,7 @@ def test_check_grad(self):
             ['out0', 'out1', 'out2'],
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -160,10 +160,10 @@ def _set_op_type(self):
         self.op_type = "split"
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], ['out0', 'out1', 'out2'], check_new_ir=True)
+        self.check_grad(['X'], ['out0', 'out1', 'out2'], check_pir=True)
 
 
 # attr(sections) is list containing Tensor
@@ -208,10 +208,10 @@ def _set_op_type(self):
         self.op_type = "split"
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], ['out0', 'out1', 'out2'], check_new_ir=True)
+        self.check_grad(['X'], ['out0', 'out1', 'out2'], check_pir=True)
 
 
 class TestSplitOp_unk_section(OpTest):
@@ -247,7 +247,7 @@ def _set_op_type(self):
         self.op_type = "split"
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -255,7 +255,7 @@ def test_check_grad(self):
             ['out0', 'out1', 'out2'],
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -308,7 +308,7 @@ def test_check_grad(self):
                 'out2',
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
     cls_name = "{}_{}".format(parent.__name__, "BF16Op")
diff --git a/test/legacy_test/test_squared_l2_distance_op.py b/test/legacy_test/test_squared_l2_distance_op.py
deleted file mode 100644
index 579681ab0c0980..00000000000000
--- a/test/legacy_test/test_squared_l2_distance_op.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestSquaredL2DistanceOp_f0(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (5, 20)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (5, 20)).astype("float32"),
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-class TestSquaredL2DistanceOp_f1(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (1, 3)).astype("float32"),
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-class TestSquaredL2DistanceOp_f2(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (2, 3, 4)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (1, 3, 4)).astype("float32"),
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        sub_res = sub_res.reshape((2, 3 * 4))
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_squeeze2_op.py b/test/legacy_test/test_squeeze2_op.py
index 1ee72ad2a39e0b..f7470e1b0ef016 100755
--- a/test/legacy_test/test_squeeze2_op.py
+++ b/test/legacy_test/test_squeeze2_op.py
@@ -56,11 +56,20 @@ def if_enable_cinn(self):
 
     def test_check_output(self):
         self.check_output(
-            no_check_set=['XShape'], check_prim=True, check_new_ir=True
+            no_check_set=['XShape'],
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -282,6 +291,16 @@ def test_axes_type():
 
         self.assertRaises(TypeError, test_axes_type)
 
+    def test_pir_error(self):
+        def test_axes_type():
+            with paddle.pir_utils.IrGuard():
+                x2 = paddle.static.data(
+                    name="x2", shape=[2, 1, 25], dtype="int32"
+                )
+                self.squeeze(x2, axis=2.1)
+
+        self.assertRaises(ValueError, test_axes_type)
+
 
 class TestSqueezeInplaceAPI(TestSqueezeAPI):
     def executed_api(self):
diff --git a/test/legacy_test/test_squeeze_op.py b/test/legacy_test/test_squeeze_op.py
deleted file mode 100755
index 294a86db6dd040..00000000000000
--- a/test/legacy_test/test_squeeze_op.py
+++ /dev/null
@@ -1,307 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import gradient_checker
-import numpy as np
-from decorator_helper import prog_scope
-from op_test import OpTest, convert_float_to_uint16
-
-import paddle
-from paddle import base
-from paddle.base import Program, core, program_guard
-
-paddle.enable_static()
-
-
-# Correct: General.
-class TestSqueezeOp(OpTest):
-    def setUp(self):
-        self.op_type = "squeeze"
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
-        self.init_attrs()
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.new_shape),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 40)
-        self.axes = (0, 2)
-        self.new_shape = (3, 40)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-class TestSqueezeFP16Op(OpTest):
-    def setUp(self):
-        self.op_type = "squeeze"
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float16")}
-        self.init_attrs()
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.new_shape),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 40)
-        self.axes = (0, 2)
-        self.new_shape = (3, 40)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-class TestSqueezeBF16Op(OpTest):
-    def setUp(self):
-        self.op_type = "squeeze"
-        self.dtype = np.uint16
-        self.init_test_case()
-        x = np.random.random(self.ori_shape).astype("float32")
-        out = x.reshape(self.new_shape)
-        self.inputs = {"X": convert_float_to_uint16(x)}
-        self.init_attrs()
-        self.outputs = {"Out": convert_float_to_uint16(out)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 40)
-        self.axes = (0, 2)
-        self.new_shape = (3, 40)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-# Correct: There is mins axis.
-class TestSqueezeOp1(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 40)
-        self.axes = (0, -2)
-        self.new_shape = (3, 40)
-
-
-# Correct: No axes input.
-class TestSqueezeOp2(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 20, 1, 5)
-        self.axes = ()
-        self.new_shape = (20, 5)
-
-
-# Correct: Just part of axes be squeezed.
-class TestSqueezeOp3(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (6, 1, 5, 1, 4, 1)
-        self.axes = (1, -1)
-        self.new_shape = (6, 5, 1, 4)
-
-
-# Correct: The demension of axis is not of size 1 remains unchanged.
-class TestSqueezeOp4(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (6, 1, 5, 1, 4, 1)
-        self.axes = (1, 2)
-        self.new_shape = (6, 5, 1, 4, 1)
-
-
-class TestSqueezeOpError(unittest.TestCase):
-    def test_errors(self):
-        paddle.enable_static()
-        with program_guard(Program(), Program()):
-            # The input type of softmax_op must be Variable.
-            x1 = base.create_lod_tensor(
-                np.array([[-1]]), [[1]], paddle.CPUPlace()
-            )
-            self.assertRaises(TypeError, paddle.squeeze, x1)
-            # The input axes of squeeze must be list.
-            x2 = paddle.static.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, paddle.squeeze, x2, axes=0)
-            # The input dtype of squeeze not support float16.
-            x3 = paddle.static.data(name='x3', shape=[4], dtype="float16")
-            self.assertRaises(TypeError, paddle.squeeze, x3, axes=0)
-
-
-class API_TestSqueeze(unittest.TestCase):
-    def setUp(self):
-        self.executed_api()
-
-    def executed_api(self):
-        self.squeeze = paddle.squeeze
-
-    def test_out(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            data1 = paddle.static.data(
-                'data1', shape=[-1, 1, 10], dtype='float64'
-            )
-            result_squeeze = self.squeeze(data1, axis=[1])
-            place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            input1 = np.random.random([5, 1, 10]).astype('float64')
-            (result,) = exe.run(
-                feed={"data1": input1}, fetch_list=[result_squeeze]
-            )
-            expected_result = np.squeeze(input1, axis=1)
-            np.testing.assert_allclose(expected_result, result, rtol=1e-05)
-
-
-class API_TestStaticSqueeze_(API_TestSqueeze):
-    def executed_api(self):
-        self.squeeze = paddle.squeeze_
-
-
-class API_TestDygraphSqueeze(unittest.TestCase):
-    def setUp(self):
-        self.executed_api()
-
-    def executed_api(self):
-        self.squeeze = paddle.squeeze
-
-    def test_out(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int32")
-        input = paddle.to_tensor(input_1)
-        output = self.squeeze(input, axis=[1])
-        out_np = output.numpy()
-        expected_out = np.squeeze(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-    def test_out_int8(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int8")
-        input = paddle.to_tensor(input_1)
-        output = self.squeeze(input, axis=[1])
-        out_np = output.numpy()
-        expected_out = np.squeeze(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-    def test_out_uint8(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("uint8")
-        input = paddle.to_tensor(input_1)
-        output = self.squeeze(input, axis=[1])
-        out_np = output.numpy()
-        expected_out = np.squeeze(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-    def test_axis_not_list(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int32")
-        input = paddle.to_tensor(input_1)
-        output = self.squeeze(input, axis=1)
-        out_np = output.numpy()
-        expected_out = np.squeeze(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-    def test_dimension_not_1(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int32")
-        input = paddle.to_tensor(input_1)
-        output = self.squeeze(input, axis=(1, 0))
-        out_np = output.numpy()
-        expected_out = np.squeeze(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-
-class API_TestDygraphSqueezeInplace(API_TestDygraphSqueeze):
-    def executed_api(self):
-        self.squeeze = paddle.squeeze_
-
-
-class TestSqueezeDoubleGradCheck(unittest.TestCase):
-    def squeeze_wrapper(self, x):
-        return paddle.squeeze(x[0])
-
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
-        eps = 0.005
-        dtype = np.float32
-
-        data = paddle.static.data('data', [2, 3], dtype)
-        data.persistable = True
-        out = paddle.squeeze(data)
-        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [data], out, x_init=[data_arr], place=place, eps=eps
-        )
-        gradient_checker.double_grad_check_for_dygraph(
-            self.squeeze_wrapper, [data], out, x_init=[data_arr], place=place
-        )
-
-    def test_grad(self):
-        paddle.enable_static()
-        places = [base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(base.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestSqueezeTripleGradCheck(unittest.TestCase):
-    def squeeze_wrapper(self, x):
-        return paddle.squeeze(x[0])
-
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
-        eps = 0.005
-        dtype = np.float32
-
-        data = paddle.static.data('data', [2, 3], dtype)
-        data.persistable = True
-        out = paddle.squeeze(data)
-        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
-
-        gradient_checker.triple_grad_check(
-            [data], out, x_init=[data_arr], place=place, eps=eps
-        )
-        gradient_checker.triple_grad_check_for_dygraph(
-            self.squeeze_wrapper, [data], out, x_init=[data_arr], place=place
-        )
-
-    def test_grad(self):
-        paddle.enable_static()
-        places = [base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(base.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_stack_op.py b/test/legacy_test/test_stack_op.py
index 44abff4dafeb58..472777b9cfd727 100644
--- a/test/legacy_test/test_stack_op.py
+++ b/test/legacy_test/test_stack_op.py
@@ -19,7 +19,7 @@
 
 import paddle
 from paddle import base
-from paddle.base.framework import Program, program_guard
+from paddle.pir_utils import test_with_pir_api
 
 paddle.enable_static()
 
@@ -63,11 +63,15 @@ def setUp(self):
         self.attrs = {'axis': self.axis}
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(check_prim=True, check_pir=True, check_prim_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
-            self.get_x_names(), 'Y', check_prim=True, check_new_ir=True
+            self.get_x_names(),
+            'Y',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
@@ -189,11 +193,15 @@ def setUp(self):
         self.attrs = {'axis': self.axis}
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(check_prim=True, check_pir=True, check_prim_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
-            self.get_x_names(), 'Y', check_prim=True, check_new_ir=True
+            self.get_x_names(),
+            'Y',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
@@ -212,11 +220,10 @@ def setUp(self):
             if base.is_compiled_with_cuda()
             else base.CPUPlace()
         )
-        self.set_program()
 
-    def set_program(self):
-        self.program = base.Program()
-        with base.program_guard(self.program):
+    def test_case(self):
+        self.program = paddle.static.Program()
+        with paddle.static.program_guard(self.program):
             input = paddle.assign(self.x)
             tensor_array = paddle.tensor.create_array(dtype='float32')
             zero = paddle.tensor.fill_constant(
@@ -227,8 +234,6 @@ def set_program(self):
                 paddle.tensor.array_write(input, zero + i, tensor_array)
 
             self.out_var = paddle.stack(tensor_array, axis=self.axis)
-
-    def test_case(self):
         self.assertTrue(self.out_var.shape[self.axis] == -1)
         exe = base.Executor(self.place)
         res = exe.run(self.program, fetch_list=self.out_var)
@@ -252,11 +257,10 @@ def setUp(self):
             if base.is_compiled_with_cuda()
             else base.CPUPlace()
         )
-        self.set_program()
 
-    def set_program(self):
-        self.program = base.Program()
-        with base.program_guard(self.program):
+    def test_case(self):
+        self.program = paddle.static.Program()
+        with paddle.static.program_guard(self.program):
             input = paddle.assign(self.x)
             tensor_array = paddle.tensor.create_array(dtype='float32')
             zero = paddle.tensor.fill_constant(
@@ -267,8 +271,6 @@ def set_program(self):
                 paddle.tensor.array_write(input, zero + i, tensor_array)
 
             self.out_var = paddle.stack(tensor_array, axis=self.axis)
-
-    def test_case(self):
         self.assertTrue(self.out_var.shape[self.axis] == -1)
         exe = base.Executor(self.place)
         res = exe.run(self.program, fetch_list=self.out_var)
@@ -278,8 +280,11 @@ def test_case(self):
 
 
 class API_test(unittest.TestCase):
+    @test_with_pir_api
     def test_out(self):
-        with base.program_guard(base.Program(), base.Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             data1 = paddle.static.data('data1', shape=[1, 2], dtype='float64')
             data2 = paddle.static.data('data2', shape=[1, 2], dtype='float64')
             data3 = paddle.static.data('data3', shape=[1, 2], dtype='float64')
@@ -301,6 +306,11 @@ def test_single_tensor_error(self):
             x = paddle.rand([2, 3])
             self.assertRaises(TypeError, paddle.stack, x)
 
+    def test_pir_single_tensor_error(self):
+        with paddle.pir_utils.IrGuard():
+            x = paddle.rand([2, 3])
+            self.assertRaises(ValueError, paddle.stack, x)
+
 
 class API_DygraphTest(unittest.TestCase):
     def test_out(self):
@@ -330,9 +340,10 @@ def test_single_tensor_error(self):
 
 
 class TestStackOpWithNegativeShape(unittest.TestCase):
+    @test_with_pir_api
     def test_out(self):
-        main_prg, startup_prg = Program(), Program()
-        with program_guard(main_prg, startup_prg):
+        main_prg, startup_prg = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(main_prg, startup_prg):
             b = paddle.static.data(name='b', shape=[-1], dtype='int64')
             e = paddle.static.data(name='e', shape=[3], dtype='int64')
             k = paddle.stack([b, e], axis=0)
diff --git a/test/legacy_test/test_stride.py b/test/legacy_test/test_stride.py
index a80451e36fdc41..ffeeade304ce5c 100644
--- a/test/legacy_test/test_stride.py
+++ b/test/legacy_test/test_stride.py
@@ -640,7 +640,7 @@ def test_stride_gpu(self):
 
 class TestToStaticCheck(unittest.TestCase):
     def test_error(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def func():
             x_np = np.random.random(size=[2, 3, 4]).astype('float32')
             x = paddle.to_tensor(x_np)
@@ -650,7 +650,7 @@ def func():
         self.assertRaises(ValueError, func)
 
     def test_no_error(self):
-        @paddle.jit.to_static
+        @paddle.jit.to_static(full_graph=True)
         def func():
             x_np = np.random.random(size=[2, 3, 4]).astype('float32')
             x = paddle.to_tensor(x_np)
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index 910d8a75e5f9f1..d8536bc7719553 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -61,7 +61,7 @@ def test_check_output(self):
         self.check_output(
             check_prim=True,
             check_cinn=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -71,7 +71,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             check_cinn=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -310,7 +310,7 @@ def test_check_output(self):
                 check_cinn=True,
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
     # FIXME: Because of the precision fp16, max_relative_error
@@ -324,7 +324,7 @@ def test_check_grad(self):
                 check_cinn=True,
                 check_prim=True,
                 check_prim_pir=True,
-                check_new_ir=True,
+                check_pir=True,
             )
 
 
@@ -377,7 +377,7 @@ def test_check_output(self):
             check_dygraph=False,
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def test_check_grad(self):
@@ -388,7 +388,7 @@ def test_check_grad(self):
             check_dygraph=False,
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
diff --git a/test/legacy_test/test_target_assign_op.py b/test/legacy_test/test_target_assign_op.py
deleted file mode 100644
index 98369d62247df3..00000000000000
--- a/test/legacy_test/test_target_assign_op.py
+++ /dev/null
@@ -1,195 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
-    if len(gt_lod) != len(neg_lod):
-        raise AssertionError("The input arguments are illegal.")
-
-    batch_size = len(gt_lod)
-
-    match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
-    neg_indices = np.zeros((sum(neg_lod), 1)).astype('int32')
-
-    offset = 0
-    for n in range(batch_size):
-        gt_num = gt_lod[n]
-        ids = random.sample(list(range(num_prior)), gt_num)
-        match_indices[n, ids] = list(range(gt_num))
-
-        ret_ids = set(range(num_prior)) - set(ids)
-        l = neg_lod[n]
-        neg_ids = random.sample(ret_ids, l)
-        neg_indices[offset : offset + neg_lod[n], :] = (
-            np.array(neg_ids).astype('int32').reshape(l, 1)
-        )
-        offset += neg_lod[n]
-
-    return match_indices, neg_indices
-
-
-def target_assign(
-    encoded_box,
-    gt_label,
-    match_indices,
-    neg_indices,
-    gt_lod,
-    neg_lod,
-    mismatch_value,
-):
-    batch_size, num_prior = match_indices.shape
-
-    # init target bbox
-    trg_box = np.zeros((batch_size, num_prior, 4)).astype('float32')
-    # init weight for target bbox
-    trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
-    # init target label
-    trg_label = np.ones((batch_size, num_prior, 1)).astype('int32')
-    trg_label = trg_label * mismatch_value
-    # init weight for target label
-    trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
-
-    gt_offset = 0
-    neg_offset = 0
-    for i in range(batch_size):
-        cur_indices = match_indices[i]
-        col_ids = np.where(cur_indices > -1)
-        col_val = cur_indices[col_ids]
-
-        # target bbox
-        for v, c in zip(col_val + gt_offset, col_ids[0].tolist()):
-            trg_box[i][c][:] = encoded_box[v][c][:]
-        # weight for target bbox
-        trg_box_wt[i][col_ids] = 1.0
-
-        trg_label[i][col_ids] = gt_label[col_val + gt_offset]
-        trg_label_wt[i][col_ids] = 1.0
-        # set target label weight to 1.0 for the negative samples
-        if neg_indices is not None:
-            neg_ids = neg_indices[neg_offset : neg_offset + neg_lod[i]]
-            trg_label_wt[i][neg_ids] = 1.0
-        # update offset
-        gt_offset += gt_lod[i]
-        neg_offset += neg_lod[i]
-
-    return trg_box, trg_box_wt, trg_label, trg_label_wt
-
-
-class TestTargetAssginFloatType(OpTest):
-    def setUp(self):
-        self.op_type = "target_assign"
-        num_prior = 120
-        num_class = 21
-        gt_lod = [5, 6, 12]
-        neg_lod = [4, 3, 6]
-        mismatch_value = 0
-        batch_size = len(gt_lod)
-        num_gt = sum(gt_lod)
-
-        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
-        gt_label = np.random.randint(num_class, size=(num_gt, 1)).astype(
-            'int32'
-        )
-
-        match_indices, neg_indices = gen_match_and_neg_indices(
-            num_prior, gt_lod, neg_lod
-        )
-
-        out, out_wt, _, _ = target_assign(
-            encoded_box,
-            gt_label,
-            match_indices,
-            neg_indices,
-            gt_lod,
-            neg_lod,
-            mismatch_value,
-        )
-
-        # assign regression targets
-        x = encoded_box
-        self.inputs = {
-            'X': (x, [gt_lod]),
-            'MatchIndices': match_indices,
-        }
-        self.attrs = {'mismatch_value': mismatch_value}
-        self.outputs = {
-            'Out': out,
-            'OutWeight': out_wt,
-        }
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-
-class TestTargetAssginIntType(OpTest):
-    def setUp(self):
-        self.op_type = "target_assign"
-        num_prior = 120
-        num_class = 21
-        gt_lod = [5, 6, 12]
-        neg_lod = [4, 3, 6]
-        mismatch_value = 0
-        batch_size = len(gt_lod)
-        num_gt = sum(gt_lod)
-
-        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
-        gt_label = np.random.randint(num_class, size=(num_gt, 1)).astype(
-            'int32'
-        )
-
-        match_indices, neg_indices = gen_match_and_neg_indices(
-            num_prior, gt_lod, neg_lod
-        )
-
-        (
-            _,
-            _,
-            out,
-            out_wt,
-        ) = target_assign(
-            encoded_box,
-            gt_label,
-            match_indices,
-            neg_indices,
-            gt_lod,
-            neg_lod,
-            mismatch_value,
-        )
-
-        # assign cassification argets
-        x = np.reshape(gt_label, (num_gt, 1, 1))
-        self.inputs = {
-            'X': (x, [gt_lod]),
-            'MatchIndices': match_indices,
-            'NegIndices': (neg_indices, [neg_lod]),
-        }
-        self.attrs = {'mismatch_value': mismatch_value}
-        self.outputs = {
-            'Out': out,
-            'OutWeight': out_wt,
-        }
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_tile_op.py b/test/legacy_test/test_tile_op.py
index 40dc04b0537707..4a7d94637c6fa5 100644
--- a/test/legacy_test/test_tile_op.py
+++ b/test/legacy_test/test_tile_op.py
@@ -21,7 +21,8 @@
 
 import paddle
 from paddle import base
-from paddle.base import Program, core, program_guard
+from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 # Situation 1: repeat_times is a list (without tensor)
@@ -47,10 +48,16 @@ def init_data(self):
         self.repeat_times = [2]
 
     def test_check_output(self):
-        self.check_output(check_cinn=self.check_cinn, check_new_ir=True)
+        self.check_output(check_cinn=self.check_cinn, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestTileOpRank_ZeroDim1(TestTileOpRank1):
@@ -165,7 +172,7 @@ def init_data(self):
         self.infer_repeat_times = [-1]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
@@ -206,7 +213,7 @@ def init_data(self):
         self.repeat_times = [2]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
@@ -235,7 +242,7 @@ def if_enable_cinn(self):
         self.check_cinn = True
 
     def test_check_output(self):
-        self.check_output(check_cinn=self.check_cinn, check_new_ir=True)
+        self.check_output(check_cinn=self.check_cinn, check_pir=True)
 
 
 class TestTileFP16OP(OpTest):
@@ -262,10 +269,16 @@ def init_data(self):
         self.repeat_times = [2, 1, 4]
 
     def test_check_output(self):
-        self.check_output(check_cinn=self.check_cinn, check_new_ir=True)
+        self.check_output(check_cinn=self.check_cinn, check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 @unittest.skipIf(
@@ -294,7 +307,7 @@ def if_enable_cinn(self):
     def test_check_output(self):
         place = core.CUDAPlace(0)
         self.check_output_with_place(
-            place, check_cinn=self.check_cinn, check_new_ir=True
+            place, check_cinn=self.check_cinn, check_pir=True
         )
 
     def init_data(self):
@@ -305,7 +318,12 @@ def init_data(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
 
@@ -324,7 +342,7 @@ def if_enable_cinn(self):
         self.check_cinn = True
 
     def test_check_output(self):
-        self.check_output(check_cinn=self.check_cinn, check_new_ir=True)
+        self.check_output(check_cinn=self.check_cinn, check_pir=True)
 
 
 # Situation 56: input x is Integer
@@ -344,12 +362,15 @@ def if_enable_cinn(self):
         self.check_cinn = True
 
     def test_check_output(self):
-        self.check_output(check_cinn=self.check_cinn, check_new_ir=True)
+        self.check_output(check_cinn=self.check_cinn, check_pir=True)
 
 
 class TestTileError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
-        with program_guard(Program(), Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             x1 = base.create_lod_tensor(
                 np.array([[-1]]), [[1]], base.CPUPlace()
             )
@@ -363,8 +384,11 @@ def test_errors(self):
 
 
 class TestTileAPIStatic(unittest.TestCase):
+    @test_with_pir_api
     def test_api(self):
-        with program_guard(Program(), Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             repeat_times = [2, 2]
             x1 = paddle.static.data(name='x1', shape=[-1, 4], dtype="int32")
             out = paddle.tile(x1, repeat_times)
@@ -490,6 +514,7 @@ def test_dygraph(self):
 
 
 class Testfp16TileOp(unittest.TestCase):
+    @test_with_pir_api
     def testfp16(self):
         input_x = (np.random.random([1, 2, 3])).astype('float16')
         with paddle.static.program_guard(paddle.static.Program()):
diff --git a/test/legacy_test/test_transpose_op.py b/test/legacy_test/test_transpose_op.py
index 52f85ef1e0a708..32f071eafb472b 100644
--- a/test/legacy_test/test_transpose_op.py
+++ b/test/legacy_test/test_transpose_op.py
@@ -49,14 +49,14 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=True,
+            check_pir=True,
             check_prim_pir=True,
         )
 
@@ -211,7 +211,7 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
         base.core.disable_autotune()
 
     def test_check_grad(self):
@@ -220,7 +220,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -254,7 +254,7 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
         base.core.disable_autotune()
 
     def test_check_grad(self):
@@ -263,7 +263,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -304,7 +304,7 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
         base.core.disable_autotune()
 
     def test_check_grad(self):
@@ -313,7 +313,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -346,7 +346,7 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -354,7 +354,7 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             check_prim_pir=True,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def initTestCase(self):
@@ -394,7 +394,7 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_new_ir=True)
+        self.check_output(no_check_set=['XShape'], check_pir=True)
 
     def test_check_grad(self):
         pass
diff --git a/test/legacy_test/test_tree_conv_op.py b/test/legacy_test/test_tree_conv_op.py
deleted file mode 100644
index e05ee1a4d4cdf0..00000000000000
--- a/test/legacy_test/test_tree_conv_op.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def collect_node_patch(og, max_depth):
-    """
-    The naive method to construct patches
-    :param og: original graph
-    :param max_depth: the depth of convolution filters
-    :return: convolution patches
-    """
-
-    def gen(node, max_depth):
-        collected = [(node, 1, 1, 0, max_depth)]
-
-        def recurse_helper(node, depth):
-            if depth > max_depth:
-                return
-            l = len(og[node])
-            for idx, c in enumerate(og[node], 1):
-                if depth + 1 < max_depth:
-                    collected.append((c, idx, l, depth + 1, max_depth))
-                    recurse_helper(c, depth + 1)
-
-        recurse_helper(node, 0)
-        return collected
-
-    res = []
-    for u in range(1, len(og)):
-        lis = gen(u, max_depth)
-        if len(lis) > 0:
-            res.append(lis)
-    return res
-
-
-class TestTreeConvOp(OpTest):
-    def setUp(self):
-        self.n = 17
-        self.fea_size = 3
-        self.output_size = 1
-        self.max_depth = 2
-        self.batch_size = 2
-        self.num_filters = 1
-        adj_array = [
-            1,
-            2,
-            1,
-            3,
-            1,
-            4,
-            1,
-            5,
-            2,
-            6,
-            2,
-            7,
-            2,
-            8,
-            4,
-            9,
-            4,
-            10,
-            5,
-            11,
-            6,
-            12,
-            6,
-            13,
-            9,
-            14,
-            9,
-            15,
-            9,
-            16,
-            9,
-            17,
-        ]
-        adj = np.array(adj_array).reshape((1, self.n - 1, 2)).astype('int32')
-        adj = np.tile(adj, (self.batch_size, 1, 1))
-        self.op_type = 'tree_conv'
-        vectors = np.random.random(
-            (self.batch_size, self.n, self.fea_size)
-        ).astype('float64')
-        self.inputs = {
-            'EdgeSet': adj,
-            'NodesVector': vectors,
-            'Filter': np.random.random(
-                (self.fea_size, 3, self.output_size, self.num_filters)
-            ).astype('float64'),
-        }
-        self.attrs = {'max_depth': self.max_depth}
-        vectors = []
-        for i in range(self.batch_size):
-            vector = self.get_output_naive(i)
-            vectors.append(vector)
-        self.outputs = {
-            'Out': np.array(vectors).astype('float64'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['NodesVector', 'Filter'], 'Out', max_relative_error=0.5
-        )
-
-    def get_output_naive(self, batch_id):
-        og = [[] for i in range(1, self.n + 2)]
-        st = np.array(self.inputs['EdgeSet'][batch_id]).tolist()
-        for e in st:
-            og[e[0]].append(e[1])
-        patches = collect_node_patch(og, self.max_depth)
-        W = np.array(self.inputs['Filter']).astype('float64')
-        W = np.transpose(W, axes=[1, 0, 2, 3])
-        vec = []
-        for i, patch in enumerate(patches, 1):
-            result = np.zeros((1, W.shape[2], W.shape[3]))
-            for v in patch:
-                eta_t = float(v[4] - v[3]) / float(v[4])
-                eta_l = (1.0 - eta_t) * (
-                    0.5 if v[2] == 1 else float(v[1] - 1.0) / float(v[2] - 1.0)
-                )
-                eta_r = (1.0 - eta_t) * (1.0 - eta_l)
-                x = self.inputs['NodesVector'][batch_id][v[0] - 1]
-                eta = (
-                    np.array([eta_l, eta_r, eta_t])
-                    .reshape((3, 1))
-                    .astype('float64')
-                )
-                Wconvi = np.tensordot(eta, W, axes=([0], [0]))
-                x = np.array(x).reshape((1, 1, self.fea_size))
-                res = np.tensordot(x, Wconvi, axes=2)
-                result = result + res
-            vec.append(result)
-        vec = np.concatenate(vec, axis=0)
-        vec = np.concatenate(
-            [
-                vec,
-                np.zeros(
-                    (self.n - vec.shape[0], W.shape[2], W.shape[3]),
-                    dtype='float64',
-                ),
-            ],
-            axis=0,
-        )
-        return vec
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_tril_triu_op.py b/test/legacy_test/test_tril_triu_op.py
index 1c64288dabbe57..d9de52a83999fd 100644
--- a/test/legacy_test/test_tril_triu_op.py
+++ b/test/legacy_test/test_tril_triu_op.py
@@ -19,7 +19,7 @@
 import paddle
 from paddle import base, tensor
 from paddle.base import core
-from paddle.base.framework import Program, program_guard
+from paddle.pir_utils import test_with_pir_api
 
 
 class TrilTriuOpDefaultTest(OpTest):
@@ -45,10 +45,10 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_pir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', check_new_ir=True)
+        self.check_grad(['X'], 'Out', check_pir=True)
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -86,7 +86,7 @@ def initTestCase(self):
         self.X = np.arange(1, 101, dtype="float32").reshape([10, -1])
 
     def test_check_output(self):
-        self.check_output_with_place(core.CUDAPlace(0), check_new_ir=True)
+        self.check_output_with_place(core.CUDAPlace(0), check_pir=True)
 
     def test_check_grad_normal(self):
         self.check_grad_with_place(
@@ -94,7 +94,7 @@ def test_check_grad_normal(self):
             ['X'],
             'Out',
             numeric_grad_delta=0.05,
-            check_new_ir=True,
+            check_pir=True,
         )
 
 
@@ -200,14 +200,15 @@ def initTestCase(self):
 class TestTrilTriuOpAPI(unittest.TestCase):
     """test case by using API and has -1 dimension"""
 
+    @test_with_pir_api
     def test_api(self):
         paddle.enable_static()
 
         dtypes = ['float16', 'float32']
         for dtype in dtypes:
-            prog = Program()
-            startup_prog = Program()
-            with program_guard(prog, startup_prog):
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            with paddle.static.program_guard(prog, startup_prog):
                 data = np.random.random([1, 9, 9, 4]).astype(dtype)
                 x = paddle.static.data(
                     shape=[1, 9, -1, 4], dtype=dtype, name='x'
@@ -221,7 +222,7 @@ def test_api(self):
                 )
                 exe = base.Executor(place)
                 tril_out, triu_out = exe.run(
-                    base.default_main_program(),
+                    prog,
                     feed={"x": data},
                     fetch_list=[tril_out, triu_out],
                 )
@@ -243,14 +244,15 @@ def test_api_with_dygraph(self):
                 np.testing.assert_allclose(tril_out, np.tril(data), rtol=1e-05)
                 np.testing.assert_allclose(triu_out, np.triu(data), rtol=1e-05)
 
+    @test_with_pir_api
     def test_base_api(self):
         paddle.enable_static()
 
         dtypes = ['float16', 'float32']
         for dtype in dtypes:
-            prog = Program()
-            startup_prog = Program()
-            with program_guard(prog, startup_prog):
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            with paddle.static.program_guard(prog, startup_prog):
                 data = np.random.random([1, 9, 9, 4]).astype(dtype)
                 x = paddle.static.data(
                     shape=[1, 9, -1, 4], dtype=dtype, name='x'
@@ -264,7 +266,7 @@ def test_base_api(self):
                 )
                 exe = base.Executor(place)
                 triu_out = exe.run(
-                    base.default_main_program(),
+                    prog,
                     feed={"x": data},
                     fetch_list=[triu_out],
                 )
diff --git a/test/legacy_test/test_uniform_random_op.py b/test/legacy_test/test_uniform_random_op.py
index 29011739802f40..1e301f53d7fc2f 100644
--- a/test/legacy_test/test_uniform_random_op.py
+++ b/test/legacy_test/test_uniform_random_op.py
@@ -69,7 +69,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -101,7 +101,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -121,7 +121,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -141,7 +141,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -170,7 +170,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=True)
+        self.check_output_customized(self.verify_output, check_pir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -244,7 +244,7 @@ def init_attrs(self):
         self.output_hist = output_hist_diag
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output, check_new_ir=False)
+        self.check_output_customized(self.verify_output, check_pir=False)
 
 
 class TestUniformRandomOpSelectedRows(unittest.TestCase):
diff --git a/test/legacy_test/test_unique.py b/test/legacy_test/test_unique.py
index 8fe9dfa9af6353..808cd8227bb7d4 100644
--- a/test/legacy_test/test_unique.py
+++ b/test/legacy_test/test_unique.py
@@ -19,6 +19,7 @@
 
 import paddle
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestUniqueOp(OpTest):
@@ -413,6 +414,7 @@ def test_dygraph_attr_dtype(self):
         self.assertTrue((inverse.numpy() == np_inverse).all(), True)
         self.assertTrue((counts.numpy() == np_counts).all(), True)
 
+    @test_with_pir_api
     def test_static_graph(self):
         with paddle_static_guard():
             with paddle.static.program_guard(
diff --git a/test/legacy_test/test_unsqueeze2_op.py b/test/legacy_test/test_unsqueeze2_op.py
index 36fa88cb1035ac..10246419fef5be 100755
--- a/test/legacy_test/test_unsqueeze2_op.py
+++ b/test/legacy_test/test_unsqueeze2_op.py
@@ -18,6 +18,7 @@
 from op_test import OpTest
 
 import paddle
+from paddle.pir_utils import test_with_pir_api
 
 paddle.enable_static()
 
@@ -44,11 +45,20 @@ def if_enable_cinn(self):
 
     def test_check_output(self):
         self.check_output(
-            no_check_set=["XShape"], check_prim=True, check_new_ir=True
+            no_check_set=["XShape"],
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
         )
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
     def init_test_case(self):
         self.ori_shape = (3, 40)
@@ -137,10 +147,10 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(no_check_set=["XShape"], check_new_ir=True)
+        self.check_output(no_check_set=["XShape"], check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_new_ir=True)
+        self.check_grad(["X"], "Out", check_pir=True)
 
     def init_test_case(self):
         self.ori_shape = (20, 5)
@@ -198,10 +208,10 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(no_check_set=["XShape"], check_new_ir=True)
+        self.check_output(no_check_set=["XShape"], check_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_new_ir=True)
+        self.check_grad(["X"], "Out", check_pir=True)
 
     def init_test_case(self):
         self.ori_shape = (20, 5)
@@ -248,34 +258,38 @@ def setUp(self):
     def executed_api(self):
         self.unsqueeze = paddle.unsqueeze
 
+    @test_with_pir_api
     def test_api(self):
-        input = np.random.random([3, 2, 5]).astype("float64")
-        x = paddle.static.data(name='x', shape=[3, 2, 5], dtype="float64")
-        positive_3_int32 = paddle.tensor.fill_constant([1], "int32", 3)
-        positive_1_int64 = paddle.tensor.fill_constant([1], "int64", 1)
-        axes_tensor_int32 = paddle.static.data(
-            name='axes_tensor_int32', shape=[3], dtype="int32"
-        )
-        axes_tensor_int64 = paddle.static.data(
-            name='axes_tensor_int64', shape=[3], dtype="int64"
-        )
+        with paddle.static.program_guard(paddle.static.Program()):
+            input = np.random.random([3, 2, 5]).astype("float64")
+            x = paddle.static.data(name='x', shape=[3, 2, 5], dtype="float64")
+            positive_3_int32 = paddle.tensor.fill_constant([1], "int32", 3)
+            positive_1_int64 = paddle.tensor.fill_constant([1], "int64", 1)
+            axes_tensor_int32 = paddle.static.data(
+                name='axes_tensor_int32', shape=[3], dtype="int32"
+            )
+            axes_tensor_int64 = paddle.static.data(
+                name='axes_tensor_int64', shape=[3], dtype="int64"
+            )
 
-        out_1 = self.unsqueeze(x, axis=[3, 1, 1])
-        out_2 = self.unsqueeze(x, axis=[positive_3_int32, positive_1_int64, 1])
-        out_3 = self.unsqueeze(x, axis=axes_tensor_int32)
-        out_4 = self.unsqueeze(x, axis=3)
-        out_5 = self.unsqueeze(x, axis=axes_tensor_int64)
-
-        exe = paddle.static.Executor(place=paddle.CPUPlace())
-        res_1, res_2, res_3, res_4, res_5 = exe.run(
-            paddle.static.default_main_program(),
-            feed={
-                "x": input,
-                "axes_tensor_int32": np.array([3, 1, 1]).astype("int32"),
-                "axes_tensor_int64": np.array([3, 1, 1]).astype("int64"),
-            },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5],
-        )
+            out_1 = self.unsqueeze(x, axis=[3, 1, 1])
+            out_2 = self.unsqueeze(
+                x, axis=[positive_3_int32, positive_1_int64, 1]
+            )
+            out_3 = self.unsqueeze(x, axis=axes_tensor_int32)
+            out_4 = self.unsqueeze(x, axis=3)
+            out_5 = self.unsqueeze(x, axis=axes_tensor_int64)
+
+            exe = paddle.static.Executor(place=paddle.CPUPlace())
+            res_1, res_2, res_3, res_4, res_5 = exe.run(
+                paddle.static.default_main_program(),
+                feed={
+                    "x": input,
+                    "axes_tensor_int32": np.array([3, 1, 1]).astype("int32"),
+                    "axes_tensor_int64": np.array([3, 1, 1]).astype("int64"),
+                },
+                fetch_list=[out_1, out_2, out_3, out_4, out_5],
+            )
 
         np.testing.assert_array_equal(res_1, input.reshape([3, 1, 1, 2, 5, 1]))
         np.testing.assert_array_equal(res_2, input.reshape([3, 1, 1, 2, 5, 1]))
@@ -290,6 +304,13 @@ def test_axes_type():
 
         self.assertRaises(TypeError, test_axes_type)
 
+        def test_pir_axes_type():
+            with paddle.pir_utils.IrGuard():
+                x2 = paddle.static.data(name="x2", shape=[2, 25], dtype="int32")
+                self.unsqueeze(x2, axis=2.1)
+
+        self.assertRaises(ValueError, test_pir_axes_type)
+
 
 class TestUnsqueezeInplaceAPI(TestUnsqueezeAPI):
     def executed_api(self):
diff --git a/test/legacy_test/test_unsqueeze_op.py b/test/legacy_test/test_unsqueeze_op.py
deleted file mode 100755
index 39aec97e23ecd0..00000000000000
--- a/test/legacy_test/test_unsqueeze_op.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import gradient_checker
-import numpy as np
-from decorator_helper import prog_scope
-from op_test import OpTest, convert_float_to_uint16
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-paddle.enable_static()
-
-
-# Correct: General.
-class TestUnsqueezeOp(OpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
-        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (3, 40)
-        self.axes = (1, 2)
-        self.new_shape = (3, 1, 1, 40)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-class TestUnsqueezeFP16Op(OpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float16")}
-        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (3, 40)
-        self.axes = (1, 2)
-        self.new_shape = (3, 1, 1, 40)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-class TestUnsqueezeBF16Op(OpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = "unsqueeze"
-        self.dtype = np.uint16
-        x = np.random.random(self.ori_shape).astype("float32")
-        out = x.reshape(self.new_shape)
-        self.inputs = {"X": convert_float_to_uint16(x)}
-        self.init_attrs()
-        self.outputs = {"Out": convert_float_to_uint16(out)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (3, 40)
-        self.axes = (1, 2)
-        self.new_shape = (3, 1, 1, 40)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-# Correct: Single input index.
-class TestUnsqueezeOp1(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (20, 5)
-        self.axes = (-1,)
-        self.new_shape = (20, 5, 1)
-
-
-# Correct: Mixed input axis.
-class TestUnsqueezeOp2(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (20, 5)
-        self.axes = (0, -1)
-        self.new_shape = (1, 20, 5, 1)
-
-
-# Correct: There is duplicated axis.
-class TestUnsqueezeOp3(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (10, 2, 5)
-        self.axes = (0, 3, 3)
-        self.new_shape = (1, 10, 2, 1, 1, 5)
-
-
-# Correct: Reversed axes.
-class TestUnsqueezeOp4(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (10, 2, 5)
-        self.axes = (3, 1, 1)
-        self.new_shape = (10, 1, 1, 2, 5, 1)
-
-
-# axis is empty, x is ND
-class TestUnsqueezeOp5(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = ()
-        self.axes = ()
-        self.new_shape = ()
-
-
-# axis is empty, x is 0D
-class TestUnsqueezeOp6(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (10, 2, 5)
-        self.axes = ()
-        self.new_shape = (10, 2, 5)
-
-
-class TestUnsqueezeOp_ZeroDim1(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = ()
-        self.axes = (-1,)
-        self.new_shape = 1
-
-
-class TestUnsqueezeOp_ZeroDim2(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = ()
-        self.axes = (-1, 1)
-        self.new_shape = (1, 1)
-
-
-class TestUnsqueezeOp_ZeroDim3(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = ()
-        self.axes = (0, 1, 2)
-        self.new_shape = (1, 1, 1)
-
-
-class API_TestUnsqueeze(unittest.TestCase):
-    def test_out(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            data1 = paddle.static.data('data1', shape=[-1, 10], dtype='float64')
-            result_squeeze = paddle.unsqueeze(data1, axis=[1])
-            place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            input1 = np.random.random([5, 1, 10]).astype('float64')
-            input = np.squeeze(input1, axis=1)
-            (result,) = exe.run(
-                feed={"data1": input}, fetch_list=[result_squeeze]
-            )
-            np.testing.assert_allclose(input1, result, rtol=1e-05)
-
-
-class TestUnsqueezeOpError(unittest.TestCase):
-    def test_errors(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            # The type of axis in split_op should be int or Variable.
-            def test_axes_type():
-                x6 = paddle.static.data(
-                    shape=[-1, 10], dtype='float16', name='x3'
-                )
-                paddle.unsqueeze(x6, axis=3.2)
-
-            self.assertRaises(TypeError, test_axes_type)
-
-
-class API_TestUnsqueeze2(unittest.TestCase):
-    def test_out(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            data1 = paddle.static.data('data1', shape=[-1, 10], dtype='float64')
-            data2 = paddle.static.data('data2', shape=[1], dtype='int32')
-            result_squeeze = paddle.unsqueeze(data1, axis=data2)
-            place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            input1 = np.random.random([5, 1, 10]).astype('float64')
-            input2 = np.array([1]).astype('int32')
-            input = np.squeeze(input1, axis=1)
-            (result1,) = exe.run(
-                feed={"data1": input, "data2": input2},
-                fetch_list=[result_squeeze],
-            )
-            np.testing.assert_allclose(input1, result1, rtol=1e-05)
-
-
-class API_TestUnsqueeze3(unittest.TestCase):
-    def test_out(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            data1 = paddle.static.data('data1', shape=[-1, 10], dtype='float64')
-            data2 = paddle.static.data('data2', shape=[1], dtype='int32')
-            result_squeeze = paddle.unsqueeze(data1, axis=[data2, 3])
-            place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            input1 = np.random.random([5, 1, 10, 1]).astype('float64')
-            input2 = np.array([1]).astype('int32')
-            input = np.squeeze(input1)
-            (result1,) = exe.run(
-                feed={"data1": input, "data2": input2},
-                fetch_list=[result_squeeze],
-            )
-            np.testing.assert_array_equal(input1, result1)
-            self.assertEqual(input1.shape, result1.shape)
-
-
-class API_TestDyUnsqueeze(unittest.TestCase):
-    def test_out(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int32")
-        input1 = np.expand_dims(input_1, axis=1)
-        input = paddle.to_tensor(input_1)
-        output = paddle.unsqueeze(input, axis=[1])
-        out_np = output.numpy()
-        np.testing.assert_array_equal(input1, out_np)
-        self.assertEqual(input1.shape, out_np.shape)
-
-
-class API_TestDyUnsqueeze2(unittest.TestCase):
-    def test_out(self):
-        paddle.disable_static()
-        input1 = np.random.random([5, 10]).astype("int32")
-        out1 = np.expand_dims(input1, axis=1)
-        input = paddle.to_tensor(input1)
-        output = paddle.unsqueeze(input, axis=1)
-        out_np = output.numpy()
-        np.testing.assert_array_equal(out1, out_np)
-        self.assertEqual(out1.shape, out_np.shape)
-
-
-class API_TestDyUnsqueezeAxisTensor(unittest.TestCase):
-    def test_out(self):
-        paddle.disable_static()
-        input1 = np.random.random([5, 10]).astype("int32")
-        out1 = np.expand_dims(input1, axis=1)
-        out1 = np.expand_dims(out1, axis=2)
-        input = paddle.to_tensor(input1)
-        output = paddle.unsqueeze(input, axis=paddle.to_tensor([1, 2]))
-        out_np = output.numpy()
-        np.testing.assert_array_equal(out1, out_np)
-        self.assertEqual(out1.shape, out_np.shape)
-
-
-class API_TestDyUnsqueezeAxisTensorList(unittest.TestCase):
-    def test_out(self):
-        paddle.disable_static()
-        input1 = np.random.random([5, 10]).astype("int32")
-        # Actually, expand_dims supports tuple since version 1.18.0
-        out1 = np.expand_dims(input1, axis=1)
-        out1 = np.expand_dims(out1, axis=2)
-        input = paddle.to_tensor(input1)
-        output = paddle.unsqueeze(
-            paddle.to_tensor(input1),
-            axis=[paddle.to_tensor([1]), paddle.to_tensor([2])],
-        )
-        out_np = output.numpy()
-        np.testing.assert_array_equal(out1, out_np)
-        self.assertEqual(out1.shape, out_np.shape)
-
-
-class API_TestDygraphUnSqueeze(unittest.TestCase):
-    def setUp(self):
-        self.executed_api()
-
-    def executed_api(self):
-        self.unsqueeze = paddle.unsqueeze
-
-    def test_out(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int32")
-        input = paddle.to_tensor(input_1)
-        output = self.unsqueeze(input, axis=[1])
-        out_np = output.numpy()
-        expected_out = np.expand_dims(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-    def test_out_int8(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int8")
-        input = paddle.to_tensor(input_1)
-        output = self.unsqueeze(input, axis=[1])
-        out_np = output.numpy()
-        expected_out = np.expand_dims(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-    def test_out_uint8(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("uint8")
-        input = paddle.to_tensor(input_1)
-        output = self.unsqueeze(input, axis=1)
-        out_np = output.numpy()
-        expected_out = np.expand_dims(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-    def test_axis_not_list(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int32")
-        input = paddle.to_tensor(input_1)
-        output = self.unsqueeze(input, axis=1)
-        out_np = output.numpy()
-        expected_out = np.expand_dims(input_1, axis=1)
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-    def test_dimension_not_1(self):
-        paddle.disable_static()
-        input_1 = np.random.random([5, 1, 10]).astype("int32")
-        input = paddle.to_tensor(input_1)
-        output = self.unsqueeze(input, axis=(1, 2))
-        out_np = output.numpy()
-        expected_out = np.expand_dims(input_1, axis=(1, 2))
-        np.testing.assert_allclose(expected_out, out_np, rtol=1e-05)
-
-
-class API_TestDygraphUnSqueezeInplace(API_TestDygraphUnSqueeze):
-    def executed_api(self):
-        self.unsqueeze = paddle.unsqueeze_
-
-
-class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
-    def unsqueeze_wrapper(self, x):
-        return paddle.unsqueeze(x[0], [0, 2])
-
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
-        eps = 0.005
-        dtype = np.float32
-
-        data = paddle.static.data('data', [2, 3, 4], dtype)
-        data.persistable = True
-        out = paddle.unsqueeze(data, [0, 2])
-        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [data], out, x_init=[data_arr], place=place, eps=eps
-        )
-        gradient_checker.double_grad_check_for_dygraph(
-            self.unsqueeze_wrapper, [data], out, x_init=[data_arr], place=place
-        )
-
-    def test_grad(self):
-        paddle.enable_static()
-        places = [base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(base.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestUnsqueezeTripleGradCheck(unittest.TestCase):
-    def unsqueeze_wrapper(self, x):
-        return paddle.unsqueeze(x[0], [0, 2])
-
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
-        eps = 0.005
-        dtype = np.float32
-
-        data = paddle.static.data('data', [2, 3, 4], dtype)
-        data.persistable = True
-        out = paddle.unsqueeze(data, [0, 2])
-        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
-
-        gradient_checker.triple_grad_check(
-            [data], out, x_init=[data_arr], place=place, eps=eps
-        )
-        gradient_checker.triple_grad_check_for_dygraph(
-            self.unsqueeze_wrapper, [data], out, x_init=[data_arr], place=place
-        )
-
-    def test_grad(self):
-        paddle.enable_static()
-        places = [base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(base.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_var_base.py b/test/legacy_test/test_var_base.py
index 748ac4ca608ab8..6b388e2e7e4b1e 100644
--- a/test/legacy_test/test_var_base.py
+++ b/test/legacy_test/test_var_base.py
@@ -87,6 +87,10 @@ def check_with_place(place):
                     self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(blocking=True)
                     self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
+                    y = x.cuda(device_id=0, blocking=True)
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
+                    y = x.cuda(device_id=0, blocking=False)
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     with self.assertRaises(ValueError):
                         y = x.cuda("test")
 
diff --git a/test/legacy_test/test_where_op.py b/test/legacy_test/test_where_op.py
index 3685a59b981347..89328610e92722 100644
--- a/test/legacy_test/test_where_op.py
+++ b/test/legacy_test/test_where_op.py
@@ -19,8 +19,10 @@
 
 import paddle
 from paddle import base
+from paddle.autograd.ir_backward import grad
 from paddle.base import Program, core, program_guard
 from paddle.base.backward import append_backward
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestWhereOp(OpTest):
@@ -33,11 +35,11 @@ def setUp(self):
         self.outputs = {'Out': np.where(self.cond, self.x, self.y)}
 
     def test_check_output(self):
-        self.check_output(check_cinn=self.check_cinn, check_new_ir=True)
+        self.check_output(check_cinn=self.check_cinn, check_pir=True)
 
     def test_check_grad(self):
         self.check_grad(
-            ['X', 'Y'], 'Out', check_cinn=self.check_cinn, check_new_ir=True
+            ['X', 'Y'], 'Out', check_cinn=self.check_cinn, check_pir=True
         )
 
     def init_config(self):
@@ -85,7 +87,7 @@ def setUp(self):
     def test_check_output(self):
         place = core.CUDAPlace(0)
         self.check_output_with_place(
-            place, check_cinn=self.check_cinn, check_new_ir=True
+            place, check_cinn=self.check_cinn, check_pir=True
         )
 
     def test_check_grad(self):
@@ -96,7 +98,7 @@ def test_check_grad(self):
             'Out',
             numeric_grad_delta=0.05,
             check_cinn=self.check_cinn,
-            check_new_ir=True,
+            check_pir=True,
         )
 
     def init_config(self):
@@ -132,7 +134,9 @@ def ref_y_backward(self, dout):
     def test_api(self, use_cuda=False):
         for x_stop_gradient in [False, True]:
             for y_stop_gradient in [False, True]:
-                with base.program_guard(Program(), Program()):
+                with paddle.static.program_guard(
+                    paddle.static.Program(), paddle.static.Program()
+                ):
                     cond = paddle.static.data(
                         name='cond', shape=[-1] + self.shape, dtype='bool'
                     )
@@ -165,7 +169,7 @@ def test_api(self, use_cuda=False):
                         if y_stop_gradient is False:
                             fetch_list.append(y.grad_name)
                         out = exe.run(
-                            base.default_main_program(),
+                            paddle.static.default_main_program(),
                             feed={'cond': self.cond, 'x': self.x, 'y': self.y},
                             fetch_list=fetch_list,
                         )
@@ -183,13 +187,66 @@ def test_api(self, use_cuda=False):
                                 out[2], self.ref_y_backward(out[1])
                             )
 
+    def test_pir_api(self, use_cuda=False):
+        for x_stop_gradient in [False, True]:
+            for y_stop_gradient in [False, True]:
+                with paddle.pir_utils.IrGuard(), paddle.static.program_guard(
+                    paddle.static.Program(), paddle.static.Program()
+                ):
+                    cond = paddle.static.data(
+                        name='cond', shape=self.shape, dtype='bool'
+                    )
+                    x = paddle.static.data(
+                        name='x', shape=self.shape, dtype='float32'
+                    )
+                    y = paddle.static.data(
+                        name='y', shape=self.shape, dtype='float32'
+                    )
+                    x.stop_gradient = x_stop_gradient
+                    y.stop_gradient = y_stop_gradient
+                    result = paddle.where(cond, x, y)
+                    result.stop_gradient = False
+                    loss = paddle.mean(result)
+                    [x_grad, y_grad] = grad(loss, (x, y))
+                    default_main_program = paddle.static.default_main_program()
+                    fetch_list = [result]
+                    if x_stop_gradient is False:
+                        fetch_list.append(x_grad)
+                    if y_stop_gradient is False:
+                        fetch_list.append(y_grad)
+                    for use_cuda in [False, True]:
+                        if use_cuda and (not base.core.is_compiled_with_cuda()):
+                            break
+                        place = (
+                            base.CUDAPlace(0) if use_cuda else base.CPUPlace()
+                        )
+                        exe = base.Executor(place)
+
+                        out = exe.run(
+                            default_main_program,
+                            feed={'cond': self.cond, 'x': self.x, 'y': self.y},
+                            fetch_list=fetch_list,
+                        )
+                        np.testing.assert_array_equal(out[0], self.out)
+                        if x_stop_gradient is False:
+                            np.testing.assert_array_equal(
+                                out[1], self.ref_x_backward(out[1])
+                            )
+                            if y.stop_gradient is False:
+                                np.testing.assert_array_equal(
+                                    out[2], self.ref_y_backward(out[2])
+                                )
+                        elif y.stop_gradient is False:
+                            np.testing.assert_array_equal(
+                                out[1], self.ref_y_backward(out[1])
+                            )
+
+    @test_with_pir_api
     def test_api_broadcast(self, use_cuda=False):
-        main_program = Program()
-        with base.program_guard(main_program):
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
             x = paddle.static.data(name='x', shape=[-1, 4, 1], dtype='float32')
-            x.desc.set_need_check_feed(False)
             y = paddle.static.data(name='y', shape=[-1, 4, 2], dtype='float32')
-            y.desc.set_need_check_feed(False)
             x_i = np.array([[0.9383, 0.1983, 3.2, 1.2]]).astype('float32')
             y_i = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0]]).astype(
                 'float32'
@@ -201,7 +258,7 @@ def test_api_broadcast(self, use_cuda=False):
                 place = base.CUDAPlace(0) if use_cuda else base.CPUPlace()
                 exe = base.Executor(place)
                 out = exe.run(
-                    base.default_main_program(),
+                    paddle.static.default_main_program(),
                     feed={'x': x_i, 'y': y_i},
                     fetch_list=[result],
                 )
@@ -209,15 +266,14 @@ def test_api_broadcast(self, use_cuda=False):
                     out[0], np.where((x_i > 1), x_i, y_i)
                 )
 
+    @test_with_pir_api
     def test_scalar(self):
-        paddle.enable_static()
-        main_program = Program()
-        with base.program_guard(main_program):
-            cond_shape = [2, 4]
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            cond_shape = [4]
             cond = paddle.static.data(
-                name='cond', shape=[-1] + cond_shape, dtype='bool'
+                name='cond', shape=cond_shape, dtype='bool'
             )
-            cond.desc.set_need_check_feed(False)
             x_data = 1.0
             y_data = 2.0
             cond_data = np.array([False, False, True, True]).astype('bool')
@@ -228,7 +284,7 @@ def test_scalar(self):
                 place = base.CUDAPlace(0) if use_cuda else base.CPUPlace()
                 exe = base.Executor(place)
                 out = exe.run(
-                    base.default_main_program(),
+                    paddle.static.default_main_program(),
                     feed={'cond': cond_data},
                     fetch_list=[result],
                 )
@@ -237,20 +293,13 @@ def test_scalar(self):
 
     def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape):
         paddle.enable_static()
-        main_program = Program()
-        with base.program_guard(main_program):
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
             cond = paddle.static.data(
-                name='cond', shape=[-1] + cond_shape, dtype='bool'
-            )
-            x = paddle.static.data(
-                name='x', shape=[-1] + x_shape, dtype='float32'
+                name='cond', shape=cond_shape, dtype='bool'
             )
-            y = paddle.static.data(
-                name='y', shape=[-1] + y_shape, dtype='float32'
-            )
-            x.desc.set_need_check_feed(False)
-            y.desc.set_need_check_feed(False)
-            cond.desc.set_need_check_feed(False)
+            x = paddle.static.data(name='x', shape=x_shape, dtype='float32')
+            y = paddle.static.data(name='y', shape=y_shape, dtype='float32')
             cond_data_tmp = np.random.random(size=cond_shape).astype('float32')
             cond_data = cond_data_tmp < 0.3
             x_data = np.random.random(size=x_shape).astype('float32')
@@ -262,55 +311,63 @@ def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape):
                 place = base.CUDAPlace(0) if use_cuda else base.CPUPlace()
                 exe = base.Executor(place)
                 out = exe.run(
-                    base.default_main_program(),
+                    paddle.static.default_main_program(),
                     feed={'cond': cond_data, 'x': x_data, 'y': y_data},
                     fetch_list=[result],
                 )
                 expect = np.where(cond_data, x_data, y_data)
                 np.testing.assert_array_equal(out[0], expect)
 
+    @test_with_pir_api
     def test_static_api_broadcast_1(self):
         cond_shape = [2, 4]
         a_shape = [2, 2, 4]
         b_shape = [2, 2, 4]
         self.__test_where_with_broadcast_static(cond_shape, a_shape, b_shape)
 
+    @test_with_pir_api
     def test_static_api_broadcast_2(self):
         cond_shape = [2, 1]
         a_shape = [2, 2, 4]
         b_shape = [2, 2, 4]
         self.__test_where_with_broadcast_static(cond_shape, a_shape, b_shape)
 
+    @test_with_pir_api
     def test_static_api_broadcast_3(self):
         cond_shape = [2, 2, 1]
         a_shape = [2, 2, 4]
         b_shape = [2, 2, 4]
         self.__test_where_with_broadcast_static(cond_shape, a_shape, b_shape)
 
+    @test_with_pir_api
     def test_static_api_broadcast_4(self):
         cond_shape = [2, 1, 4]
         a_shape = [2, 2, 4]
         b_shape = [2, 2, 4]
         self.__test_where_with_broadcast_static(cond_shape, a_shape, b_shape)
 
+    @test_with_pir_api
     def test_static_api_broadcast_5(self):
         cond_shape = [3, 2, 2, 4]
         a_shape = [2, 2, 4]
         b_shape = [2, 2, 4]
         self.__test_where_with_broadcast_static(cond_shape, a_shape, b_shape)
 
+    @test_with_pir_api
     def test_static_api_broadcast_6(self):
         cond_shape = [2, 2, 4]
         a_shape = [2, 2, 1]
         b_shape = [2, 2, 1]
         self.__test_where_with_broadcast_static(cond_shape, a_shape, b_shape)
 
+    @test_with_pir_api
     def test_static_api_broadcast_7(self):
         cond_shape = [2, 2, 4]
         a_shape = [2, 1, 4]
         b_shape = [2, 1, 4]
         self.__test_where_with_broadcast_static(cond_shape, a_shape, b_shape)
 
+    @test_with_pir_api
     def test_static_api_broadcast_8(self):
         cond_shape = [3, 2, 2, 4]
         a_shape = [2, 2, 1]
@@ -433,7 +490,9 @@ def test_where_condition(self):
 
 class TestWhereOpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program(), Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype('float64')
             y_i = np.array([1.0, 1.0, 1.0, 1.0]).astype('float64')
             cond_i = np.array([False, False, True, True]).astype('bool')
@@ -443,6 +502,12 @@ def test_Variable():
 
             self.assertRaises(TypeError, test_Variable)
 
+            def test_OpResult():
+                with paddle.pir_utils.IrGuard():
+                    paddle.where(cond_i, x_i, y_i)
+
+            self.assertRaises(ValueError, test_OpResult)
+
             def test_type():
                 x = paddle.static.data(name='x', shape=[-1, 4], dtype='bool')
                 x.desc.set_need_check_feed(False)
diff --git a/test/mkldnn/test_activation_mkldnn_op.py b/test/mkldnn/test_activation_mkldnn_op.py
index e6ef8388f771d1..d37cea47450c70 100644
--- a/test/mkldnn/test_activation_mkldnn_op.py
+++ b/test/mkldnn/test_activation_mkldnn_op.py
@@ -482,6 +482,14 @@ def setUp(self):
         self.outputs = {'Out': out}
         self.attrs = {"use_mkldnn": True}
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', check_pir=True)
+
 
 class TestMKLDNNRound_ZeroDim(TestActivation_ZeroDim):
     def setUp(self):
@@ -494,6 +502,14 @@ def setUp(self):
         self.outputs = {'Out': out}
         self.attrs = {"use_mkldnn": True}
 
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', check_pir=True)
+
 
 class TestMKLDNNSigmoidDim4(TestSigmoid):
     def setUp(self):
diff --git a/test/mkldnn/test_reduce_bf16_mkldnn_op.py b/test/mkldnn/test_reduce_bf16_mkldnn_op.py
index 187ce4cde47393..1d0e0e596dcb89 100644
--- a/test/mkldnn/test_reduce_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_reduce_bf16_mkldnn_op.py
@@ -40,7 +40,7 @@ def setUp(self):
         self.attrs = {'use_mkldnn': self.use_mkldnn}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False, check_new_ir=False)
+        self.check_output(check_dygraph=False, check_pir=False)
 
     def calculate_grads(self):
         tmp_tensor = np.zeros(self.x_fp32.shape).astype("float32")
@@ -84,7 +84,7 @@ def test_check_grad(self):
             check_dygraph=False,
             user_defined_grads=[self.grad_X],
             user_defined_grad_outputs=[convert_float_to_uint16(self.grad_Out)],
-            check_new_ir=False,
+            check_pir=False,
         )
 
 
diff --git a/test/mkldnn/test_reduce_mkldnn_op.py b/test/mkldnn/test_reduce_mkldnn_op.py
index 3dce2c72e55687..d22556f67630c0 100644
--- a/test/mkldnn/test_reduce_mkldnn_op.py
+++ b/test/mkldnn/test_reduce_mkldnn_op.py
@@ -29,12 +29,12 @@ def setUp(self):
         self.attrs = {'use_mkldnn': self.use_mkldnn}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False, check_new_ir=False)
+        self.check_output(check_dygraph=False, check_pir=False)
 
 
 class TestReduceDefaultWithGradOneDNNOp(TestReduceSumDefaultOneDNNOp):
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_dygraph=False, check_new_ir=False)
+        self.check_grad(['X'], 'Out', check_dygraph=False, check_pir=False)
 
 
 class TestReduceSum4DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
diff --git a/test/prim/model/bert.py b/test/prim/model/bert.py
index f7cf05f7ca2434..fe54de520f88f5 100644
--- a/test/prim/model/bert.py
+++ b/test/prim/model/bert.py
@@ -251,7 +251,7 @@ def __init__(self, config: BertConfig, to_static, enable_cinn):
                 if enable_cinn:
                     build_strategy.build_cinn_pass = True
                 self.encoder = paddle.jit.to_static(
-                    self.encoder, None, build_strategy
+                    self.encoder, None, build_strategy, full_graph=True
                 )
         self.pooler = BertPooler(config)
         # self.apply(self.init_weights)
diff --git a/test/prim/model/test_prim_simplenet_cinn.py b/test/prim/model/test_prim_simplenet_cinn.py
index 6482e849560e01..06b5085ae77295 100644
--- a/test/prim/model/test_prim_simplenet_cinn.py
+++ b/test/prim/model/test_prim_simplenet_cinn.py
@@ -26,7 +26,9 @@
 def apply_to_static(net, use_cinn):
     build_strategy = paddle.static.BuildStrategy()
     build_strategy.build_cinn_pass = use_cinn
-    return paddle.jit.to_static(net, build_strategy=build_strategy)
+    return paddle.jit.to_static(
+        net, build_strategy=build_strategy, full_graph=True
+    )
 
 
 class PrimeNet(paddle.nn.Layer):
diff --git a/test/prim/model/test_resnet_cinn.py b/test/prim/model/test_resnet_cinn.py
index ef932603f8a58a..7734f9da609097 100644
--- a/test/prim/model/test_resnet_cinn.py
+++ b/test/prim/model/test_resnet_cinn.py
@@ -185,7 +185,9 @@ def train(to_static, enable_prim, enable_cinn):
         build_strategy = paddle.static.BuildStrategy()
         if enable_cinn:
             build_strategy.build_cinn_pass = True
-        resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
+        resnet = paddle.jit.to_static(
+            resnet, build_strategy=build_strategy, full_graph=True
+        )
     optimizer = optimizer_setting(parameter_list=resnet.parameters())
 
     train_losses = run(resnet, data_loader, optimizer, 'train')
diff --git a/test/prim/model/test_resnet_prim.py b/test/prim/model/test_resnet_prim.py
index de81f2b78b650f..e3e2d859fa4b61 100644
--- a/test/prim/model/test_resnet_prim.py
+++ b/test/prim/model/test_resnet_prim.py
@@ -186,7 +186,9 @@ def train(to_static, enable_prim, enable_cinn):
         build_strategy = paddle.static.BuildStrategy()
         if enable_cinn:
             build_strategy.build_cinn_pass = True
-        resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
+        resnet = paddle.jit.to_static(
+            resnet, build_strategy=build_strategy, full_graph=True
+        )
     optimizer = optimizer_setting(parameter_list=resnet.parameters())
 
     train_losses = run(resnet, data_loader, optimizer, 'train')
diff --git a/test/prim/model/test_resnet_prim_cinn.py b/test/prim/model/test_resnet_prim_cinn.py
index 933da8fcf105c6..5ebf0684259cc9 100644
--- a/test/prim/model/test_resnet_prim_cinn.py
+++ b/test/prim/model/test_resnet_prim_cinn.py
@@ -186,7 +186,9 @@ def train(to_static, enable_prim, enable_cinn):
         build_strategy = paddle.static.BuildStrategy()
         if enable_cinn:
             build_strategy.build_cinn_pass = True
-        resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
+        resnet = paddle.jit.to_static(
+            resnet, build_strategy=build_strategy, full_graph=True
+        )
     optimizer = optimizer_setting(parameter_list=resnet.parameters())
 
     train_losses = run(resnet, data_loader, optimizer, 'train')
diff --git a/test/prim/pir_prim/CMakeLists.txt b/test/prim/pir_prim/CMakeLists.txt
index c31e7254ff60c9..049f4b915dc457 100644
--- a/test/prim/pir_prim/CMakeLists.txt
+++ b/test/prim/pir_prim/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TEST_PRIM_PURE_NEW_IR_CASES
     test_prim_program test_prim_simpnet test_prim_custom_vjp test_prim_jit
-    test_pir_prim_flags)
+    test_pir_prim_flags test_sink_decomp)
 
 foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
diff --git a/test/prim/pir_prim/test_decompose_op.py b/test/prim/pir_prim/test_decompose_op.py
new file mode 100644
index 00000000000000..ea1eac9a769c15
--- /dev/null
+++ b/test/prim/pir_prim/test_decompose_op.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import pir
+from paddle.base import core
+from paddle.decomposition import decomp
+
+paddle.enable_static()
+
+
+def check_param_mappings(param_mappings):
+    for VarDesc, Values in param_mappings.items():
+        if len(Values) < 0 or len(Values) > 1:
+            raise ValueError("currently only support one-to-one param_mappings")
+
+
+def get_new_ir_grad_var_to_var_map(param_mappings, old_ir_grad_var_to_var_map):
+    new_ir_grad_var_to_var_map = {}
+    for grad_var, var in old_ir_grad_var_to_var_map.items():
+        if grad_var in param_mappings.keys():
+            new_grad_var = param_mappings[grad_var][0]
+            new_var = param_mappings[var][0]
+            new_ir_grad_var_to_var_map[new_grad_var] = new_var
+    return new_ir_grad_var_to_var_map
+
+
+def get_fwd_op(bwd_op, grad_var_to_var_map):
+    bwd_op_input_names = bwd_op.get_input_names()
+    for idx, input_name in enumerate(bwd_op_input_names):
+        if input_name == "out_grad":
+            out_grad = bwd_op.operand(idx).source()
+            out = grad_var_to_var_map[out_grad]
+            fwd_op = out.get_defining_op()
+            return fwd_op
+
+    return None
+
+
+def get_pir_program_and_param_map():
+    shape = [2, 3]
+    mp = paddle.static.Program()
+    with paddle.static.program_guard(mp):
+        # construct graph
+        x = paddle.static.data('x', shape, dtype='float32')
+        x.stop_gradient = False
+        y = paddle.static.data('y', shape, dtype='float32')
+        y.stop_gradient = False
+        z = paddle.static.data('z', shape, dtype='float32')
+        z.stop_gradient = False
+        tmp1 = paddle.add(x, y)
+        tmp2 = paddle.multiply(tmp1, z)
+        tmp3 = paddle.mean(tmp2, axis=-1, keepdim=True)
+        scale = paddle.tensor.fill_constant(
+            shape=tmp3.shape[1:],
+            dtype=tmp3.dtype,
+            value=1.0,
+        )
+        scale.stop_gradient = False
+        out = paddle.nn.functional.layer_norm(
+            tmp3, tmp3.shape[1:], scale, None, 1e-5
+        )
+        # construct backward graph
+        gradients = paddle.static.gradients(out, [x, y, z])
+
+    newir_program, param_mappings = pir.translate_to_new_ir_with_param_map(
+        mp.desc
+    )
+    check_param_mappings(param_mappings)
+
+    return newir_program, param_mappings
+
+
+class TestDecomposeOp(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [2, 3]
+        self.x = np.random.random(self.shape_x).astype("float32")
+        self.shape_y = [2, 3]
+        self.y = np.random.random(self.shape_y).astype("float32")
+        self.shape_z = [2, 3]
+        self.z = np.random.random(self.shape_z).astype("float32")
+
+    def net(self, flag=None):
+        (
+            newir_program,
+            param_mappings,
+        ) = get_pir_program_and_param_map()
+
+        newir_ops = newir_program.global_block().ops
+        global_outputs = [newir_ops[7].result(0)]
+        global_grads = [
+            newir_ops[-1].result(0),
+            newir_ops[-1].result(1),
+            newir_ops[-2].result(1),
+        ]
+
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
+            newir_program
+        ):
+            if flag == "decompose":
+                core._set_prim_forward_enabled(True)
+                core._set_prim_backward_enabled(True)
+
+                # get the old_ir_grad_var_to_var map
+                old_ir_grad_var_to_var_map = {
+                    'layer_norm_1.tmp_2@GRAD': 'layer_norm_1.tmp_2',
+                    'mean_1.tmp_0@GRAD': 'mean_1.tmp_0',
+                    "fill_constant_3.tmp_0@GRAD": "fill_constant_3.tmp_0",
+                    'elementwise_mul_1@GRAD': 'elementwise_mul_1',
+                    'elementwise_add_1@GRAD': 'elementwise_add_1',
+                    'z@GRAD': 'z',
+                    'x@GRAD': 'x',
+                    'y@GRAD': 'y',
+                }
+
+                grad_var_to_var_map = get_new_ir_grad_var_to_var_map(
+                    param_mappings, old_ir_grad_var_to_var_map
+                )
+
+                # get global outputs and grads info, when decomposing an op that corresponds to global outputs and grads, then update the global outputs and grads
+                (
+                    fwd_leaf_ops,
+                    fwd_leaf_ops_output_indexes,
+                ) = decomp.get_leaf_ops(
+                    newir_program.global_block(), global_outputs
+                )  # without update during execution
+                (
+                    bwd_leaf_ops,
+                    bwd_leaf_ops_output_indexes,
+                ) = decomp.get_leaf_ops(
+                    newir_program.global_block(), global_grads
+                )
+
+                decompose_bwd_ops_names = [
+                    "pd_op.layer_norm_grad",
+                    "pd_op.mean_grad",
+                    "pd_op.add_grad",
+                    "pd_op.multiply_grad",
+                ]
+                for bwd_op in newir_ops:
+                    if (
+                        flag == "decompose"
+                        and bwd_op.name() in decompose_bwd_ops_names
+                    ):
+                        fwd_op = get_fwd_op(bwd_op, grad_var_to_var_map)
+                        assert fwd_op is not None, "fwd_op is None"
+                        fwd_inputs = tuple(
+                            x.source() for x in fwd_op.operands()
+                        )
+                        fwd_outputs = tuple(fwd_op.results())
+
+                        # if bwd_op has custom_vjp rule, then decompose bwd_op firstly and decompose fwd_op secondly
+                        if core.has_custom_vjp(fwd_op):
+                            bwd_leaf_op_index = (
+                                bwd_leaf_ops.index(bwd_op)
+                                if bwd_op in bwd_leaf_ops
+                                else None
+                            )
+                            new_grads = decomp.decompose_bwd_op(
+                                newir_program.global_block(),
+                                bwd_op,
+                                grad_var_to_var_map,
+                                fwd_outputs,
+                                fwd_inputs,
+                            )
+                            if bwd_leaf_op_index is not None:
+                                decomp.replace_graph_outputs(
+                                    global_grads,
+                                    new_grads,
+                                    bwd_leaf_op_index,
+                                    bwd_leaf_ops_output_indexes,
+                                )
+
+                            fwd_leaf_op_index = (
+                                fwd_leaf_ops.index(fwd_op)
+                                if fwd_op in fwd_leaf_ops
+                                else None
+                            )
+                            new_fwd_outputs = decomp.decompose_fwd_op(
+                                newir_program.global_block(),
+                                fwd_op,
+                                grad_var_to_var_map,
+                            )
+                            if fwd_leaf_op_index is not None:
+                                decomp.replace_graph_outputs(
+                                    global_outputs,
+                                    new_fwd_outputs,
+                                    fwd_leaf_op_index,
+                                    fwd_leaf_ops_output_indexes,
+                                )
+
+                        # if bwd_op has no custom_vjp rule, then decompose fwd_op into a set of primitive ops firstly and decompose bwd_op secondly
+                        else:
+                            fwd_leaf_op_index = (
+                                fwd_leaf_ops.index(fwd_op)
+                                if fwd_op in fwd_leaf_ops
+                                else None
+                            )
+                            new_fwd_outputs = decomp.decompose_fwd_op(
+                                newir_program.global_block(),
+                                fwd_op,
+                                grad_var_to_var_map,
+                            )
+                            if fwd_leaf_op_index is not None:
+                                decomp.replace_graph_outputs(
+                                    global_outputs,
+                                    new_fwd_outputs,
+                                    fwd_leaf_op_index,
+                                    fwd_leaf_ops_output_indexes,
+                                )
+
+                            bwd_leaf_op_index = (
+                                bwd_leaf_ops.index(bwd_op)
+                                if bwd_op in bwd_leaf_ops
+                                else None
+                            )
+                            new_grads = decomp.decompose_bwd_op(
+                                newir_program.global_block(),
+                                bwd_op,
+                                grad_var_to_var_map,
+                                new_fwd_outputs,
+                                fwd_inputs,
+                            )
+                            if bwd_leaf_op_index is not None:
+                                decomp.replace_graph_outputs(
+                                    global_grads,
+                                    new_grads,
+                                    bwd_leaf_op_index,
+                                    bwd_leaf_ops_output_indexes,
+                                )
+
+            # execution
+            exe = paddle.static.Executor()
+            outs = exe.run(
+                newir_program,
+                feed={'x': self.x, 'y': self.y, 'z': self.z},
+                fetch_list=[
+                    global_outputs[0],
+                    global_grads[0],
+                    global_grads[1],
+                    global_grads[2],
+                ],
+            )
+            core._set_prim_backward_enabled(False)
+            core._set_prim_forward_enabled(False)
+
+        return outs
+
+    def test_decompose_layer_norm_op(self):
+        res_ref = self.net()
+        res = self.net("decompose")
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_allclose(ref, actual, atol=1e-4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/prim/pir_prim/test_pir_prim_flags.py b/test/prim/pir_prim/test_pir_prim_flags.py
index 4bee4da74a4d12..d523aa33f1e3b1 100644
--- a/test/prim/pir_prim/test_pir_prim_flags.py
+++ b/test/prim/pir_prim/test_pir_prim_flags.py
@@ -102,7 +102,7 @@ def train(self):
         x = paddle.randn([2, 4])
         x.stop_gradient = False
         net = PrimeNet()
-        net = paddle.jit.to_static(net)
+        net.forward = paddle.jit.to_static(full_graph=True)(net.forward)
         out = net(x)
         loss = paddle.mean(out)
         loss.backward()
diff --git a/test/prim/pir_prim/test_prim_jit.py b/test/prim/pir_prim/test_prim_jit.py
index 72958eff9a1d7b..0d9f7c4118783b 100644
--- a/test/prim/pir_prim/test_prim_jit.py
+++ b/test/prim/pir_prim/test_prim_jit.py
@@ -30,7 +30,7 @@ def func(x):
             return out
 
         # ==== dygraph computation ====
-        static_func = paddle.jit.to_static(func)
+        static_func = paddle.jit.to_static(func, full_graph=True)
         x = paddle.randn((8, 16, 64))
         x.stop_gradient = False
         ref_out = func(x) * 2
diff --git a/test/prim/pir_prim/test_sink_decomp.py b/test/prim/pir_prim/test_sink_decomp.py
new file mode 100644
index 00000000000000..d1a14987123ee9
--- /dev/null
+++ b/test/prim/pir_prim/test_sink_decomp.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.autograd.ir_backward import grad
+from paddle.base import core
+from paddle.decomposition import decompose
+
+paddle.enable_static()
+
+
+class TestPrimMode(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [8, 16, 32, 64]
+        self.shape_y = [8, 16, 32, 64]
+        self.x = np.random.random(self.shape_x).astype("float32")
+        self.y = np.random.random(self.shape_y).astype("float32")
+        self.prog = None
+
+    def base_net(self, flag=None):
+        if flag == "forward":
+            core._set_prim_forward_enabled(True)
+        elif flag == "backward":
+            core._set_prim_backward_enabled(True)
+        elif flag == "all":
+            core._set_prim_all_enabled(True)
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            x = paddle.static.data('x', self.shape_x, dtype='float32')
+            y = paddle.static.data('y', self.shape_y, dtype='float32')
+            x.stop_gradient = False
+            y.stop_gradient = False
+            divide_out = paddle.divide(x, y)
+            sum_out = paddle.mean(divide_out, axis=0)
+            [new_out] = decompose(main_program, [sum_out])
+            gradients = grad(new_out, (x, y))
+
+            exe = paddle.static.Executor()
+            [fwd, dx, dy] = exe.run(
+                feed={'x': self.x, 'y': self.y}, fetch_list=[new_out, gradients]
+            )
+
+        whole_ops = [op.name() for op in main_program.global_block().ops]
+        self.prog = main_program
+        if flag == "forward":
+            core._set_prim_forward_enabled(False)
+            assert (
+                'pd_op.mean' not in whole_ops
+                and 'pd_op.divide_grad' in whole_ops
+            )
+        elif flag == "backward":
+            core._set_prim_backward_enabled(False)
+            assert (
+                'pd_op.mean' in whole_ops
+                and 'pd_op.divide_grad' not in whole_ops
+            )
+        elif flag == "all":
+            core._set_prim_all_enabled(False)
+            assert (
+                'pd_op.mean' not in whole_ops
+                and 'pd_op.divide_grad' not in whole_ops
+            )
+        else:
+            assert (
+                'pd_op.mean' in whole_ops and 'pd_op.divide_grad' in whole_ops
+            )
+        return fwd, dx, dy
+
+    def test_prim_forward(self):
+        res_ref = self.base_net()
+        res = self.base_net("forward")
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_equal(ref, actual)
+
+    def test_prim_backward(self):
+        res_ref = self.base_net()
+        res = self.base_net("backward")
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_allclose(ref, actual, rtol=1e-6)
+
+    def test_prim_all(self):
+        res_ref = self.base_net()
+        res = self.base_net("all")
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_allclose(ref, actual, rtol=1e-6)
+
+    def test_has_decomp(self):
+        _ = self.base_net()
+        for op in self.prog.global_block().ops:
+            if op.name() == "pd_op.divide":
+                self.assertEqual(core.has_decomp(op), False)
+            if op.name() == "pd_op.mean":
+                self.assertEqual(core.has_decomp(op), True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/prim/prim/flags/test_prim_flags.py b/test/prim/prim/flags/test_prim_flags.py
index c1164a5e626e47..9f6c84577697c1 100644
--- a/test/prim/prim/flags/test_prim_flags.py
+++ b/test/prim/prim/flags/test_prim_flags.py
@@ -153,7 +153,7 @@ def train(self):
         x = paddle.randn([2, 4])
         x.stop_gradient = False
         net = PrimeNet()
-        net = paddle.jit.to_static(net)
+        net = paddle.jit.to_static(net, full_graph=True)
 
         out = net(x)
         loss = paddle.mean(out)
diff --git a/test/prim/prim/flags/test_prim_flags_case.py b/test/prim/prim/flags/test_prim_flags_case.py
index 126c15de81fe2f..a5657326838211 100644
--- a/test/prim/prim/flags/test_prim_flags_case.py
+++ b/test/prim/prim/flags/test_prim_flags_case.py
@@ -23,7 +23,9 @@
 def apply_to_static(net, use_cinn):
     build_strategy = paddle.static.BuildStrategy()
     build_strategy.build_cinn_pass = use_cinn
-    return paddle.jit.to_static(net, build_strategy=build_strategy)
+    return paddle.jit.to_static(
+        net, build_strategy=build_strategy, full_graph=True
+    )
 
 
 class PrimeNet(paddle.nn.Layer):
diff --git a/test/prim/process/test_check_inputs.py b/test/prim/process/test_check_inputs.py
index 631da96cc8b237..b844f52ea81d80 100644
--- a/test/prim/process/test_check_inputs.py
+++ b/test/prim/process/test_check_inputs.py
@@ -43,7 +43,7 @@ def test_error_input(self):
         np_data = np.random.random([3, 4]).astype("float32")
         tensor_data = paddle.to_tensor(np_data)
         shape = paddle.to_tensor([2, 3, 4])
-        net = paddle.jit.to_static(fn)
+        net = paddle.jit.to_static(fn, full_graph=True)
         with self.assertRaises(NotImplementedError):
             _ = net(tensor_data, shape).numpy()
         core._set_prim_all_enabled(False)
diff --git a/test/prim/test_comp_custom_vjp.py b/test/prim/test_comp_custom_vjp.py
index fb62fe80202a40..40638bc579cf94 100644
--- a/test/prim/test_comp_custom_vjp.py
+++ b/test/prim/test_comp_custom_vjp.py
@@ -70,7 +70,7 @@ def test_enable_prim_fwd(self):
             self.ops_fwd_enable_bwd_disable,
             tuple(
                 op.type
-                for op in paddle.jit.to_static(self.f)
+                for op in paddle.jit.to_static(full_graph=True)(self.f)
                 .get_concrete_program()[1]
                 ._train_program.block(0)
                 .ops
@@ -86,7 +86,7 @@ def test_enable_prim_bwd(self):
             self.ops_fwd_disable_bwd_enable,
             tuple(
                 op.type
-                for op in paddle.jit.to_static(self.f)
+                for op in paddle.jit.to_static(full_graph=True)(self.f)
                 .get_concrete_program()[1]
                 ._train_program.block(0)
                 .ops
@@ -101,7 +101,7 @@ def test_enable_prim_all(self):
             self.ops_all_enable,
             tuple(
                 op.type
-                for op in paddle.jit.to_static(self.f)
+                for op in paddle.jit.to_static(full_graph=True)(self.f)
                 .get_concrete_program()[1]
                 ._train_program.block(0)
                 .ops
diff --git a/test/prim/test_comp_dispensable.py b/test/prim/test_comp_dispensable.py
index be76ce92ce7f03..9c7d10b645d5e4 100644
--- a/test/prim/test_comp_dispensable.py
+++ b/test/prim/test_comp_dispensable.py
@@ -25,11 +25,10 @@ def tearDown(self):
         paddle.base.core._set_prim_all_enabled(False)
 
     def test_dispensable(self):
-        @paddle.jit.to_static
         def f(x):
             return paddle.split(x, num_or_sections=2)
 
-        f = paddle.jit.to_static(f)
+        f = paddle.jit.to_static(full_graph=True)(f)
         x = paddle.rand((8,))
         x.stop_gradient = False
 
diff --git a/test/quantization/test_weight_only_linear.py b/test/quantization/test_weight_only_linear.py
index 3e40f4d64d36ac..dcda3d3e4c72ef 100644
--- a/test/quantization/test_weight_only_linear.py
+++ b/test/quantization/test_weight_only_linear.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import os
 import re
 import struct
@@ -333,5 +334,39 @@ def config(self):
         self.static = True
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or get_cuda_version() < 11020
+    or paddle.device.cuda.get_device_capability()[0] < 8,
+    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+)
+class WeightOnlyLinearBackwardAndWeightDequantizeTestCase(unittest.TestCase):
+    def test_weightonly_linear_backward(self):
+        x = paddle.rand(shape=(128, 4096), dtype='float16')
+        x.stop_gradient = False
+        quant_x = copy.deepcopy(x)
+        quant_x.stop_gradient = False
+        weight = paddle.rand(shape=(4096, 12288), dtype='float16')
+
+        quant_weight, quant_scale = Q.weight_quantize(
+            x=weight, algo='weight_only_int8'
+        )
+        dequant_weight = Q.weight_dequantize(quant_weight, quant_scale)
+        np.testing.assert_allclose(weight, dequant_weight, rtol=1e-2, atol=1e-2)
+
+        quant_out = Q.weight_only_linear(
+            x=quant_x,
+            weight=quant_weight,
+            weight_scale=quant_scale,
+            weight_dtype="int8",
+        )
+        out = paddle.matmul(x=x, y=weight)
+        np.testing.assert_allclose(quant_out, out, rtol=1e-3, atol=1e-3)
+
+        quant_out.backward()
+        out.backward()
+        np.testing.assert_allclose(quant_x.grad, x.grad, rtol=1e-3, atol=1e-3)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/rnn/rnn_numpy.py b/test/rnn/rnn_numpy.py
index f303b460cdb9d2..b925670910f5fd 100644
--- a/test/rnn/rnn_numpy.py
+++ b/test/rnn/rnn_numpy.py
@@ -38,12 +38,14 @@ def __init__(
         self,
         input_size,
         hidden_size,
+        weight=True,
         bias=True,
         nonlinearity="RNN_TANH",
         dtype="float64",
     ):
         self.input_size = input_size
         self.hidden_size = hidden_size
+        self.weight = weight
         self.bias = bias
         if nonlinearity == 'RNN_TANH':
             self.nonlinearity = np.tanh
@@ -52,12 +54,16 @@ def __init__(
 
         self.parameters = {}
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.random.uniform(
-            -std, std, (hidden_size, input_size)
-        ).astype(dtype)
-        self.weight_hh = np.random.uniform(
-            -std, std, (hidden_size, hidden_size)
-        ).astype(dtype)
+        if weight:
+            self.weight_ih = np.random.uniform(
+                -std, std, (hidden_size, input_size)
+            ).astype(dtype)
+            self.weight_hh = np.random.uniform(
+                -std, std, (hidden_size, hidden_size)
+            ).astype(dtype)
+        else:
+            self.weight_ih = np.ones((hidden_size, input_size)).astype(dtype)
+            self.weight_hh = np.ones((hidden_size, hidden_size)).astype(dtype)
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
@@ -67,11 +73,11 @@ def __init__(
             self.bias_hh = np.random.uniform(-std, std, (hidden_size,)).astype(
                 dtype
             )
-            self.parameters['bias_ih'] = self.bias_ih
-            self.parameters['bias_hh'] = self.bias_hh
         else:
-            self.bias_ih = None
-            self.bias_hh = None
+            self.bias_ih = np.zeros(hidden_size).astype(dtype)
+            self.bias_hh = np.zeros(hidden_size).astype(dtype)
+        self.parameters['bias_ih'] = self.bias_ih
+        self.parameters['bias_hh'] = self.bias_hh
 
     def init_state(self, inputs, batch_dim_index=0):
         batch_size = inputs.shape[batch_dim_index]
@@ -92,18 +98,29 @@ def forward(self, inputs, hx=None):
 
 
 class GRUCell(LayerMixin):
-    def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
+    def __init__(
+        self, input_size, hidden_size, weight=True, bias=True, dtype="float64"
+    ):
         self.input_size = input_size
         self.hidden_size = hidden_size
+        self.weight = weight
         self.bias = bias
         self.parameters = {}
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.random.uniform(
-            -std, std, (3 * hidden_size, input_size)
-        ).astype(dtype)
-        self.weight_hh = np.random.uniform(
-            -std, std, (3 * hidden_size, hidden_size)
-        ).astype(dtype)
+        if weight:
+            self.weight_ih = np.random.uniform(
+                -std, std, (3 * hidden_size, input_size)
+            ).astype(dtype)
+            self.weight_hh = np.random.uniform(
+                -std, std, (3 * hidden_size, hidden_size)
+            ).astype(dtype)
+        else:
+            self.weight_ih = np.ones((3 * hidden_size, input_size)).astype(
+                dtype
+            )
+            self.weight_hh = np.ones((3 * hidden_size, hidden_size)).astype(
+                dtype
+            )
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
@@ -113,11 +130,11 @@ def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
             self.bias_hh = np.random.uniform(
                 -std, std, (3 * hidden_size)
             ).astype(dtype)
-            self.parameters['bias_ih'] = self.bias_ih
-            self.parameters['bias_hh'] = self.bias_hh
         else:
-            self.bias_ih = None
-            self.bias_hh = None
+            self.bias_ih = np.zeros(3 * hidden_size).astype(dtype)
+            self.bias_hh = np.zeros(3 * hidden_size).astype(dtype)
+        self.parameters['bias_ih'] = self.bias_ih
+        self.parameters['bias_hh'] = self.bias_hh
 
     def init_state(self, inputs, batch_dim_index=0):
         batch_size = inputs.shape[batch_dim_index]
@@ -144,18 +161,29 @@ def forward(self, inputs, hx=None):
 
 
 class LSTMCell(LayerMixin):
-    def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
+    def __init__(
+        self, input_size, hidden_size, weight=True, bias=True, dtype="float64"
+    ):
         self.input_size = input_size
         self.hidden_size = hidden_size
+        self.weight = weight
         self.bias = bias
         self.parameters = {}
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.random.uniform(
-            -std, std, (4 * hidden_size, input_size)
-        ).astype(dtype)
-        self.weight_hh = np.random.uniform(
-            -std, std, (4 * hidden_size, hidden_size)
-        ).astype(dtype)
+        if weight:
+            self.weight_ih = np.random.uniform(
+                -std, std, (4 * hidden_size, input_size)
+            ).astype(dtype)
+            self.weight_hh = np.random.uniform(
+                -std, std, (4 * hidden_size, hidden_size)
+            ).astype(dtype)
+        else:
+            self.weight_ih = np.ones((4 * hidden_size, input_size)).astype(
+                dtype
+            )
+            self.weight_hh = np.ones((4 * hidden_size, hidden_size)).astype(
+                dtype
+            )
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
@@ -165,11 +193,11 @@ def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
             self.bias_hh = np.random.uniform(
                 -std, std, (4 * hidden_size)
             ).astype(dtype)
-            self.parameters['bias_ih'] = self.bias_ih
-            self.parameters['bias_hh'] = self.bias_hh
         else:
-            self.bias_ih = None
-            self.bias_hh = None
+            self.bias_ih = np.zeros(4 * hidden_size).astype(dtype)
+            self.bias_hh = np.zeros(4 * hidden_size).astype(dtype)
+        self.parameters['bias_ih'] = self.bias_ih
+        self.parameters['bias_hh'] = self.bias_hh
 
     def init_state(self, inputs, batch_dim_index=0):
         batch_size = inputs.shape[batch_dim_index]
diff --git a/test/rnn/test_rnn_cells.py b/test/rnn/test_rnn_cells.py
index 4bb6f49963f843..4b055fcf45d73a 100644
--- a/test/rnn/test_rnn_cells.py
+++ b/test/rnn/test_rnn_cells.py
@@ -24,8 +24,9 @@
 
 
 class TestSimpleRNNCell(unittest.TestCase):
-    def __init__(self, bias=True, place="cpu"):
+    def __init__(self, weight=True, bias=True, place="cpu"):
         super().__init__(methodName="runTest")
+        self.weight = weight
         self.bias = bias
         self.place = (
             paddle.CPUPlace() if place == "cpu" else paddle.CUDAPlace(0)
@@ -33,9 +34,14 @@ def __init__(self, bias=True, place="cpu"):
 
     def setUp(self):
         paddle.disable_static(self.place)
-        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+        rnn1 = SimpleRNNCell(16, 32, weight=self.weight, bias=self.bias)
         rnn2 = paddle.nn.SimpleRNNCell(
-            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias
+            16,
+            32,
+            weight_ih_attr=self.weight,
+            weight_hh_attr=self.weight,
+            bias_ih_attr=self.bias,
+            bias_hh_attr=self.bias,
         )
         convert_params_for_cell(rnn1, rnn2)
 
@@ -76,8 +82,9 @@ def runTest(self):
 
 
 class TestGRUCell(unittest.TestCase):
-    def __init__(self, bias=True, place="cpu"):
+    def __init__(self, weight=True, bias=True, place="cpu"):
         super().__init__(methodName="runTest")
+        self.weight = weight
         self.bias = bias
         self.place = (
             paddle.CPUPlace() if place == "cpu" else paddle.CUDAPlace(0)
@@ -85,9 +92,14 @@ def __init__(self, bias=True, place="cpu"):
 
     def setUp(self):
         paddle.disable_static(self.place)
-        rnn1 = GRUCell(16, 32, bias=self.bias)
+        rnn1 = GRUCell(16, 32, weight=self.weight, bias=self.bias)
         rnn2 = paddle.nn.GRUCell(
-            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias
+            16,
+            32,
+            weight_ih_attr=self.weight,
+            weight_hh_attr=self.weight,
+            bias_ih_attr=self.bias,
+            bias_hh_attr=self.bias,
         )
         convert_params_for_cell(rnn1, rnn2)
 
@@ -128,17 +140,23 @@ def runTest(self):
 
 
 class TestLSTMCell(unittest.TestCase):
-    def __init__(self, bias=True, place="cpu"):
+    def __init__(self, weight=True, bias=True, place="cpu"):
         super().__init__(methodName="runTest")
+        self.weight = weight
         self.bias = bias
         self.place = (
             paddle.CPUPlace() if place == "cpu" else paddle.CUDAPlace(0)
         )
 
     def setUp(self):
-        rnn1 = LSTMCell(16, 32, bias=self.bias)
+        rnn1 = LSTMCell(16, 32, weight=self.weight, bias=self.bias)
         rnn2 = paddle.nn.LSTMCell(
-            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias
+            16,
+            32,
+            weight_ih_attr=self.weight,
+            weight_hh_attr=self.weight,
+            bias_ih_attr=self.bias,
+            bias_hh_attr=self.bias,
         )
         convert_params_for_cell(rnn1, rnn2)
 
@@ -187,8 +205,13 @@ def runTest(self):
 def load_tests(loader, tests, pattern):
     suite = unittest.TestSuite()
     devices = ["cpu", "gpu"] if paddle.base.is_compiled_with_cuda() else ["cpu"]
-    for bias in [True, False]:
-        for device in devices:
-            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
-                suite.addTest(test_class(bias, device))
+    for weight in [True, False]:
+        for bias in [True, False]:
+            for device in devices:
+                for test_class in [
+                    TestSimpleRNNCell,
+                    TestGRUCell,
+                    TestLSTMCell,
+                ]:
+                    suite.addTest(test_class(weight, bias, device))
     return suite
diff --git a/test/sequence/test_sequence_topk_avg_pooling.py b/test/sequence/test_sequence_topk_avg_pooling.py
deleted file mode 100644
index 470b3029ab9eda..00000000000000
--- a/test/sequence/test_sequence_topk_avg_pooling.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-from copy import deepcopy
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequenceTopkAvgPoolingOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.set_data()
-        self.compute()
-
-    def init_op_type(self):
-        self.op_type = "sequence_topk_avg_pooling"
-
-    def set_data(self):
-        topks = [1, 3, 5]
-        channel_num = 3
-        dim = 10
-        row = [30, 45]
-        col = [25, 36]
-        self.init_data(topks, channel_num, row, col, dim)
-        self.init_data(topks, channel_num, row, col, dim)
-
-    def init_data(self, topks, channel_num, row, col, dim=10):
-        self.attrs = {"topks": topks, "channel_num": channel_num}
-        feature = [row[i] * col[i] for i in range(len(row))]
-        numel = sum(feature) * channel_num
-        x_data = np.arange(numel).astype('float32')
-        x_lod = [[x * channel_num for x in feature]]
-        row_data = np.random.random((sum(row), dim)).astype('float32')
-        col_data = np.random.random((sum(col), dim)).astype('float32')
-        self.inputs = {
-            'X': (x_data, x_lod),
-            'ROW': (row_data, [row]),
-            'COLUMN': (col_data, [col]),
-        }
-
-    def calc_gradient(self, pos_data, topks, channel_num, row, col):
-        max_k = topks[-1]
-        pos_data = pos_data.flatten()
-        in_numel = sum([row[i] * col[i] for i in range(len(row))]) * channel_num
-        out_numel = sum(row) * len(topks) * channel_num
-        gradient = np.zeros(shape=(in_numel), dtype="float32")
-        dout_val = 1.0 / out_numel
-        pos_offset, in_offset = 0, 0
-        for bs_idx in range(len(row)):  # batch
-            row_size = row[bs_idx]
-            col_size = col[bs_idx]
-            for ch in range(channel_num):  # channel
-                for row_idx in range(row_size):  # row
-                    in_idx = in_offset + row_idx * col_size
-                    pos_idx = pos_offset + row_idx * max_k
-                    for k_idx in range(len(topks)):
-                        for k in range(topks[k_idx]):
-                            if pos_data[pos_idx + k] != -1:
-                                gradient[in_idx + pos_data[pos_idx + k]] += (
-                                    dout_val / topks[k_idx]
-                                )
-                in_offset += row_size * col_size
-                pos_offset += row_size * max_k
-        return gradient
-
-    def compute(self):
-        topks = self.attrs['topks']
-        max_k = topks[-1]
-        x_data, x_lod = self.inputs['X']
-        row_data, row_lod = self.inputs['ROW']
-        col_data, col_lod = self.inputs['COLUMN']
-        channel_num = self.attrs['channel_num']
-        out = np.zeros((0, len(topks) * channel_num), dtype=x_data.dtype)
-        pos = np.zeros((0,), dtype='int32')
-        out_lod = deepcopy(row_lod)
-
-        offset = 0
-        for idx in range(len(x_lod[0])):
-            x_len = x_lod[0][idx]
-            self.assertTrue(
-                x_len == channel_num * row_lod[0][idx] * col_lod[0][idx],
-                f"x_len: {x_len} can't mod channel_num: {channel_num}",
-            )
-            out_tmp = np.zeros((0,), dtype=x_data.dtype)
-            pos_tmp = np.zeros((0,), dtype='int32')
-            for ch in range(channel_num):
-                for r_id in range(row_lod[0][idx]):
-                    x_sub = x_data[offset : (offset + col_lod[0][idx])]
-                    topk_val, topk_pos = self.get_topk(x_sub, max_k)
-                    sum_data = self.topk_sum(topk_val, topk_pos, max_k)
-                    new_feature = np.array(
-                        [sum_data[topk] / topk for topk in topks]
-                    )
-                    out_tmp = np.hstack((out_tmp, new_feature))
-                    pos_tmp = np.hstack((pos_tmp, topk_pos))
-
-                    offset += col_lod[0][idx]
-
-            out_tmp = out_tmp.reshape([channel_num, -1, len(topks)]).transpose(
-                1, 0, 2
-            )
-            pos_tmp = pos_tmp.reshape([channel_num, -1, max_k]).transpose(
-                1, 0, 2
-            )
-            out = np.vstack(
-                (out, out_tmp.reshape([-1, len(topks) * channel_num]))
-            )
-            pos = np.hstack((pos, pos_tmp.flatten()))
-
-        self.outputs = {'Out': (out.astype('float32'), out_lod), 'pos': pos}
-        self.gradient = self.calc_gradient(
-            pos, topks, channel_num, row_lod[0], col_lod[0]
-        )
-
-    def get_topk(self, x, topk):
-        real_topk = topk if topk < len(x) else len(x)
-        topk_pos = np.array(x).argsort()[-topk:][::-1]
-        topk_val = np.array(x)[topk_pos]
-        if real_topk < topk:
-            topk_pos = np.hstack((topk_pos, np.full((topk - real_topk,), -1)))
-            topk_val = np.hstack((topk_val, np.full((topk - real_topk,), 0.0)))
-
-        return topk_val, topk_pos
-
-    def topk_sum(self, x, pos, max_k):
-        sum_data = [0.0] * (max_k + 1)
-        for i in range(1, max_k + 1):
-            if pos[i - 1] == -1:
-                sum_data[i] = sum_data[i - 1]
-            else:
-                sum_data[i] = sum_data[i - 1] + x[i - 1]
-        return sum_data
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient])
-
-
-class TestSequenceTopkAvgPoolingOpCase1(TestSequenceTopkAvgPoolingOp):
-    def set_data(self):
-        topks = [2, 3]
-        channel_num = 5
-        dim = 10
-        row = [36]
-        col = [48]
-        self.init_data(topks, channel_num, row, col, dim)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/sot/extract_errors.py b/test/sot/extract_errors.py
new file mode 100644
index 00000000000000..b9d9e505724ef0
--- /dev/null
+++ b/test/sot/extract_errors.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+
+runtime_error_msg = sys.stdin.read()
+
+pattern = r'File "?(.*?)"?, line (\d+),.*\n(.*?)\n(.*?)$'
+for match in re.finditer(pattern, runtime_error_msg, re.MULTILINE):
+    file = match.group(1)
+    if file.startswith("./"):
+        file = f"tests/{file[2:]}"
+        line = match.group(2)
+        error_info = match.group(4)
+        if "AssertionError" not in error_info:
+            # error_info = match.group(3) + '\n' + match.group(4)
+            output = f"::error file={file},line={line}::Error"
+            print(output)
diff --git a/test/sot/test_01_basic.py b/test/sot/test_01_basic.py
new file mode 100644
index 00000000000000..8a03ea9fd3ae5a
--- /dev/null
+++ b/test/sot/test_01_basic.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase, strict_mode_guard
+
+import paddle
+
+
+def foo(x: int, y: paddle.Tensor):
+    return x + y
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        self.assert_results(foo, 1, paddle.to_tensor(2))
+
+
+def numpy_add(x, y):
+    out = paddle.to_tensor(x.numpy() + y.numpy())
+    return out
+
+
+class TestNumpyAdd(TestCaseBase):
+    @strict_mode_guard(0)
+    def test_numpy_add(self):
+        x = paddle.to_tensor([2])
+        y = paddle.to_tensor([3])
+        self.assert_results(numpy_add, x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
+
+
+# Instructions:
+# LOAD_FAST
+# BINARY_ADD
+# RETURN_VALUE
+
+# Variables:
+# ConstantVariable
+# TensorVariable
diff --git a/test/sot/test_02_store_inplace.py b/test/sot/test_02_store_inplace.py
new file mode 100644
index 00000000000000..3c9b4df4602a05
--- /dev/null
+++ b/test/sot/test_02_store_inplace.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def foo(x: int, y: paddle.Tensor):
+    x = x + 1
+    y = y + 1
+    x += y
+    return x
+
+
+class TestStoreInplace(TestCaseBase):
+    def test_simple(self):
+        self.assert_results(foo, 1, paddle.to_tensor(2))
+
+
+if __name__ == "__main__":
+    unittest.main()
+
+
+# Instructions:
+# LOAD_FAST
+# BINARY_ADD
+# STORE_FAST (new)
+# INPLACE_ADD (new)
+# RETURN_VALUE
+
+# Variables:
+# ConstantVariable
+# TensorVariable
diff --git a/test/sot/test_03_tuple.py b/test/sot/test_03_tuple.py
new file mode 100644
index 00000000000000..797d54384714d0
--- /dev/null
+++ b/test/sot/test_03_tuple.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# New Supported Instructions:
+# BUILD_TUPLE
+# BINARY_SUBSCR
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.psdb import check_no_breakgraph
+
+
+@check_no_breakgraph
+def build_tuple(x: int, y: paddle.Tensor):
+    x = (x, y)
+    return x[1] + 1
+
+
+@check_no_breakgraph
+def build_tuple_with_slice_subscript(x: int, y: paddle.Tensor):
+    z = (x, y, 3, 4)
+    return z[0:5:1]
+
+
+@check_no_breakgraph
+def build_tuple_with_int_subscript(x: int, y: paddle.Tensor):
+    z = (x, y)
+    return z[0]
+
+
+@check_no_breakgraph
+def tuple_count_int(x: int, y: paddle.Tensor):
+    z = (x, x, 2, 1)
+    return z.count(x)
+
+
+def tuple_count_tensor(x: paddle.Tensor, y: tuple[paddle.Tensor]):
+    return y.count(x)
+
+
+@check_no_breakgraph
+def tuple_index_int(x: int, y: paddle.Tensor):
+    z = (x, y, x, y, y)
+    return z.index(x)
+
+
+def tuple_index_tensor(x: paddle.Tensor, y: tuple[paddle.Tensor]):
+    return y.index(x)
+
+
+class TestBuildTuple(TestCaseBase):
+    def test_build_tuple(self):
+        self.assert_results(build_tuple, 1, paddle.to_tensor(2))
+        self.assert_results(
+            build_tuple_with_slice_subscript, 1, paddle.to_tensor(2)
+        )
+        self.assert_results(
+            build_tuple_with_int_subscript, 1, paddle.to_tensor(2)
+        )
+
+
+class TestTupleMethods(TestCaseBase):
+    def test_tuple_methods_int(self):
+        self.assert_results(tuple_count_int, 1, paddle.to_tensor(2))
+        self.assert_results(tuple_index_int, 1, paddle.to_tensor(2))
+
+    def test_tuple_methods_tensor(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+        self.assert_results(tuple_count_tensor, a, (a, b, a, b))
+        self.assert_results(tuple_index_tensor, b, (b, b, b, a))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_04_list.py b/test/sot/test_04_list.py
new file mode 100644
index 00000000000000..d8b0823a279c21
--- /dev/null
+++ b/test/sot/test_04_list.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# New Supported Instructions:
+# BUILD_LIST (new)
+# BINARY_SUBSCR
+# DELETE_SUBSCR
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.psdb import check_no_breakgraph
+
+
+@check_no_breakgraph
+def list_getitem_int(x: int, y: paddle.Tensor):
+    x = [x, y]
+    return x[0] + 1
+
+
+@check_no_breakgraph
+def list_getitem_tensor(x: int, y: paddle.Tensor):
+    x = [x, y]
+    return x[1] + 1
+
+
+@check_no_breakgraph
+def list_setitem_int(x: int, y: paddle.Tensor):
+    z = [x, y]
+    z[0] = 3
+    return z
+
+
+def list_setitem_tensor(x: int, y: paddle.Tensor):
+    z = [x, y]
+    z[1] = paddle.to_tensor(3)
+    return z
+
+
+@check_no_breakgraph
+def list_delitem_int(x: int, y: paddle.Tensor):
+    z = [x, y]
+    del z[0]
+    return z
+
+
+@check_no_breakgraph
+def list_delitem_tensor(x: int, y: paddle.Tensor):
+    z = [x, y]
+    del z[1]
+    return z
+
+
+@check_no_breakgraph
+def list_construct_from_list(x: int, y: paddle.Tensor):
+    z = [x, y]
+    return z
+
+
+@check_no_breakgraph
+def list_append_int(x: int, y: paddle.Tensor):
+    z = [x, y]
+    z.append(3)
+    return z
+
+
+@check_no_breakgraph
+def list_append_tensor(x: int, y: paddle.Tensor):
+    z = [x, y]
+    z.append(y)
+    return z
+
+
+@check_no_breakgraph
+def list_clear(x: int, y: paddle.Tensor):
+    z = [x, y]
+    z.clear()
+    return z
+
+
+@check_no_breakgraph
+def list_copy(x: int, y: paddle.Tensor):
+    z = [x, y]
+    a = z.copy()
+    z[0] = 3
+    z[1] = y + 1
+    return (a, z)
+
+
+@check_no_breakgraph
+def list_count_int(x: int, y: paddle.Tensor):
+    z = [x, x, 2, 3, 1]
+    return z.count(x)
+
+
+def list_count_tensor(x: paddle.Tensor, y: list[paddle.Tensor]):
+    return y.count(x)
+
+
+@check_no_breakgraph
+def list_extend(x: int, y: paddle.Tensor):
+    z = [x, y]
+    a = [y, x]
+    b = (x, y)
+    z.extend(a)
+    z.extend(b)
+    return z
+
+
+@check_no_breakgraph
+def list_index_int(x: int, y: paddle.Tensor):
+    z = [x, x, 1, 2]
+    return z.index(x)
+
+
+def list_index_tensor(x: paddle.Tensor, y: list[paddle.Tensor]):
+    return y.index(x)
+
+
+@check_no_breakgraph
+def list_insert(x: int, y: paddle.Tensor):
+    z = [x, y]
+    z.insert(0, x)
+    z.insert(3, y)
+    return z
+
+
+@check_no_breakgraph
+def list_pop(x: int, y: paddle.Tensor):
+    z = [x, y]
+    a = z.pop()
+    b = z.pop()
+    return (z, a, b)
+
+
+@check_no_breakgraph
+def list_remove(x: int, y: paddle.Tensor):
+    z = [x, x, y, y]
+    z.remove(x)
+    z.remove(y)
+    return z
+
+
+@check_no_breakgraph
+def list_reverse(x: int, y: paddle.Tensor):
+    z = [x, x, y, y]
+    z.reverse()
+    return z
+
+
+@check_no_breakgraph
+def list_default_sort(x: int, y: paddle.Tensor):
+    z = [x + 2, x, x + 1]
+    z.sort()
+    return z
+
+
+@check_no_breakgraph
+def list_key_sort(x: int, y: paddle.Tensor):
+    z = [x + 2, x, x + 1]
+    z.sort(lambda x: x)
+    return z
+
+
+@check_no_breakgraph
+def list_reverse_sort(x: int, y: paddle.Tensor):
+    z = [x + 2, x, x + 1]
+    z.sort(reverse=True)
+    return z
+
+
+@check_no_breakgraph
+def list_tensor_sort(x: int, y: paddle.Tensor):
+    z = [y + 2, y, y + 1]
+    z.sort()
+    return z
+
+
+@check_no_breakgraph
+def list_max(x: paddle.Tensor | int, y: paddle.Tensor | int):
+    z = [x, x, y]
+    return max(z)
+
+
+@check_no_breakgraph
+def list_tensor_max_api(x: paddle.Tensor):
+    return x.max()
+
+
+@check_no_breakgraph
+def list_min(x: paddle.Tensor | int, y: paddle.Tensor | int):
+    z = [x, x, y]
+    return min(z)
+
+
+@check_no_breakgraph
+def list_tensor_min_api(x: paddle.Tensor):
+    return x.min()
+
+
+@check_no_breakgraph
+def list_no_arguments():
+    l1 = list()  # noqa: C408
+    l1.append(1)
+    l2 = list()  # noqa: C408
+    l2.append(2)
+    return l1[0] + l2[0]
+
+
+class TestListBasic(TestCaseBase):
+    def test_list_basic(self):
+        self.assert_results(list_getitem_int, 1, paddle.to_tensor(2))
+        self.assert_results(list_getitem_tensor, 1, paddle.to_tensor(2))
+        self.assert_results_with_side_effects(
+            list_setitem_int, 1, paddle.to_tensor(2)
+        )
+
+
+class TestListMethods(TestCaseBase):
+    def test_list_setitem(self):
+        self.assert_results_with_side_effects(
+            list_setitem_tensor, 1, paddle.to_tensor(2)
+        )
+
+    def test_list_count_and_index(self):
+        self.assert_results(list_count_int, 1, paddle.to_tensor(2))
+        self.assert_results(list_index_int, 1, paddle.to_tensor(2))
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+        self.assert_results(list_count_tensor, a, [a, b, a, b, a, b])
+        self.assert_results(list_index_tensor, b, [a, b, a, b, a, b])
+
+    def test_list_delitem(self):
+        self.assert_results_with_side_effects(
+            list_delitem_int, 1, paddle.to_tensor(2)
+        )
+        self.assert_results_with_side_effects(
+            list_delitem_tensor, 1, paddle.to_tensor(2)
+        )
+
+    def test_list_append(self):
+        self.assert_results_with_side_effects(
+            list_append_int, 1, paddle.to_tensor(2)
+        )
+        self.assert_results_with_side_effects(
+            list_append_tensor, 1, paddle.to_tensor(2)
+        )
+
+    def test_list_clear(self):
+        self.assert_results_with_side_effects(
+            list_clear, 1, paddle.to_tensor(2)
+        )
+
+    def test_list_copy(self):
+        self.assert_results_with_side_effects(list_copy, 1, paddle.to_tensor(2))
+
+    def test_list_extend(self):
+        self.assert_results_with_side_effects(
+            list_extend, 1, paddle.to_tensor(2)
+        )
+
+    def test_list_insert(self):
+        self.assert_results_with_side_effects(
+            list_insert, 1, paddle.to_tensor(2)
+        )
+
+    def test_list_pop(self):
+        self.assert_results_with_side_effects(list_pop, 1, paddle.to_tensor(2))
+
+    def test_list_remove(self):
+        self.assert_results_with_side_effects(
+            list_remove, 1, paddle.to_tensor(2)
+        )
+
+    def test_list_reverse(self):
+        self.assert_results_with_side_effects(
+            list_reverse, 1, paddle.to_tensor(2)
+        )
+        self.assert_results_with_side_effects(
+            list_reverse, 1, paddle.to_tensor(2)
+        )
+
+    def test_list_sort(self):
+        self.assert_results_with_side_effects(
+            list_default_sort, 1, paddle.to_tensor(2)
+        )
+        # TODO: Not currently supported
+        # self.assert_results_with_side_effects(
+        #     list_tensor_sort, 1, paddle.to_tensor(2)
+        # )
+        # self.assert_results_with_side_effects(
+        #     list_key_sort, 1, paddle.to_tensor(2)
+        # )
+        # self.assert_results_with_side_effects(
+        #     list_reverse_sort, 1, paddle.to_tensor(2)
+        # )
+
+    def test_list_construct_from_list(self):
+        self.assert_results(list_construct_from_list, 1, paddle.to_tensor(2))
+
+    def test_list_max_min(self):
+        self.assert_results(list_max, 1, 2)
+        self.assert_results(list_min, 1, 2)
+        self.assert_results(list_tensor_max_api, paddle.to_tensor([1, 2, 3]))
+        self.assert_results(list_tensor_min_api, paddle.to_tensor([1, 2, 3]))
+
+    def test_list_noargs(self):
+        self.assert_results(list_no_arguments)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_05_dict.py b/test/sot/test_05_dict.py
new file mode 100644
index 00000000000000..7014a717467984
--- /dev/null
+++ b/test/sot/test_05_dict.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# New Supported Instructions:
+# BUILD_MAP (new)
+# BUILD_CONST_KEY_MAP (new)
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.psdb import check_no_breakgraph
+
+
+@check_no_breakgraph
+def build_map(x: int, y: paddle.Tensor):
+    z = {x: y}
+    return z[x] + 1
+
+
+@check_no_breakgraph
+def build_const_key_map(x: int, y: paddle.Tensor):
+    z = {1: y, 2: y + 1}
+    return z[x] + 1
+
+
+@check_no_breakgraph
+def dict_get_item(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    return (z.get(1), z.get(2))
+
+
+@check_no_breakgraph
+def dict_get_item_default(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    return (z.get(3, 2), z.get(4, y))
+
+
+@check_no_breakgraph
+def dict_set_item_int(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    z[1] = x * 2
+    return z[1]
+
+
+@check_no_breakgraph
+def dict_set_item_tensor(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    z[2] = y
+    return z[1]
+
+
+@check_no_breakgraph
+def dict_update_item1(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    z.update({1: x * 2, 2: y, 3: y + 2})
+    return z
+
+
+@check_no_breakgraph
+def dict_update_item2(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    z.update({1: x * 2, 2: y, 3: z[2] + 2})
+    return z
+
+
+@check_no_breakgraph
+def dict_del_item_int(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    del z[1]
+    return z
+
+
+@check_no_breakgraph
+def dict_del_item_tensor(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    del z[2]
+    return z
+
+
+@check_no_breakgraph
+def dict_clear(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    z.clear()
+    return z
+
+
+@check_no_breakgraph
+def dict_copy(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    z2 = z.copy()
+    z[1] = 2
+    return z2
+
+
+@check_no_breakgraph
+def dict_setdefault_int(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    a = z.setdefault(4)
+    b = z.setdefault(1, 2)
+    c = z.setdefault(3, 4)
+    return (z, a, b, c)
+
+
+@check_no_breakgraph
+def dict_pop(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1, 3: y}
+    a = z.pop(1)
+    b = z.pop(2, 3)
+    c = z.pop(4, 3)
+    d = z.pop(5, y)
+    return (z, a, b, c, d)
+
+
+@check_no_breakgraph
+def dict_popitem(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1, 3: y}
+    a = z.popitem()
+    return (z, a)
+
+
+@check_no_breakgraph
+def dict_construct_from_dict():
+    x = {1: 2, 3: 4}
+    d = dict(x)
+    return d
+
+
+@check_no_breakgraph
+def dict_construct_from_list():
+    x = [[1, 2], [3, 4]]
+    d = dict(x)
+    return d
+
+
+@check_no_breakgraph
+def dict_construct_from_tuple():
+    x = ((1, 2), (3, 4))
+    d = dict(x)
+    return d
+
+
+@check_no_breakgraph
+def dict_construct_from_comprehension():
+    z = {1: 2, 3: 4}
+    d = {k: v + 1 for k, v in z.items()}
+    return d
+
+
+@check_no_breakgraph
+def dict_no_arguments():
+    d1 = dict()  # noqa: C408
+    d1.update({1: 2})
+    d2 = dict()  # noqa: C408
+    d2.update({3: 4})
+    return d1[1] + d2[3]
+
+
+@check_no_breakgraph
+def dict_test_fromkeys(x):
+    d = dict.fromkeys(x)
+    return d
+
+
+@check_no_breakgraph
+def dict_test_fromkeys_defalut(x, y):
+    d = dict.fromkeys(x, y)
+    return d
+
+
+class TestBuildDict(TestCaseBase):
+    def test_build_map(self):
+        self.assert_results(build_map, 1, paddle.to_tensor(2))
+
+    def test_build_const_key_map(self):
+        self.assert_results(build_const_key_map, 1, paddle.to_tensor(2))
+
+
+class TestDictMethods(TestCaseBase):
+    def test_dict_get_item(self):
+        self.assert_results(dict_get_item, 1, paddle.to_tensor(2))
+        self.assert_results(dict_get_item_default, 1, paddle.to_tensor(2))
+
+    def test_dict_set_item(self):
+        self.assert_results_with_side_effects(
+            dict_set_item_int, 1, paddle.to_tensor(2)
+        )
+        self.assert_results_with_side_effects(
+            dict_set_item_tensor, 1, paddle.to_tensor(2)
+        )
+
+    def test_dict_copy(self):
+        self.assert_results_with_side_effects(dict_copy, 1, paddle.to_tensor(2))
+
+    def test_dict_update(self):
+        self.assert_results_with_side_effects(
+            dict_update_item1, 1, paddle.to_tensor(2)
+        )
+        self.assert_results_with_side_effects(
+            dict_update_item2, 1, paddle.to_tensor(2)
+        )
+
+    def test_dict_setdefault(self):
+        self.assert_results_with_side_effects(
+            dict_setdefault_int, 1, paddle.to_tensor(2)
+        )
+
+    def test_dict_del_item(self):
+        self.assert_results_with_side_effects(
+            dict_del_item_int, 1, paddle.to_tensor(2)
+        )
+        self.assert_results_with_side_effects(
+            dict_del_item_tensor, 1, paddle.to_tensor(2)
+        )
+
+    def test_dict_clear(self):
+        self.assert_results_with_side_effects(
+            dict_clear, 1, paddle.to_tensor(2)
+        )
+
+    def test_dict_pop(self):
+        self.assert_results_with_side_effects(dict_pop, 1, paddle.to_tensor(2))
+
+    def test_dict_popitem(self):
+        self.assert_results_with_side_effects(
+            dict_popitem, 1, paddle.to_tensor(2)
+        )
+
+    def test_construct(self):
+        self.assert_results(dict_construct_from_dict)
+        self.assert_results(dict_construct_from_list)
+        self.assert_results(dict_construct_from_tuple)
+        self.assert_results(dict_construct_from_comprehension)
+
+    def test_dict_noargs(self):
+        self.assert_results(dict_no_arguments)
+
+    def test_dict_fromkeys(self):
+        self.assert_results(dict_test_fromkeys, (1, 2, 3, 4))
+        self.assert_results(dict_test_fromkeys, [1, 2, 3, 4])
+        self.assert_results(dict_test_fromkeys_defalut, (1, 2, 3, 4), 1)
+        self.assert_results(
+            dict_test_fromkeys_defalut, (1, 2, 3, 4), paddle.to_tensor(1)
+        )
+        self.assert_results(dict_test_fromkeys_defalut, [1, 2, 3, 4], 1)
+        self.assert_results(
+            dict_test_fromkeys_defalut, [1, 2, 3, 4], paddle.to_tensor(1)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_06_call_function.py b/test/sot/test_06_call_function.py
new file mode 100644
index 00000000000000..4358afe6ca985f
--- /dev/null
+++ b/test/sot/test_06_call_function.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def add(x, y):
+    return x + y
+
+
+def sub(x, y):
+    return x - y
+
+
+def foo_1(x: paddle.Tensor):
+    m = x + 1
+    y = add(m * 3, m * 2)
+    return y
+
+
+def foo_2(x: paddle.Tensor):
+    m = x + 1
+    y = sub(m * 3, m * 2)
+    return y
+
+
+def foo_3(x: paddle.Tensor):
+    m = x + 1
+    y = sub(m * 3, m * 2)
+    y = sub(y, y)
+    y = sub(y, y)
+    return y
+
+
+def nest_2(x):
+    return x + 1
+
+
+def nest_1(x):
+    return (x - 1) * 2
+
+
+def foo_4(x: paddle.Tensor):
+    m = x + 1
+    m = nest_1(m)
+    return m
+
+
+def fn_with_varargs_and_kwargs(x, *args, **kwargs):
+    return (
+        x
+        + args[0]
+        + args[1]
+        - args[2]
+        + kwargs['a'] * kwargs['b'] / kwargs['c']
+    )
+
+
+def foo_5(x: paddle.Tensor):
+    m = x + 1
+    m = fn_with_varargs_and_kwargs(
+        m, x + 1, x + 2, x + 3, a=x + 4, b=x + 5, c=x + 6
+    )
+    return m
+
+
+def fn_with_default_value(x, y=1, z=2):
+    return x + y + z
+
+
+def foo_6(x: paddle.Tensor):
+    m = x + 1
+    m = fn_with_default_value(m, m + 10)
+    m = fn_with_default_value(m + 42)
+    return m
+
+
+def fn_with_default_value_and_varargs_kwargs(x, y=1, *args, **kwargs):
+    return x + y + args[0] + kwargs['a']
+
+
+def foo_7(x: paddle.Tensor):
+    m = x + 1
+    m = fn_with_default_value_and_varargs_kwargs(m, m + 1, m + 2, a=m + 3)
+    return m
+
+
+def fn_with_default_value_and_varargs_kwargs_kwonly_1(
+    x, y=1, *args, z, **kwargs
+):
+    return x + y + args[0] + kwargs['a'] + z
+
+
+def fn_with_default_value_and_varargs_kwargs_kwonly_2(
+    x, y=1, *args, z=10, **kwargs
+):
+    return x + y + args[0] + kwargs['a'] + z
+
+
+def foo_8(x: paddle.Tensor):
+    m = x + 1
+    m = fn_with_default_value_and_varargs_kwargs_kwonly_1(
+        m, m + 1, m + 2, a=m + 3, z=m + 4
+    )
+    m = fn_with_default_value_and_varargs_kwargs_kwonly_2(
+        m, m + 1, m + 2, a=m + 3
+    )
+    return m
+
+
+class TestCall(TestCaseBase):
+    def test_call1(self):
+        self.assert_results(foo_1, paddle.to_tensor(2))
+
+    def test_call2(self):
+        self.assert_results(foo_2, paddle.to_tensor(3))
+
+    def test_call3(self):
+        self.assert_results(foo_3, paddle.to_tensor(4))
+
+    def test_call4(self):
+        self.assert_results(foo_4, paddle.to_tensor(5))
+
+    def test_call5(self):
+        self.assert_results(foo_5, paddle.to_tensor(6))
+
+    def test_call6(self):
+        self.assert_results(foo_6, paddle.to_tensor(7))
+
+    def test_call7(self):
+        self.assert_results(foo_7, paddle.to_tensor(8))
+
+    def test_call8(self):
+        self.assert_results(foo_8, paddle.to_tensor(9))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_07_unpack.py b/test/sot/test_07_unpack.py
new file mode 100644
index 00000000000000..f04a185294b6f5
--- /dev/null
+++ b/test/sot/test_07_unpack.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# New Supported Instructions:
+# UNPACK_SEQUENCE (new)
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def unpack_tuple(x: tuple[int, paddle.Tensor]):
+    y, z = x
+    return z + 1
+
+
+def unpack_tensor(x: paddle.Tensor):
+    a, b = x
+    return (a, b)
+
+
+def unpack_ex_tuple(x: tuple[int, int, paddle.Tensor]):
+    *y, z = x
+    return z + 1
+
+
+def unpack_ex_tensor(x: paddle.Tensor):
+    a, b, *c = x
+    return (a, b)
+
+
+def unpack_ex_tensor_2(x: paddle.Tensor):
+    a, *b, c, d = x
+    return (a, c)
+
+
+class TestUnpack(TestCaseBase):
+    def test_unpack_tuple(self):
+        self.assert_results(unpack_tuple, (1, paddle.to_tensor(2)))
+
+    def test_unpack_tensor(self):
+        self.assert_results(unpack_tensor, paddle.to_tensor([2, 3]))
+
+    def test_unpack_ex_tuple(self):
+        self.assert_results(unpack_ex_tuple, (1, 1, paddle.to_tensor(2)))
+
+    def test_unpack_ex_tensor(self):
+        self.assert_results(unpack_ex_tensor, paddle.to_tensor([2, 3, 3, 3]))
+
+    def test_unpack_ex_tensor_2(self):
+        self.assert_results(unpack_ex_tensor_2, paddle.to_tensor([2, 3, 3, 3]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_08_rot.py b/test/sot/test_08_rot.py
new file mode 100644
index 00000000000000..2d9146e3ff3baf
--- /dev/null
+++ b/test/sot/test_08_rot.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def rot_two_return_a(a: paddle.Tensor, b: paddle.Tensor):
+    b, a = a, b
+    return a + 1
+
+
+def rot_two_return_b(a: paddle.Tensor, b: paddle.Tensor):
+    b, a = a, b
+    return b + 2
+
+
+def rot_three_return_a(a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor):
+    a, b, c = c, b, a
+    return a + 1
+
+
+def rot_three_return_b(a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor):
+    a, b, c = c, b, a
+    return b + 1
+
+
+def rot_three_return_c(a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor):
+    a, b, c = c, b, a
+    return c + 1
+
+
+def rot_four_return_a(
+    a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor, d: paddle.Tensor
+):
+    a, b, c, d = d, c, b, a
+    return a + 1
+
+
+def rot_four_return_b(
+    a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor, d: paddle.Tensor
+):
+    a, b, c, d = d, c, b, a
+    return b + 1
+
+
+def rot_four_return_c(
+    a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor, d: paddle.Tensor
+):
+    a, b, c, d = d, c, b, a
+    return c + 1
+
+
+def rot_four_return_d(
+    a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor, d: paddle.Tensor
+):
+    a, b, c, d = d, c, b, a
+    return d + 1
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+        c = paddle.to_tensor(3)
+        d = paddle.to_tensor(4)
+        self.assert_results(rot_two_return_a, a, b)
+        self.assert_results(rot_two_return_b, a, b)
+
+        self.assert_results(rot_three_return_a, a, b, c)
+        self.assert_results(rot_three_return_b, a, b, c)
+        self.assert_results(rot_three_return_c, a, b, c)
+
+        self.assert_results(rot_four_return_a, a, b, c, d)
+        self.assert_results(rot_four_return_b, a, b, c, d)
+        self.assert_results(rot_four_return_c, a, b, c, d)
+        self.assert_results(rot_four_return_d, a, b, c, d)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_09_f_string.py b/test/sot/test_09_f_string.py
new file mode 100644
index 00000000000000..c2a3b8144605bf
--- /dev/null
+++ b/test/sot/test_09_f_string.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# FORMAT_VALUE (new)
+# BUILD_STRING (new)
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.psdb import assert_true
+
+
+def foo(x: paddle.Tensor):
+    whilespace = 123
+    hello_world = f"Hello {whilespace} World"
+    z = assert_true(hello_world == "Hello 123 World")
+    x = x + 1
+    return x
+
+
+class TestFString(TestCaseBase):
+    def test_fstring(self):
+        self.assert_results(foo, paddle.to_tensor(1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_10_build_unpack.py b/test/sot/test_10_build_unpack.py
new file mode 100644
index 00000000000000..0b35c469018632
--- /dev/null
+++ b/test/sot/test_10_build_unpack.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# BUILD_TUPLE_UNPACK (new)
+# BUILD_LIST_UNPACK (new)
+# BUILD_TUPLE_UNPACK_WITH_CALL (new)
+# CALL_FUNCTION_EX (new)
+# BUILD_MAP_UNPACK (new)
+# LIST_EXTEND (new)
+# LIST_TO_TUPLE (new)
+# DICT_UPDATE (new)
+# DICT_MERGE (new)
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def build_tuple_unpack(x: tuple[paddle.Tensor], y: tuple[paddle.Tensor]):
+    z = (*x, *y)
+
+    return z[0] + 1
+
+
+def build_list_unpack(x: list[paddle.Tensor], y: list[paddle.Tensor]):
+    z = [*x, *y]
+    return z[0] + 1
+
+
+def build_tuple_unpack_with_call(
+    x: tuple[paddle.Tensor], y: tuple[paddle.Tensor]
+):
+    z = build_tuple_unpack_with_call_inner(*x, *y)
+    return z[0] + 1
+
+
+def build_tuple_unpack_with_call_inner(
+    a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor, d: paddle.Tensor
+):
+    z = (a, b, c, d)
+    return z
+
+
+def build_map_unpack(x: dict[str, paddle.Tensor], y: dict[str, paddle.Tensor]):
+    z = {**x, **y}
+    return z["a"] + 1
+
+
+def build_map_unpack_with_call_inner(
+    a: paddle.Tensor, b: paddle.Tensor, c: paddle.Tensor, d: paddle.Tensor
+):
+    z = {"a": a, "b": b, "c": c, "d": d}
+    return z
+
+
+def build_map_unpack_with_call(
+    x: dict[str, paddle.Tensor], y: dict[str, paddle.Tensor]
+):
+    z = build_map_unpack_with_call_inner(**x, **y)
+    return z["a"] + 1
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+        c = paddle.to_tensor(3)
+        d = paddle.to_tensor(4)
+
+        self.assert_results(build_tuple_unpack, (a, b), (c, d))
+        self.assert_results(build_list_unpack, [a, b], [c, d])
+        self.assert_results(build_tuple_unpack_with_call, (a, b), (c, d))
+        self.assert_results(
+            build_map_unpack, {"a": a, "b": b}, {"c": c, "d": d}
+        )
+        self.assert_results(
+            build_map_unpack_with_call, {"a": a, "b": b}, {"c": c, "d": d}
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_11_jumps.py b/test/sot/test_11_jumps.py
new file mode 100644
index 00000000000000..80fa1f4a4eb02b
--- /dev/null
+++ b/test/sot/test_11_jumps.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.psdb import check_no_breakgraph
+
+
+@check_no_breakgraph
+def pop_jump_if_false(x: bool, y: paddle.Tensor):
+    if x:
+        y += 1
+    else:
+        y -= 1
+    return y
+
+
+@check_no_breakgraph
+def pop_jump_if_true(x: bool, y: bool, z: paddle.Tensor):
+    return (x or y) and z
+
+
+@check_no_breakgraph
+def jump_if_false_or_pop(x: bool, y: paddle.Tensor):
+    return x and (y + 1)
+
+
+@check_no_breakgraph
+def jump_if_true_or_pop(x: bool, y: paddle.Tensor):
+    return x or (y + 1)
+
+
+@check_no_breakgraph
+def jump_absolute(x: int, y: paddle.Tensor):
+    while x > 0:
+        y += 1
+        x -= 1
+    return y
+
+
+@check_no_breakgraph
+def pop_jump_if_none(x: bool, y: paddle.Tensor):
+    if x is not None:
+        y += 1
+    else:
+        y -= 1
+    return y
+
+
+@check_no_breakgraph
+def pop_jump_if_not_none(x: bool, y: paddle.Tensor):
+    if x is None:
+        y += 1
+    else:
+        y -= 1
+    return y
+
+
+a = paddle.to_tensor(1)
+b = paddle.to_tensor(2)
+c = paddle.to_tensor(3)
+d = paddle.to_tensor(4)
+
+true_tensor = paddle.to_tensor(True)
+false_tensor = paddle.to_tensor(False)
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        self.assert_results(jump_absolute, 5, a)
+
+        self.assert_results(pop_jump_if_false, True, a)
+        self.assert_results(pop_jump_if_false, False, a)
+        self.assert_results(jump_if_false_or_pop, True, a)
+        self.assert_results(jump_if_false_or_pop, False, a)
+        self.assert_results(jump_if_true_or_pop, True, a)
+        self.assert_results(jump_if_true_or_pop, False, a)
+        self.assert_results(pop_jump_if_true, True, False, a)
+        self.assert_results(pop_jump_if_true, False, False, a)
+
+        self.assert_results(pop_jump_if_none, None, a)
+        self.assert_results(pop_jump_if_none, True, a)
+        self.assert_results(pop_jump_if_not_none, None, a)
+        self.assert_results(pop_jump_if_not_none, True, a)
+
+    def test_breakgraph(self):
+        self.assert_results(pop_jump_if_false, true_tensor, a)
+        self.assert_results(jump_if_false_or_pop, true_tensor, a)
+        self.assert_results(jump_if_true_or_pop, false_tensor, a)
+        self.assert_results(pop_jump_if_true, true_tensor, false_tensor, a)
+        self.assert_results(jump_absolute, 5, a)
+        self.assert_results(pop_jump_if_false, false_tensor, a)
+        self.assert_results(jump_if_false_or_pop, false_tensor, a)
+        self.assert_results(jump_if_true_or_pop, false_tensor, a)
+        self.assert_results(pop_jump_if_true, true_tensor, false_tensor, a)
+
+        self.assert_results(pop_jump_if_none, true_tensor, a)
+        self.assert_results(pop_jump_if_not_none, true_tensor, a)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_12_for_loop.py b/test/sot/test_12_for_loop.py
new file mode 100644
index 00000000000000..63e3fedace4bfd
--- /dev/null
+++ b/test/sot/test_12_for_loop.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# GET_ITER (new)
+# FOR_ITER (new)
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase, strict_mode_guard
+
+import paddle
+from paddle.jit import sot
+from paddle.jit.sot import symbolic_translate
+from paddle.jit.sot.opcode_translator.executor.executor_cache import (
+    OpcodeExecutorCache,
+)
+
+
+def gener():
+    yield 1
+    yield 2
+    yield 3
+
+
+def for_list_1(x: paddle.Tensor):
+    for i in [1, 2, 3]:
+        x += i
+
+        if x > 2:
+            x += 1
+        else:
+            x -= 1
+    return x
+
+
+def for_list_2(x: paddle.Tensor):
+    for i in [1, 2, 3]:
+        x += i
+
+        if i > 2:
+            x += 1
+        else:
+            x -= 1
+    return x
+
+
+def for_dict(x: paddle.Tensor):
+    map = {1: 2, 3: 4}
+    for k in map.keys():
+        x += k
+
+    for v in map.values():
+        x += v
+
+    for k, v in map.items():
+        x += k
+        x += v
+
+    return x
+
+
+def for_iter(x, it):
+    for item in it:
+        x += item
+    return x
+
+
+def for_for_fallback(x, it):
+    for i in [1, 2, 3]:
+        for item in it:
+            x += item
+    return x
+
+
+def for_break(x: paddle.Tensor, it):
+    for i in [1, 2, 3]:
+        x += i
+        if i == 2:
+            break
+    for i in it:
+        x += i
+        if i == 2:
+            break
+    return x
+
+
+def for_continue(x: paddle.Tensor, it):
+    for i in [1, 2, 3]:
+        if i == 2:
+            continue
+        x += i
+
+    for i in it:
+        if i == 2:
+            continue
+        x += i
+    return x
+
+
+def for_enumerate_var_with_nested_range(x_array):
+    x = paddle.tensor.fill_constant([1], 'int32', 0)
+    x_array = paddle.to_tensor(x_array)
+    for i, num in enumerate(x_array):
+        for idx in range(num):
+            x = x + num
+    return x
+
+
+def for_create_tmp_in_loop(x, it):
+    s = x
+    for i in it:
+        tmp = i
+        s += tmp
+    return s, tmp
+
+
+def for_without_zero_iter(self_res_dict, output):
+    res_dict = {"logits": output}
+    for res_key in list(self_res_dict):
+        res_dict[res_key] = self_res_dict.pop(res_key)
+    return res_dict
+
+
+@sot.psdb.check_no_fallback
+def for_reconstruct_range_iter():
+    for i in range(3):
+        sot.psdb.breakgraph()
+
+
+global_var_name = None
+
+
+def for_tmp_var_with_same_name_as_global_var():
+    total = 0
+    for i in range(3):
+        global_var_name = i + 3
+        sot.psdb.breakgraph()
+        total += global_var_name
+    return total
+
+
+def for_layer_list(layer_list, x):
+    for net in layer_list:
+        x = net(x)
+    return x
+
+
+class TestForLoop(TestCaseBase):
+    def test_list(self):
+        a = paddle.to_tensor(1)
+        self.assert_results(for_list_1, a)
+
+    def test_list_with_fallback(self):
+        a = paddle.to_tensor(1)
+        self.assert_results(for_list_2, a)
+
+    def test_dict(self):
+        a = paddle.to_tensor(1)
+        self.assert_results(for_dict, a)
+
+    def test_fallback(self):
+        a = paddle.to_tensor(1)
+
+        sym_output = symbolic_translate(for_iter)(a, gener())
+        paddle_output = for_iter(a, gener())
+        self.assert_nest_match(sym_output, paddle_output)
+
+    def test_for_for_fallback(self):
+        a = paddle.to_tensor(1)
+
+        sym_output = symbolic_translate(for_iter)(a, gener())
+        paddle_output = for_iter(a, gener())
+        self.assert_nest_match(sym_output, paddle_output)
+
+    def test_for_break(self):
+        a = paddle.to_tensor(1)
+        sym_output = symbolic_translate(for_break)(a, gener())
+        paddle_output = for_break(a, gener())
+        self.assert_nest_match(sym_output, paddle_output)
+
+    def test_for_continue(self):
+        a = paddle.to_tensor(1)
+        sym_output = symbolic_translate(for_continue)(a, gener())
+        paddle_output = for_continue(a, gener())
+        self.assert_nest_match(sym_output, paddle_output)
+
+    # TODO(zmh): support range for tensor
+    # def test_resume_stack(self):
+    #     a = [1, 2, 3]
+    #     self.assert_results(for_enumerate_var_with_nested_range, a)
+
+    def test_create_var_in_loop(self):
+        x = paddle.to_tensor(1, dtype="float32")
+        a = [1, 2, 3]
+        self.assert_results(for_create_tmp_in_loop, x, a)
+
+        sym_output = symbolic_translate(for_create_tmp_in_loop)(x, iter(a))
+        paddle_output = for_create_tmp_in_loop(x, iter(a))
+        self.assert_nest_match(sym_output, paddle_output)
+
+    def test_create_var_in_loop_with_same_name_as_global(self):
+        self.assert_results(for_tmp_var_with_same_name_as_global_var)
+
+    def test_for_without_zero_iter(self):
+        self_res_dict = {}
+        output = paddle.to_tensor(2)
+        self.assert_results(for_without_zero_iter, self_res_dict, output)
+
+    def test_reconstruct_range_iter(self):
+        self.assert_results(for_reconstruct_range_iter)
+
+    def test_layer_list(self):
+        layers = paddle.nn.LayerList()
+        for i in range(5):
+            layers.append(paddle.nn.Linear(5, 5))
+        x = paddle.rand([5], dtype="float32")
+        self.assert_results(for_layer_list, layers, x)
+
+
+def run_list_comp(x):
+    out = [s.chunk(2, axis=1) for s in x]
+    return out
+
+
+class TestListComp(TestCaseBase):
+    def test_list_comp(self):
+        x = [paddle.randn([1, 4]), paddle.randn([1, 4])]
+        self.assert_results(run_list_comp, x)
+
+
+def for_enumerate_cache(func_list, x):
+    out = None
+    for idx, func in enumerate(func_list):
+        out = func(x[idx])
+    return out
+
+
+class TestEnumerateCache(TestCaseBase):
+    def test_run(self):
+        func_list = [
+            paddle.nn.Linear(10, 10),
+        ]
+        x = [
+            paddle.randn([5, 10]),
+        ]
+
+        out = symbolic_translate(for_enumerate_cache)(func_list, x)
+        out = symbolic_translate(for_enumerate_cache)(func_list, x)
+        self.assert_nest_match(OpcodeExecutorCache().translate_count, 1)
+
+
+# after_loop_fn need zzz, and zzz is created as UndefinedVar when generating loop body
+# do not set zzz as UndefinedVar again
+def undefined_var_case_0():
+    for i in [1, 2]:
+        sot.psdb.breakgraph()
+        zzz = i
+
+    zzz = zzz + 1
+    return zzz
+
+
+# after_loop_fn need create zzz as UndefinedVar
+def undefined_var_case_1():
+    for i in [1, 2]:
+        sot.psdb.breakgraph()
+        aaa = i
+
+    for i in [1, 3]:
+        zzz = i
+    zzz = zzz + 1
+    return zzz
+
+
+class TestUndefinedVarInRiskyCodes(TestCaseBase):
+    def test_undefined_var_case_0(self):
+        self.assert_results(undefined_var_case_0)
+
+    def test_undefined_var_case_1(self):
+        self.assert_results(undefined_var_case_1)
+
+
+if __name__ == "__main__":
+    with strict_mode_guard(0):
+        unittest.main()
diff --git a/test/sot/test_13_make_function.py b/test/sot/test_13_make_function.py
new file mode 100644
index 00000000000000..9784d7ffad385f
--- /dev/null
+++ b/test/sot/test_13_make_function.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MAKE_FUNCTION
+# CALL_FUNCTION_KW
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def make_fn(x: paddle.Tensor):
+    def fn(a, b=2, c=3, d=4):
+        return a + b + c + d
+
+    return fn(1) + fn(2, c=5) + x
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        self.assert_results(make_fn, paddle.to_tensor(1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_14_operators.py b/test/sot/test_14_operators.py
new file mode 100644
index 00000000000000..fc403ae3ef665f
--- /dev/null
+++ b/test/sot/test_14_operators.py
@@ -0,0 +1,387 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def unary_positive(x: int):
+    y = +x
+    return y
+
+
+def unary_negative(x: paddle.Tensor):
+    y = -x
+    return y
+
+
+def unary_not(x: paddle.Tensor):
+    y = not x
+    return y
+
+
+def unary_invert(x: paddle.Tensor):
+    y = ~x
+    return y
+
+
+def binary_power(x: paddle.Tensor, y: paddle.Tensor):
+    z = x**y
+    return z
+
+
+def binary_multiply(x: paddle.Tensor, y: paddle.Tensor):
+    z = x * y
+    return z
+
+
+def binary_matrix_multiply(x: paddle.Tensor, y: paddle.Tensor):
+    z = x @ y
+    return z
+
+
+def binary_floor_divide(x: paddle.Tensor, y: paddle.Tensor):
+    z = x // y
+    return z
+
+
+def binary_true_divide(x: paddle.Tensor, y: paddle.Tensor):
+    z = x / y
+    return z
+
+
+def binary_modulo(x: paddle.Tensor, y: paddle.Tensor):
+    z = x % y
+    return z
+
+
+def binary_add(x: paddle.Tensor, y: paddle.Tensor):
+    z = x + y
+    return z
+
+
+def binary_subtract(x: paddle.Tensor, y: paddle.Tensor):
+    z = x - y
+    return z
+
+
+def binary_lshift(x: int, y: int):
+    z = x << y
+    return z
+
+
+def binary_rshift(x: int, y: int):
+    z = x >> y
+    return z
+
+
+def binary_and(x: paddle.Tensor, y: paddle.Tensor):
+    z = x & y
+    return z
+
+
+def binary_or(x: paddle.Tensor, y: paddle.Tensor):
+    z = x | y
+    return z
+
+
+def binary_xor(x: paddle.Tensor, y: paddle.Tensor):
+    z = x ^ y
+    return z
+
+
+def inplace_power(x: paddle.Tensor, y: paddle.Tensor):
+    x **= y
+    return x
+
+
+def inplace_multiply(x: paddle.Tensor, y: paddle.Tensor):
+    x *= y
+    return x
+
+
+def inplace_matrix_multiply(x: paddle.Tensor, y: paddle.Tensor):
+    x @= y
+    return x
+
+
+def inplace_floor_divide(x: paddle.Tensor, y: paddle.Tensor):
+    x //= y
+    return x
+
+
+def inplace_true_divide(x: paddle.Tensor, y: paddle.Tensor):
+    x /= y
+    return x
+
+
+def inplace_modulo(x: paddle.Tensor, y: paddle.Tensor):
+    x %= y
+    return x
+
+
+def inplace_add(x: paddle.Tensor, y: paddle.Tensor):
+    x += y
+    return x
+
+
+def inplace_subtract(x: paddle.Tensor, y: paddle.Tensor):
+    x -= y
+    return x
+
+
+def inplace_lshift(x: paddle.Tensor, y: int):
+    x <<= y
+    return x
+
+
+def inplace_rshift(x: paddle.Tensor, y: int):
+    x >>= y
+    return x
+
+
+def inplace_and(x: paddle.Tensor, y: paddle.Tensor):
+    x &= y
+    return x
+
+
+def inplace_or(x: paddle.Tensor, y: paddle.Tensor):
+    x |= y
+    return x
+
+
+def inplace_xor(x: paddle.Tensor, y: paddle.Tensor):
+    x ^= y
+    return x
+
+
+def list_getitem(x: int, y: paddle.Tensor):
+    z = [x, y]
+    return operator.getitem(z, 1) + 1
+
+
+def list_getitem_slice(x: int, y: paddle.Tensor):
+    z = [x, y]
+    return operator.getitem(z, slice(0, 2))
+
+
+def list_setitem_int(x: int, y: paddle.Tensor):
+    z = [x, y]
+    operator.setitem(z, 0, 3)
+    return z
+
+
+def list_setitem_tensor(x: int, y: paddle.Tensor):
+    z = [x, y]
+    operator.setitem(z, 1, paddle.to_tensor(3))
+    return z
+
+
+def list_delitem_int(x: int, y: paddle.Tensor):
+    z = [x, y]
+    operator.delitem(z, 0)
+    return z
+
+
+def list_delitem_tensor(x: int, y: paddle.Tensor):
+    z = [x, y]
+    operator.delitem(z, 1)
+    return z
+
+
+def dict_getitem_int(x: int, y: paddle.Tensor):
+    z = {1: y, 2: y + 1}
+    return operator.getitem(z, 1)
+
+
+def dict_getitem_tensor(x: int, y: paddle.Tensor):
+    z = {1: y, 2: y + 1}
+    return operator.getitem(z, 2)
+
+
+def dict_setitem_int(x: int, y: paddle.Tensor):
+    z = {'x': x, 'y': y}
+    operator.setitem(z, 'x', 2)
+    return z
+
+
+def dict_setitem_tensor(x: int, y: paddle.Tensor):
+    z = {'x': x, 'y': y}
+    operator.setitem(z, 'y', paddle.to_tensor(3))
+    return z
+
+
+def dict_delitem_int(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    operator.delitem(z, 1)
+    return z
+
+
+def dict_delitem_tensor(x: int, y: paddle.Tensor):
+    z = {1: x, 2: y + 1}
+    operator.delitem(z, 2)
+    return z
+
+
+def tuple_getitem_int(x: int, y: paddle.Tensor):
+    x = (x, y)
+    return operator.getitem(x, 0)
+
+
+def tuple_getitem_tensor(x: int, y: paddle.Tensor):
+    x = (x, y)
+    return operator.getitem(x, 1)
+
+
+def tuple_getitem_slice(x: int, y: paddle.Tensor):
+    x = (x, y, 1)
+    return operator.getitem(x, slice(0, 2))
+
+
+def operator_add(x: int, y: paddle.Tensor):
+    return operator.add(x, y)
+
+
+def operator_mul(x: int, y: paddle.Tensor):
+    return operator.mul(x, y)
+
+
+def operator_truth(y: paddle.Tensor):
+    return operator.truth(y)
+
+
+def operator_is_(x: paddle.Tensor, y: paddle.Tensor):
+    return (operator.is_(x, x), operator.is_(x, y))
+
+
+def operator_in_(x: int, y: list):
+    return x in y
+
+
+def operator_not_in_(x: int, y: list):
+    return x not in y
+
+
+def operator_is_not(x: paddle.Tensor, y: paddle.Tensor):
+    return (operator.is_not(x, x), operator.is_not(x, y))
+
+
+def operator_pos(y: int):
+    return operator.pos(+y)
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(True)
+        c = paddle.to_tensor(3)
+        d = paddle.to_tensor(4)
+        e = paddle.to_tensor([[1, 2], [3, 4], [5, 6]], dtype='float32')
+        f = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype='float32')
+        g = paddle.to_tensor(False)
+
+        self.assert_results(unary_positive, 1)
+        self.assert_results(unary_negative, a)
+        self.assert_results(unary_not, b)
+        self.assert_results(unary_invert, b)
+
+        self.assert_results(binary_power, c, d)
+        self.assert_results(binary_multiply, c, d)
+        self.assert_results(binary_matrix_multiply, e, f)
+        self.assert_results(binary_floor_divide, c, d)
+        self.assert_results(binary_true_divide, c, d)
+        self.assert_results(binary_modulo, c, d)
+        self.assert_results(binary_add, c, d)
+        self.assert_results(binary_subtract, c, d)
+        self.assert_results(binary_lshift, 10, 2)
+        self.assert_results(binary_rshift, 10, 1)
+        self.assert_results(binary_and, b, g)
+        self.assert_results(binary_or, b, g)
+        self.assert_results(binary_xor, b, g)
+
+        self.assert_results(inplace_power, c, d)
+        self.assert_results(inplace_multiply, c, d)
+        self.assert_results(inplace_matrix_multiply, e, f)
+        self.assert_results(inplace_floor_divide, c, d)
+        self.assert_results(inplace_true_divide, c, d)
+        self.assert_results(inplace_modulo, c, d)
+        self.assert_results(inplace_add, c, d)
+        self.assert_results(inplace_subtract, c, d)
+        self.assert_results(inplace_lshift, 10, 2)
+        self.assert_results(inplace_rshift, 10, 1)
+        self.assert_results(inplace_and, b, g)
+        self.assert_results(inplace_or, b, g)
+        self.assert_results(inplace_xor, b, g)
+
+    def test_operator_simple(self):
+        self.assert_results(operator_add, 1, paddle.to_tensor(2))
+        self.assert_results(operator_mul, 1, paddle.to_tensor(2))
+        self.assert_results(operator_truth, paddle.to_tensor(2))
+        self.assert_results(
+            operator_is_, paddle.to_tensor(2), paddle.to_tensor(3)
+        )
+        self.assert_results(
+            operator_is_not, paddle.to_tensor(2), paddle.to_tensor(3)
+        )
+        self.assert_results(operator_pos, 1)
+        self.assert_results(operator_in_, 12, [1, 2, 12])
+        self.assert_results(operator_in_, 12, [1, 2, 3])
+        self.assert_results(operator_not_in_, 12, [1, 2, 3])
+        self.assert_results(operator_not_in_, 12, [1, 2, 3])
+
+    def test_operator_list(self):
+        self.assert_results(list_getitem, 1, paddle.to_tensor(2))
+        self.assert_results(list_getitem_slice, 1, paddle.to_tensor(2))
+        self.assert_results(list_setitem_int, 1, paddle.to_tensor(2))
+        self.assert_results_with_side_effects(
+            list_setitem_tensor, 1, paddle.to_tensor(2)
+        )
+        self.assert_results(list_delitem_int, 1, paddle.to_tensor(2))
+        self.assert_results(list_delitem_tensor, 1, paddle.to_tensor(2))
+
+    def test_operator_dict(self):
+        self.assert_results(dict_getitem_int, 1, paddle.to_tensor(2))
+        self.assert_results(dict_getitem_tensor, 1, paddle.to_tensor(2))
+        self.assert_results(dict_setitem_int, 1, paddle.to_tensor(2))
+        self.assert_results_with_side_effects(
+            dict_setitem_tensor, 1, paddle.to_tensor(2)
+        )
+        self.assert_results(dict_delitem_int, 1, paddle.to_tensor(2))
+        self.assert_results(dict_delitem_tensor, 1, paddle.to_tensor(2))
+
+    def test_operator_tuple(self):
+        self.assert_results(tuple_getitem_int, 1, paddle.to_tensor(2))
+        self.assert_results(tuple_getitem_tensor, 1, paddle.to_tensor(2))
+        self.assert_results(tuple_getitem_slice, 1, paddle.to_tensor(2))
+
+
+def run_not_eq(x: paddle.Tensor, y: int):
+    out = paddle.reshape(x, [1, -1]) != y
+    out = out.astype('float32')
+    return out
+
+
+class TestNotEq(TestCaseBase):
+    def test_not_eq(self):
+        x = paddle.to_tensor([2])
+        y = 3
+        self.assert_results(run_not_eq, x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_15_slice.py b/test/sot/test_15_slice.py
new file mode 100644
index 00000000000000..b2ee00526f25b7
--- /dev/null
+++ b/test/sot/test_15_slice.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# BUILD_SLICE (new)
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.psdb import check_no_breakgraph
+
+
+def build_list_slice(x: list, y: paddle.Tensor):
+    x[2:4] = [0, 1]
+    return x[0] + y
+
+
+def build_list_slice_with_step(x: list, y: paddle.Tensor):
+    x[1:5:2] = [0, 1]
+    return x[0] + y
+
+
+def build_tuple_slice(x: list, y: paddle.Tensor):
+    x[2:4] = (0, 1)
+    return x[0] + y
+
+
+def build_tuple_slice_with_step(x: list, y: paddle.Tensor):
+    x[1:5:2] = (0, 1)
+    return x[0] + y
+
+
+def tensor_subscript_ellipsis(x: paddle.Tensor, y: paddle.Tensor):
+    return x[...] + y[...]
+
+
+@check_no_breakgraph
+def tensor_subscript_tensor(x: paddle.Tensor):
+    d0, d1 = paddle.shape(x)
+    return x[: d0 // 2, d1 // 2 : d1]
+
+
+class TestSlice(TestCaseBase):
+    def test_simple(self):
+        x = list(range(10))
+        y = paddle.arange(10)
+        self.assert_results_with_side_effects(build_list_slice, x, y)
+        self.assert_results_with_side_effects(build_list_slice_with_step, x, y)
+        self.assert_results_with_side_effects(build_tuple_slice, x, y)
+        self.assert_results_with_side_effects(build_tuple_slice_with_step, x, y)
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.linears = paddle.nn.LayerList(
+            [paddle.nn.Linear(10, 10) for i in range(10)]
+        )
+
+    def forward(self, x):
+        for i, l in enumerate(self.linears):
+            x = self.linears[i // 2](x) + l(x)
+        return x
+
+
+def layer_list_slice(layer, x):
+    out = layer(x)
+    return out
+
+
+class TestLayerList(TestCaseBase):
+    def test_layer_list_slice(self):
+        layer = MyLayer()
+        x = paddle.randn([5, 10])
+        self.assert_results(layer_list_slice, layer, x)
+
+
+def tensor_slice(x: paddle.Tensor):
+    return x[1, 1, 1] + 1
+
+
+class TestTensorSlice(TestCaseBase):
+    def test_tensor_slice(self):
+        x = paddle.randn([4, 3, 10])
+        self.assert_results(tensor_slice, x)
+
+
+class TestTensorEllipsis(TestCaseBase):
+    def test_tensor_subscript_ellipsis(self):
+        x = paddle.rand((10,))
+        y = paddle.rand((10, 10))
+        self.assert_results(tensor_subscript_ellipsis, x, y)
+
+
+class TestTensorSubscriptTensor(TestCaseBase):
+    def test_tensor_subscript_tensor(self):
+        x = paddle.rand((10, 10))
+        self.assert_results(tensor_subscript_tensor, x)
+
+
+class LayerListNet(paddle.nn.Layer):
+    def __init__(self) -> None:
+        super().__init__()
+        self.layer_list = paddle.nn.LayerList(
+            [paddle.nn.Linear(5, 5), paddle.nn.Linear(5, 5)]
+        )
+
+    def forward(self, x):
+        out = self.layer_list[0](x)
+        for layer in self.layer_list[1:]:
+            out = layer(out)
+        return out
+
+
+class TestLayerListSlice(TestCaseBase):
+    def test_layer_list_slice(self):
+        x = paddle.randn([2, 5])
+        net = LayerListNet()
+        self.assert_results(layer_list_slice, net, x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_16_paddle_api.py b/test/sot/test_16_paddle_api.py
new file mode 100644
index 00000000000000..9f6e05fa48b2fc
--- /dev/null
+++ b/test/sot/test_16_paddle_api.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.nn.functional import relu
+
+
+def paddle_api_method_call(x: paddle.Tensor):
+    m = x + 2
+    m = paddle.nn.functional.relu(m)
+    return m
+
+
+def paddle_api_function_call(x: paddle.Tensor):
+    m = x + 2
+    m = relu(m)
+    return m
+
+
+def paddle_api_function_call_concat(
+    x: paddle.Tensor, y: paddle.Tensor, axis: int
+):
+    return paddle.concat([x, y], axis=axis)
+
+
+class TestPaddleApiCall(TestCaseBase):
+    def test_paddle_api_method_call(self):
+        self.assert_results(paddle_api_method_call, paddle.to_tensor(2.0))
+        self.assert_results(paddle_api_method_call, paddle.to_tensor(-5.0))
+        self.assert_results(paddle_api_method_call, paddle.to_tensor(0.0))
+
+    def test_paddle_api_function_call(self):
+        self.assert_results(paddle_api_function_call, paddle.to_tensor(2.0))
+        self.assert_results(paddle_api_function_call, paddle.to_tensor(-5.0))
+        self.assert_results(paddle_api_function_call, paddle.to_tensor(0.0))
+
+    def test_paddle_api_function_call_concat(self):
+        a = paddle.to_tensor([[1, 2], [3, 4]])
+        b = paddle.to_tensor([[5, 6], [7, 8]])
+        self.assert_results(paddle_api_function_call_concat, a, b, 0)
+        self.assert_results(paddle_api_function_call_concat, a, b, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_17_paddle_layer.py b/test/sot/test_17_paddle_layer.py
new file mode 100644
index 00000000000000..58b7dfb9fa301d
--- /dev/null
+++ b/test/sot/test_17_paddle_layer.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+class SimpleNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(10, 1)
+
+    def forward(self, x):
+        out1 = self.linear1(x)
+        return out1
+
+
+class SimpleNet_bound(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(10, 1)
+
+    def add(self, x):
+        return x + 1
+
+    def forward(self, x):
+        x = self.add(x)
+        out1 = self.linear1(x)
+        return out1
+
+
+def net_call(x: paddle.Tensor, net):
+    return net(x)
+
+
+def net_call_passed_by_user(x: paddle.Tensor, net_forward):
+    return net_forward(x)
+
+
+class SimpleNetWithSequenital(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.seq = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 10),
+            paddle.nn.Linear(10, 10),
+            paddle.nn.Linear(10, 1),
+        )
+
+    def forward(self, x):
+        out1 = self.seq(x)
+        return out1
+
+
+class TestLayer(TestCaseBase):
+    def test_layer(self):
+        x = paddle.rand((10,))
+        y = paddle.rand((10, 10))
+        net = SimpleNet()
+        self.assert_results(net_call, x, net)
+        self.assert_results(net_call, y, net)
+        self.assert_results(net_call_passed_by_user, x, net.forward)
+
+    def test_layer_with_sequential(self):
+        x = paddle.rand((10,))
+        y = paddle.rand((10, 10))
+        net = SimpleNetWithSequenital()
+        self.assert_results(net_call, x, net)
+        self.assert_results(net_call, y, net)
+        self.assert_results(net_call_passed_by_user, x, net.forward)
+
+    def test_bound(self):
+        x = paddle.rand((10,))
+        y = paddle.rand((10, 10))
+        net = SimpleNet_bound()
+        self.assert_results(net_call, x, net)
+        self.assert_results(net_call, y, net)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_18_tensor_method.py b/test/sot/test_18_tensor_method.py
new file mode 100644
index 00000000000000..2591db1f748d93
--- /dev/null
+++ b/test/sot/test_18_tensor_method.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def tensor_method_call_1(x: paddle.Tensor):
+    y = x + 1
+    return y.mean()
+
+
+def tensor_method_call_2(a: paddle.Tensor, b: paddle.Tensor):
+    c = a.add(b)
+    d = c.multiply(a)
+    e = d.subtract(b)
+    f = e.divide(a)
+    g = f.pow(2) + f.abs().sqrt()
+    h = (g.abs() + 1).log() - (g / g.max()).exp()
+    i = h.sin() + h.cos()
+    return i
+
+
+def tensor_method_passed_by_user(a: paddle.Tensor, func: paddle.Tensor):
+    return func(a)
+
+
+def tensor_method_property(a: paddle.Tensor, b: paddle.Tensor):
+    return (
+        a.name,
+        str(a.place),
+        a.persistable,
+        a.dtype,
+        a.type,
+        a.is_tensor(),
+        a.clear_gradient(),
+        a @ b.T + len(a.shape) + b.size + a.ndim + a.dim() + a.rank(),
+    )
+
+
+def middle_tensor_name(a: paddle.Tensor, b: paddle.Tensor):
+    c = a + b
+    return c.name
+
+
+class TestTensorMethod(TestCaseBase):
+    def test_tensor_method_1(self):
+        x = paddle.rand([10])
+        y = paddle.rand([2, 4, 6])
+        self.assert_results(tensor_method_call_1, x)
+        self.assert_results(tensor_method_call_1, y)
+
+    def test_tensor_method_2(self):
+        x = paddle.rand([42])
+        y = paddle.rand([42])
+        self.assert_results(tensor_method_call_2, x, y)
+
+    def test_tensor_method_passed_by_user(self):
+        x = paddle.rand([42])
+        y = paddle.rand([42])
+        self.assert_results(tensor_method_passed_by_user, x, y.add)
+
+    def test_tensor_method_property(self):
+        x = paddle.rand([42, 24], dtype='float64')
+        y = paddle.rand([42, 24], dtype='float32')
+        self.assert_results(tensor_method_property, x, y)
+
+    @unittest.skip("TODO: dynamic tensor name is different")
+    def test_middle_tensor_name(self):
+        x = paddle.rand([42, 24])
+        y = paddle.rand([42, 24])
+        self.assert_results(middle_tensor_name, x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_19_closure.py b/test/sot/test_19_closure.py
new file mode 100644
index 00000000000000..6191141e07f390
--- /dev/null
+++ b/test/sot/test_19_closure.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+from test_case_base import TestCaseBase, strict_mode_guard
+
+import paddle
+
+
+def foo(x: int, y: paddle.Tensor):
+    z = 3
+
+    def local(a, b=5):
+        return a + x + z + b + y
+
+    return local(4) + z
+
+
+def foo2(y: paddle.Tensor, x=1):
+    """
+    Test strip default value
+    """
+    z = 3
+
+    def local(a, b=5):
+        return a + x + z + b + y
+
+    return local(4)
+
+
+def foo3(y: paddle.Tensor, x=1):
+    """
+    Test Closure Band Default
+    """
+    z = 3
+
+    def local(a, b=5):
+        nonlocal z
+        z = 4
+        return a + x + z + b + y
+
+    return local(4)
+
+
+global_z = 3
+
+
+def test_global(y: paddle.Tensor):
+    """
+    Test Global variable
+    """
+
+    def local(a, b=5):
+        global global_z
+        global_z += 1
+        return a + global_z + b + y
+
+    return local(1)
+
+
+def multi(c):
+    return c + 2
+
+
+def wrapper_function(func):
+    a = 2
+
+    def inner():
+        return func(a)
+
+    return inner
+
+
+wrapped_multi = wrapper_function(multi)
+
+
+def foo5(y: paddle.Tensor):
+    """
+    Test incoming closures
+    """
+    a = wrapped_multi()
+    return a
+
+
+def outwrapper(func):
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def foo6(y: paddle.Tensor):
+    """
+    Test Decorator
+    """
+
+    @outwrapper
+    def load_1(a, b=5):
+        return a + b
+
+    return load_1(1)
+
+
+import numpy as np
+
+
+def numpy_sum(m):
+    """
+    Test loop call
+
+    Example: a->b->c->a
+    """
+    a = np.array([1, 2, 3])
+    tmp = np.sum(a)
+    return m + 1
+
+
+def lambda_closure(x, m):
+    """
+    lambda closure.
+    """
+
+    def break_graph_closure():
+        print("yes")
+        return x + m
+
+    return break_graph_closure()
+
+
+# motivated by python builtin decorator
+def kwargs_wrapper(func):
+    sig = inspect.signature(func)
+
+    def inner(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    inner.__signature__ = sig
+    return inner
+
+
+@kwargs_wrapper
+def func7(a, b):
+    return a + b
+
+
+def foo7():
+    return func7(3, 5)
+
+
+def create_closure():
+    x = 1
+
+    def closure():
+        return x + 1
+
+    return closure
+
+
+class TestExecutor(TestCaseBase):
+    def test_closure(self):
+        self.assert_results(foo, 1, paddle.to_tensor(2))
+        self.assert_results(foo2, paddle.to_tensor(2))
+        self.assert_results(foo3, paddle.to_tensor(2))
+        self.assert_results_with_global_check(
+            test_global, ["global_z"], paddle.to_tensor(2)
+        )
+        self.assert_results(foo5, paddle.to_tensor(2))
+        self.assert_results(foo6, paddle.to_tensor(2))
+        self.assert_results(numpy_sum, paddle.to_tensor(1))
+        with strict_mode_guard(0):
+            self.assert_results(
+                lambda_closure, paddle.to_tensor(2), paddle.to_tensor(1)
+            )
+
+
+class TestExecutor2(TestCaseBase):
+    def test_closure(self):
+        self.assert_results(foo7)
+
+
+# Side Effect.
+def test_slice_in_for_loop(x, iter_num=3):
+    x = paddle.to_tensor(x)
+    a = []
+    # Use `paddle.full` so that static analysis can analyze the type of iter_num is Tensor
+    iter_num = paddle.full(
+        shape=[1], fill_value=iter_num, dtype="int32"
+    )  # TODO(liym27): Delete it if the type of parameter iter_num can be resolved
+
+    for i in range(iter_num):
+        a.append(x)
+
+    for i in range(iter_num):
+        a[i] = x
+    out = a[2]
+    return out
+
+
+class TestExecutor3(TestCaseBase):
+    def test_closure(self):
+        tx = paddle.to_tensor([1.0, 2.0, 3.0])
+        # need side effect of list.
+        # self.assert_results(test_slice_in_for_loop, tx)
+
+
+def non_local_test(t: paddle.Tensor):
+    a = 1
+
+    def func1():
+        nonlocal a
+        t = a
+        a = 2
+        return t
+
+    def func2():
+        nonlocal a
+        a = 1
+        return a
+
+    t += func1()  # add 2
+    t += func2()  # add 1
+    t += a  # add 1
+    return t
+
+
+class TestExecutor4(TestCaseBase):
+    def test_closure(self):
+        tx = paddle.to_tensor([1.0])
+        self.assert_results(non_local_test, tx)
+
+
+class TestCreateClosure(TestCaseBase):
+    def test_create_closure(self):
+        closure = create_closure()
+        self.assert_results(closure)
+
+
+if __name__ == "__main__":
+    unittest.main()
+
+# Instructions:
+# LOAD_CLOSURE
+# LOAD_DEREF
+# LOAD_CLASSDEREF
+# STORE_DEREF
+# DELETE_DEREF
+# STORE_GLOBAL
diff --git a/test/sot/test_20_string.py b/test/sot/test_20_string.py
new file mode 100644
index 00000000000000..5e628b795afdde
--- /dev/null
+++ b/test/sot/test_20_string.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.psdb import assert_true, check_no_breakgraph
+
+
+def string_format(x: paddle.Tensor):
+    whilespace = 123
+    hello_world = f"Hello {whilespace} World"
+    z = assert_true(hello_world == "Hello 123 World")
+    hello_world2 = f"Hello {whilespace}{whilespace} World"
+    z = assert_true(hello_world2 == "Hello 123123 World")
+    hello_world_lower = "Hello World".lower()
+    z = assert_true(hello_world_lower == "hello world")
+    return x + 1
+
+
+def string_lower(x: paddle.Tensor):
+    hello_world_lower = "Hello World".lower()
+    z = assert_true(hello_world_lower == "hello world")
+    return x + 1
+
+
+@check_no_breakgraph
+def str_startswith():
+    s = "Hello World"
+    a1 = s.startswith("Hello")
+    a2 = s.startswith("World")
+    a3 = s.startswith("Hello World")
+    a4 = s.startswith("Hello World!")
+    a5 = s.startswith("Hello", 5)
+    a6 = s.startswith("Hello", 1, 4)
+    a7 = s.startswith("Hello", 0, 11)
+    return (a1, a2, a3, a4, a5, a6, a7)
+
+
+@check_no_breakgraph
+def str_endswith():
+    s = "Hello World"
+    a1 = s.endswith("Hello")
+    a2 = s.endswith("World")
+    a3 = s.endswith("Hello World")
+    a4 = s.endswith("Hello World!")
+    a5 = s.endswith("Hello", 5)
+    a6 = s.endswith("Hello", 0, 4)
+    a7 = s.endswith("Hello", 1, 11)
+    return (a1, a2, a3, a4, a5, a6, a7)
+
+
+class TestExecutor(TestCaseBase):
+    def test_string_format(self):
+        self.assert_results(string_format, paddle.to_tensor(1))
+
+    def test_string_lower(self):
+        self.assert_results(string_lower, paddle.to_tensor(1))
+
+    def test_str_startswith(self):
+        self.assert_results(str_startswith)
+
+    def test_str_endswith(self):
+        self.assert_results(str_endswith)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_21_global.py b/test/sot/test_21_global.py
new file mode 100644
index 00000000000000..131f9c7e367f90
--- /dev/null
+++ b/test/sot/test_21_global.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit import sot
+
+global_x = 1
+global_y = paddle.to_tensor(2)
+global_z = None
+global_del_val = 1
+global_dict = {}
+global_list = [1, 2]
+global_inline = 0
+
+
+def global_func_int():
+    global global_x
+    global_x = global_x + 1
+    return global_x
+
+
+def global_func_int_add():
+    global global_x
+    global_x = global_x + global_x
+    return global_x + global_x
+
+
+def global_func_tensor_int_add(tensor_y: paddle.Tensor):
+    global global_x
+    global_x += 1
+    return global_x + tensor_y
+
+
+def global_multiple_update():
+    global global_x
+    global_x = 999
+    global_x = 888
+    global_x = 777
+    return global_x - 1
+
+
+def global_func_tensor():
+    global global_y
+    global_y = global_y + global_y
+    return global_y
+
+
+def global_func_tensor_add():
+    global global_y
+    global_y = global_y + global_y
+    return global_y + global_y
+
+
+def global_func():
+    global global_x
+    global global_y
+    global global_z
+
+    global_z = global_x + global_y
+    return global_z
+
+
+def global_del_global():
+    global global_del_val
+
+    del global_del_val
+
+
+def global_func_dict():
+    global global_dict
+    global_dict["key"] = "value"
+    global_dict.update({"test_key1": "test_value2"})
+    return global_dict
+
+
+def global_func_control1():
+    global global_dict
+    if "key" in global_dict:
+        del global_dict["key"]
+    return global_dict
+
+
+def global_func_control2():
+    global global_list
+    for i in range(len(global_list)):
+        global_list[i] = global_list[i] + 1
+    return global_list
+
+
+def global_func_inline_inner_1():
+    global global_inline
+    global_func_inline_inner_2()
+    global_inline += 1
+
+
+def global_func_inline_inner_2():
+    global global_inline
+    global_inline += 1
+
+
+def global_func_inline():
+    global_func_inline_inner_1()
+    global global_inline
+    return global_inline
+
+
+class TestGlobal(TestCaseBase):
+    def test_global_func_int(self):
+        global global_x
+        self.assert_results_with_global_check(global_func_int, ["global_x"])
+        global_x += 1
+        self.assert_results_with_global_check(global_func_int, ["global_x"])
+        self.assert_results_with_global_check(global_func_int_add, ["global_x"])
+
+    def test_global_multiple_update(self):
+        self.assert_results_with_global_check(
+            global_multiple_update, ["global_x"]
+        )
+
+    def test_global_func_tensor_int_add(self):
+        self.assert_results_with_global_check(
+            global_func_tensor_int_add, ["global_x"], paddle.to_tensor(1)
+        )
+
+    def test_global_func_tensor(self):
+        self.assert_results_with_global_check(global_func_tensor, ["global_y"])
+        self.assert_results_with_global_check(
+            global_func_tensor_add, ["global_y"]
+        )
+
+    def test_global_func(self):
+        self.assert_results_with_global_check(global_func, ["global_z"])
+        self.assertIn("global_del_val", global_del_global.__globals__)
+        sot.symbolic_translate(global_del_global)()
+        self.assertNotIn("global_del_val", global_del_global.__globals__)
+
+    def test_global_func_dict(self):
+        self.assert_results_with_global_check(global_func_dict, ["global_dict"])
+        self.assert_results_with_global_check(
+            global_func_control1, ["global_dict"]
+        )
+
+    def test_global_func_list(self):
+        self.assert_results_with_global_check(
+            global_func_control2, ["global_list"]
+        )
+
+    def test_global_func_inline(self):
+        global global_inline
+        global_inline = 0
+        sot.symbolic_translate(global_func_inline)()
+        self.assertEqual(global_inline, 2)
+        sot.symbolic_translate(global_func_inline)()
+        self.assertEqual(global_inline, 4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_analysis_inputs.py b/test/sot/test_analysis_inputs.py
new file mode 100644
index 00000000000000..20b32c2225324f
--- /dev/null
+++ b/test/sot/test_analysis_inputs.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import inspect
+import sys
+import unittest
+
+import paddle
+from paddle.jit.sot.opcode_translator.instruction_utils import (
+    analysis_inputs,
+    calc_offset_from_bytecode_offset,
+    get_instructions,
+)
+
+
+def assert_inputs_equals(instruction_offset: int, expected_inputs: set[str]):
+    current_frame = inspect.currentframe()
+    assert current_frame is not None
+    test_frame = current_frame.f_back
+    assert test_frame is not None
+
+    instructions = get_instructions(test_frame.f_code)
+    current_instr_idx = calc_offset_from_bytecode_offset(
+        test_frame.f_lasti + 2, instructions
+    )
+    actual_inputs = analysis_inputs(
+        instructions, current_instr_idx + instruction_offset
+    )
+    assert (
+        set(actual_inputs) == expected_inputs
+    ), f"actual_inputs: {actual_inputs}, expected_inputs: {expected_inputs}"
+
+
+def case1(x):
+    m = x + 1
+    n = x + 2
+    assert_inputs_equals(0, {"x", "n"})
+    y = x + 2
+    assert_inputs_equals(0, {"n"})
+    return n
+
+
+def case2(x):
+    x = x + 1
+    assert_inputs_equals(0, {"x"})
+    y = x + 3
+    z = x + y
+    assert_inputs_equals(0, {"x"})
+    x += 1
+    m = x + 1
+    n = x + m
+    assert_inputs_equals(0, set())
+    return 1
+
+
+def case3(x):
+    y = x + 1
+
+    assert_inputs_equals(0, {"x"})
+    if x:
+        z = 1
+    else:
+        z = 2
+    return z
+
+
+def case4(x):
+    y = x + 1
+
+    assert_inputs_equals(0, {"x", "y"})
+    if x:
+        z = y
+    else:
+        z = x
+    return z
+
+
+def case5(x):
+    y = x + 1
+    z = x + 2
+
+    assert_inputs_equals(0, {"z"})
+    if z:
+        a = 1
+    else:
+        b = 2
+    return z
+
+
+def case6(x):
+    y = x + 1
+    z = x + 2
+
+    assert_inputs_equals(0, {"a", "z"})
+    if z:
+        a = 1
+    else:
+        a += 1
+    return z
+
+
+def case7(x):
+    y = x + 1
+    z = x + 2
+
+    assert_inputs_equals(0, {"a", "z"})
+    if not z:
+        a += 1  # noqa: F821
+    else:
+        a = 1
+    return z
+
+
+def breakgraph_api(x):
+    return x
+
+
+def normal_api(x):
+    return x
+
+
+def case8(x):
+    x = normal_api(x)
+    assert_inputs_equals(0, {"x"})
+    for i in range(10):
+        x += 1
+        if i > 5:
+            continue
+            x += 10086
+        x += i
+    return x
+
+
+case9_offset = -9 if sys.version_info >= (3, 11) else -7
+
+
+def case9(x):
+    x = breakgraph_api(x)
+    assert_inputs_equals(
+        case9_offset, set()
+    )  # analysis when call breakgraph api (CALL_FUNCTION)
+    for i in range(10):
+        x += 1
+        if i > 5:
+            continue
+            x += 10086
+        x += i
+    return x
+
+
+def case10(x):
+    assert_inputs_equals(0, {"x", "y"})
+    # if x == 0, y will be read before assignment
+    for i in range(x):
+        y = i
+        z = y
+
+    return y + 1
+
+
+def case11(x):
+    y = x + 1
+    z = x + 2
+
+    assert_inputs_equals(0, {"a", "y", "z"})
+    if z:
+        if not y:
+            a += 1  # noqa: F821
+        else:
+            a = 2
+    else:
+        if y:
+            a = 1
+        else:
+            a += 1
+    return z
+
+
+def case12(x):
+    y = x + 1
+    z = x + 2
+
+    assert_inputs_equals(0, {"a", "y", "z"})
+    if z:
+        if y:
+            a = 2
+        else:
+            a += 2
+    else:
+        if y:
+            a += 1
+        else:
+            a = 1
+    return z
+
+
+class TestAnalysisInputs(unittest.TestCase):
+    def test_case1(self):
+        case1(paddle.to_tensor([1]))
+
+    def test_case2(self):
+        case2(paddle.to_tensor([2]))
+
+    def test_case3(self):
+        case3(paddle.to_tensor([3]))
+
+    def test_case4(self):
+        case4(paddle.to_tensor([4]))
+
+    def test_case5(self):
+        case5(paddle.to_tensor([5]))
+
+    def test_case6(self):
+        case6(paddle.to_tensor([6]))
+
+    def test_case7(self):
+        case7(paddle.to_tensor([7]))
+
+    def test_case8(self):
+        case8(paddle.to_tensor([8]))
+
+    def test_case9(self):
+        case9(paddle.to_tensor([9]))
+
+    def test_case10(self):
+        case10(paddle.to_tensor([10]))
+
+    def test_case11(self):
+        case11(paddle.to_tensor([11]))
+
+    def test_case12(self):
+        case12(paddle.to_tensor([12]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_break_graph.py b/test/sot/test_break_graph.py
new file mode 100644
index 00000000000000..cc1aca51caec30
--- /dev/null
+++ b/test/sot/test_break_graph.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.utils.paddle_api_config import add_break_graph_apis
+
+
+def ifelse_func(x, y):
+    if x > 0:
+        y = y + 1
+    else:
+        y = y + 2
+    return y
+
+
+class TestIfElse(TestCaseBase):
+    def test_simple(self):
+        x = paddle.to_tensor([1.0])
+        y = paddle.to_tensor([2.0])
+        self.assert_results(ifelse_func, x, y)
+
+
+def multi_output(x: paddle.Tensor):
+    m = x + 1
+    if x > 0:
+        return m
+    else:
+        return 2 * m
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        x = paddle.to_tensor(2)
+        self.assert_results(multi_output, x)
+        x = paddle.to_tensor(-2)
+        self.assert_results(multi_output, x)
+
+
+def print_break_graph(x, y):
+    z = x + y
+    print(x, z)
+    out = y * z * 2
+    return out
+
+
+class TestPrint(TestCaseBase):
+    def test_simple(self):
+        x = paddle.to_tensor(2)
+        y = paddle.to_tensor(3)
+        self.assert_results(print_break_graph, x, y)
+
+
+def to_tensor_break_graph(x, y):
+    z = x + y
+    out = y * paddle.to_tensor(2) * z
+    return out
+
+
+class TestToTensor(TestCaseBase):
+    def test_simple(self):
+        add_break_graph_apis([paddle.to_tensor])
+        x = paddle.to_tensor(2)
+        y = paddle.to_tensor(3)
+        self.assert_results(to_tensor_break_graph, x, y)
+
+
+def tensor_clear_gradient(x):
+    x = paddle.to_tensor(x)
+    x.clear_gradient()
+    return x
+
+
+class TestBreakGraphInResumeFn(TestCaseBase):
+    def test_simple(self):
+        x = paddle.to_tensor(2)
+        self.assert_results(tensor_clear_gradient, x)
+
+
+def inner_fn(a, b, c, d):
+    return a + b * c - d
+
+
+def multi_stack_args(a, b, c):
+    out = inner_fn(a, b, c, paddle.to_tensor(4))
+    return out
+
+
+class TestMultiStackArgs(TestCaseBase):
+    def test_simple(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+        c = paddle.to_tensor(3)
+        self.assert_results(multi_stack_args, a, b, c)
+
+
+def break_graph_in_call_method(x):
+    out = paddle.nn.functional.relu(paddle.to_tensor([4.0]))
+    return x + out
+
+
+def numpy_break_graph():
+    a = paddle.to_tensor([1, 2])
+    b = np.sum(a.numpy())
+    print(b)
+    return b
+
+
+class TestBreakGraphInCallMethod(TestCaseBase):
+    def test_simple(self):
+        x = paddle.to_tensor([1.0])
+        break_graph_in_call_method(x)
+        x = paddle.to_tensor([2.0])
+        break_graph_in_call_method(x)
+
+        x = paddle.to_tensor([3.0])
+        self.assert_results(break_graph_in_call_method, x)
+
+    def test_numpy(self):
+        self.assert_results(numpy_break_graph)
+
+
+def test_break_graph_repeat(x):
+    out = paddle.to_tensor(
+        paddle.to_tensor(paddle.to_tensor(paddle.to_tensor([1.0])))
+    )
+    return x + out
+
+
+class TestBreakGraphRepeat(TestCaseBase):
+    def test_simple(self):
+        x = paddle.to_tensor([1.0])
+        test_break_graph_repeat(x)
+        x = paddle.to_tensor([2.0])
+        test_break_graph_repeat(x)
+
+        x = paddle.to_tensor([3.0])
+        self.assert_results(test_break_graph_repeat, x)
+
+
+def break_graph_resume_pass_null(x, y):
+    return paddle.add(x, y[0:50] if y is not None else None)
+
+
+class TestBreakGraphResumePassNull(TestCaseBase):
+    def test_break_graph_resume_pass_null(self):
+        x = paddle.rand([50, 50], dtype=paddle.float32)
+        y = paddle.rand([100, 50], dtype=paddle.float32)
+        self.assert_results(break_graph_resume_pass_null, x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_builtin_dispatch.py b/test/sot/test_builtin_dispatch.py
new file mode 100644
index 00000000000000..e4a1ee5fb29993
--- /dev/null
+++ b/test/sot/test_builtin_dispatch.py
@@ -0,0 +1,329 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import math
+import operator
+import unittest
+import weakref
+
+from test_case_base import (
+    TestCaseBase,
+    test_instruction_translator_cache_context,
+)
+
+import paddle
+from paddle.jit.sot.psdb import check_no_breakgraph
+
+
+def dispatch_len(x: paddle.Tensor):
+    return len(x.shape)
+
+
+def dispatch_tensor_len(x: paddle.Tensor):
+    return len(x)
+
+
+def dispatch_reversed(x: paddle.Tensor | int, y: paddle.Tensor | int):
+    return list(reversed([x + 1, y - 1, x * 10, y + 1000]))
+
+
+def dispatch_bool(x: paddle.Tensor):
+    return operator.truth(x.shape) and bool(x.shape)
+
+
+def dispatch_ceil(x: paddle.Tensor | float):
+    return math.ceil(x) + 1
+
+
+def dispatch_floor(x: paddle.Tensor | float):
+    return math.floor(x) + 1
+
+
+def test_sum_tuple(x: paddle.Tensor | int, y: paddle.Tensor | int):
+    return sum((x, y))
+
+
+def test_sum_tuple2(
+    x: paddle.Tensor | int | list[int] | list[paddle.Tensor],
+    y: paddle.Tensor | int | list[int] | list[paddle.Tensor],
+):
+    return sum((x, y), x)
+
+
+def test_sum_tuple3(x):
+    return sum((), x)
+
+
+def test_sum_list(x: paddle.Tensor | int, y: paddle.Tensor | int):
+    return sum([x, y])
+
+
+def test_sum_list2(
+    x: paddle.Tensor | int | list[int] | list[paddle.Tensor],
+    y: paddle.Tensor | int | list[int] | list[paddle.Tensor],
+):
+    return sum([x, y], x)
+
+
+def test_sum_list3(x):
+    return sum([], x)
+
+
+def test_tensor_sum(x: paddle.Tensor):
+    return sum(x)
+
+
+def test_tensor_sum_api(x: paddle.Tensor):
+    return x.sum()
+
+
+def test_pow(x: paddle.Tensor | int, y: paddle.Tensor | int):
+    return pow(x, y)
+
+
+def test_pow2(x: paddle.Tensor | int, y: paddle.Tensor | int):
+    return pow(x, y, 1)
+
+
+def test_tensor_pow_api(x: paddle.Tensor, y: paddle.Tensor | int):
+    return x.pow(y)
+
+
+def test_math_pow(x: int, y: int):
+    return math.pow(x, y)
+
+
+def test_chr(x: int | hex | paddle.Tensor):
+    return chr(x)
+
+
+def test_ord(x: str):
+    return ord(x)
+
+
+@check_no_breakgraph
+def test_sqrt(x: int):
+    return math.sqrt(x)
+
+
+class TestBuiltinDispatch(TestCaseBase):
+    def test_dispatch_len(self):
+        self.assert_results(dispatch_len, paddle.to_tensor([1, 2, 3]))
+
+    def test_dispatch_bool(self):
+        self.assert_results(dispatch_bool, paddle.to_tensor([1, 2, 3]))
+
+    def test_dispatch_tensor_len(self):
+        with test_instruction_translator_cache_context() as ctx:
+            self.assert_results(
+                dispatch_tensor_len, paddle.to_tensor([1, 2, 3])
+            )
+            self.assertEqual(ctx.translate_count, 1)
+            self.assert_results(
+                dispatch_tensor_len, paddle.to_tensor([4, 5, 6])
+            )
+            self.assertEqual(ctx.translate_count, 1)
+
+    def test_dispatch_list_reversed(self):
+        self.assert_results(dispatch_reversed, paddle.to_tensor(1), 2)
+        self.assert_results(dispatch_reversed, 2, paddle.to_tensor(1))
+
+    def test_dispatch_tensor_reversed(self):
+        self.assert_results(
+            dispatch_reversed,
+            paddle.to_tensor([1, 2]),
+            paddle.to_tensor([3, 4]),
+        )
+
+    def test_not_dispatch_tensor_ceil(self):
+        # ceil should break graph, since it returns a int rather than a tensor
+        self.assert_results(dispatch_ceil, paddle.to_tensor(1.2))
+
+    def test_dispatch_float_ceil(self):
+        self.assert_results(dispatch_ceil, 1.2)
+
+    def test_not_dispatch_tensor_floor(self):
+        # floor should break graph, since it returns a int rather than a tensor
+        self.assert_results(dispatch_floor, paddle.to_tensor(1.2))
+
+    def test_dispatch_float_floor(self):
+        self.assert_results(dispatch_floor, 1.2)
+
+    def test_dispatch_sum(self):
+        self.assert_results(test_sum_tuple, 1, 1)
+        self.assert_results(test_sum_tuple, paddle.to_tensor(1), 1)
+        self.assert_results(
+            test_sum_tuple, paddle.to_tensor(1), paddle.to_tensor(1)
+        )
+        self.assert_results(
+            test_sum_tuple, paddle.to_tensor([1, 2]), paddle.to_tensor(1)
+        )
+        self.assert_results(
+            test_sum_tuple, paddle.to_tensor([1, 2]), paddle.to_tensor([1, 3])
+        )
+        self.assert_results(test_sum_tuple2, 1, 1)
+        self.assert_results(test_sum_tuple2, [1, 2], [3, 4])
+        self.assert_results(test_sum_tuple2, paddle.to_tensor(1), 1)
+        self.assert_results(
+            test_sum_tuple2, paddle.to_tensor(1), paddle.to_tensor(1)
+        )
+        self.assert_results(
+            test_sum_tuple2,
+            [paddle.to_tensor(1), paddle.to_tensor(2)],
+            [paddle.to_tensor(3), paddle.to_tensor(4)],
+        )
+        self.assert_results(
+            test_sum_tuple2, paddle.to_tensor([1, 2]), paddle.to_tensor(1)
+        )
+        self.assert_results(
+            test_sum_tuple2, paddle.to_tensor([1, 2]), paddle.to_tensor([1, 3])
+        )
+        self.assert_results(test_sum_tuple3, 1)
+        self.assert_results(test_sum_tuple3, paddle.to_tensor(1))
+        self.assert_results(test_sum_list, 1, 1)
+        self.assert_results(test_sum_list, paddle.to_tensor(1), 1)
+        self.assert_results(
+            test_sum_list, paddle.to_tensor(1), paddle.to_tensor(1)
+        )
+        self.assert_results(
+            test_sum_list, paddle.to_tensor([1, 2]), paddle.to_tensor(1)
+        )
+        self.assert_results(
+            test_sum_list, paddle.to_tensor([1, 2]), paddle.to_tensor([1, 3])
+        )
+        self.assert_results(test_sum_list2, 1, 1)
+        self.assert_results(test_sum_list2, [1, 2], [3, 4])
+        self.assert_results(test_sum_list2, paddle.to_tensor(1), 1)
+        self.assert_results(
+            test_sum_list2, paddle.to_tensor(1), paddle.to_tensor(1)
+        )
+        self.assert_results(
+            test_sum_list2,
+            [paddle.to_tensor(1), paddle.to_tensor(2)],
+            [paddle.to_tensor(3), paddle.to_tensor(4)],
+        )
+        self.assert_results(
+            test_sum_list2, paddle.to_tensor([1, 2]), paddle.to_tensor(1)
+        )
+        self.assert_results(
+            test_sum_list2, paddle.to_tensor([1, 2]), paddle.to_tensor([1, 3])
+        )
+        self.assert_results(test_sum_list3, 1)
+        self.assert_results(test_sum_list3, paddle.to_tensor(1))
+        self.assert_results(test_tensor_sum, paddle.to_tensor([1, 2]))
+        self.assert_results(test_tensor_sum, paddle.to_tensor((1, 2)))
+        self.assert_results(test_tensor_sum_api, paddle.to_tensor([1, 2]))
+        self.assert_results(test_tensor_sum_api, paddle.to_tensor((1, 2)))
+
+    def test_dispatch_pow(self):
+        self.assert_results(test_pow, 2, 3)
+        self.assert_results(test_pow, paddle.to_tensor(2), 3)
+        self.assert_results(test_pow, paddle.to_tensor(2), paddle.to_tensor(3))
+        self.assert_results(test_pow2, 2, 3)
+        self.assert_results(test_math_pow, 2, 3)
+        self.assert_results(test_tensor_pow_api, paddle.to_tensor(2), 3)
+        self.assert_results(
+            test_tensor_pow_api, paddle.to_tensor(2), paddle.to_tensor(3)
+        )
+
+    def test_dispatch_chr(self):
+        self.assert_results(test_chr, 65)
+        self.assert_results(test_chr, 0x41)
+        self.assert_results(test_chr, paddle.to_tensor(65))
+        self.assert_results(test_chr, paddle.to_tensor(0x41))
+
+    def test_dispatch_ord(self):
+        self.assert_results(test_ord, "a")
+
+    def test_dispatch_sqrt(self):
+        self.assert_results(test_sqrt, 9)
+
+
+def run_getattr(x: paddle.Tensor):
+    attr = 'dtype'
+    out = getattr(x, attr)
+    return out
+
+
+class TestGetattr(TestCaseBase):
+    def test_getattr(self):
+        x = paddle.to_tensor(4)
+        self.assert_results(run_getattr, x)
+
+
+def tensor_hasattr(x: paddle.Tensor):
+    return (
+        hasattr(x, "dtype"),
+        hasattr(x, "stop_gradient"),
+        hasattr(x, "abs"),
+        hasattr(x, "non_tensor_attr"),
+    )
+
+
+class ObjectHasattr:
+    def __init__(self):
+        attr1 = 1
+        attr2 = "2"
+        attr3 = [3]
+
+
+def object_hasattr(x: ObjectHasattr):
+    return (
+        hasattr(x, "attr1"),
+        hasattr(x, "attr2"),
+        hasattr(x, "attr3"),
+        hasattr(x, "non_obj_attr"),
+    )
+
+
+def layer_hasattr(layer: paddle.nn.Layer):
+    return (
+        hasattr(layer, "parameters"),
+        hasattr(layer, "sublayers"),
+        hasattr(layer, "non_layer_attr"),
+    )
+
+
+class TestHasattr(TestCaseBase):
+    def test_tensor_hasattr(self):
+        x = paddle.to_tensor(4)
+        self.assert_results(tensor_hasattr, x)
+
+    def test_object_hasattr(self):
+        x = ObjectHasattr()
+        self.assert_results(object_hasattr, x)
+
+    def test_layer_hasattr(self):
+        x = paddle.nn.Layer()
+        self.assert_results(layer_hasattr, x)
+
+
+class WeakrefableObject:
+    ...
+
+
+def weakref_breakgraph(obj):
+    return weakref.ref(obj)
+
+
+class TestWeakref(TestCaseBase):
+    def test_weakref_breakgraph(self):
+        obj = WeakrefableObject()
+        self.assert_results(weakref_breakgraph, obj)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_call_object.py b/test/sot/test_call_object.py
new file mode 100644
index 00000000000000..486f3591f43269
--- /dev/null
+++ b/test/sot/test_call_object.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+patched = lambda self, x: x * self.a
+
+patched2 = lambda self, x: x * self.a + 3
+
+
+class A:
+    def __init__(self, a):
+        self.a = a
+
+    def __call__(self, x):
+        return self.add(x)
+
+    def add(self, x):
+        return x + self.a
+
+    multi = patched
+
+
+class B:
+    def __init__(self, a):
+        self.a = A(a)
+
+    def __call__(self, x, func):
+        return getattr(self.a, func)(x)
+
+    def self_call(self, x, func):
+        return getattr(self.a, func)(self.a, x)
+
+
+def foo_1(a, x):
+    return a(x)
+
+
+def foo_2(a, x):
+    return a.multi(x)
+
+
+def foo_3(b, x):
+    return b(x, "multi")
+
+
+def foo_4(b, x):
+    return b(x, "add")
+
+
+def foo_5(b, x):
+    return b.self_call(x, "multi")
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        c = B(13)
+        c.a.multi = patched2
+        self.assert_results(foo_1, A(13), paddle.to_tensor(2))
+        self.assert_results(foo_2, A(13), paddle.to_tensor(2))
+        self.assert_results(foo_3, B(13), paddle.to_tensor(2))
+        self.assert_results(foo_4, B(13), paddle.to_tensor(2))
+        self.assert_results(foo_5, c, paddle.to_tensor(2))
+        self.assert_results(foo_4, c, paddle.to_tensor(2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_case_base.py b/test/sot/test_case_base.py
new file mode 100644
index 00000000000000..03ce3c98227e8a
--- /dev/null
+++ b/test/sot/test_case_base.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import contextlib
+import copy
+import inspect
+import os
+import types
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.jit.sot import symbolic_translate
+from paddle.jit.sot.opcode_translator.executor.executor_cache import (
+    OpcodeExecutorCache,
+)
+
+
+@contextlib.contextmanager
+def test_instruction_translator_cache_context():
+    cache = OpcodeExecutorCache()
+    cache.clear()
+    yield cache
+    cache.clear()
+
+
+def github_action_error_msg(msg: str):
+    if 'GITHUB_ACTIONS' in os.environ:
+        frame = inspect.currentframe()
+        if frame is not None:
+            # find the first frame that is in the test folder
+            while frame.f_back is not None:
+                filename = frame.f_code.co_filename
+                if filename.startswith("./"):
+                    filename = f"tests/{filename[2:]}"
+                    lineno = frame.f_lineno
+                    output = f"\n::error file={filename},line={lineno}::{msg}"
+                    return output
+                frame = frame.f_back
+    return None
+
+
+class TestCaseBase(unittest.TestCase):
+    def assertIs(self, x, y, msg=None):
+        super().assertIs(x, y, msg=msg)
+        if msg is None:
+            msg = f"Assert Is, x is {x}, y is {y}"
+        msg = github_action_error_msg(msg)
+        if msg is not None:
+            print(msg)
+
+    def assertEqual(self, x, y, msg=None):
+        super().assertEqual(x, y, msg=msg)
+        if msg is None:
+            msg = f"Assert Equal, x is {x}, y is {y}"
+        msg = github_action_error_msg(msg)
+        if msg is not None:
+            print(msg)
+
+    def assert_nest_match(self, x, y):
+        cls_x = type(x)
+        cls_y = type(y)
+        msg = f"type mismatch, x is {cls_x}, y is {cls_y}"
+        self.assertIs(cls_x, cls_y, msg=msg)
+
+        container_types = (tuple, list, dict, set)
+        if cls_x in container_types:
+            msg = f"length mismatch, x is {len(x)}, y is {len(y)}"
+            self.assertEqual(
+                len(x),
+                len(y),
+                msg=msg,
+            )
+            if cls_x in (tuple, list):
+                for x_item, y_item in zip(x, y):
+                    self.assert_nest_match(x_item, y_item)
+            elif cls_x is dict:
+                for x_key, y_key in zip(x.keys(), y.keys()):
+                    self.assert_nest_match(x_key, y_key)
+                    self.assert_nest_match(x[x_key], y[y_key])
+            elif cls_x is set:
+                # TODO: Nested set is not supported yet
+                self.assertEqual(x, y)
+        elif cls_x in (np.ndarray, paddle.Tensor):
+            # TODO: support assert_allclose github error log
+            np.testing.assert_allclose(x, y)
+        else:
+            self.assertEqual(x, y)
+
+    def assert_results(self, func, *inputs):
+        sym_output = symbolic_translate(func)(*inputs)
+        paddle_output = func(*inputs)
+        self.assert_nest_match(sym_output, paddle_output)
+
+    def assert_results_with_side_effects(self, func, *inputs):
+        sym_inputs = copy.deepcopy(inputs)
+        sym_output = symbolic_translate(func)(*sym_inputs)
+        paddle_inputs = copy.deepcopy(inputs)
+        paddle_output = func(*paddle_inputs)
+        self.assert_nest_match(sym_inputs, paddle_inputs)
+        self.assert_nest_match(sym_output, paddle_output)
+
+    def assert_results_with_global_check(
+        self, func, global_keys: list[str], *inputs
+    ):
+        def copy_fn(fn):
+            return types.FunctionType(
+                code=fn.__code__,
+                globals=copy.copy(fn.__globals__),
+                name=fn.__name__,
+                argdefs=fn.__defaults__,
+                closure=fn.__closure__,
+            )
+
+        sym_copied_fn = copy_fn(func)
+        sym_fn = symbolic_translate(sym_copied_fn)
+        paddle_fn = copy_fn(func)
+        sym_output = sym_fn(*inputs)
+        paddle_output = paddle_fn(*inputs)
+        for key in global_keys:
+            self.assert_nest_match(
+                sym_copied_fn.__globals__[key], paddle_fn.__globals__[key]
+            )
+        self.assert_nest_match(sym_output, paddle_output)
+
+
+@contextlib.contextmanager
+def strict_mode_guard(value):
+    if "STRICT_MODE" not in os.environ:
+        os.environ["STRICT_MODE"] = "0"
+    old_value = os.environ["STRICT_MODE"]
+    os.environ["STRICT_MODE"] = str(value)
+    yield
+    os.environ["STRICT_MODE"] = old_value
+
+
+@contextlib.contextmanager
+def cost_model_guard(value):
+    if "COST_MODEL" not in os.environ:
+        os.environ["COST_MODEL"] = "True"
+    old_value = os.environ["COST_MODEL"]
+    os.environ["COST_MODEL"] = str(value)
+    yield
+    os.environ["COST_MODEL"] = old_value
diff --git a/test/sot/test_code_status.py b/test/sot/test_code_status.py
new file mode 100644
index 00000000000000..9fec5712c2293a
--- /dev/null
+++ b/test/sot/test_code_status.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase, strict_mode_guard
+
+import paddle
+from paddle.jit import sot
+from paddle.jit.sot.opcode_translator.skip_files import skip_function
+from paddle.jit.sot.utils.code_status import CodeState, CodeStatus
+
+
+class SimpleNet1(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.layers = paddle.nn.LayerList(
+            [paddle.nn.Linear(10, 10) for _ in range(30)]
+        )
+
+    def forward(self, x):
+        for i in range(len(self.layers)):
+            sot.psdb.breakgraph()
+            x = self.layers[i](x)
+            x = self.layers[i](x)
+            x = self.layers[i](x)
+            x = self.layers[i](x)
+        return x
+
+
+class SimpleNet2(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.layers = paddle.nn.LayerList(
+            [paddle.nn.Linear(10, 10) for _ in range(30)]
+        )
+
+    def forward(self, x):
+        sot.psdb.fallback()
+        for i in range(len(self.layers)):
+            x = self.layers[i](x)
+            x = self.layers[i](x)
+            x = self.layers[i](x)
+            x = self.layers[i](x)
+        return x
+
+
+def run_net(net, x):
+    for i in range(20):
+        x = net(x)
+    return x
+
+
+class TestCodeInfo(TestCaseBase):
+    def test_case_1(self):
+        CodeStatus().clear()
+        net = SimpleNet1()
+        inp = paddle.rand((10, 10))
+        self.assert_results(run_net, net, inp)
+        code_map = CodeStatus().code_map
+        states = []
+        for k, v in code_map.items():
+            if k.co_name.startswith("#") or k.co_name.startswith("$"):
+                states.append(v)
+            elif k in CodeStatus().WITH_GRAPH_API:
+                assert v.state == CodeState.WITH_GRAPH
+            else:
+                assert v.state == CodeState.WITHOUT_GRAPH
+        # run_net, forward, loop body, resumed part2 in loop body
+        assert len([v for v in states if v.state == CodeState.WITH_GRAPH]) == 4
+        # resumed part1 in loop body
+        assert (
+            len([v for v in states if v.state == CodeState.WITHOUT_GRAPH]) == 1
+        )
+
+    def test_case_2(self):
+        with strict_mode_guard(0):
+            CodeStatus().clear()
+            net = SimpleNet2()
+            inp = paddle.rand((10, 10))
+            self.assert_results(run_net, net, inp)
+            code_map = CodeStatus().code_map
+            states = []
+            for k, v in code_map.items():
+                if k.co_name.startswith("#") or k.co_name.startswith("$"):
+                    states.append(v)
+                elif k in CodeStatus().WITH_GRAPH_API:
+                    assert v.state == CodeState.WITH_GRAPH
+                else:
+                    assert v.state == CodeState.WITHOUT_GRAPH
+            # no graph found because fallback (paddle api will not enter simulate)
+            assert (
+                len([v for v in states if v.state == CodeState.WITH_GRAPH]) == 0
+            )
+
+
+def no_skip_func_0(x):
+    return x + 1
+
+
+def skipped_func_0():
+    pass
+
+
+def skipped_func_1(x):
+    return x + 1
+
+
+def skipped_func_2(x):
+    return no_skip_func_0(x)
+
+
+def call_skipped_func_0(x):
+    for i in range(15):
+        skipped_func_0()
+        x = skipped_func_1(x)
+        x = skipped_func_2(x)
+    return x
+
+
+skip_function(skipped_func_0)
+skip_function(skipped_func_1)
+skip_function(skipped_func_2)
+skip_function(call_skipped_func_0)
+
+
+class TestDisableSkippedFrame(TestCaseBase):
+    def test_case_0(self):
+        CodeStatus().clear()
+        x = paddle.to_tensor([1])
+        self.assert_results(call_skipped_func_0, x)
+        code_map = CodeStatus().code_map
+        assert (
+            code_map[skipped_func_0.__code__].state == CodeState.WITHOUT_GRAPH
+        )
+        assert (
+            code_map[skipped_func_1.__code__].state == CodeState.WITHOUT_GRAPH
+        )
+        assert code_map[skipped_func_2.__code__].state == CodeState.WITH_GRAPH
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_constant_graph.py b/test/sot/test_constant_graph.py
new file mode 100644
index 00000000000000..970f9f49024131
--- /dev/null
+++ b/test/sot/test_constant_graph.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# New Supported Instructions:
+# BUILD_MAP (new)
+# BUILD_CONST_KEY_MAP (new)
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def func_1(format_str, tensor):
+    str = format_str.format(xx=12)
+    a = "{xx} = 12".format
+    ttt = f"{10} = 12"
+    a(xx=12)
+    tensor = tensor + 1
+    return str, tensor
+
+
+def func_2(format_str, tensor):
+    str = format_str % 10
+    tensor = tensor + 1
+    return str, tensor
+
+
+class TestConstantGraph(TestCaseBase):
+    def test_case_1(self):
+        x = "{xx} is xx"
+        tensor = paddle.to_tensor(1)
+        self.assert_results(func_1, x, tensor)
+
+    def test_case_2(self):
+        x = "%s is xx"
+        tensor = paddle.to_tensor(1)
+        self.assert_results(func_2, x, tensor)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_cost_model.py b/test/sot/test_cost_model.py
new file mode 100644
index 00000000000000..07899a03efbfd6
--- /dev/null
+++ b/test/sot/test_cost_model.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+
+from test_case_base import TestCaseBase, cost_model_guard
+
+import paddle
+from paddle.jit.sot import psdb, symbolic_translate
+from paddle.jit.sot.utils import StepInfoManager, StepState
+
+
+def dyn_fast(x, net, iter_):
+    for i in iter_:
+        x = net(x)
+    return x
+
+
+def sot_fast_with_single_graph(x, net):
+    if not psdb.in_sot():
+        time.sleep(0.1)
+    return x + 1
+
+
+def sot_fast_with_multi_graph(x, net):
+    if not psdb.in_sot():
+        time.sleep(0.1)
+    x = x + 1
+    psdb.breakgraph()
+    x = x + 2
+    return x
+
+
+class Net(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.linear = paddle.nn.Linear(10, 10)
+
+    def forward(self, x):
+        if not psdb.in_sot():
+            time.sleep(0.1)
+        x = x / 3
+        x = x + 5
+        x = self.linear(x)
+        return x
+
+
+class TestCostModel(TestCaseBase):
+    @cost_model_guard("True")
+    def test_dyn_fast(self):
+        x = paddle.rand([10])
+        net = paddle.nn.Linear(10, 10)
+        sot_fn = symbolic_translate(dyn_fast)
+        for i in range(60):
+            sot_fn(x, net, iter(range(10)))
+
+        state = StepInfoManager().step_record[dyn_fast.__code__].state
+        assert state == StepState.RUN_DYN
+
+    @cost_model_guard("True")
+    def test_sot_fast_with_multi_graph(self):
+        x = paddle.rand([10])
+        net = paddle.nn.Linear(10, 10)
+        sot_fn = symbolic_translate(sot_fast_with_multi_graph)
+        for i in range(30):
+            sot_fn(x, net)
+
+        state = (
+            StepInfoManager()
+            .step_record[sot_fast_with_multi_graph.__code__]
+            .state
+        )
+        assert state == StepState.RUN_SOT
+
+    @cost_model_guard("True")
+    def test_sot_fast_with_single_graph(self):
+        x = paddle.rand([10])
+        net = paddle.nn.Linear(10, 10)
+        for i in range(30):
+            symbolic_translate(sot_fast_with_single_graph)(x, net)
+
+        state = (
+            StepInfoManager()
+            .step_record[sot_fast_with_single_graph.__code__]
+            .state
+        )
+        assert state == StepState.RUN_SOT
+
+    @cost_model_guard("True")
+    def test_net(self):
+        x = paddle.rand([10])
+        net = Net()
+        net = paddle.jit.to_static(net, enable_fallback=True)
+        for i in range(30):
+            x = net(x)
+
+        state = StepInfoManager().step_record[Net.forward.__code__].state
+        assert state == StepState.RUN_SOT
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_delete_fast.py b/test/sot/test_delete_fast.py
new file mode 100644
index 00000000000000..9dca7d4ea1b14c
--- /dev/null
+++ b/test/sot/test_delete_fast.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def test_delete_fast(a):
+    a = a + 2
+    t = a * 3
+    del t
+    return a
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        a = paddle.to_tensor(1)
+        self.assert_results(test_delete_fast, a)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_dup_top.py b/test/sot/test_dup_top.py
new file mode 100644
index 00000000000000..5cb28a2dc6ceac
--- /dev/null
+++ b/test/sot/test_dup_top.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def func_dup_top_1():
+    return True == True != False
+
+
+def func_dup_top_2(x):
+    y = x + 1
+    return True == True != False
+
+
+def func_dup_top_two(x: list[paddle.Tensor]):
+    x[0] += x[1]
+    return x
+
+
+class TestDupTop(TestCaseBase):
+    def test_dup_top(self):
+        self.assert_results(func_dup_top_1)
+        self.assert_results(func_dup_top_2, paddle.to_tensor(1.0))
+        # TODO: fix this after we support side effect
+        # self.assert_results(
+        #     func_dup_top_two, [paddle.to_tensor(1.0), paddle.to_tensor(2.0)]
+        # )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_enumerate.py b/test/sot/test_enumerate.py
new file mode 100644
index 00000000000000..f81a451da55c99
--- /dev/null
+++ b/test/sot/test_enumerate.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase, strict_mode_guard
+
+import paddle
+
+
+def test_enumerate_1(x: int, y: int):
+    for id, val in enumerate(range(x)):
+        if id % 2 == 0:
+            y += val
+    return y
+
+
+def test_enumerate_2(x: list):
+    return list(enumerate(x))
+
+
+def test_enumerate_3(x: list):
+    return tuple(enumerate(x))
+
+
+def test_enumerate_4(x: paddle.Tensor):
+    sum = 0
+    for idx, val in enumerate(x):
+        sum += val
+    return sum
+
+
+# TODO(zmh): support range for tensor
+def test_enumerate_5(x: paddle.Tensor):
+    sum = 0
+
+    for idx, val in enumerate(x):
+        for i in range(val):
+            sum += val
+    return sum
+
+
+def test_enumerate_6(x: paddle.Tensor):
+    sum = 0
+
+    for idx, val in enumerate(x):
+        for i in range(idx):
+            sum += val
+    return sum
+
+
+def test_enumerate_7(x: paddle.Tensor):
+    sum = 0
+    x = x.flatten()
+    for idx, val in enumerate(x):
+        sum += val
+    return sum
+
+
+# TODO(zmh): support -1
+def test_enumerate_8(x: paddle.Tensor):
+    sum = 0
+    x = paddle.nonzero(x, as_tuple=False)
+    for idx, val in enumerate(x):
+        sum += val
+    return sum
+
+
+def test_enumerate_10(layer_list, x):
+    sum = 0
+    for idx, layer in enumerate(layer_list):
+        sum += layer(x)
+    return sum
+
+
+class TestExecutor(TestCaseBase):
+    def test_cases(self):
+        x = 8
+        y = 5
+        ty = paddle.randn((10, 10))
+        layer_list = paddle.nn.LayerList(
+            [paddle.nn.Linear(10, 10) for _ in range(3)]
+        )
+
+        self.assert_results(test_enumerate_1, x, y)
+        self.assert_results(test_enumerate_2, [2, 4, 6, 8, 10])
+        self.assert_results(test_enumerate_3, [2, 4, 6, 8, 10])
+
+        self.assert_results(test_enumerate_4, ty)
+        # TODO(zmh): support range for tensor
+
+        with strict_mode_guard(0):
+            self.assert_results(test_enumerate_5, paddle.to_tensor([1, 2, 3]))
+        self.assert_results(test_enumerate_6, paddle.to_tensor([1, 2, 3]))
+        self.assert_results(test_enumerate_7, ty)
+        # TODO(zmh): support -1
+
+        with strict_mode_guard(0):
+            self.assert_results(test_enumerate_8, ty)
+
+        self.assert_results(test_enumerate_10, layer_list, paddle.randn((10,)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_error_handling.py b/test/sot/test_error_handling.py
new file mode 100644
index 00000000000000..c74436f0d44f4f
--- /dev/null
+++ b/test/sot/test_error_handling.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase, strict_mode_guard
+
+from paddle.jit import sot
+
+
+def fn_with_try_except():
+    sot.psdb.breakgraph()
+    sot.psdb.fallback()
+    try:
+        raise ValueError("ValueError")
+    except ValueError:
+        print("catch ValueError")
+        return True
+
+
+class TestErrorHandling(TestCaseBase):
+    @strict_mode_guard(0)
+    def test_fn_with_try_except(self):
+        self.assert_results(fn_with_try_except)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_exception.py b/test/sot/test_exception.py
new file mode 100644
index 00000000000000..26e0f55044379d
--- /dev/null
+++ b/test/sot/test_exception.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import re
+import unittest
+
+import paddle
+from paddle.jit.sot import symbolic_translate
+
+
+def case1(x):
+    return n  # noqa: F821
+
+
+def case2(x):
+    x = x + 1
+    return x @ x
+
+
+def case3(x):
+    y = x.undefined_attr
+    return y
+
+
+def case4_inner(x):
+    y = x * 2
+    print()
+    y = y + 1
+    return y.undefined_attr
+
+
+def case4(x):
+    return case4_inner(x)
+
+
+def case5_inner3(x):
+    x += 1
+    print(x)
+    z = x + 1
+    return z
+
+
+def case5_inner2(x):
+    x += 1
+    z = case5_inner3(1 / 0)
+    return z + 1
+
+
+def case5_inner1(x):
+    return case5_inner2(x)
+
+
+def case5(x):
+    y = case5_inner3(x)
+    return case5_inner1(y) + 1
+
+
+class TestException(unittest.TestCase):
+    def catch_error(self, func, inputs, error_lines: int | list[int]):
+        if isinstance(error_lines, int):
+            error_lines = [error_lines]
+        try:
+            symbolic_translate(func)(inputs)
+        except Exception as e:
+            match_results = re.compile(r'File ".*", line (\d+)').findall(str(e))
+            match_results = list(map(int, match_results))
+            assert (
+                match_results == error_lines
+            ), f"{match_results} is not equal {error_lines}"
+
+    def test_all_case(self):
+        self.catch_error(case1, paddle.rand([2, 1]), 25)
+        # TODO: support runtime error, such as x[111], x@x
+        # self.catch_error(case2, paddle.rand([2, 1]), 30)
+        self.catch_error(case3, paddle.rand([2, 1]), 34)
+        self.catch_error(case4, paddle.rand([2, 1]), 42)
+        self.catch_error(case5, paddle.rand([3, 1]), [68, 63, 58])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_execution_base.py b/test/sot/test_execution_base.py
new file mode 100644
index 00000000000000..8c16b89ec4cf18
--- /dev/null
+++ b/test/sot/test_execution_base.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot import symbolic_translate
+from paddle.static import BuildStrategy
+
+
+def func(x, y):
+    ret = 2 * x
+    ret = paddle.nn.functional.relu(ret)
+    ret = ret + y
+    return ret
+
+
+def simple(x):
+    ret = 2 * x
+    return ret
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        x = paddle.to_tensor([1.0])
+        y = paddle.to_tensor([2.0])
+        self.assert_results(simple, x)
+        self.assert_results(simple, y)
+
+
+def foo(x):
+    out = x + 1
+    out = out * 2
+    out = paddle.nn.functional.relu(out)
+    return out
+
+
+class TestBackend(TestCaseBase):
+    def test_backend(self):
+        x = paddle.randn([2, 3])
+        dy_out = foo(x)
+        sot_out = symbolic_translate(
+            foo, build_strategy=BuildStrategy(), backend='CINN'
+        )(x)
+        self.assert_nest_match(dy_out, sot_out)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_guard_outputs.py b/test/sot/test_guard_outputs.py
new file mode 100644
index 00000000000000..c717eb8190e5fc
--- /dev/null
+++ b/test/sot/test_guard_outputs.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import (
+    TestCaseBase,
+    test_instruction_translator_cache_context,
+)
+
+import paddle
+
+
+def non_operator_related_fn(x: int, y: int):
+    return x + y
+
+
+def partial_non_operator_related_fn(x: paddle.Tensor, y: paddle.Tensor, z: int):
+    a = x + y
+    return [a, z + z]
+
+
+def guard_inputs(x: int, y: int, z: int):
+    return x + y + z
+
+
+class TestGuardOutputs(TestCaseBase):
+    def test_non_operator_related_fn(self):
+        with test_instruction_translator_cache_context() as ctx:
+            self.assert_results(non_operator_related_fn, 1, 2)
+            self.assertEqual(ctx.translate_count, 1)
+            self.assert_results(non_operator_related_fn, 3, 4)
+            self.assertEqual(ctx.translate_count, 2)
+
+    def test_partial_non_operator_related_fn(self):
+        with test_instruction_translator_cache_context() as ctx:
+            self.assert_results(
+                partial_non_operator_related_fn,
+                paddle.to_tensor(1),
+                paddle.to_tensor(2),
+                3,
+            )
+            self.assertEqual(ctx.translate_count, 1)
+            self.assert_results(
+                partial_non_operator_related_fn,
+                paddle.to_tensor(4),
+                paddle.to_tensor(5),
+                6,
+            )
+            self.assertEqual(ctx.translate_count, 2)
+
+    def test_guard_inputs(self):
+        with test_instruction_translator_cache_context() as ctx:
+            self.assert_results(guard_inputs, 1, 2, 3)
+            self.assertEqual(ctx.translate_count, 1)
+            self.assert_results(guard_inputs, 0, 2, 3)
+            self.assertEqual(ctx.translate_count, 2)
+            self.assert_results(guard_inputs, 1, 0, 3)
+            self.assertEqual(ctx.translate_count, 3)
+            self.assert_results(guard_inputs, 1, 2, 0)
+            self.assertEqual(ctx.translate_count, 4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_guard_user_defined_fn.py b/test/sot/test_guard_user_defined_fn.py
new file mode 100644
index 00000000000000..193164b06f58d6
--- /dev/null
+++ b/test/sot/test_guard_user_defined_fn.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import (
+    TestCaseBase,
+    test_instruction_translator_cache_context,
+)
+
+import paddle
+
+
+def test_guard_fn(fn, inp):
+    if fn is None:
+        return 0
+    else:
+        return fn(inp)
+
+
+class TestGuardOutputs(TestCaseBase):
+    def test_non_operator_related_fn(self):
+        with test_instruction_translator_cache_context() as ctx:
+            self.assert_results(
+                test_guard_fn,
+                paddle.nn.functional.relu,
+                paddle.to_tensor([1.0, -1.0]),
+            )
+            self.assertEqual(ctx.translate_count, 1)
+            self.assert_results(
+                test_guard_fn,
+                paddle.nn.functional.gelu,
+                paddle.to_tensor([1.0, -1.0]),
+            )
+            self.assertEqual(ctx.translate_count, 2)
+            self.assert_results(
+                test_guard_fn,
+                paddle.nn.functional.relu,
+                paddle.to_tensor([-1.0, -1.0]),
+            )
+            self.assertEqual(ctx.translate_count, 2)
+            self.assert_results(
+                test_guard_fn, None, paddle.to_tensor([-1.0, -1.0])
+            )
+            self.assertEqual(ctx.translate_count, 3)
+
+        deleted_cnt = 0
+
+        class Callable:
+            def __call__(self, var):
+                return paddle.nn.functional.relu(var)
+
+            def __del__(self):
+                nonlocal deleted_cnt
+                deleted_cnt += 1
+
+        fn1 = Callable()
+        fn2 = Callable()
+        with test_instruction_translator_cache_context() as ctx:
+            self.assert_results(
+                test_guard_fn, fn1, paddle.to_tensor([1.0, -1.0])
+            )
+            self.assertEqual(ctx.translate_count, 1)
+            self.assert_results(
+                test_guard_fn, fn2, paddle.to_tensor([1.0, -1.0])
+            )
+            self.assertEqual(ctx.translate_count, 2)
+            self.assert_results(
+                test_guard_fn, fn2, paddle.to_tensor([1.0, -1.0])
+            )
+            self.assertEqual(ctx.translate_count, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_inplace_api.py b/test/sot/test_inplace_api.py
new file mode 100644
index 00000000000000..767368e9fe7dd4
--- /dev/null
+++ b/test/sot/test_inplace_api.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot import symbolic_translate
+
+
+def simple(x, y):
+    x[0] = 3.0
+    z = [y]
+    y[1] = 5.0
+    return x[0] + x[1] + z[0][1] + y[0] + y[1]
+
+
+def inplace_in_if(x, y, z):
+    if z:
+        x[0] = 3.0
+        z = [y]
+        y[1] = 5.0
+        ret = x[0] + x[1] + z[0][1] + y[0] + y[1]
+        return ret
+    else:
+        return None
+
+
+def inplace_in_if_fallback(x, y, z):
+    if z > 0:
+        x[0] = 3.0
+        z = [y]
+        y[1] = 5.0
+        ret = x[0] + x[1] + z[0][1] + y[0] + y[1]
+        return ret
+    else:
+        return None
+
+
+def inplace_in_loop(x, y):
+    ret = 0
+    for i in range(10):
+        x[0] = 1
+        z = [y]
+        y[1] = 2 * i + 1
+        ret += x[0] + x[1] + z[0][1] + y[0] + y[1]
+    return ret
+
+
+def inplace_in_loop_fallback(x, y, it):
+    ret = 0
+    for i in it:
+        x[0] = 1
+        z = [y]
+        y[1] = 2 * i + 1
+        ret += x[0] + x[1] + z[0][1] + y[0] + y[1]
+    return ret
+
+
+def inplace_case_0(x):
+    x[:] = 1.0
+    return x
+
+
+def inplace_case_1(x):
+    x[0][0, 0::2] = 1.0
+    return x
+
+
+def inplace_case_2(x):
+    t = x[0]
+    t[:, 0::2] = t[:, 0::2] * 0
+    t[:, 1::2] = t[:, 1::2] + 2
+    return x
+
+
+class TestExecutor(TestCaseBase):
+    def test_case(self):
+        self.assert_results(inplace_case_0, paddle.randn((1, 4)))
+        self.assert_results(inplace_case_1, [paddle.randn((1, 4))])
+        self.assert_results(inplace_case_2, [paddle.randn((1, 4))])
+
+    def test_backward(self):
+        @symbolic_translate
+        def func(x):
+            m = x * 2
+            n = x * 3
+            y = m
+            y[:] = n
+            return y
+
+        x = paddle.ones((1, 4)) * 4
+        x.stop_gradient = False
+        y = func(x)
+        y.sum().backward()
+        assert (x.grad.numpy() == 3).all()
+
+    def test_simple(self):
+        self.assert_results(
+            simple, paddle.to_tensor([1.0, 2.0]), paddle.to_tensor([3.0, 4.0])
+        )
+
+    def test_if(self):
+        self.assert_results(
+            inplace_in_if,
+            paddle.to_tensor([1.0, 2.0]),
+            paddle.to_tensor([3.0, 4.0]),
+            True,
+        )
+        self.assert_results(
+            inplace_in_if_fallback,
+            paddle.to_tensor([1.0, 2.0]),
+            paddle.to_tensor([3.0, 4.0]),
+            paddle.to_tensor(1),
+        )
+
+    def test_loop(self):
+        self.assert_results(
+            inplace_in_loop,
+            paddle.to_tensor([1.0, 2.0]),
+            paddle.to_tensor([3.0, 4.0]),
+        )
+
+        a = range(10)
+        sym_output = symbolic_translate(inplace_in_loop_fallback)(
+            paddle.to_tensor([1.0, 2.0]), paddle.to_tensor([3.0, 4.0]), iter(a)
+        )
+        paddle_output = inplace_in_loop_fallback(
+            paddle.to_tensor([1.0, 2.0]), paddle.to_tensor([3.0, 4.0]), iter(a)
+        )
+        self.assert_nest_match(sym_output, paddle_output)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_instruction_translator_cache.py b/test/sot/test_instruction_translator_cache.py
new file mode 100644
index 00000000000000..6ee1b33ebbc15f
--- /dev/null
+++ b/test/sot/test_instruction_translator_cache.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import inspect
+import random
+import types
+import unittest
+from unittest.mock import patch
+
+from test_case_base import (
+    TestCaseBase,
+    test_instruction_translator_cache_context,
+)
+
+from paddle.jit.sot.opcode_translator.custom_code import CustomCode
+from paddle.jit.sot.opcode_translator.executor.executor_cache import (
+    OpcodeExecutorCache,
+)
+
+
+def fake_frames() -> (
+    tuple[
+        types.FrameType,
+        types.FrameType,
+        types.FrameType,
+        types.FrameType,
+        types.FrameType,
+    ]
+):
+    def fake_inner_fn_1():
+        frame = inspect.currentframe()
+        assert frame is not None
+        return frame
+
+    def fake_inner_fn_2():
+        frame = inspect.currentframe()
+        assert frame is not None
+        return frame
+
+    def fake_inner_fn_3():
+        frame = inspect.currentframe()
+        assert frame is not None
+        return frame
+
+    def fake_inner_fn_4():
+        frame = inspect.currentframe()
+        assert frame is not None
+        return frame
+
+    def fake_inner_fn_5():
+        frame = inspect.currentframe()
+        assert frame is not None
+        return frame
+
+    return (
+        fake_inner_fn_1(),
+        fake_inner_fn_2(),
+        fake_inner_fn_3(),
+        fake_inner_fn_4(),
+        fake_inner_fn_5(),
+    )
+
+
+(
+    FRAME_1,
+    FRAME_2,
+    FRAME_3,
+    FRAME_4,
+    FRAME_5,
+) = fake_frames()
+
+
+def mock_start_translate(frame: types.FrameType, **kwargs):
+    translate_map = {
+        FRAME_1: (CustomCode(FRAME_2.f_code, False), lambda frame: True),
+        FRAME_3: (
+            CustomCode(FRAME_4.f_code, False),
+            lambda frame: False,
+        ),  # Always re-compile
+        FRAME_5: (CustomCode(None, False), lambda frame: True),
+    }
+    return translate_map[frame]
+
+
+class TestOpcodeExecutorCache(unittest.TestCase):
+    def reset(self):
+        global translate_count
+        translate_count = 0
+        OpcodeExecutorCache().clear()
+
+    @patch(
+        "paddle.jit.sot.opcode_translator.executor.executor_cache.start_translate",
+        mock_start_translate,
+    )
+    def test_cache_hit(self):
+        with test_instruction_translator_cache_context() as ctx:
+            translated_code_1 = OpcodeExecutorCache()(FRAME_1)
+            assert translated_code_1 is not None
+            self.assertEqual(translated_code_1.code, FRAME_2.f_code)
+            self.assertEqual(ctx.translate_count, 1)
+            # cache hit
+            translated_code_2 = OpcodeExecutorCache()(FRAME_1)
+            assert translated_code_2 is not None
+            self.assertEqual(translated_code_2.code, FRAME_2.f_code)
+            self.assertEqual(ctx.translate_count, 1)
+
+    @patch(
+        "paddle.jit.sot.opcode_translator.executor.executor_cache.start_translate",
+        mock_start_translate,
+    )
+    def test_cache_miss_due_to_unknown_code(self):
+        with test_instruction_translator_cache_context() as ctx:
+            translated_code_1 = OpcodeExecutorCache()(FRAME_1)
+            assert translated_code_1 is not None
+            self.assertEqual(translated_code_1.code, FRAME_2.f_code)
+            self.assertEqual(ctx.translate_count, 1)
+            # cache miss
+            translated_code_2 = OpcodeExecutorCache()(FRAME_3)
+            assert translated_code_2 is not None
+            self.assertEqual(translated_code_2.code, FRAME_4.f_code)
+            self.assertEqual(ctx.translate_count, 2)
+
+    @patch(
+        "paddle.jit.sot.opcode_translator.executor.executor_cache.start_translate",
+        mock_start_translate,
+    )
+    def test_cache_miss_due_to_check_failed(self):
+        with test_instruction_translator_cache_context() as ctx:
+            translated_code_1 = OpcodeExecutorCache()(FRAME_3)
+            assert translated_code_1 is not None
+            self.assertEqual(translated_code_1.code, FRAME_4.f_code)
+            self.assertEqual(ctx.translate_count, 1)
+            # cache miss
+            translated_code_2 = OpcodeExecutorCache()(FRAME_3)
+            assert translated_code_2 is not None
+            self.assertEqual(translated_code_2.code, FRAME_4.f_code)
+            self.assertEqual(ctx.translate_count, 2)
+
+
+def foo(x):
+    return x + 1
+
+
+class TestCacheExceedLimit(TestCaseBase):
+    def test_cache_exceed_limit(self):
+        for _ in range(30):
+            input = random.random()
+            self.assert_results(foo, input)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/sot/test_map.py b/test/sot/test_map.py
new file mode 100644
index 00000000000000..812ab36673be42
--- /dev/null
+++ b/test/sot/test_map.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+from typing import Iterable
+
+from test_case_base import TestCaseBase, strict_mode_guard
+
+from paddle.jit import sot
+from paddle.jit.sot.psdb import check_no_breakgraph
+
+
+def double_num(num: float | int):
+    return num * 2
+
+
+def double_num_with_breakgraph(num: float | int):
+    sot.psdb.breakgraph()
+    return num * 2
+
+
+@check_no_breakgraph
+def test_map_list(x: list):
+    return list(map(double_num, x))
+
+
+@check_no_breakgraph
+def test_map_list_comprehension(x: list):
+    return [i for i in map(double_num, x)]  # noqa: C416
+
+
+@check_no_breakgraph
+def test_map_tuple(x: tuple):
+    return tuple(map(double_num, x))
+
+
+@check_no_breakgraph
+def test_map_tuple_comprehension(x: tuple):
+    return [i for i in map(double_num, x)]  # noqa: C416
+
+
+@check_no_breakgraph
+def test_map_range(x: Iterable):
+    return list(map(double_num, x))
+
+
+@check_no_breakgraph
+def test_map_range_comprehension(x: Iterable):
+    return [i for i in map(double_num, x)]  # noqa: C416
+
+
+def add_dict_prefix(key: str):
+    return f"dict_{key}"
+
+
+@check_no_breakgraph
+def test_map_dict(x: dict):
+    return list(map(add_dict_prefix, x))
+
+
+@check_no_breakgraph
+def test_map_dict_comprehension(x: dict):
+    return [i for i in map(add_dict_prefix, x)]  # noqa: C416
+
+
+def test_map_list_with_breakgraph(x: list):
+    return list(map(double_num_with_breakgraph, x))
+
+
+@check_no_breakgraph
+def test_map_unpack(x: list):
+    a, b, c, d = map(double_num, x)
+    return a, b, c, d
+
+
+@check_no_breakgraph
+def test_map_for_loop(x: list):
+    res = 0
+    for i in map(double_num, x):
+        res += i
+    return res
+
+
+class TestMap(TestCaseBase):
+    def test_map(self):
+        self.assert_results(test_map_list, [1, 2, 3, 4])
+        self.assert_results(test_map_tuple, (1, 2, 3, 4))
+        self.assert_results(test_map_range, range(5))
+        self.assert_results(test_map_dict, {"a": 1, "b": 2, "c": 3})
+
+    def test_map_comprehension(self):
+        self.assert_results(test_map_list_comprehension, [1, 2, 3, 4])
+        self.assert_results(test_map_tuple_comprehension, (1, 2, 3, 4))
+        self.assert_results(test_map_range_comprehension, range(5))
+        self.assert_results(
+            test_map_dict_comprehension, {"a": 1, "b": 2, "c": 3}
+        )
+
+    def test_map_with_breakgraph(self):
+        with strict_mode_guard(0):
+            self.assert_results(test_map_list_with_breakgraph, [1, 2, 3, 4])
+
+    def test_map_unpack(self):
+        self.assert_results(test_map_unpack, [1, 2, 3, 4])
+
+    def test_map_for_loop(self):
+        self.assert_results(test_map_for_loop, [7, 8, 9, 10])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_multiple_args.py b/test/sot/test_multiple_args.py
new file mode 100644
index 00000000000000..7d5bf6b59205c7
--- /dev/null
+++ b/test/sot/test_multiple_args.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def foo(x, y):
+    ret = x + y
+    return ret
+
+
+class TestMultipleArgs(TestCaseBase):
+    def test_multiple_args(self):
+        x = paddle.to_tensor([1.0])
+        y = paddle.to_tensor([2.0])
+        self.assert_results(foo, x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_mutable_data.py b/test/sot/test_mutable_data.py
new file mode 100644
index 00000000000000..2cedee2d8529fd
--- /dev/null
+++ b/test/sot/test_mutable_data.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.jit.sot.opcode_translator.executor.mutable_data import (
+    MutableData,
+    MutableDictLikeData,
+    MutableListLikeData,
+)
+
+
+class VariableBase:
+    def __init__(self):
+        ...
+
+
+class ConstVariable(VariableBase):
+    def __init__(self, value):
+        self.value = value
+
+    def __repr__(self):
+        return f"ConstVariable({self.value})"
+
+    def __eq__(self, other):
+        if not isinstance(other, ConstVariable):
+            return False
+        return self.value == other.value
+
+
+class DictVariable(VariableBase):
+    def __init__(self, data):
+        self.data = data
+        self.proxy = MutableDictLikeData(data, DictVariable.proxy_getter)
+
+    @staticmethod
+    def proxy_getter(proxy, key):
+        if key not in proxy.original_data:
+            return MutableData.Empty()
+        return ConstVariable(proxy.original_data[key])
+
+    def getitem(self, key):
+        res = self.proxy.get(key)
+        if isinstance(res, MutableData.Empty):
+            raise KeyError(f"Key {key} not found")
+        return res
+
+    def setitem(self, key, value):
+        self.proxy.set(key, value)
+
+    def delitem(self, key):
+        self.proxy.delete(key)
+
+
+class ListVariable(VariableBase):
+    def __init__(self, data):
+        self.data = data
+        self.proxy = MutableListLikeData(data, ListVariable.proxy_getter)
+
+    @staticmethod
+    def proxy_getter(proxy, key):
+        if key < 0 or key >= len(proxy.original_data):
+            return MutableData.Empty()
+        return ConstVariable(proxy.original_data[key])
+
+    def getitem(self, key):
+        if isinstance(key, int):
+            res = self.proxy.get(key)
+            if isinstance(res, MutableData.Empty):
+                raise IndexError(f"Index {key} out of range")
+            return res
+        elif isinstance(key, slice):
+            return self.proxy.get_all()[key]
+        else:
+            raise TypeError(f"Invalid key type {type(key)}")
+
+    def __getitem__(self, key):
+        return self.getitem(key)
+
+    def setitem(self, key, value):
+        if isinstance(key, int):
+            self.proxy.set(key, value)
+        elif isinstance(key, slice):
+            start, end, step = key.indices(self.proxy.length)
+            indices = list(range(start, end, step))
+            if step == 1:
+                # replace a continuous range
+                for i, idx in enumerate(indices):
+                    self.proxy.delete(idx - i)
+                for i, item in enumerate(value):
+                    self.proxy.insert(start + i, item)
+            else:
+                # replace some elements
+                if len(indices) != len(value):
+                    raise ValueError(
+                        f"Attempt to replace {len(indices)} items with {len(value)}"
+                    )
+                for i, idx in enumerate(indices):
+                    self.proxy.set(idx, value[i])
+
+    def delitem(self, key):
+        self.proxy.delete(key)
+
+    def insert(self, index, value):
+        self.proxy.insert(index, value)
+
+    def append(self, value):
+        self.proxy.insert(self.proxy.length, value)
+
+    def extend(self, value):
+        for item in value:
+            self.append(item)
+
+    def pop(self, index=-1):
+        res = self.getitem(index)
+        self.delitem(index)
+        return res
+
+    def clear(self):
+        for i in range(self.proxy.length):
+            self.delitem(0)
+
+    def remove(self, value):
+        for i in range(self.proxy.length):
+            if self.getitem(i) == value:
+                self.delitem(i)
+                return
+        raise ValueError(f"Value {value} not found")
+
+    def sort(self, key=None, reverse=False):
+        if key is None:
+            key = lambda x: x
+        permutation = list(range(self.proxy.length))
+        permutation.sort(
+            key=lambda x: key(self.getitem(x).value), reverse=reverse
+        )
+        self.proxy.permutate(permutation)
+
+    def reverse(self):
+        permutation = list(range(self.proxy.length))
+        permutation.reverse()
+        self.proxy.permutate(permutation)
+
+
+class TestMutableDictLikeVariable(unittest.TestCase):
+    def test_getitem(self):
+        data = {"a": 1, "b": 2}
+        var = DictVariable(data)
+        self.assertEqual(var.getitem("a"), ConstVariable(1))
+        self.assertEqual(var.getitem("b"), ConstVariable(2))
+
+    def test_setitem(self):
+        data = {"a": 1, "b": 2}
+        var = DictVariable(data)
+        var.setitem("a", ConstVariable(3))
+        self.assertEqual(var.getitem("a"), ConstVariable(3))
+        var.setitem("c", ConstVariable(4))
+        self.assertEqual(var.getitem("c"), ConstVariable(4))
+
+    def test_delitem(self):
+        data = {"a": 1, "b": 2}
+        var = DictVariable(data)
+        var.delitem("a")
+        with self.assertRaises(KeyError):
+            var.getitem("a")
+
+    def test_keys(self):
+        data = {"a": 1, "b": 2}
+        var = DictVariable(data)
+        self.assertEqual(list(var.proxy.get_all().keys()), ["a", "b"])
+
+
+class TestMutableListLikeVariable(unittest.TestCase):
+    def test_getitem(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        self.assertEqual(var.getitem(0), ConstVariable(1))
+        self.assertEqual(var.getitem(1), ConstVariable(2))
+        self.assertEqual(var.getitem(2), ConstVariable(3))
+
+    def test_getitem_slice_1(self):
+        data = [1, 2, 3, 4, 5, 6, 7]
+        var = ListVariable(data)
+        self.assertEqual(
+            var.getitem(slice(0, 3)),
+            [ConstVariable(1), ConstVariable(2), ConstVariable(3)],
+        )
+        self.assertEqual(
+            var.getitem(slice(4, 1, -1)),
+            [ConstVariable(5), ConstVariable(4), ConstVariable(3)],
+        )
+        self.assertEqual(
+            var.getitem(slice(1, 5, 2)),
+            [ConstVariable(2), ConstVariable(4)],
+        )
+
+    def test_getitem_slice_2(self):
+        data = [1, 2, 3, 4, 5, 6, 7]
+        var = ListVariable(data)
+        self.assertEqual(
+            var[0:3],
+            [ConstVariable(1), ConstVariable(2), ConstVariable(3)],
+        )
+        self.assertEqual(
+            var[4:1:-1],
+            [ConstVariable(5), ConstVariable(4), ConstVariable(3)],
+        )
+        self.assertEqual(
+            var[1:5:2],
+            [ConstVariable(2), ConstVariable(4)],
+        )
+
+    def test_setitem(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        var.setitem(0, ConstVariable(4))
+        self.assertEqual(var.getitem(0), ConstVariable(4))
+        var.append(ConstVariable(5))
+        self.assertEqual(var.getitem(3), ConstVariable(5))
+
+    def test_setitem_slice_1(self):
+        data = [1, 2, 3, 4, 5, 6, 7]
+        var = ListVariable(data)
+        var.setitem(slice(0, 3), [ConstVariable(4), ConstVariable(5)])
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [4, 5, 4, 5, 6, 7]],
+        )
+        var.setitem(
+            slice(4, 1, -1),
+            [ConstVariable(8), ConstVariable(9), ConstVariable(10)],
+        )
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [4, 5, 10, 9, 8, 7]],
+        )
+
+    def test_setitem_slice_2(self):
+        data = [1, 2, 3, 4, 5, 6, 7]
+        var = ListVariable(data)
+        var.setitem(slice(2, 5, 2), [ConstVariable(8), ConstVariable(9)])
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [1, 2, 8, 4, 9, 6, 7]],
+        )
+
+    def test_delitem(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        var.delitem(0)
+        with self.assertRaises(IndexError):
+            var.getitem(2)
+        var.pop()
+        with self.assertRaises(IndexError):
+            var.getitem(1)
+
+    def test_insert(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        var.insert(0, ConstVariable(4))
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [4, 1, 2, 3]],
+        )
+        var.insert(2, ConstVariable(5))
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [4, 1, 5, 2, 3]],
+        )
+
+    def test_append(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        var.append(ConstVariable(4))
+        self.assertEqual(var.getitem(3), ConstVariable(4))
+
+    def test_extend(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        var.extend([ConstVariable(4), ConstVariable(5)])
+        self.assertEqual(var.getitem(3), ConstVariable(4))
+        self.assertEqual(var.getitem(4), ConstVariable(5))
+
+    def test_pop(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        self.assertEqual(var.pop(), ConstVariable(3))
+        self.assertEqual(var.pop(0), ConstVariable(1))
+
+    def test_clear(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        var.clear()
+        self.assertEqual(var.proxy.length, 0)
+
+    def test_remove(self):
+        data = [1, 2, 3]
+        var = ListVariable(data)
+        var.remove(ConstVariable(2))
+        self.assertEqual(var.getitem(0), ConstVariable(1))
+        self.assertEqual(var.getitem(1), ConstVariable(3))
+        with self.assertRaises(ValueError):
+            var.remove(ConstVariable(2))
+
+    def test_sort(self):
+        data = [2, 3, 0, 4, 1, 5]
+        var = ListVariable(data)
+        var.sort()
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [0, 1, 2, 3, 4, 5]],
+        )
+
+    def test_sort_with_key(self):
+        data = [-1, -4, 2, 0, 5, -3]
+        var = ListVariable(data)
+        var.sort(key=lambda x: x**2)
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [0, -1, 2, -3, -4, 5]],
+        )
+
+    def test_sort_reverse(self):
+        data = [2, 3, 0, 4, 1, 5]
+        var = ListVariable(data)
+        var.sort(reverse=True)
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [5, 4, 3, 2, 1, 0]],
+        )
+
+    def test_reverse(self):
+        data = [2, 3, 0, 4, 1, 5]
+        var = ListVariable(data)
+        var.reverse()
+        self.assertEqual(
+            [var.getitem(i) for i in range(var.proxy.length)],
+            [ConstVariable(n) for n in [5, 1, 4, 0, 3, 2]],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_numpy.py b/test/sot/test_numpy.py
new file mode 100644
index 00000000000000..3600d4df7cc455
--- /dev/null
+++ b/test/sot/test_numpy.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from test_case_base import TestCaseBase, strict_mode_guard
+
+import paddle
+
+
+def foo(x, y):
+    ret = x + y
+    return ret
+
+
+class TestNumpy(TestCaseBase):
+    def test_tensor_add_numpy_number(self):
+        x = paddle.to_tensor([1.0])
+        y = np.int64(2)
+        self.assert_results(foo, x, y)
+        self.assert_results(foo, y, x)
+
+    @strict_mode_guard(0)
+    def test_tensor_add_numpy_array(self):
+        x = paddle.to_tensor([1.0])
+        y = np.array(2.0)
+        self.assert_results(foo, x, y)
+        self.assert_results(foo, y, x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_numpy_var_if.py b/test/sot/test_numpy_var_if.py
new file mode 100644
index 00000000000000..9d7c4a7048e251
--- /dev/null
+++ b/test/sot/test_numpy_var_if.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.psdb import check_no_breakgraph, check_no_fallback
+
+os.environ['MIN_GRAPH_SIZE'] = '-1'
+
+
+@check_no_breakgraph
+@check_no_fallback
+def forward(x, y):
+    if x == 0:
+        return y + 2
+    else:
+        return y * 2
+
+
+@check_no_breakgraph
+@check_no_fallback
+def forward2(x, y):
+    if x == x:  # numpy == numpy
+        return y + 2
+    else:
+        return y * 2
+
+
+class TestJumpWithNumpy(TestCaseBase):
+    def test_jump(self):
+        self.assert_results(forward, np.array([1]), paddle.to_tensor(2))
+        self.assert_results(forward, np.array([0]), paddle.to_tensor(2))
+        self.assert_results(forward2, np.array([0]), paddle.to_tensor(2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_output_restoration.py b/test/sot/test_output_restoration.py
new file mode 100644
index 00000000000000..9c2cf268e9087b
--- /dev/null
+++ b/test/sot/test_output_restoration.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def output_identity(x):
+    return x
+
+
+def output_const():
+    return 42
+
+
+def output_list(x: paddle.Tensor, y: paddle.Tensor, z: int):
+    a = x + 1
+    b = z + 1
+    l = [1, a, b, y]
+    return l
+
+
+def output_dict(x: paddle.Tensor, y: paddle.Tensor, z: int):
+    a = x + 1
+    b = z + 1
+    l = {1: a, b: y}
+    return l
+
+
+def output_dict_const_key(x: paddle.Tensor, y: paddle.Tensor, z: int):
+    a = x + 1
+    b = z + 1
+    l = {1: a, 2: y}
+    return l
+
+
+def output_nest_struct(x: paddle.Tensor, y: paddle.Tensor, z: int):
+    a = x + y + z
+    b = z + 1
+    l = [1 + 1, (z, a), [b]]
+    return l
+
+
+class TestOutputRestoration(TestCaseBase):
+    def test_output_identity(self):
+        self.assert_results(output_identity, 1)
+        self.assert_results(output_identity, 2)
+        self.assert_results(output_identity, paddle.to_tensor(1))
+
+    def test_output_const(self):
+        self.assert_results(output_const)
+
+    def test_output_list(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+
+        self.assert_results(output_list, a, b, 3)
+
+    def test_output_dict(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+
+        self.assert_results(output_dict, a, b, 3)
+
+    def test_output_dict_const_key(self):
+        a = paddle.to_tensor(2)
+        b = paddle.to_tensor(3)
+
+        self.assert_results(output_dict_const_key, a, b, 4)
+
+    def test_output_nest_struct(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+
+        self.assert_results(output_nest_struct, a, b, 3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_range.py b/test/sot/test_range.py
new file mode 100644
index 00000000000000..3a7e85fb0951de
--- /dev/null
+++ b/test/sot/test_range.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def test_range_1(stop: int):
+    return range(stop)
+
+
+def test_range_2(start: int, stop: int):
+    return range(start, stop)
+
+
+def test_range_3(start: int, stop: int, step: int):
+    return range(start, stop, step)
+
+
+def test_range_4(stop: int, index: int):
+    return range(stop)[index]
+
+
+def test_range_5(stop: int):
+    return list(range(stop))
+
+
+def test_range_6(stop: int, index: int):
+    return list(range(stop))[index]
+
+
+def test_range_7(index: int, tensor: paddle.Tensor):
+    return list(range(len(tensor.shape)))[index]
+
+
+def test_range_8(stop: int):
+    sum = 0
+    for i in range(stop):
+        sum += i
+    return sum
+
+
+def test_range_9(stop: int, tensor: paddle.Tensor):
+    for i in range(stop):
+        tensor += i
+    return tensor
+
+
+def test_range_10(stop: int, tensor: paddle.Tensor):
+    for i in range(stop):
+        for j in range(stop + 1):
+            tensor += j
+    return tensor
+
+
+class TestExecutor(TestCaseBase):
+    def test_cases(self):
+        start = 3
+        stop = 10
+        step = 2
+        index = 1
+        tensor = paddle.randn((10, 10))
+
+        self.assert_results(test_range_1, stop)
+        self.assert_results(test_range_2, start, stop)
+        self.assert_results(test_range_3, start, stop, step)
+        self.assert_results(test_range_4, stop, index)
+        self.assert_results(test_range_5, stop)
+        self.assert_results(test_range_6, stop, index)
+        self.assert_results(test_range_7, index, tensor)
+        self.assert_results(test_range_8, stop)
+
+        self.assert_results(test_range_9, stop, paddle.randn((10,)))
+        self.assert_results(test_range_10, stop, paddle.randn((10,)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_resnet.py b/test/sot/test_resnet.py
new file mode 100644
index 00000000000000..cc9a47252c559e
--- /dev/null
+++ b/test/sot/test_resnet.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import (
+    TestCaseBase,
+    test_instruction_translator_cache_context,
+)
+
+import paddle
+from paddle.vision.models.resnet import resnet18
+
+
+def resnet_call(x: paddle.Tensor, net: paddle.nn.Layer):
+    return net(x)
+
+
+class TestResNet(TestCaseBase):
+    def test_resnet_eval(self):
+        x = paddle.rand((10, 3, 224, 224))
+        net = resnet18(pretrained=False)
+        net.eval()
+        with test_instruction_translator_cache_context() as ctx:
+            self.assert_results(resnet_call, x, net)
+            self.assertEqual(ctx.translate_count, 1)
+            self.assert_results(resnet_call, x, net)  # cache hit
+            self.assertEqual(ctx.translate_count, 1)
+            net.train()
+            self.assert_results(resnet_call, x, net)  # cache miss
+            self.assertEqual(ctx.translate_count, 2)
+
+    def test_resnet_train(self):
+        x = paddle.rand((10, 3, 224, 224))
+        net = resnet18(pretrained=False)
+        net.train()
+        with test_instruction_translator_cache_context() as ctx:
+            self.assert_results(resnet_call, x, net)
+            self.assertEqual(ctx.translate_count, 1)
+            self.assert_results(resnet_call, x, net)  # cache hit
+            self.assertEqual(ctx.translate_count, 1)
+            net.eval()
+            self.assert_results(resnet_call, x, net)  # cache miss
+            self.assertEqual(ctx.translate_count, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_resnet50_backward.py b/test/sot/test_resnet50_backward.py
new file mode 100644
index 00000000000000..bd5aac0025e802
--- /dev/null
+++ b/test/sot/test_resnet50_backward.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+os.environ["FLAGS_cudnn_deterministic"] = "True"
+
+import random
+import unittest
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+import paddle
+from paddle.jit.sot import symbolic_translate
+from paddle.jit.sot.utils.utils import execute_time
+from paddle.vision import resnet50
+
+
+def resnet_call(net: paddle.nn.Layer, x: paddle.Tensor):
+    return net(x)
+
+
+def run_dygraph_optimizer(inp):
+    """dygraph train + SGD optimizer"""
+    paddle.seed(2021)
+    np.random.seed(2021)
+    random.seed(2021)
+    net = resnet50()
+    optimizer = paddle.optimizer.SGD(
+        learning_rate=0.03, parameters=net.parameters()
+    )
+    for i in range(5):
+        optimizer.clear_grad()
+        loss = execute_time(net)(inp)
+        loss.backward()
+        optimizer.step()
+    return loss
+
+
+def run_symbolic_optimizer(inp):
+    """dygraph train + SGD optimizer"""
+    paddle.seed(2021)
+    np.random.seed(2021)
+    random.seed(2021)
+    net = resnet50()
+    net_wrapper = symbolic_translate(resnet_call)
+    optimizer = paddle.optimizer.SGD(
+        learning_rate=0.03, parameters=net.parameters()
+    )
+    for i in range(5):
+        optimizer.clear_grad()
+        loss = execute_time(net_wrapper)(net, inp)
+        loss.backward()
+        optimizer.step()
+    return loss
+
+
+def run_to_static_optimizer(inp):
+    """dygraph train + SGD optimizer"""
+    paddle.seed(2021)
+    np.random.seed(2021)
+    random.seed(2021)
+    net = resnet50()
+    net = paddle.jit.to_static(net, enable_fallback=False)
+    optimizer = paddle.optimizer.SGD(
+        learning_rate=0.03, parameters=net.parameters()
+    )
+    for i in range(5):
+        optimizer.clear_grad()
+        loss = execute_time(net)(inp)
+        loss.backward()
+        optimizer.step()
+    return loss
+
+
+class TestBackward(unittest.TestCase):
+    def test(self):
+        # TODO(xiongkun) add cache to speedup !
+        paddle.seed(2021)
+        np.random.seed(2021)
+        random.seed(2021)
+        inp = paddle.rand((3, 3, 255, 255))
+        print("Start Run SymbolicTranslate:")
+        out2 = run_symbolic_optimizer(inp)[0].numpy()
+        print("Start Run Dygraph:")
+        out1 = run_dygraph_optimizer(inp)[0].numpy()
+        print("Start Run To Static:")
+        out1 = run_to_static_optimizer(inp)[0].numpy()
+        assert_array_equal(
+            out1, out2, "Not Equal in dygraph and static graph", True
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_segment_linear.py b/test/sot/test_segment_linear.py
new file mode 100644
index 00000000000000..ee3b7d70f8d365
--- /dev/null
+++ b/test/sot/test_segment_linear.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle import nn
+from paddle.jit import sot
+
+
+class Head(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.head = nn.Linear(10, 150)
+
+    def forward(self, x, patch_embed_size):
+        masks = self.head(x)
+        # [b, (h w), c] -> [b, c, h, w]
+        h, w = patch_embed_size[0], patch_embed_size[1]
+        masks = masks.reshape((1, h, w, paddle.shape(masks)[-1]))
+        masks = masks.transpose((0, 3, 1, 2))
+        return masks
+
+
+class SimpleNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.tmp = nn.Linear(1, 1024 * 10)
+        self.tmp2 = nn.Linear(1, 1 * 10 * 32 * 32)
+        self.head = Head()
+
+    def getshape(self, x):
+        x = self.tmp2(x.mean().reshape([1])).reshape([1, 10, 32, 32])
+        x = paddle.shape(x)
+        return x
+
+    def forward(self, x):
+        shape = self.getshape(x)
+        feat = self.tmp(x.mean().reshape([1])).reshape([1, 1024, 10])
+        logits = self.head(feat, shape[2:])
+        return logits
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        sot.skip_function(SimpleNet.forward)
+        x = paddle.randn((1, 8, 8))
+        net = SimpleNet()
+        net = paddle.jit.to_static(
+            net
+        )  # dont make effect. we need fetch sot PR in paddle.
+        loss = net(x)
+        loss = loss.sum()
+        loss.backward()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_side_effects.py b/test/sot/test_side_effects.py
new file mode 100644
index 00000000000000..46bed6e8d3c4e3
--- /dev/null
+++ b/test/sot/test_side_effects.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase, strict_mode_guard
+
+import paddle
+from paddle.jit import sot
+from paddle.jit.sot import symbolic_translate
+from paddle.jit.sot.utils import InnerError
+
+
+def dict_setitem(x):
+    x[0] = 1
+    return x[0]
+
+
+def dict_delitem(x):
+    del x[0]
+    return x
+
+
+def dict_delitem_getitem(a):
+    b = a[0]
+    del a[0]
+    b[0] = 1
+    return a, b
+
+
+def dict_nested_1(x):
+    x[0][0] = 42
+    x[1][0] = x[0][0] + x[0][1]
+    x[2] = {1: 2}
+    return x
+
+
+def dict_nested_2(x):
+    a = x[0]
+    b = x[1]
+    del a[0]
+    a[1] = b[0]
+    a[2] = b[1]
+    x[1][0] = 42
+    del a[1]
+    return a, b
+
+
+def list_append_int(tensor_x, list_a):
+    tensor_x = tensor_x + 1
+    list_a.append(12)
+    return tensor_x, list_a
+
+
+def list_append_tensor(tensor_x, list_a):
+    tensor_x = tensor_x + 1
+    list_a.append(tensor_x)
+    return tensor_x, list_a
+
+
+def list_delitem(list_a):
+    del list_a[0]
+    return list_a[0]
+
+
+def list_extend(list_a):
+    list_a.extend([1, 2, 3])
+    return list_a[0]
+
+
+def list_nested(list_a):
+    inner_list = []
+    inner_list.append(list_a)
+    inner_list[-1].append(12)
+    return 12
+
+
+def list_insert(list_a):
+    list_a.insert(0, 1)
+    return list_a[0]
+
+
+def list_remove(list_a):
+    list_a.remove(1)
+    return list_a[0]
+
+
+def list_pop(list_a):
+    list_a.pop(0)
+    list_a.pop()
+    list_a.pop(1)
+    return list_a[0]
+
+
+def list_clear(list_a):
+    list_a.clear()
+    return list_a
+
+
+def list_sort(list_a):
+    list_a.sort()
+    return list_a
+
+
+def list_reverse(list_a):
+    list_a.reverse()
+    return list_a
+
+
+def slice_in_for_loop(x, iter_num=3):
+    x = paddle.to_tensor(x)
+    a = []
+
+    iter_num = paddle.full(shape=[1], fill_value=iter_num, dtype="int32")
+
+    for i in range(iter_num):
+        a.append(x)
+
+    for i in range(iter_num):
+        a[i] = x
+    out = a[2]
+    return out
+
+
+# TODO: Object SideEffect
+class CustomObject:
+    def __init__(self):
+        self.x = 2
+        self.y = paddle.to_tensor(1)
+
+    def object_attr_set2(self, x):
+        self.outputs = []
+        self.outputs.append(x)
+        return self.outputs
+
+
+@sot.psdb.check_no_breakgraph
+def object_attr_set(cus_obj, t):
+    """object side effect."""
+    t = t + 1
+    cus_obj.x = t
+    return t, cus_obj.x
+
+
+def object_attr_breakgraph(cus_obj, t):
+    t = t + 1
+    sot.psdb.breakgraph()
+    cus_obj.x = t
+    sot.psdb.breakgraph()
+    return t, cus_obj.x
+
+
+@sot.psdb.check_no_breakgraph
+def object_attr_tensor_del(cus_obj):
+    del cus_obj.y
+
+
+@sot.psdb.check_no_breakgraph
+def object_attr_int_del(cus_obj):
+    del cus_obj.x
+
+
+def slice_list_after_change(l):
+    l.reverse()
+    sum = 0
+    for i, v in zip(range(2), l[2:]):
+        sum += v
+    return sum
+
+
+class TestDictSideEffect(TestCaseBase):
+    def test_dict_setitem(self):
+        self.assert_results_with_side_effects(
+            dict_setitem, {0: paddle.to_tensor(0)}
+        )
+        self.assert_results_with_side_effects(
+            dict_setitem, {0: paddle.to_tensor(1)}
+        )
+
+    def test_dict_delitem(self):
+        self.assert_results_with_side_effects(
+            dict_delitem, {0: paddle.to_tensor(0), 1: paddle.to_tensor(1)}
+        )
+        self.assert_results_with_side_effects(
+            dict_delitem, {0: paddle.to_tensor(1), 2: paddle.to_tensor(2)}
+        )
+
+    def test_dict_delitem_getitem(self):
+        self.assert_results_with_side_effects(
+            dict_delitem_getitem, {0: {0: 1, 1: 2}}
+        )
+
+    def test_dict_nested_1(self):
+        self.assert_results_with_side_effects(
+            dict_nested_1, {0: {0: 1, 1: 2}, 1: {0: 1, 1: 2}}
+        )
+        self.assert_results_with_side_effects(
+            dict_nested_1, {0: {0: 123, 1: 2}, 1: {0: 1, 1: 2}}
+        )
+
+    def test_dict_nested_2(self):
+        self.assert_results_with_side_effects(
+            dict_nested_2, {0: {0: 1, 1: 2}, 1: {0: 1, 1: 2}}
+        )
+        self.assert_results_with_side_effects(
+            dict_nested_2, {0: {0: 123, 1: 2}, 1: {0: 1, 1: 2}}
+        )
+
+
+class TestListSideEffect(TestCaseBase):
+    def test_list_append(self):
+        self.assert_results_with_side_effects(
+            list_append_int, paddle.to_tensor(1), [1, 2, 3]
+        )
+        self.assert_results_with_side_effects(
+            list_append_tensor, paddle.to_tensor(2), [1, 2, 3]
+        )
+
+    def test_list_delitem(self):
+        self.assert_results_with_side_effects(list_delitem, [1, 2, 3])
+
+    def test_list_extend(self):
+        self.assert_results_with_side_effects(
+            list_extend, [1, 2, 3, 4, 5, 6, 7, 8, 9]
+        )
+
+    def test_list_insert(self):
+        self.assert_results_with_side_effects(list_insert, [1, 2, 3])
+        self.assert_results_with_side_effects(
+            list_insert, [-1, 2, -3, 4, -5, 6, -7, 8, -9]
+        )
+
+    def test_list_remove(self):
+        self.assert_results_with_side_effects(list_remove, [1, 1, 1])
+        self.assert_results_with_side_effects(list_remove, [0, 1, 2])
+        with self.assertRaises(InnerError):
+            symbolic_translate(list_remove)([0, 2, 4])
+
+    def test_list_pop(self):
+        self.assert_results_with_side_effects(list_pop, [1, 2, 3, 4, 5])
+        self.assert_results_with_side_effects(
+            list_pop, [-1, 2, -3, 4, -5, 6, -7, 8, -9]
+        )
+
+    def test_list_clear(self):
+        self.assert_results_with_side_effects(list_clear, [1, 2, 3, 4, 5])
+        self.assert_results_with_side_effects(
+            list_clear, [-1, 2, -3, 4, -5, 6, -7, 8, -9]
+        )
+
+    def test_list_sort(self):
+        self.assert_results_with_side_effects(list_sort, [2, 1, 7, 3, 4, 6])
+        self.assert_results_with_side_effects(
+            list_sort, [-1, 2, -3, 4, -5, 6, -7, 8, -9]
+        )
+
+    def test_list_reverse(self):
+        self.assert_results_with_side_effects(list_reverse, [1, 2, 3, 4, 5])
+        self.assert_results_with_side_effects(
+            list_reverse, [-1, 2, -3, 4, -5, 6, -7, 8, -9]
+        )
+
+    def test_slice_in_for_loop(self):
+        x = 2
+        with strict_mode_guard(0):
+            self.assert_results_with_side_effects(slice_in_for_loop, x)
+
+    def test_list_nested(self):
+        self.assert_results_with_side_effects(list_nested, [1, 2, 3])
+
+
+class TestSliceAfterChange(TestCaseBase):
+    def test_slice_list_after_change(self):
+        self.assert_results_with_side_effects(
+            slice_list_after_change, [1, 2, 3, 4]
+        )
+        self.assert_results_with_side_effects(
+            slice_list_after_change, [7, 8, 9, 10]
+        )
+
+
+class TestAttrSideEffect(TestCaseBase):
+    def attr_check(self, func, attr_keys: list[str], cls, *inputs):
+        cus_obj1 = cls()
+        cus_obj2 = cls()
+        sym_output = symbolic_translate(func)(cus_obj1, *inputs)
+        paddle_output = func(cus_obj2, *inputs)
+        for key in attr_keys:
+            self.assert_nest_match(
+                getattr(cus_obj1, key, f"__MISS_KEY__{key}"),
+                getattr(cus_obj2, key, f"__MISS_KEY__{key}"),
+            )
+        self.assert_nest_match(sym_output, paddle_output)
+
+    def test_attr_set(self):
+        self.attr_check(object_attr_set, ["x"], CustomObject, 5)
+        self.attr_check(
+            CustomObject.object_attr_set2, ["outputs"], CustomObject, 6
+        )
+        self.attr_check(
+            CustomObject.object_attr_set2,
+            ["outputs"],
+            CustomObject,
+            paddle.to_tensor(5),
+        )
+        self.attr_check(
+            object_attr_set, ["x"], CustomObject, paddle.to_tensor(5)
+        )
+
+    def test_attr_del(self):
+        self.attr_check(object_attr_tensor_del, ["y"], CustomObject)
+        self.attr_check(object_attr_int_del, ["x"], CustomObject)
+
+    def test_attr_set_breakgraph(self):
+        self.attr_check(object_attr_breakgraph, ["x"], CustomObject, 100)
+        self.attr_check(object_attr_breakgraph, ["x"], CustomObject, 1000)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_simulate_initialize.py b/test/sot/test_simulate_initialize.py
new file mode 100644
index 00000000000000..495e06ac1dbda2
--- /dev/null
+++ b/test/sot/test_simulate_initialize.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle import nn
+from paddle.jit.sot import symbolic_translate
+
+
+class A:
+    def __init__(self, vals):
+        vals.append(1)
+
+
+def foo(x, y):
+    out = nn.Softmax()(paddle.to_tensor([x, y], dtype="float32"))
+    return out
+
+
+def bar(x):
+    a = A(x)
+    t = paddle.to_tensor(x)
+    return t.mean()
+
+
+class TestInit(TestCaseBase):
+    def test_init_paddle_layer(self):
+        self.assert_results(foo, 1, 2)
+
+    def test_init_python_object(self):
+        sot_output = symbolic_translate(bar)([1.0, 2.0])
+        dyn_output = bar([1.0, 2.0])
+        self.assert_nest_match(sot_output, dyn_output)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_sir_rollback.py b/test/sot/test_sir_rollback.py
new file mode 100644
index 00000000000000..ddb7792651e4d1
--- /dev/null
+++ b/test/sot/test_sir_rollback.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import inspect
+import operator
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+from paddle.jit.sot.opcode_translator.executor.function_graph import (
+    FunctionGraph,
+)
+from paddle.jit.sot.opcode_translator.executor.tracker import (
+    DanglingTracker,
+    LocalTracker,
+)
+from paddle.jit.sot.opcode_translator.executor.variables import (
+    BuiltinVariable,
+    VariableFactory,
+)
+
+
+def compute(x, y):
+    ret = BuiltinVariable(operator.add, x.graph, DanglingTracker())(x, y)
+    return BuiltinVariable(operator.mul, x.graph, DanglingTracker())(ret, x)
+
+
+def try_add(x, y):
+    return BuiltinVariable(operator.add, x.graph, DanglingTracker())(x, y)
+
+
+class TestRollback(TestCaseBase):
+    def test_rollback(self):
+        frame = inspect.currentframe()
+        graph = FunctionGraph(frame)
+        a = paddle.to_tensor(1.0)
+        b = paddle.to_tensor(2.0)
+        a = VariableFactory().from_value(a, graph, LocalTracker("a"))
+        b = VariableFactory().from_value(b, graph, LocalTracker("b"))
+        out = compute(a, b)
+        original_length = len(graph.sir_ctx.TOS.statements)
+        memo = graph.save_memo()
+        try_add(out, out)
+
+        assert len(graph.sir_ctx.TOS.statements) != len(
+            memo.stmt_ir.statements
+        ), "After add, we must statement IR."
+        graph.restore_memo(memo)
+
+        assert len(graph.sir_ctx.TOS.statements) == original_length
+
+
+def fn_with_side_effects_inner(x, y):
+    x[0] += 10
+    x[1] += 20
+    x[2] -= 10
+    print(y)  # print will cause breakgraph
+
+
+def fn_with_side_effects(x, y):
+    x[0] += 1
+    fn_with_side_effects_inner(x, y)
+    return x[0] + y
+
+
+class TestSideEffectRollback(TestCaseBase):
+    def test_side_effect_rollback(self):
+        self.assert_results_with_side_effects(
+            fn_with_side_effects, [1, 2, 3], paddle.to_tensor(42)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_stack.py b/test/sot/test_stack.py
new file mode 100644
index 00000000000000..e29610b2c837cf
--- /dev/null
+++ b/test/sot/test_stack.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.jit.sot.opcode_translator.executor.variable_stack import (
+    VariableStack,
+)
+
+
+class TestVariableStack(unittest.TestCase):
+    def test_basic(self):
+        stack = VariableStack([1, 2, 3])
+        self.assertEqual(str(stack), "[1, 2, 3]")
+        self.assertEqual(len(stack), 3)
+        self.assertEqual(str(stack.copy()), str(stack))
+
+    def test_peek(self):
+        stack = VariableStack([1, 2, 3])
+        self.assertEqual(stack.peek(), 3)
+        self.assertEqual(stack.top, 3)
+        self.assertEqual(stack.peek(1), 3)
+        stack.peek[1] = 4
+        stack.peek[2] = 3
+        self.assertEqual(stack.peek[1], 4)
+        self.assertEqual(stack.peek[:1], [4])
+        self.assertEqual(stack.peek[:2], [3, 4])
+        stack.top = 5
+        self.assertEqual(stack.peek[:2], [3, 5])
+
+    def test_push_pop(self):
+        stack = VariableStack()
+        stack.push(1)
+        stack.push(2)
+        self.assertEqual(stack.pop(), 2)
+        self.assertEqual(stack.pop(), 1)
+
+    def test_pop_n(self):
+        stack = VariableStack([1, 2, 3, 4])
+        self.assertEqual(stack.pop_n(2), [3, 4])
+        self.assertEqual(stack.pop_n(2), [1, 2])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_str_format.py b/test/sot/test_str_format.py
new file mode 100644
index 00000000000000..34bbd6e31f3dde
--- /dev/null
+++ b/test/sot/test_str_format.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+
+# copy from python library _distutils_hack/__init__.py
+def find_spec(self, fullname, path, target=None):
+    method_name = 'spec_for_{fullname}'.format(
+        **{'self': self, 'fullname': fullname}
+    )
+    method = getattr(self, method_name, lambda: None)
+    return method()
+
+
+class TestExecutor(TestCaseBase):
+    def test_simple(self):
+        self.assert_results(find_spec, "self", "fullname", "path", None)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_tensor_dtype_in_guard.py b/test/sot/test_tensor_dtype_in_guard.py
new file mode 100644
index 00000000000000..d5d001b7038d0d
--- /dev/null
+++ b/test/sot/test_tensor_dtype_in_guard.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import (
+    TestCaseBase,
+    test_instruction_translator_cache_context,
+)
+
+import paddle
+from paddle.jit import sot
+
+
+def foo(x, y):
+    if x.dtype == paddle.float32:
+        out = x + y
+    else:
+        out = x - y
+    return out
+
+
+@sot.skip_function
+def dtype_in_guard(x, y):
+    with paddle.amp.auto_cast(level='O2'):
+        for i in range(10):
+            z = foo(x, y)
+            x = z
+        return x
+
+
+def bar(x, y):
+    if x == paddle.float32:
+        return y + 1
+    else:
+        return y - 1
+
+
+@sot.skip_function
+def dtype_as_input(x, y):
+    with paddle.amp.auto_cast(level='O2'):
+        for i in range(10):
+            z = bar(x, y)
+            y = z
+        return y
+
+
+class TestDtypeInGuard(TestCaseBase):
+    def test_dtype_in_guard(self):
+        with test_instruction_translator_cache_context() as ctx:
+            x = paddle.to_tensor([2], dtype="float32")
+            y = paddle.to_tensor([3], dtype="float32")
+            self.assert_results(dtype_in_guard, x, y)
+            self.assertEqual(ctx.translate_count, 1)
+
+    def test_input_dtype_in_guard(self):
+        with test_instruction_translator_cache_context() as ctx:
+            x = paddle.float32
+            y = paddle.to_tensor([3], dtype="float32")
+            self.assert_results(dtype_as_input, x, y)
+            self.assertEqual(ctx.translate_count, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_tensor_slice.py b/test/sot/test_tensor_slice.py
new file mode 100644
index 00000000000000..32c52759da4387
--- /dev/null
+++ b/test/sot/test_tensor_slice.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_case_base import TestCaseBase
+
+import paddle
+
+
+def foo(x: paddle.Tensor):
+    return x[:, 0]
+
+
+class TestExecutor(TestCaseBase):
+    def test_tensor_slice(self):
+        x = paddle.randn((10, 10))
+        self.assert_results(foo, x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_trace_list_arg.py b/test/sot/test_trace_list_arg.py
new file mode 100644
index 00000000000000..8a82406a11f754
--- /dev/null
+++ b/test/sot/test_trace_list_arg.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+from test_case_base import (
+    TestCaseBase,
+    test_instruction_translator_cache_context,
+)
+
+import paddle
+
+
+def foo(x: list[paddle.Tensor], y: list[paddle.Tensor]):
+    return x[0] + y[0]
+
+
+def bar(x: list[paddle.Tensor], y: int, z: int):
+    return x[y + z] + 1
+
+
+class TestTraceListArg(TestCaseBase):
+    def test_foo(self):
+        a = paddle.to_tensor(1)
+        b = paddle.to_tensor(2)
+        c = paddle.to_tensor([3, 4])
+
+        with test_instruction_translator_cache_context() as cache:
+            self.assert_results(foo, [a], [b])
+            self.assertEqual(cache.translate_count, 1)
+            self.assert_results(foo, [b], [a])  # Cache hit
+            self.assertEqual(cache.translate_count, 1)
+            self.assert_results(foo, [a], [c])  # Cache miss
+            self.assertEqual(cache.translate_count, 2)
+
+    def test_bar(self):
+        a = [paddle.to_tensor(1), paddle.to_tensor(2), paddle.to_tensor(3)]
+        b = [paddle.to_tensor([2, 3]), paddle.to_tensor(4), paddle.to_tensor(5)]
+
+        with test_instruction_translator_cache_context() as cache:
+            self.assert_results(bar, a, 1, 1)
+            self.assertEqual(cache.translate_count, 1)
+            self.assert_results(bar, a, 2, 0)  # Cache miss
+            self.assertEqual(cache.translate_count, 2)
+            self.assert_results(bar, b, 1, 1)  # Cache hit
+            self.assertEqual(cache.translate_count, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/white_list/check_op_sequence_batch_1_input_white_list.py b/test/white_list/check_op_sequence_batch_1_input_white_list.py
index f98fcbd586e655..2506c557e6e63b 100644
--- a/test/white_list/check_op_sequence_batch_1_input_white_list.py
+++ b/test/white_list/check_op_sequence_batch_1_input_white_list.py
@@ -30,7 +30,6 @@
     'sequence_scatter',
     'sequence_slice',
     'sequence_softmax',
-    'sequence_topk_avg_pooling',
     'sequence_unpad',
 ]
 
diff --git a/test/white_list/check_op_sequence_instance_0_input_white_list.py b/test/white_list/check_op_sequence_instance_0_input_white_list.py
index 5b222c56e8dde6..b4f9d16317e16c 100644
--- a/test/white_list/check_op_sequence_instance_0_input_white_list.py
+++ b/test/white_list/check_op_sequence_instance_0_input_white_list.py
@@ -36,7 +36,6 @@
     'sequence_scatter',
     'sequence_slice',
     'sequence_softmax',
-    'sequence_topk_avg_pooling',
     'sequence_unpad',
 ]
 
diff --git a/test/white_list/check_shape_white_list.py b/test/white_list/check_shape_white_list.py
index db5a7108672778..144505f3d75d94 100644
--- a/test/white_list/check_shape_white_list.py
+++ b/test/white_list/check_shape_white_list.py
@@ -18,13 +18,9 @@
     'conv2d_transpose',
     'depthwise_conv2d_transpose',
     'grid_sampler',
-    'lstmp',
-    'margin_rank_loss',
     'matmul',
     'scatter',
     'soft_relu',
-    'squared_l2_distance',
-    'tree_conv',
     'cvm',
     'cudnn_lstm',
     'rnn',
diff --git a/test/white_list/compile_vs_runtime_white_list.py b/test/white_list/compile_vs_runtime_white_list.py
index a00c1a720aa5cb..0c74eb327a853b 100644
--- a/test/white_list/compile_vs_runtime_white_list.py
+++ b/test/white_list/compile_vs_runtime_white_list.py
@@ -29,7 +29,6 @@
     'gru',
     'rpn_target_assign',
     'retinanet_target_assign',
-    'filter_by_instag',
     'im2sequence',
     'generate_proposal_labels',
     'detection_map',
diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list
index ddd7abe19becd4..dea0398f9d5fac 100644
--- a/test/white_list/new_ir_op_test_white_list
+++ b/test/white_list/new_ir_op_test_white_list
@@ -61,6 +61,7 @@ test_diag_v2
 test_digamma_op
 test_dist_op
 test_dot_op
+test_dpsgd_op
 test_edit_distance_op
 test_eigh_op
 test_eigh_op_static_build
@@ -161,7 +162,6 @@ test_prior_box_op
 test_psroi_pool_op
 test_put_along_axis_op
 test_range
-test_real_imag_op_with_stride
 test_reduce_op
 test_reduce_op_static_build
 test_reshape_op
@@ -185,7 +185,6 @@ test_solve_op
 test_spectral_norm_op
 test_spectral_op
 test_squared_l2_norm_op
-test_squeeze_op_with_stride
 test_svd_op
 test_take_along_axis_op
 test_temporal_shift_op
@@ -197,13 +196,11 @@ test_tril_indices_op
 test_trilinear_interp_v2_op
 test_triu_indices_op
 test_trunc_op
-test_unbind_op_with_stride
 test_unfold_op
 test_unique_consecutive_op
 test_unpool3d_op
 test_unpool_op
 test_unsqueeze2_op
-test_unsqueeze_op_with_stride
 test_update_loss_scaling_op
 test_update_loss_scaling_op_static_build
 test_viterbi_decode_op
diff --git a/test/white_list/no_grad_set_white_list.py b/test/white_list/no_grad_set_white_list.py
index 33960cf4c64d3a..36210a8175025f 100644
--- a/test/white_list/no_grad_set_white_list.py
+++ b/test/white_list/no_grad_set_white_list.py
@@ -26,7 +26,6 @@
     'conv2d_transpose',
     'conv3d',
     'conv3d_transpose',
-    'cos_sim',
     'cross_entropy',
     'cross_entropy2',
     'data_norm',
@@ -44,7 +43,6 @@
     'elementwise_pow',
     'elementwise_fmin',
     'elementwise_fmax',
-    'filter_by_instag',
     'fused_elemwise_activation',
     'fused_emb_seq_pool',
     'fused_embedding_seq_pool',
@@ -59,8 +57,6 @@
     'lookup_table',
     'lookup_table_v2',
     'lstm',
-    'lstmp',
-    'margin_rank_loss',
     'matmul',
     'matmul_v2',
     'mul',
diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py
index 49b501e765b541..d2520739339eba 100644
--- a/test/white_list/op_accuracy_white_list.py
+++ b/test/white_list/op_accuracy_white_list.py
@@ -21,8 +21,6 @@
     'conv2d_transpose',
     'conv3d',
     'conv3d_transpose',
-    'conv_shift',
-    'cos_sim',
     'cudnn_lstm',
     'cvm',
     'data_norm',
@@ -41,7 +39,6 @@
     'log_loss',
     'logit',
     'lrn',
-    'margin_rank_loss',
     'match_matrix_tensor',
     'matmul',
     'max_pool2d_with_index',
@@ -62,13 +59,11 @@
     'sequence_pool',
     'sequence_reverse',
     'sequence_slice',
-    'sequence_topk_avg_pooling',
     'shuffle_channel',
     'sigmoid',
     'smooth_l1_loss',
     'softmax',
     'spectral_norm',
-    'squared_l2_distance',
     'squared_l2_norm',
     'tanh',
     'mish',
diff --git a/test/white_list/op_threshold_white_list.py b/test/white_list/op_threshold_white_list.py
index fa151bfb072573..c5eb0862cb7176 100644
--- a/test/white_list/op_threshold_white_list.py
+++ b/test/white_list/op_threshold_white_list.py
@@ -27,7 +27,6 @@
     'gru_unit',
     'kldiv_loss',
     'lstm',
-    'lstmp',
     'max_pool2d_with_index',
     'max_pool3d_with_index',
     'norm',
diff --git a/test/xpu/test_fill_op_xpu.py b/test/xpu/test_fill_op_xpu.py
deleted file mode 100644
index 8adb6fc08b998c..00000000000000
--- a/test/xpu/test_fill_op_xpu.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from op import Operator
-from op_test_xpu import XPUOpTest
-
-import paddle
-from paddle.base import core
-
-paddle.enable_static()
-
-
-class XPUTestFillOp(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = 'fill'
-        self.use_dynamic_create_class = False
-
-    class TestFillOp1(XPUOpTest):
-        def setUp(self):
-            self.op_type = "fill"
-            val = np.random.random(size=[100, 200])
-            self.inputs = {}
-            self.attrs = {
-                'value': val.flatten().tolist(),
-                'shape': [100, 200],
-                'dtype': int(core.VarDesc.VarType.FP64),
-                'force_cpu': False,
-            }
-            self.outputs = {'Out': val.astype('float64')}
-
-        def test_check_output(self):
-            self.check_output_with_place(paddle.XPUPlace(0))
-
-    class TestFillOp2(XPUOpTest):
-        def setUp(self):
-            self.op_type = "fill"
-            val = np.random.random(size=[100, 200])
-            self.inputs = {}
-            self.attrs = {
-                'value': val.flatten().tolist(),
-                'shape': [100, 200],
-                'dtype': int(core.VarDesc.VarType.FP64),
-                'force_cpu': True,
-            }
-            self.outputs = {'Out': val.astype('float64')}
-
-        def test_check_output(self):
-            self.check_output()
-
-    class TestFillOp3(unittest.TestCase):
-        def check_with_place(self, place, f_cpu):
-            scope = core.Scope()
-            # create Out Variable
-            out = scope.var('Out').get_tensor()
-
-            # create and run fill_op operator
-            val = np.random.random(size=[300, 200])
-            fill_op = Operator(
-                "fill",
-                value=val.flatten(),
-                shape=[300, 200],
-                dtype=int(core.VarDesc.VarType.FP32),
-                force_cpu=f_cpu,
-                Out='Out',
-            )
-            fill_op.run(scope, place)
-
-            # get result from Out
-            result_array = np.array(out)
-            full_array = np.array(val, 'float32')
-
-            np.testing.assert_array_equal(result_array, full_array)
-
-        def test_fill_op(self):
-            places = [core.CPUPlace()]
-            if core.is_compiled_with_xpu():
-                places.append(core.XPUPlace(0))
-
-            for place in places:
-                self.check_with_place(place, True)
-                self.check_with_place(place, False)
-
-
-support_types = get_xpu_op_support_types('fill')
-for stype in support_types:
-    create_test_class(globals(), XPUTestFillOp, stype)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/xpu/test_flatten_op_xpu.py b/test/xpu/test_flatten_op_xpu.py
deleted file mode 100644
index 7673ec9ba3d6d2..00000000000000
--- a/test/xpu/test_flatten_op_xpu.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from op_test_xpu import XPUOpTest
-
-import paddle
-
-paddle.enable_static()
-
-
-class XPUTestFlattenOp(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = 'flatten'
-        self.use_dynamic_create_class = False
-
-    class TestFlattenOp(XPUOpTest):
-        def setUp(self):
-            self.op_type = "flatten"
-            self.use_xpu = True
-            self.place = paddle.XPUPlace(0)
-            self.init_test_case()
-            self.dtype = self.in_type
-            self.inputs = {
-                "X": np.random.random(self.in_shape).astype(self.dtype)
-            }
-            self.init_attrs()
-            self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
-
-        def test_check_output(self):
-            self.check_output_with_place(self.place)
-
-        def test_check_grad(self):
-            self.check_grad_with_place(self.place, ["X"], "Out")
-
-        def init_test_case(self):
-            self.in_shape = (3, 2, 2, 10)
-            self.axis = 1
-            self.new_shape = (3, 40)
-
-        def init_attrs(self):
-            self.attrs = {"axis": self.axis}
-
-    class TestFlattenOp1(TestFlattenOp):
-        def init_test_case(self):
-            self.in_shape = (3, 2, 2, 10)
-            self.axis = 0
-            self.new_shape = (1, 120)
-
-    class TestFlattenOpWithDefaultAxis(TestFlattenOp):
-        def init_test_case(self):
-            self.in_shape = (10, 2, 2, 3)
-            self.new_shape = (10, 12)
-
-        def init_attrs(self):
-            self.attrs = {}
-
-    class TestFlattenOpSixDims(TestFlattenOp):
-        def init_test_case(self):
-            self.in_shape = (3, 2, 3, 2, 4, 4)
-            self.axis = 4
-            self.new_shape = (36, 16)
-
-
-support_types = get_xpu_op_support_types('flatten')
-for stype in support_types:
-    create_test_class(globals(), XPUTestFlattenOp, stype)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/xpu/test_one_hot_op_xpu.py b/test/xpu/test_one_hot_op_xpu.py
deleted file mode 100644
index 9536a8202919b7..00000000000000
--- a/test/xpu/test_one_hot_op_xpu.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from op_test_xpu import XPUOpTest
-
-import paddle
-from paddle.base import core
-
-paddle.enable_static()
-
-
-class XPUTestOneHotOP(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = 'one_hot'
-        self.use_dynamic_create_class = False
-
-    class TestXPUOneHotOP(XPUOpTest):
-        def setUp(self):
-            self.place = paddle.XPUPlace(0)
-            self.init_dtype()
-            self.op_type = 'one_hot'
-
-            self.set_data()
-            self.set_input()
-
-        def set_data(self):
-            self.depth = 10
-            self.depth_np = np.array(10).astype('int32')
-            self.x_lod = [[4, 1, 3, 3]]
-            self.x = [
-                np.random.randint(0, self.depth - 1)
-                for i in range(sum(self.x_lod[0]))
-            ]
-            self.x = (
-                np.array(self.x)
-                .astype(self.dtype)
-                .reshape([sum(self.x_lod[0]), 1])
-            )
-
-            self.out = np.zeros(
-                shape=(np.prod(self.x.shape[:-1]), self.depth)
-            ).astype('float32')
-            for i in range(np.prod(self.x.shape)):
-                self.out[i, self.x[i]] = 1.0
-
-            self.outputs = {'Out': (self.out, self.x_lod)}
-
-        def set_input(self):
-            self.inputs = {
-                'X': (self.x, self.x_lod),
-                'depth_tensor': self.depth_np,
-            }
-            self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
-
-        def test_check_output(self):
-            self.check_output(check_dygraph=False)
-
-        def init_dtype(self):
-            self.dtype = self.in_type
-
-    class TestXPUOneHotOP_attr(TestXPUOneHotOP):
-        def set_input(self):
-            self.inputs = {'X': (self.x, self.x_lod)}
-            self.attrs = {
-                'dtype': int(core.VarDesc.VarType.FP32),
-                'depth': self.depth,
-            }
-
-    class TestXPUOneHotOP_default_dtype(TestXPUOneHotOP):
-        def set_input(self):
-            self.inputs = {
-                'X': (self.x, self.x_lod),
-                'depth_tensor': self.depth_np,
-            }
-            self.attrs = {}
-
-    class TestXPUOneHotOP_default_dtype_attr(TestXPUOneHotOP):
-        def set_input(self):
-            self.inputs = {'X': (self.x, self.x_lod)}
-            self.attrs = {'depth': self.depth}
-
-    class TestXPUOneHotOP_out_of_range(TestXPUOneHotOP):
-        def set_data(self):
-            self.depth = 10
-            self.x_lod = [[4, 1, 3, 3]]
-            self.x = [
-                np.random.choice([-1, self.depth])
-                for i in range(sum(self.x_lod[0]))
-            ]
-            self.x = (
-                np.array(self.x)
-                .astype(self.dtype)
-                .reshape([sum(self.x_lod[0]), 1])
-            )
-
-            self.out = np.zeros(
-                shape=(np.prod(self.x.shape[:-1]), self.depth)
-            ).astype('float32')
-
-            self.outputs = {'Out': (self.out, self.x_lod)}
-
-        def set_input(self):
-            self.inputs = {'X': (self.x, self.x_lod)}
-            self.attrs = {'depth': self.depth, 'allow_out_of_range': True}
-
-
-support_types = get_xpu_op_support_types('one_hot')
-for stype in support_types:
-    create_test_class(globals(), XPUTestOneHotOP, stype)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/xpu/test_squeeze_op_xpu.py b/test/xpu/test_squeeze_op_xpu.py
deleted file mode 100644
index c5b9efce7a7708..00000000000000
--- a/test/xpu/test_squeeze_op_xpu.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from op_test_xpu import XPUOpTest
-
-import paddle
-from paddle import base
-from paddle.base import Program, program_guard
-
-paddle.enable_static()
-
-
-class XPUTestSqueezeOp(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = "squeeze"
-        self.use_dynamic_create_class = False
-
-    # Correct: General.
-    class TestSqueezeOp(XPUOpTest):
-        def setUp(self):
-            self.op_type = "squeeze"
-            self.__class__.op_type = "squeeze"
-            self.use_mkldnn = False
-            self.init_dtype()
-            self.init_test_case()
-            self.inputs = {
-                "X": np.random.random(self.ori_shape).astype(self.dtype)
-            }
-            self.init_attrs()
-            self.outputs = {
-                "Out": self.inputs["X"].reshape(self.new_shape),
-            }
-
-        def init_dtype(self):
-            self.dtype = self.in_type
-
-        def test_check_output(self):
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-        def test_check_grad(self):
-            place = paddle.XPUPlace(0)
-            if self.dtype == np.bool_:
-                return
-            else:
-                self.check_grad_with_place(place, ['X'], 'Out')
-
-        def init_test_case(self):
-            self.ori_shape = (1, 3, 1, 40)
-            self.axes = (0, 2)
-            self.new_shape = (3, 40)
-
-        def init_attrs(self):
-            self.attrs = {"axes": self.axes}
-
-    # Correct: There is mins axis.
-    class TestSqueezeOp1(TestSqueezeOp):
-        def init_test_case(self):
-            self.ori_shape = (1, 3, 1, 40)
-            self.axes = (0, -2)
-            self.new_shape = (3, 40)
-
-    # Correct: No axes input.
-    class TestSqueezeOp2(TestSqueezeOp):
-        def init_test_case(self):
-            self.ori_shape = (1, 20, 1, 5)
-            self.axes = ()
-            self.new_shape = (20, 5)
-
-    # Correct: Just part of axes be squeezed.
-    class TestSqueezeOp3(TestSqueezeOp):
-        def init_test_case(self):
-            self.ori_shape = (6, 1, 5, 1, 4, 1)
-            self.axes = (1, -1)
-            self.new_shape = (6, 5, 1, 4)
-
-    # Correct: The demension of axis is not of size 1 remains unchanged.
-    class TestSqueezeOp4(TestSqueezeOp):
-        def init_test_case(self):
-            self.ori_shape = (6, 1, 5, 1, 4, 1)
-            self.axes = (1, 2)
-            self.new_shape = (6, 5, 1, 4, 1)
-
-
-class TestSqueezeOpError(unittest.TestCase):
-    def test_errors(self):
-        paddle.enable_static()
-        with program_guard(Program(), Program()):
-            # The input type of softmax_op must be Variable.
-            x1 = base.create_lod_tensor(
-                np.array([[-1]]), [[1]], paddle.XPUPlace(0)
-            )
-            self.assertRaises(TypeError, paddle.squeeze, x1)
-            # The input axes of squeeze must be list.
-            x2 = paddle.static.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, paddle.squeeze, x2, axes=0)
-            # The input dtype of squeeze not support float16.
-            x3 = paddle.static.data(name='x3', shape=[4], dtype="float16")
-            self.assertRaises(TypeError, paddle.squeeze, x3, axes=0)
-
-
-support_types = get_xpu_op_support_types("squeeze")
-for stype in support_types:
-    create_test_class(globals(), XPUTestSqueezeOp, stype)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/xpu/test_unsqueeze_op_xpu.py b/test/xpu/test_unsqueeze_op_xpu.py
deleted file mode 100644
index 333633031bdfd3..00000000000000
--- a/test/xpu/test_unsqueeze_op_xpu.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from op_test_xpu import XPUOpTest
-
-import paddle
-
-paddle.enable_static()
-
-
-# Correct: General.
-class XPUTestUnsqueezeOp(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = "unsqueeze"
-        self.use_dynamic_create_class = False
-
-    class TestUnsqueezeOp(XPUOpTest):
-        def setUp(self):
-            self.op_type = "unsqueeze"
-            self.__class__.op_type = "unsqueeze"
-            self.use_mkldnn = False
-            self.init_test_case()
-            self.inputs = {
-                "X": np.random.random(self.ori_shape).astype(self.dtype)
-            }
-            self.init_attrs()
-            self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
-
-        def init_dtype(self):
-            self.dtype = self.in_type
-
-        def test_check_output(self):
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-        def test_check_grad(self):
-            place = paddle.XPUPlace(0)
-            if self.dtype == np.bool_:
-                return
-            else:
-                self.check_grad_with_place(place, ['X'], 'Out')
-
-        def init_test_case(self):
-            self.ori_shape = (3, 40)
-            self.axes = (1, 2)
-            self.new_shape = (3, 1, 1, 40)
-
-        def init_attrs(self):
-            self.attrs = {"axes": self.axes}
-
-    # Correct: Single input index.
-    class TestUnsqueezeOp1(TestUnsqueezeOp):
-        def init_test_case(self):
-            self.ori_shape = (20, 5)
-            self.axes = (-1,)
-            self.new_shape = (20, 5, 1)
-
-    # Correct: Mixed input axis.
-    class TestUnsqueezeOp2(TestUnsqueezeOp):
-        def init_test_case(self):
-            self.ori_shape = (20, 5)
-            self.axes = (0, -1)
-            self.new_shape = (1, 20, 5, 1)
-
-    # Correct: There is duplicated axis.
-    class TestUnsqueezeOp3(TestUnsqueezeOp):
-        def init_test_case(self):
-            self.ori_shape = (10, 2, 5)
-            self.axes = (0, 3, 3)
-            self.new_shape = (1, 10, 2, 1, 1, 5)
-
-    # Correct: Reversed axes.
-    class TestUnsqueezeOp4(TestUnsqueezeOp):
-        def init_test_case(self):
-            self.ori_shape = (10, 2, 5)
-            self.axes = (3, 1, 1)
-            self.new_shape = (10, 1, 1, 2, 5, 1)
-
-
-support_types = get_xpu_op_support_types("unsqueeze")
-for stype in support_types:
-    create_test_class(globals(), XPUTestUnsqueezeOp, stype)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/third_party/cccl b/third_party/cccl
new file mode 160000
index 00000000000000..1f6e4bcae0fbf1
--- /dev/null
+++ b/third_party/cccl
@@ -0,0 +1 @@
+Subproject commit 1f6e4bcae0fbf1bbed87f88544d8d2161c490fc1
diff --git a/third_party/mkldnn b/third_party/mkldnn
index 64f6bcbcbab628..01204edbda1c2a 160000
--- a/third_party/mkldnn
+++ b/third_party/mkldnn
@@ -1 +1 @@
-Subproject commit 64f6bcbcbab628e96f33a62c3e975f8535a7bde4
+Subproject commit 01204edbda1c2a4ff0cccd40476ed6bd2fb62d56
diff --git a/third_party/openblas b/third_party/openblas
index 394a9fbafe9010..5f36f18148603f 160000
--- a/third_party/openblas
+++ b/third_party/openblas
@@ -1 +1 @@
-Subproject commit 394a9fbafe9010b76a2615c562204277a956eb52
+Subproject commit 5f36f18148603facb6c3540e673610d6b24cbfbb
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 37d75207cfb846..72ce2af178057a 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -443,6 +443,19 @@ if [ "${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; th
     check_approval 1 phlrain fuyinno4 QingshuChen lanxianghit
 fi
 
+ALL_CHANGE_YAML_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".yaml"`
+BAN_COMP_MESSAGE=""
+for CHANGE_FILE in ${ALL_CHANGE_YAML_FILES}; do
+    ALL_ITEM_BAN_COMP=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CHANGE_FILE} | grep "composite" || true`
+    if [ "${ALL_ITEM_BAN_COMP}" != "" ]; then
+        BAN_COMP_MESSAGE="${BAN_COMP_MESSAGE} ${CHANGE_FILE} : \n${ALL_ITEM_BAN_COMP} \n"
+    fi
+done
+if [ "${BAN_COMP_MESSAGE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+  echo_line="If you need to change the key composite, you must have one RD (Charles-hit(wanghao), cyber-pioneer(chenzhuo), cxxly(chenxiaoxu)) review and approve. \nThe code that do not meet the specification are as follows:\n${BAN_COMP_MESSAGE}\n"
+    check_approval 1 Charles-hit cyber-pioneer cxxly
+fi
+
 NEW_OP_ADDED=`git diff --name-only --diff-filter=A upstream/$BRANCH |grep -oE ".+_op..*" || true`
 if [ "${NEW_OP_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     GET_KERNEL_TYPE_FUNC_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -czoE "GetExpectedKernelType[(][^(){}]+[)][^{]+[{][^}]+[}]" || true`
diff --git a/tools/codestyle/clang-tidy.py b/tools/codestyle/clang-tidy.py
index ef1a5c76e1e432..d8f87d1a630d70 100644
--- a/tools/codestyle/clang-tidy.py
+++ b/tools/codestyle/clang-tidy.py
@@ -396,9 +396,9 @@ def main():
     # Load the database and extract all files.
     database = json.load(open(os.path.join(build_path, db_path)))
     database = skip_check_file(database, build_path)
-    files = [
+    files = {
         make_absolute(entry['file'], entry['directory']) for entry in database
-    ]
+    }
 
     max_task = args.j
     if max_task == 0:
diff --git a/tools/dockerfile/Dockerfile.ubuntu20 b/tools/dockerfile/Dockerfile.ubuntu20
index 2b7967d16245eb..3fbbaf8f707af5 100644
--- a/tools/dockerfile/Dockerfile.ubuntu20
+++ b/tools/dockerfile/Dockerfile.ubuntu20
@@ -61,7 +61,8 @@ RUN apt-get update && \
   apt-get install -y python3.7 python3.7-dev python3.7-distutils\
   python3.8 python3.8-dev python3.8-distutils \
   python3.9 python3.9-dev python3.9-distutils \
-  python3.10 python3.10-dev python3.10-distutils && \
+  python3.10 python3.10-dev python3.10-distutils \
+  python3.11 python3.11-dev python3.11-distutils && \
   apt-get install python-is-python3 && \
   rm /usr/bin/python && ln -s /usr/bin/python3.9 /usr/bin/python && \
   rm /usr/bin/python3 && ln -s /usr/bin/python3.9 /usr/bin/python3
@@ -72,7 +73,8 @@ WORKDIR /home/setuptools-50.3.2
 RUN python3.9 setup.py build && python3.9 setup.py install && \
   python3.8 setup.py build && python3.8 setup.py install && \
   python3.7 setup.py build && python3.7 setup.py install && \
-  python3.10 setup.py build && python3.10 setup.py install
+  python3.10 setup.py build && python3.10 setup.py install && \
+  python3.11 setup.py build && python3.11 setup.py install
 
 WORKDIR /home
 RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
@@ -81,7 +83,8 @@ WORKDIR pip-20.0.1
 RUN python3.9 setup.py install && \
   python3.8 setup.py install && \
   python3.7 setup.py install && \
-  python3.10 setup.py install
+  python3.10 setup.py install && \
+  python3.11 setup.py install
 
 WORKDIR /home
 RUN rm setuptools-50.3.2.zip pip-20.0.1.tar.gz && \
@@ -115,7 +118,8 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 RUN python3.7 -m pip --no-cache-dir install --upgrade pip && \
     python3.8 -m pip --no-cache-dir install --upgrade pip && \
     python3.9 -m pip --no-cache-dir install --upgrade pip && \
-    python3.10 -m pip --no-cache-dir install --upgrade pip
+    python3.10 -m pip --no-cache-dir install --upgrade pip && \
+    python3.11 -m pip --no-cache-dir install --upgrade pip
 
 RUN rm -f /usr/local/bin/pip && ln -s /usr/local/bin/pip3.9 /usr/local/bin/pip && \
     rm -f /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.9 /usr/local/bin/pip3
@@ -127,13 +131,16 @@ RUN pip3.7 --no-cache-dir install ipython==5.3.0 && \
     pip3.9 --no-cache-dir install ipython==5.3.0 && \
     pip3.9 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip3.10 --no-cache-dir install ipython==5.3.0 && \
-    pip3.10 --no-cache-dir install ipykernel==4.6.0 wheel
+    pip3.10 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.11 --no-cache-dir install ipython==5.3.0 && \
+    pip3.11 --no-cache-dir install ipykernel==4.6.0 wheel
 
 #For docstring checker
 RUN pip3.7 --no-cache-dir install pytest astroid isort && \
     pip3.8 --no-cache-dir install pytest astroid isort && \
     pip3.9 --no-cache-dir install pytest astroid isort && \
-    pip3.10 --no-cache-dir install pytest astroid isort
+    pip3.10 --no-cache-dir install pytest astroid isort && \
+    pip3.11 --no-cache-dir install pytest astroid isort
 
 RUN pip3.7 --no-cache-dir install pre-commit==2.17.0 pylint==2.12.0 && \
     pip3.8 --no-cache-dir install pre-commit==2.17.0 pylint==2.12.0 && \
@@ -142,7 +149,8 @@ RUN pip3.7 --no-cache-dir install pre-commit==2.17.0 pylint==2.12.0 && \
     pip3.7 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0 && \
     pip3.8 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0 && \
     pip3.9 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0 && \
-    pip3.10 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0
+    pip3.10 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0 && \
+    pip3.11 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0
 
 COPY ./python/requirements.txt /root/
 COPY ./python/unittest_py/requirements.txt /home/
@@ -154,7 +162,9 @@ RUN pip3.7 --no-cache-dir install -r /root/requirements.txt && \
     pip3.9 --no-cache-dir install -r /root/requirements.txt && \
     pip3.9 --no-cache-dir install -r /home/requirements.txt && \
     pip3.10 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.10 --no-cache-dir install -r /home/requirements.txt
+    pip3.10 --no-cache-dir install -r /home/requirements.txt && \
+    pip3.11 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.11 --no-cache-dir install -r /home/requirements.txt
 
 
 # ccache 4.2.0
diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
index fafc3516904d86..c79d486c62838c 100644
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -53,7 +53,6 @@ fi
 FILE_WHITE_LIST="\
     box_clip_op.cc \
     box_clip_op.h \
-    random_crop_op.h \
     elementwise_op_function.cu.h \
     fused_elemwise_activation_op.cc \
     auc_op.cu \
diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh
index 31ad58a86456e0..d1cb054771535b 100644
--- a/tools/gpups_test.sh
+++ b/tools/gpups_test.sh
@@ -29,6 +29,10 @@ function collect_failed_tests() {
 
 serial_list="^test_conv2d_op$|\
 ^test_conv2d_transpose_op$|\
+^test_dygraph_dataparallel_bf16$|\
+^test_dygraph_sharding_stage1_fp16$|\
+^test_dygraph_sharding_stage2_bf16$|\
+^test_dygraph_sharding_stage3_bf16$|\
 ^test_conv3d_op$"
 
 parallel_list="^init_phi_test$|\
@@ -47,7 +51,6 @@ parallel_list="^init_phi_test$|\
 ^test_conv3d_transpose_op$|\
 ^test_conv_bn_fuse_pass_cc$|\
 ^test_conv_nn_grad$|\
-^test_conv_shift_op$|\
 ^test_conv_transpose_nn_grad$|\
 ^test_convert_call$|\
 ^test_convert_call_generator$|\
@@ -59,7 +62,6 @@ parallel_list="^init_phi_test$|\
 ^test_custom_kernel$|\
 ^test_dist_fleet_ps11$|\
 ^test_dist_fleet_ps12$|\
-^test_dygraph_sharding_stage2_bf16$|\
 ^test_executor_feed_non_tensor$|\
 ^test_flash_attention$|\
 ^test_fused_adam_op$|\
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index c22938e27d1505..b1a19e118e7e4b 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -284,7 +284,6 @@
     'test_depthwise_conv_mkldnn_pass',
     'test_fleet_metric',
     'test_fc_fuse_pass_cc',
-    'test_fleet_private_function',
     'test_fleet',
     'test_executor_check_feed',
     'test_py_reader_lod_level_share',
@@ -298,7 +297,6 @@
     'test_ps_dispatcher',
     'test_analyzer_rnn2',
     'test_multi_gru_seq_fuse_pass',
-    'test_filter_by_instag_op',
     'test_switch',
     'test_matmul_transpose_reshape_fuse_pass',
     'test_mkldnn_caching',
@@ -354,7 +352,6 @@
     'rw_lock_test',
     'exception_holder_test',
     'enforce_test',
-    'test_rnn_memory_helper_op',
     'ddim_test',
     'test_eager_deletion_padding_rnn',
     'test_is_test_pass',
@@ -821,7 +818,6 @@
     'test_fit_a_line',
     'test_mish_op',
     'test_transpose_op',
-    'test_mean_iou',
     'test_conv3d_transpose_op',
     'test_jit_save_load',
     'test_unsqueeze2_op',
@@ -868,14 +864,12 @@
     'test_distribution',
     'test_box_clip_op',
     'custom_tensor_test',
-    'test_marker_op',
     'test_dataloader_early_reset',
     'test_gather_nd_op',
     'test_tensor_register_hook',
     'test_retain_graph',
     'test_network_with_dtype',
     'test_basic_api_transformation',
-    'test_diag',
     'test_lod_array_length_op',
     'test_reinforcement_learning',
     'test_softmax_op',
@@ -894,7 +888,6 @@
     'test_cross_op',
     'concat_test',
     'test_ast_util',
-    'test_proximal_adagrad_op',
     'test_pairwise_distance',
     'test_imperative_mnist',
     'test_beam_search_decoder',
@@ -902,7 +895,6 @@
     'test_dygraph_spectral_norm',
     'test_scale_mkldnn_op',
     'test_load_state_dict_from_old_format',
-    'test_margin_rank_loss_op',
     'test_lookup_table_v2_op',
     'test_mix_precision_all_reduce_fuse',
     'test_spp_op',
@@ -915,7 +907,6 @@
     'test_transformer',
     'test_for_enumerate',
     'test_variable_trans_func',
-    'test_squared_l2_distance_op',
     'test_quantize_transpiler_v2',
     'test_im2sequence_op',
     'test_reader_reset',
@@ -1002,7 +993,6 @@
     'test_unstack_op',
     'test_increment',
     'strided_memcpy_test',
-    'test_target_assign_op',
     'test_trt_dynamic_shape_transformer_prune',
     'test_box_decoder_and_assign_op',
     'test_trt_dynamic_shape',
@@ -1050,7 +1040,6 @@
     'test_huber_loss_op',
     'test_slice',
     'test_label_smooth_functional',
-    'test_conv_shift_op',
     'test_imperative_optimizer_v2',
     'test_len',
     'test_imperative_named_members',
@@ -1079,7 +1068,6 @@
     'test_deformable_conv_v1_op',
     'test_complex_grad_accumulated',
     'test_sequence_mask',
-    'test_fill_op',
     'test_imperative_deepcf',
     'test_multiply',
     'test_partial_program',
@@ -1101,7 +1089,6 @@
     'test_empty_like_op',
     'test_imperative_layer_children',
     'nccl_op_test',
-    'test_tree_conv_op',
     'test_share_data_op',
     'test_ir_memory_optimize_transformer',
     'test_math_op_patch',
@@ -1118,7 +1105,6 @@
     'test_prelu_mkldnn_op',
     'test_box_coder_op',
     'test_atan2_op',
-    'test_unsqueeze_op',
     'test_profiler',
     'test_affine_channel_op',
     'test_leaky_relu_grad_grad_functor',
@@ -1196,7 +1182,6 @@
     'test_imperative_optimizer',
     'test_subtract_op',
     'test_conv_transpose_nn_grad',
-    'test_sigmoid_focal_loss_op',
     'test_cuda_stream_event',
     'test_sequence_pad_op',
     'test_rnn_cells',
@@ -1308,7 +1293,6 @@
     'test_quantization_scale_pass',
     'test_segment_ops',
     'test_layers',
-    'test_isfinite_op',
     'test_imperative_qat_channelwise',
     'test_eye_op',
     'test_imperative_framework',
@@ -1324,7 +1308,6 @@
     'test_sequence_expand_as',
     'test_full_name_usage',
     'test_glu',
-    'test_pad2d_op',
     'test_read_file',
     'test_erf_op',
     'test_sequence_unpad_op',
@@ -1382,7 +1365,6 @@
     'test_complex_abs',
     'test_gradient_accmulator',
     'test_instance_norm_op_v2',
-    'test_random_crop_op',
     'test_mobile_net',
     'test_parallel_executor_transformer',
     'test_tensor_scalar_type_promotion_dynamic',
@@ -1408,7 +1390,6 @@
     'test_imperative_save_load_v2',
     'test_lookahead',
     'test_moving_average_abs_max_scale_op',
-    'test_roi_perspective_transform_op',
     'test_tensorrt_engine',
     'test_affine_grid_function',
     'test_nonzero_api',
@@ -1466,7 +1447,6 @@
     'test_generate_proposals_v2_op',
     'test_graph',
     'test_gelu_op',
-    'test_sample_logits_op',
     'test_weight_normalization',
     'test_activation_bf16_mkldnn_op',
     'trt_dynamic_shape_test',
@@ -1486,18 +1466,15 @@
     'test_dict',
     'test_bilinear_tensor_product_op',
     'test_assert',
-    'test_smooth_l1_loss_op',
     'sequence_padding_test',
     'test_analyzer_ernie',
     'test_minimum_op',
     'test_yolov3_loss_op',
     'test_decayed_adagrad_op',
     'test_split_mkldnn_op',
-    'test_squeeze_op',
     'test_save_inference_model',
     'test_smooth_l1_loss',
     'test_bilateral_slice_op',
-    'test_inplace_abn_op',
     'test_parallel_executor_seresnext_base_gpu',
     'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu',
     'test_parallel_executor_seresnext_with_reduce_gpu',
@@ -1530,7 +1507,6 @@
     'test_imperative_transformer_sorted_gradient',
     'test_bicubic_interp_v2_op',
     'test_rank_attention_op',
-    'test_space_to_depth_op',
     'test_image_classification',
     'test_custom_relu_op_setup',
     'test_sgd_op',
@@ -1592,7 +1568,6 @@
     'test_trt_conv_quant_dequant_pass',
     'test_trt_convert_elementwise',
     'test_trt_convert_depthwise_conv2d_transpose',
-    'test_trt_convert_flatten',
     'test_trt_matmul_quant_dequant',
     'test_trt_convert_dropout',
     'test_trt_convert_conv2d_transpose',
@@ -1715,7 +1690,6 @@
     'test_run_fluid_by_module_or_command_line',
     'test_rpn_target_assign_op',
     'test_row_conv',
-    'test_rnn_memory_helper_op',
     'test_reshape_transpose_matmul_mkldnn_fuse_pass',
     'test_reshape_bf16_op',
     'test_require_version',
@@ -1870,7 +1844,6 @@
     'test_fleet',
     'test_flags_use_mkldnn',
     'test_flags_mkldnn_ops_on_off',
-    'test_filter_by_instag_op',
     'test_fetch_var',
     'test_fetch_handler',
     'test_feed_fetch_method',
@@ -2147,7 +2120,6 @@
     'test_dgc_optimizer',
     'heter_server_test',
     'test_custom_conj',
-    'test_fleet_private_function',
     'test_fake_init_op',
     'brpc_service_sparse_sgd_test',
     'test_tf32_cudnn',
@@ -2322,7 +2294,6 @@
     'test_standalone_controlflow',
     'test_standalone_multiply_write',
     'test_reshape_op',
-    'test_inplace_abn_op',
     'test_fused_transformer_encoder_layer',
     'test_eager_deletion_while_op',
     'test_dataloader_unkeep_order',
@@ -2375,7 +2346,6 @@
     'test_atan2_op',
     'test_tensor_fill_',
     'test_std_layer',
-    'test_squeeze_op',
     'test_split_op',
     'test_sign_op',
     'test_set_value_op',
@@ -2456,7 +2426,6 @@
     'test_yolov3_loss_op',
     'test_where_index',
     'test_variance_layer',
-    'test_unsqueeze_op',
     'test_translated_layer',
     'test_tensor_shape',
     'test_slice',
@@ -2507,7 +2476,6 @@
     'test_complex_abs',
     'test_subtract_op',
     'test_complex_elementwise_layers',
-    'test_marker_op',
     'test_typing',
     'test_cuda_empty_cache',
     'test_randn_op',
@@ -2601,11 +2569,9 @@
     'test_logical_op',
     'test_imperative_deepcf',
     'test_cholesky_op',
-    'test_sample_logits_op',
     'test_ir_fc_fuse_pass',
     'test_fleet_base_single',
     'test_multiprocess_dataloader_iterable_dataset_dynamic',
-    'test_fill_op',
     'test_slice_op',
     'test_cond',
     'test_ema',
@@ -2659,13 +2625,11 @@
     'test_unfold_op',
     'test_conv_bn_fuse_pass',
     'test_truncated_gaussian_random_op',
-    'test_tree_conv_op',
     'test_traced_layer_err_msg',
     'test_unique_with_counts',
     'test_auc_single_pred_op',
     'test_instance_norm_op_v2',
     'test_softmax_bf16_mkldnn_op',
-    'test_mean_iou',
     'test_sequence_slice_op',
     'test_polygon_box_transform',
     'test_sequence_pad_op',
@@ -2691,7 +2655,6 @@
     'test_optimizer',
     'test_deformable_conv_op',
     'test_py_reader_push_pop',
-    'test_random_crop_op',
     'test_shuffle_channel_op',
     'test_center_loss',
     'test_temporal_shift_op',
@@ -2708,18 +2671,13 @@
     'test_top_k_op',
     'test_batch_fc_op',
     'test_tensor_scalar_type_promotion_static',
-    'test_squared_l2_distance_op',
     'test_bicubic_interp_op',
     'test_spp_op',
-    'test_space_to_depth_op',
     'test_callbacks',
-    'test_sigmoid_focal_loss_op',
-    'test_collect_fpn_proposals_op',
     'test_sequence_unpad_op',
     'test_conv1d_transpose_layer',
     'test_sequence_pool',
     'test_conv_elementwise_add_fuse_pass',
-    'test_conv_shift_op',
     'test_sequence_expand_as',
     'test_cos_sim_op',
     'test_sequence_concat',
@@ -2747,7 +2705,6 @@
     'test_generate_proposals_v2_op',
     'test_pad_constant_like',
     'test_grid_sample_function',
-    'test_pad2d_op',
     'test_huber_loss_op',
     'test_one_hot_op',
     'test_normal',
@@ -2771,7 +2728,6 @@
     'test_lookup_table_v2_op',
     'test_l1_norm_op',
     'test_lstm_op',
-    'test_margin_rank_loss_op',
     'test_index_sample_op',
     'test_imperative_save_load',
     'test_imperative_ptb_rnn_sorted_gradient',
@@ -2808,7 +2764,6 @@
     'test_device_guard',
     'test_rnn_cells_static',
     'test_deformable_psroi_pooling',
-    'test_roi_perspective_transform_op',
     'test_segment_ops',
     'test_cvm_op',
     'test_selu_op',
@@ -2822,7 +2777,6 @@
     'test_compare_reduce_op',
     'test_clip_by_norm_op',
     'test_box_coder_op',
-    'test_smooth_l1_loss_op',
     'test_bilinear_interp_op',
     'test_spectral_norm_op',
     'test_sum_mkldnn_op',
@@ -2871,11 +2825,8 @@
     'test_regularizer',
     'test_sequence_reverse',
     'test_shape_op',
-    'test_diag',
     'test_strided_slice_op',
     'test_switch_case',
-    'test_target_assign_op',
-    'test_isfinite_op',
     'test_conv_elementwise_add_act_fuse_pass',
     'test_unbind_op',
     'test_size_op',
@@ -2916,7 +2867,6 @@
     'test_dequantize_log_op',
     'test_mkldnn_batch_norm_act_fuse_pass',
     'test_imperative_skip_op',
-    'test_proximal_adagrad_op',
     'test_conv2d_transpose_mkldnn_op',
     'test_imperative_optimizer',
     'test_assign_value_op',
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index ef5bc0ca45cfaa..23044c127625b7 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -102,7 +102,7 @@ def _patch_float_precision(digits):
     pattern_number = re.compile(
         r"""
         (?:
-            (?<=[\s*\[\(\'\"\:])                        # number starts
+            (?:(?<=[\s*\[\(\'\"\:])|^)                  # number starts
             (?:                                         # int/float or complex-real
                 (?:
                     [+-]?
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index a7e7ad08e3ab1d..a8ceadfe90a1ed 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -27,7 +27,6 @@
     'test_lookup_table_op',
     'test_lookup_table_bf16_op',
     'test_lookup_table_v2_bf16_op',
-    'test_pad2d_op',
     'test_scatter_op',
     'test_sequence_concat',
     'test_sequence_conv',
@@ -39,8 +38,6 @@
     'test_sequence_scatter_op',
     'test_sequence_slice_op',
     'test_slice_op',
-    'test_space_to_depth_op',
-    'test_squared_l2_distance_op',
     'test_accuracy_op',
     'test_activation_nn_grad',
     'test_adadelta_op',
@@ -115,7 +112,6 @@
     'test_conv3d_transpose_part2_op',
     'test_conv_nn_grad',
     'test_conv_transpose_nn_grad',
-    'test_conv_shift_op',
     'test_cos_sim_op',
     'test_create_global_var',
     'test_crf_decoding_op',
@@ -200,9 +196,7 @@
     'test_fetch_var',
     'test_fill_any_like_op',
     'test_fill_constant_op',
-    'test_fill_op',
     'test_fill_zeros_like_op',
-    'test_filter_by_instag_op',
     'test_flatten2_op',
     'test_flatten_contiguous_range_op',
     'test_flatten_op',
@@ -279,7 +273,6 @@
     'test_infer_no_need_buffer_slots',
     'test_inference_model_io',
     'test_initializer',
-    'test_inplace_abn_op',
     'test_inplace_addto_strategy',
     'test_inplace_softmax_with_cross_entropy',
     'test_input_spec',
@@ -289,8 +282,6 @@
     'test_io_save_load',
     'test_iou_similarity_op',
     'test_ir_memory_optimize_pass',
-    'test_is_empty_op',
-    'test_isfinite_op',
     'test_kldiv_loss_op',
     'test_kron_op',
     'test_l1_norm_op',
@@ -318,12 +309,10 @@
     'test_lrn_op',
     'test_lstm_op',
     'test_lstmp_op',
-    'test_margin_rank_loss_op',
     'test_math_op_patch',
     'test_matmul_op',
     'test_matmul_v2_op',
     'test_matrix_nms_op',
-    'test_mean_iou',
     'test_memory_reuse_exclude_feed_var',
     'test_memory_usage',
     'test_merge_ids_op',
@@ -395,7 +384,6 @@
     'test_program_prune_backward',
     'test_program_to_string',
     'test_protobuf_descs',
-    'test_proximal_adagrad_op',
     'test_proximal_gd_op',
     'test_prroi_pool_op',
     'test_prune',
@@ -412,7 +400,6 @@
     'test_queue',
     'test_randint_op',
     'test_randn_op',
-    'test_random_crop_op',
     'test_randperm_op',
     'test_range',
     'test_rank_loss_op',
@@ -431,9 +418,7 @@
     'test_reverse_op',
     'test_rmsprop_op',
     'test_rnn_cell_api',
-    'test_rnn_memory_helper_op',
     'test_roi_align_op',
-    'test_roi_perspective_transform_op',
     'test_roi_pool_op',
     'test_roll_op',
     'test_row_conv',
@@ -441,7 +426,6 @@
     'test_rpn_target_assign_op',
     'test_run_program_op',
     'test_runtime_and_compiletime_exception',
-    'test_sample_logits_op',
     'test_save_model_without_var',
     'test_scale_op',
     'test_scale_mkldnn_op',
@@ -459,13 +443,11 @@
     'test_shuffle_batch_op',
     'test_shuffle_channel_op',
     'test_sigmoid_cross_entropy_with_logits_op',
-    'test_sigmoid_focal_loss_op',
     'test_sign_op',
     'test_similarity_focus_op',
     'test_size_op',
     'test_share_data_op',
     'test_smooth_l1_loss',
-    'test_smooth_l1_loss_op',
     'test_softmax_with_cross_entropy_op',
     'test_spectral_norm_op',
     'test_split_ids_op',
@@ -481,7 +463,6 @@
     'test_sum_op',
     'test_switch',
     'test_switch_case',
-    'test_target_assign_op',
     'test_tdm_child_op',
     'test_tdm_sampler_op',
     'test_teacher_student_sigmoid_loss_op',
@@ -492,7 +473,6 @@
     'test_trace_op',
     'test_trainable',
     'test_transpose_op',
-    'test_tree_conv_op',
     'test_tril_triu_op',
     'test_trilinear_interp_op',
     'test_trilinear_interp_v2_op',
@@ -702,7 +682,6 @@
     'test_lamb_op_xpu',
     'test_model_cast_to_bf16',
     'test_sgd_op_bf16',
-    'test_marker_op',
     'test_c_embedding_op',
     'test_class_center_sample_op',
     'test_fill_diagonal_tensor_op',
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 37d132fbc1535a..49603fd9a3afa5 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -40,6 +40,7 @@
 
 def main():
     sys.path.append(os.getcwd())
+    os.environ["FLAGS_dynamic_static_unified_comm"] = "false"
     if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
         if os.getenv('FLAGS_enable_gpu_memory_usage_log') is None:
             os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index 714685062359f5..c6951bc4ec5d59 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -881,53 +881,7 @@ def test_patch_xdoctest(self):
                         [1.94591032, 2.07944156, 2.1972246]])
 
             """,
-        }
-
-        test_results = get_test_results(doctester, docstrings_to_test)
-        self.assertEqual(len(test_results), 9)
-
-        tr_0, tr_1, tr_2, tr_3, tr_4, tr_5, tr_6, tr_7, tr_8 = test_results
-
-        self.assertIn('gpu_to_gpu', tr_0.name)
-        self.assertTrue(tr_0.passed)
-
-        self.assertIn('cpu_to_cpu', tr_1.name)
-        self.assertTrue(tr_1.passed)
-
-        self.assertIn('gpu_to_cpu', tr_2.name)
-        self.assertTrue(tr_2.passed)
-
-        self.assertIn('cpu_to_gpu', tr_3.name)
-        self.assertTrue(tr_3.passed)
-
-        self.assertIn('gpu_to_cpu_array', tr_4.name)
-        self.assertTrue(tr_4.passed)
-
-        self.assertIn('cpu_to_gpu_array', tr_5.name)
-        self.assertTrue(tr_5.passed)
-
-        self.assertIn('mass_array', tr_6.name)
-        self.assertTrue(tr_6.passed)
-
-        self.assertIn('float_array', tr_7.name)
-        self.assertTrue(tr_7.passed)
-
-        self.assertIn('float_array_diff', tr_8.name)
-        self.assertTrue(tr_8.passed)
-
-        # reload xdoctest.checker
-        importlib.reload(xdoctest.checker)
-
-        _clear_environ()
-
-        test_capacity = {'cpu'}
-        doctester = Xdoctester(
-            style='freeform', target='codeblock', patch_float_precision=None
-        )
-        doctester.prepare(test_capacity)
-
-        docstrings_to_test = {
-            'gpu_to_gpu': """
+            'float_begin': """
             placeholder
 
             Examples:
@@ -937,15 +891,11 @@ def test_patch_xdoctest(self):
 
                     this is some blabla...
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('gpu')
-                    >>> a = paddle.to_tensor(.123456789)
-                    >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.123456780])
+                    >>> print(7.0)
+                    7.
 
             """,
-            'cpu_to_cpu': """
+            'float_begin_long': """
             placeholder
 
             Examples:
@@ -955,15 +905,11 @@ def test_patch_xdoctest(self):
 
                     this is some blabla...
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('cpu')
-                    >>> a = paddle.to_tensor(.123456789)
-                    >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.123456780])
+                    >>> print(7.0000023)
+                    7.0000024
 
             """,
-            'gpu_to_cpu': """
+            'float_begin_more': """
             placeholder
 
             Examples:
@@ -973,15 +919,11 @@ def test_patch_xdoctest(self):
 
                     this is some blabla...
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('gpu')
-                    >>> a = paddle.to_tensor(.123456789)
-                    >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.123456780])
+                    >>> print(7.0, 5., 6.123456)
+                    7.0 5.0 6.123457
 
             """,
-            'cpu_to_gpu': """
+            'float_begin_more_diff': """
             placeholder
 
             Examples:
@@ -991,14 +933,11 @@ def test_patch_xdoctest(self):
 
                     this is some blabla...
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('cpu')
-                    >>> a = paddle.to_tensor(.123456789)
-                    >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.123456780])
+                    >>> print(7.0, 5., 6.123456)
+                    7.0 5.0 6.123457
+
             """,
-            'gpu_to_cpu_array': """
+            'float_begin_more_brief': """
             placeholder
 
             Examples:
@@ -1008,16 +947,11 @@ def test_patch_xdoctest(self):
 
                     this is some blabla...
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('gpu')
-                    >>> a = paddle.to_tensor([[1.123456789 ,2,3], [2,3,4], [3,4,5]])
-                    >>> print(a)
-                    Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [[1.123456780, 2., 3.],
-                    [2., 3., 4.],
-                    [3., 4., 5.]])
+                    >>> print(7.0, 5., 6.123456)
+                    7. 5. 6.123457
+
             """,
-            'cpu_to_gpu_array': """
+            'float_begin_fail': """
             placeholder
 
             Examples:
@@ -1027,106 +961,109 @@ def test_patch_xdoctest(self):
 
                     this is some blabla...
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('cpu')
-                    >>> a = paddle.to_tensor([[1.123456789,2,3], [2,3,4], [3,4,5]])
-                    >>> print(a)
-                    Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [[1.123456780, 2., 3.],
-                    [2., 3., 4.],
-                    [3., 4., 5.]])
+                    >>> print(7.0100023)
+                    7.0000024
+
             """,
-            'mass_array': """
-            placeholder
+        }
 
-            Examples:
+        test_results = get_test_results(doctester, docstrings_to_test)
+        self.assertEqual(len(test_results), 15)
 
-                .. code-block:: python
-                    :name: code-example-1
+        (
+            tr_0,
+            tr_1,
+            tr_2,
+            tr_3,
+            tr_4,
+            tr_5,
+            tr_6,
+            tr_7,
+            tr_8,
+            tr_9,
+            tr_10,
+            tr_11,
+            tr_12,
+            tr_13,
+            tr_14,
+        ) = test_results
 
-                    this is some blabla...
+        self.assertIn('gpu_to_gpu', tr_0.name)
+        self.assertTrue(tr_0.passed)
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('gpu')
-                    >>> a = paddle.to_tensor(
-                    ... [[1.123456780, 2., -3, .3],
-                    ... [2, 3, +4., 1.2+10.34e-5j],
-                    ... [3, 5.e-3, 1e2, 3e-8]]
-                    ... )
-                    >>> # Tensor(shape=[3, 4], dtype=complex64, place=Place(gpu:0), stop_gradient=True,
-                    >>> #       [[ (1.1234568357467651+0j)                    ,
-                    >>> #          (2+0j)                                     ,
-                    >>> #         (-3+0j)                                     ,
-                    >>> #          (0.30000001192092896+0j)                   ],
-                    >>> #        [ (2+0j)                                     ,
-                    >>> #          (3+0j)                                     ,
-                    >>> #          (4+0j)                                     ,
-                    >>> #         (1.2000000476837158+0.00010340000153519213j)],
-                    >>> #        [ (3+0j)                                     ,
-                    >>> #          (0.004999999888241291+0j)                  ,
-                    >>> #          (100+0j)                                   ,
-                    >>> #          (2.999999892949745e-08+0j)                 ]])
-                    >>> print(a)
-                    Tensor(shape=[3, 4], dtype=complex64, place=Place(AAA), stop_gradient=True,
-                        [[ (1.123456+0j),
-                            (2+0j),
-                            (-3+0j),
-                            (0.3+0j)],
-                            [ (2+0j),
-                            (3+0j),
-                            (4+0j),
-                            (1.2+0.00010340j)],
-                            [ (3+0j),
-                            (0.00499999+0j),
-                            (100+0j),
-                            (2.999999e-08+0j)]])
-            """,
-            'float_array': """
-            placeholder
+        self.assertIn('cpu_to_cpu', tr_1.name)
+        self.assertTrue(tr_1.passed)
 
-            Examples:
+        self.assertIn('gpu_to_cpu', tr_2.name)
+        self.assertTrue(tr_2.passed)
 
-                .. code-block:: python
-                    :name: code-example-1
+        self.assertIn('cpu_to_gpu', tr_3.name)
+        self.assertTrue(tr_3.passed)
 
-                    this is some blabla...
+        self.assertIn('gpu_to_cpu_array', tr_4.name)
+        self.assertTrue(tr_4.passed)
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('cpu')
-                    >>> x = [[2, 3, 4], [7, 8, 9]]
-                    >>> x = paddle.to_tensor(x, dtype='float32')
-                    >>> print(paddle.log(x))
-                    Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [[0.69314718, 1.09861231, 1.38629436],
-                        [1.94591010, 2.07944155, 2.19722462]])
+        self.assertIn('cpu_to_gpu_array', tr_5.name)
+        self.assertTrue(tr_5.passed)
 
-            """,
-            'float_array_diff': """
-            placeholder
+        self.assertIn('mass_array', tr_6.name)
+        self.assertTrue(tr_6.passed)
 
-            Examples:
+        self.assertIn('float_array', tr_7.name)
+        self.assertTrue(tr_7.passed)
 
-                .. code-block:: python
-                    :name: code-example-1
+        self.assertIn('float_array_diff', tr_8.name)
+        self.assertTrue(tr_8.passed)
 
-                    this is some blabla...
+        self.assertIn('float_begin', tr_9.name)
+        self.assertTrue(tr_9.passed)
 
-                    >>> import paddle
-                    >>> paddle.device.set_device('cpu')
-                    >>> x = [[2, 3, 4], [7, 8, 9]]
-                    >>> x = paddle.to_tensor(x, dtype='float32')
-                    >>> print(paddle.log(x))
-                    Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-                        [[0.69314712, 1.09861221, 1.386294],
-                        [1.94591032, 2.07944156, 2.1972246]])
+        self.assertIn('float_begin_long', tr_10.name)
+        self.assertTrue(tr_10.passed)
 
-            """,
-        }
+        self.assertIn('float_begin_more', tr_11.name)
+        self.assertTrue(tr_11.passed)
+
+        self.assertIn('float_begin_more_diff', tr_12.name)
+        self.assertTrue(tr_12.passed)
+
+        self.assertIn('float_begin_more_brief', tr_13.name)
+        self.assertTrue(tr_13.passed)
+
+        self.assertIn('float_begin_fail', tr_14.name)
+        self.assertFalse(tr_14.passed)
+
+        # reload xdoctest.checker
+        importlib.reload(xdoctest.checker)
+
+        _clear_environ()
+
+        test_capacity = {'cpu'}
+        doctester = Xdoctester(
+            style='freeform', target='codeblock', patch_float_precision=None
+        )
+        doctester.prepare(test_capacity)
 
         test_results = get_test_results(doctester, docstrings_to_test)
-        self.assertEqual(len(test_results), 9)
+        self.assertEqual(len(test_results), 15)
 
-        tr_0, tr_1, tr_2, tr_3, tr_4, tr_5, tr_6, tr_7, tr_8 = test_results
+        (
+            tr_0,
+            tr_1,
+            tr_2,
+            tr_3,
+            tr_4,
+            tr_5,
+            tr_6,
+            tr_7,
+            tr_8,
+            tr_9,
+            tr_10,
+            tr_11,
+            tr_12,
+            tr_13,
+            tr_14,
+        ) = test_results
 
         self.assertIn('gpu_to_gpu', tr_0.name)
         self.assertFalse(tr_0.passed)
@@ -1155,6 +1092,24 @@ def test_patch_xdoctest(self):
         self.assertIn('float_array_diff', tr_8.name)
         self.assertFalse(tr_8.passed)
 
+        self.assertIn('float_begin', tr_9.name)
+        self.assertFalse(tr_9.passed)
+
+        self.assertIn('float_begin_long', tr_10.name)
+        self.assertFalse(tr_10.passed)
+
+        self.assertIn('float_begin_more', tr_11.name)
+        self.assertFalse(tr_11.passed)
+
+        self.assertIn('float_begin_more_diff', tr_12.name)
+        self.assertFalse(tr_12.passed)
+
+        self.assertIn('float_begin_more_brief', tr_13.name)
+        self.assertFalse(tr_13.passed)
+
+        self.assertIn('float_begin_fail', tr_14.name)
+        self.assertFalse(tr_14.passed)
+
     def test_run_cpu(self):
         _clear_environ()
 
@@ -1521,62 +1476,6 @@ def test_style_google(self):
         doctester = Xdoctester(style='google', target='codeblock')
         doctester.prepare(test_capacity)
 
-        docstrings_to_test = {
-            'one_plus_one': """
-            placeholder
-
-            .. code-block:: python
-                :name: code-example-0
-
-                this is some blabla...
-
-                >>> # doctest: +SKIP('skip')
-                >>> print(1+1)
-                2
-
-            Examples:
-
-                .. code-block:: python
-                    :name: code-example-1
-
-                    this is some blabla...
-
-                    >>> # doctest: +REQUIRES(env:CPU)
-                    >>> print(1-1)
-                    0
-
-            Examples:
-
-                .. code-block:: python
-                    :name: code-example-2
-
-                    >>> print(1+2)
-                    3
-            """,
-            'one_minus_one': """
-            placeholder
-
-            Examples:
-
-                .. code-block:: python
-                    :name: code-example-1
-
-                    this is some blabla...
-
-                    >>> # doctest: +REQUIRES(env:GPU)
-                    >>> print(1-1)
-                    0
-
-            Examples:
-
-                .. code-block:: python
-                    :name: code-example-2
-
-                    >>> print(1+1)
-                    3
-            """,
-        }
-
         test_results = get_test_results(doctester, docstrings_to_test)
         self.assertEqual(len(test_results), 4)
 
@@ -1849,27 +1748,6 @@ def test_no_code(self):
         doctester = Xdoctester(style='google', target='codeblock')
         doctester.prepare(test_capacity)
 
-        docstrings_to_test = {
-            'one_plus_one': """
-            placeholder
-
-            .. code-block:: python
-                :name: code-example-0
-
-                this is some blabla...
-
-                >>> # doctest: +SKIP('skip')
-                >>> print(1+1)
-                2
-            """,
-            'one_minus_one': """
-            placeholder
-
-            Examples:
-
-            """,
-        }
-
         test_results = get_test_results(doctester, docstrings_to_test)
         self.assertEqual(len(test_results), 0)
 
@@ -1879,27 +1757,6 @@ def test_no_code(self):
         doctester = Xdoctester(style='freeform', target='docstring')
         doctester.prepare(test_capacity)
 
-        docstrings_to_test = {
-            'one_plus_one': """
-            placeholder
-
-            .. code-block:: python
-                :name: code-example-0
-
-                this is some blabla...
-
-                >>> # doctest: +SKIP('skip')
-                >>> print(1+1)
-                2
-            """,
-            'one_minus_one': """
-            placeholder
-
-            Examples:
-
-            """,
-        }
-
         test_results = get_test_results(doctester, docstrings_to_test)
         self.assertEqual(len(test_results), 2)
 
@@ -1925,27 +1782,6 @@ def test_no_code(self):
         doctester = Xdoctester(style='freeform', target='codeblock')
         doctester.prepare(test_capacity)
 
-        docstrings_to_test = {
-            'one_plus_one': """
-            placeholder
-
-            .. code-block:: python
-                :name: code-example-0
-
-                this is some blabla...
-
-                >>> # doctest: +SKIP('skip')
-                >>> print(1+1)
-                2
-            """,
-            'one_minus_one': """
-            placeholder
-
-            Examples:
-
-            """,
-        }
-
         test_results = get_test_results(doctester, docstrings_to_test)
         self.assertEqual(len(test_results), 1)
 
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index e7c05f2768a832..b21910e0ae3663 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -145,7 +145,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_tensor_scalar_type_promotion_dynamic$|\
 ^test_model$|\
 ^test_py_reader_combination$|\
-^test_trt_convert_flatten$|\
 ^test_py_reader_push_pop$|\
 ^test_parallel_executor_feed_persistable_var$|\
 ^test_parallel_executor_inference_feed_partial_data$|\
@@ -217,7 +216,6 @@ long_time_test="^test_gru_op$|\
 ^test_gather_op$|\
 ^test_gather_nd_op$|\
 ^test_sequence_conv$|\
-^test_space_to_depth_op$|\
 ^test_activation_nn_grad$|\
 ^test_activation_op$|\
 ^test_bicubic_interp_v2_op$|\
diff --git a/tools/xpu/pack_paddle_depence.sh b/tools/xpu/pack_paddle_depence.sh
index d683d082051bb5..0538bf192695bd 100644
--- a/tools/xpu/pack_paddle_depence.sh
+++ b/tools/xpu/pack_paddle_depence.sh
@@ -26,13 +26,13 @@ XDNN_DIR_NAME=$4
 XCCL_URL=$5
 XCCL_DIR_NAME=$6
 
-wget --no-check-certificate ${XRE_URL} -c -q -O xre.tar.gz
+wget --no-check-certificate ${XRE_URL} -q -O xre.tar.gz
 tar xvf xre.tar.gz
 
-wget --no-check-certificate ${XDNN_URL} -c -q -O xdnn.tar.gz
+wget --no-check-certificate ${XDNN_URL} -q -O xdnn.tar.gz
 tar xvf xdnn.tar.gz
 
-wget --no-check-certificate ${XCCL_URL} -c -q -O xccl.tar.gz
+wget --no-check-certificate ${XCCL_URL} -q -O xccl.tar.gz
 tar xvf xccl.tar.gz
 
 mkdir -p xpu/include/xpu