Merge branch 'PaddlePaddle:develop' into add_diagonal

PaddlePaddle · Oct 19, 2023 · e402e9c · e402e9c
2 parents 0ea1cce + cfd1d0e
commit e402e9c
Show file tree

Hide file tree

Showing 1,458 changed files with 65,998 additions and 39,009 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -20,7 +20,7 @@ bugprone-integer-division,
 bugprone-misplaced-widening-cast,
 -bugprone-move-forwarding-reference,
 -bugprone-multiple-statement-macro,
--bugprone-narrowing-conversions,
+bugprone-narrowing-conversions,
 -bugprone-not-null-terminated-result,
 -bugprone-parent-virtual-call,
 -bugprone-posix-return,
@@ -155,7 +155,7 @@ cppcoreguidelines-avoid-c-arrays,
 -cppcoreguidelines-avoid-goto,
 cppcoreguidelines-c-copy-assignment-signature,
 cppcoreguidelines-explicit-virtual-functions,
--cppcoreguidelines-init-variables,
+cppcoreguidelines-init-variables,
 cppcoreguidelines-narrowing-conversions,
 cppcoreguidelines-no-malloc,
 -cppcoreguidelines-pro-type-const-cast,
@@ -189,12 +189,12 @@ modernize-use-override,
 modernize-use-transparent-functors,
 -modernize-use-uncaught-exceptions,
 performance-faster-string-find,
--performance-for-range-copy,
+performance-for-range-copy,
 -performance-implicit-conversion-in-loop,
 -performance-inefficient-algorithm,
 performance-inefficient-string-concatenation,
 -performance-inefficient-vector-operation,
--performance-move-const-arg,
+performance-move-const-arg,
 -performance-move-constructor-init,
 -performance-no-automatic-move,
 performance-noexcept-move-constructor,

diff --git a/.flake8 b/.flake8
@@ -26,6 +26,9 @@ per-file-ignores =
     # These files need tabs for testing.
     test/dygraph_to_static/test_error.py:E101,W191
 
+    # Ignore compare with True in sot unittest
+    test/sot/test_dup_top.py:E712
+
     # temp ignore base directory
     python/paddle/base/*:
         E712,

diff --git a/.gitmodules b/.gitmodules
@@ -106,3 +106,7 @@
 	path = third_party/jitify
 	url = https://github.com/NVIDIA/jitify.git
 	ignore = dirty
+[submodule "third_party/cccl"]
+	path = third_party/cccl
+	url = https://github.com/NVIDIA/cccl.git
+	ignore = dirty
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 include(ExternalProject)
-set(OPENSSL_USE_STATIC_LIBS ON)
+if(NOT WITH_ARM)
+  set(OPENSSL_USE_STATIC_LIBS ON)
+endif()
 find_package(OpenSSL REQUIRED)
 
 message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})

diff --git a/cmake/external/cccl.cmake b/cmake/external/cccl.cmake
@@ -0,0 +1,31 @@
+include(ExternalProject)
+
+set(CCCL_PATH
+    "${THIRD_PARTY_PATH}/cccl"
+    CACHE STRING "A path setting for external_cccl path.")
+set(CCCL_PREFIX_DIR ${CCCL_PATH})
+set(CCCL_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cccl)
+
+# The latest commit has bugs in windows, so we set a fix commit.
+set(CCCL_TAG 1f6e4bcae0fbf1bbed87f88544d8d2161c490fc1)
+execute_process(COMMAND git --git-dir=${CCCL_SOURCE_DIR}/.git
+                        --work-tree=${CCCL_SOURCE_DIR} checkout ${CCCL_TAG})
+
+set(CCCL_INCLUDE_DIR ${CCCL_SOURCE_DIR})
+message("CCCL_INCLUDE_DIR is ${CCCL_INCLUDE_DIR}")
+include_directories(${CCCL_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cccl
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${CCCL_SOURCE_DIR}
+  PREFIX ${CCCL_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
+
+add_library(cccl INTERFACE)
+
+add_dependencies(cccl extern_cccl)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
@@ -19,12 +19,16 @@ set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
 set(CBLAS_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/openblas)
 set(CBLAS_TAG v0.3.7)
 
-# OpenBLAS support Raptor Lake from v0.3.22
-if(UNIX
-   AND NOT APPLE
-   AND NOT WITH_ROCM
+# Why use v0.3.18?  The IDG business line encountered a random openblas error,
+# which can be resolved after upgrading openblas.
+# And why compile when gcc>8.2? Please refer to
+# https://github.com/spack/spack/issues/19932#issuecomment-733452619
+# v0.3.18 only support gcc>=8.3 or gcc>=7.4
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+   AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.2
    AND NOT WITH_XPU)
-  set(CBLAS_TAG v0.3.23)
+  # We only compile with openblas 0.3.18 when gcc >= 8.3
+  set(CBLAS_TAG v0.3.18)
 endif()
 
 if(APPLE AND WITH_ARM)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -88,7 +88,7 @@
 # To build a unit test binary, which is an executable binary with libpaddle.so
 # automatically linked:
 #
-#   paddle_test(example SHARED)
+#   paddle_test(example SRCS example_test.cc)
 #
 
 # including binary directory for generated headers.
@@ -1345,6 +1345,9 @@ function(math_library TARGET)
   if(WITH_GPU)
     if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
       list(APPEND math_common_deps cub)
+    elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0
+           OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0)
+      list(APPEND math_common_deps cccl)
     else()
       list(APPEND math_common_deps)
     endif()

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
@@ -247,6 +247,14 @@ if(NOT DEFINED WITH_MKLDNN)
   endif()
 endif()
 
+if(WIN32)
+  if(MSVC)
+    if(MSVC_VERSION LESS 1920)
+      set(WITH_MKLDNN OFF)
+    endif()
+  endif()
+endif()
+
 if(WIN32
    OR APPLE
    OR NOT WITH_GPU
@@ -375,6 +383,10 @@ if(WITH_GPU)
   if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
     include(external/cub) # download cub
     list(APPEND third_party_deps extern_cub)
+  elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0
+         OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0)
+    include(external/cccl)
+    list(APPEND third_party_deps extern_cccl)
   endif()
   set(URL
       "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz"

diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -19,6 +19,7 @@
 #include "paddle/cinn/ir/operation.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
 namespace ast_gen_ius {
@@ -84,11 +85,75 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     tensor_group->MarkShareMemBuffer(tensor, init_tensor);
     tensor_group->CtrlDepend(tensor, init_tensor);
     Expr init_body = ir::Store::Make(init_tensor, init_value, axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> block_vars;
+    std::vector<ir::Expr> iter_values;
+    // reduce body and reduce init schedule block should have different objects
+    // for same axis so we re-create objects
+    std::vector<Var> axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      block_vars.push_back(Var(Expr(0),
+                               shape[i],
+                               cinn::UniqName("i" + std::to_string(i)),
+                               /*is_reduce = */ false));
+      optim::ReplaceVarWithExpr(&init_body, axis[i], block_vars[i]);
+      axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        iter_values.push_back(Expr(0));
+      } else {
+        iter_values.push_back(axis_vars[i]);
+      }
+    }
+    init_body = ir::ScheduleBlockRealize::Make(
+        iter_values,
+        ir::ScheduleBlock::Make(
+            block_vars, {}, {}, reduce_init_name, init_body));
 
     // For the remaining reduce axis, make reduce body
     const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
     ir::Expr reduce_body =
         ConvertReduceBody(tensor->body(), tensor, axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> reduce_block_vars;
+    std::vector<ir::Expr> reduce_iter_values;
+    // reduce body and reduce init schedule block should have different objects
+    // for same axis so we re-create objects
+    std::vector<Var> reduce_axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      reduce_block_vars.push_back(Var(Expr(0),
+                                      shape[i],
+                                      cinn::UniqName("i" + std::to_string(i)),
+                                      /*is_reduce = */ false));
+      reduce_axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        reduce_iter_values.push_back(Expr(0));
+      } else {
+        reduce_iter_values.push_back(axis_vars[i]);
+      }
+    }
+    for (int i = 0; i < reduce_axis.size(); ++i) {
+      int count = shape.size() + i;
+      reduce_block_vars.push_back(
+          Var(reduce_axis[i]->lower_bound,
+              reduce_axis[i]->upper_bound,
+              cinn::UniqName("i" + std::to_string(count)),
+              /*is_reduce = */ true));
+      ir::Var reduce_axis_var = reduce_axis[i];
+      reduce_axis_var->is_reduce_axis = true;
+      reduce_iter_values.push_back(reduce_axis_var);
+    }
+    for (int i = 0; i < axis.size(); ++i) {
+      optim::ReplaceVarWithExpr(&reduce_body, axis[i], reduce_block_vars[i]);
+    }
+    for (int i = axis.size(); i < reduce_block_vars.size(); ++i) {
+      optim::ReplaceVarWithExpr(
+          &reduce_body, reduce_axis[i - axis.size()], reduce_block_vars[i]);
+    }
+
+    reduce_body = ir::ScheduleBlockRealize::Make(
+        reduce_iter_values,
+        ir::ScheduleBlock::Make(
+            reduce_block_vars, {}, {}, tensor->name, reduce_body));
     for (int i = static_cast<int>(reduce_axis.size()) - 1; i >= 0; --i) {
       reduce_body = ir::For::Make(reduce_axis[i],
                                   reduce_axis[i]->lower_bound,
@@ -114,6 +179,24 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     return body;
   } else {
     ir::Expr body = ir::Store::Make(tensor, tensor->body(), axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> block_vars;
+    std::vector<ir::Expr> iter_values;
+    std::vector<Var> axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      block_vars.push_back(Var(
+          Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false));
+      optim::ReplaceVarWithExpr(&body, axis[i], block_vars[i]);
+      axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        iter_values.push_back(Expr(0));
+      } else {
+        iter_values.push_back(axis_vars[i]);
+      }
+    }
+    body = ir::ScheduleBlockRealize::Make(
+        iter_values,
+        ir::ScheduleBlock::Make(block_vars, {}, {}, tensor->name, body));
     for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
       ir::Var loop_var = axis[i];
       ir::Expr loop_extent = shape[i];