Combination of multiple paddle::memory::allocate operation into one f…

…or ops (#49126) * A leap of try for cudaLaunchCooperativeKernel * fix bugs * Totally replace the lar cuda kernel * Fix bugs * fix code according to comments * fix codes according to review comments * adding some function overload * relocate the power operation. * add bf16 support for index select relevant ops * revert bf16 type change. * add changes for more op * fix code writting bugs
PaddlePaddle · Feb 1, 2023 · bdae548 · bdae548
1 parent af67309
commit bdae548
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 60 deletions.
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -1530,37 +1530,31 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   ComputeBroadcastKernelSize(
       y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);
 
-  auto x_strides_array_tmp = paddle::memory::Alloc(
+  // One part buffer for x_strides_array, rest for y_strides_array and
+  // out_dims_array.
+  size_t tmp_total_bytes = bytes * 3;
+  auto tmp_buffer = paddle::memory::Alloc(
       ctx.GetPlace(),
-      bytes,
+      tmp_total_bytes,
       phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *x_strides_array_gpu =
-      reinterpret_cast<int *>(x_strides_array_tmp->ptr());
+  int *x_strides_array_gpu = reinterpret_cast<int *>(tmp_buffer->ptr());
+  int *y_strides_array_gpu =
+      reinterpret_cast<int *>(x_strides_array_gpu + max_dim);
+  int *out_dims_array_gpu =
+      reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
+
   paddle::memory::Copy(gplace,
                        x_strides_array_gpu,
                        cplace,
                        x_strides_array.data(),
                        bytes,
                        ctx.stream());
-
-  auto y_strides_array_tmp = paddle::memory::Alloc(
-      ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *y_strides_array_gpu =
-      reinterpret_cast<int *>(y_strides_array_tmp->ptr());
   paddle::memory::Copy(gplace,
                        y_strides_array_gpu,
                        cplace,
                        y_strides_array.data(),
                        bytes,
                        ctx.stream());
-
-  auto out_dims_array_tmp = paddle::memory::Alloc(
-      ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
   paddle::memory::Copy(
       gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
 
@@ -1569,24 +1563,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
   int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
   if (dx) {
-    auto x_strides_order_tmp = paddle::memory::Alloc(
+    size_t dx_total_bytes = bytes * 2;
+    auto dx_tmp_buffer = paddle::memory::Alloc(
         ctx.GetPlace(),
-        bytes,
+        dx_total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *x_strides_order_gpu =
-        reinterpret_cast<int *>(x_strides_order_tmp->ptr());
+    int *x_strides_order_gpu = reinterpret_cast<int *>(dx_tmp_buffer->ptr());
+    int *x_dims_order_gpu =
+        reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
+
     paddle::memory::Copy(gplace,
                          x_strides_order_gpu,
                          cplace,
                          x_strides_order.data(),
                          bytes,
                          ctx.stream());
-
-    auto x_dims_order_tmp = paddle::memory::Alloc(
-        ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
     paddle::memory::Copy(gplace,
                          x_dims_order_gpu,
                          cplace,
@@ -1610,24 +1601,22 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                                       dx_op);
   }
   if (dy) {
-    auto y_strides_order_tmp = paddle::memory::Alloc(
+    // One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
+    size_t dy_total_bytes = bytes * 2;
+    auto dy_tmp_buffer = paddle::memory::Alloc(
         ctx.GetPlace(),
-        bytes,
+        dy_total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *y_strides_order_gpu =
-        reinterpret_cast<int *>(y_strides_order_tmp->ptr());
+    int *y_strides_order_gpu = reinterpret_cast<int *>(dy_tmp_buffer->ptr());
+    int *y_dims_order_gpu =
+        reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
+
     paddle::memory::Copy(gplace,
                          y_strides_order_gpu,
                          cplace,
                          y_strides_order.data(),
                          bytes,
                          ctx.stream());
-
-    auto y_dims_order_tmp = paddle::memory::Alloc(
-        ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
     paddle::memory::Copy(gplace,
                          y_dims_order_gpu,
                          cplace,

diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -55,32 +55,27 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
     cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
   }
 
-  // Copy the addresses of A and A_inv from host to device.
+  // Copy the addresses of A and A_inv from host to device,
+  // and allocate device memory for info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
   paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
       paddle::memory::Alloc(
           dev_ctx.GetPlace(),
-          cpu_ptrs.size() * sizeof(T*),
+          total_bytes,
           phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   paddle::memory::Copy(dev_ctx.GetPlace(),
                        tmp_gpu_ptrs_data->ptr(),
                        phi::CPUPlace(),
                        static_cast<void*>(cpu_ptrs.data()),
                        cpu_ptrs.size() * sizeof(T*),
                        dev_ctx.stream());
-  T** gpu_inv_ptrs =
-      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-  // Allocate device memory for info and pivots.
-  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
-      paddle::memory::Alloc(
-          dev_ctx.GetPlace(),
-          num_ints * sizeof(int),
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+  T** gpu_inv_pivot_info = reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr());
+  T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size;
+  int* gpu_info_ptr =
+      reinterpret_cast<int*>(gpu_inv_pivot_info + cpu_ptrs.size());
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
   std::vector<int> info;  // only for singular checking
   info.resize(batch_size);
   // This functions in cuBLAS is intended to be used for matrices of small
@@ -100,8 +95,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
     // This function performs the LU factorization of each matrix A by the
     // equation P * A = L * U. L and U are written back to original matrix A,
     // and diagonal elements of L are discarded.
-    int* gpu_pivot_ptr =
-        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+    int* gpu_pivot_ptr = gpu_info_ptr + batch_size;
     blas.BatchedGETRF(n,
                       reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
                       gpu_pivot_ptr,

diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -354,12 +354,6 @@ struct MatrixEighFunctor<GPUContext, T> {
         has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
 
     ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
-    auto info = paddle::memory::Alloc(
-        dev_ctx.GetPlace(),
-        sizeof(int) * batch_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
-
     DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
     T *input_vector = input_trans.data<T>();
 
@@ -410,11 +404,13 @@ struct MatrixEighFunctor<GPUContext, T> {
                 out_value,
                 &workspace_size);
     }
+    size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
     auto work = paddle::memory::Alloc(
         dev_ctx.GetPlace(),
-        sizeof(T) * workspace_size,
+        total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
+    auto *info_ptr = reinterpret_cast<int *>(work_ptr + workspace_size);
 
     for (auto i = 0; i < batch_size; ++i) {
       auto *input_data = input_vector + i * vector_stride;