diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index b9ffb4e3f1237..f577f1781ff09 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -1530,37 +1530,31 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ComputeBroadcastKernelSize( y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim); - auto x_strides_array_tmp = paddle::memory::Alloc( + // One part buffer for x_strides_array, rest for y_strides_array and + // out_dims_array. + size_t tmp_total_bytes = bytes * 3; + auto tmp_buffer = paddle::memory::Alloc( ctx.GetPlace(), - bytes, + tmp_total_bytes, phi::Stream(reinterpret_cast(ctx.stream()))); - int *x_strides_array_gpu = - reinterpret_cast(x_strides_array_tmp->ptr()); + int *x_strides_array_gpu = reinterpret_cast(tmp_buffer->ptr()); + int *y_strides_array_gpu = + reinterpret_cast(x_strides_array_gpu + max_dim); + int *out_dims_array_gpu = + reinterpret_cast(y_strides_array_gpu + max_dim); + paddle::memory::Copy(gplace, x_strides_array_gpu, cplace, x_strides_array.data(), bytes, ctx.stream()); - - auto y_strides_array_tmp = paddle::memory::Alloc( - ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(ctx.stream()))); - int *y_strides_array_gpu = - reinterpret_cast(y_strides_array_tmp->ptr()); paddle::memory::Copy(gplace, y_strides_array_gpu, cplace, y_strides_array.data(), bytes, ctx.stream()); - - auto out_dims_array_tmp = paddle::memory::Alloc( - ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(ctx.stream()))); - int *out_dims_array_gpu = reinterpret_cast(out_dims_array_tmp->ptr()); paddle::memory::Copy( gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream()); @@ -1569,24 +1563,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads); int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads); if (dx) { - auto x_strides_order_tmp = paddle::memory::Alloc( + size_t dx_total_bytes = bytes * 2; + auto dx_tmp_buffer = paddle::memory::Alloc( ctx.GetPlace(), - bytes, + dx_total_bytes, phi::Stream(reinterpret_cast(ctx.stream()))); - int *x_strides_order_gpu = - reinterpret_cast(x_strides_order_tmp->ptr()); + int *x_strides_order_gpu = reinterpret_cast(dx_tmp_buffer->ptr()); + int *x_dims_order_gpu = + reinterpret_cast(x_strides_order_gpu + max_dim); + paddle::memory::Copy(gplace, x_strides_order_gpu, cplace, x_strides_order.data(), bytes, ctx.stream()); - - auto x_dims_order_tmp = paddle::memory::Alloc( - ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(ctx.stream()))); - int *x_dims_order_gpu = reinterpret_cast(x_dims_order_tmp->ptr()); paddle::memory::Copy(gplace, x_dims_order_gpu, cplace, @@ -1610,24 +1601,22 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, dx_op); } if (dy) { - auto y_strides_order_tmp = paddle::memory::Alloc( + // One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu + size_t dy_total_bytes = bytes * 2; + auto dy_tmp_buffer = paddle::memory::Alloc( ctx.GetPlace(), - bytes, + dy_total_bytes, phi::Stream(reinterpret_cast(ctx.stream()))); - int *y_strides_order_gpu = - reinterpret_cast(y_strides_order_tmp->ptr()); + int *y_strides_order_gpu = reinterpret_cast(dy_tmp_buffer->ptr()); + int *y_dims_order_gpu = + reinterpret_cast(y_strides_order_gpu + max_dim); + paddle::memory::Copy(gplace, y_strides_order_gpu, cplace, y_strides_order.data(), bytes, ctx.stream()); - - auto y_dims_order_tmp = paddle::memory::Alloc( - ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(ctx.stream()))); - int *y_dims_order_gpu = reinterpret_cast(y_dims_order_tmp->ptr()); paddle::memory::Copy(gplace, y_dims_order_gpu, cplace, diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc index c43c3c04755f3..3961f82c8fd0f 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc @@ -55,11 +55,14 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; } - // Copy the addresses of A and A_inv from host to device. + // Copy the addresses of A and A_inv from host to device, + // and allocate device memory for info and pivots. + int num_ints = n < 32 ? batch_size : batch_size * (n + 1); + size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int); paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = paddle::memory::Alloc( dev_ctx.GetPlace(), - cpu_ptrs.size() * sizeof(T*), + total_bytes, phi::Stream(reinterpret_cast(dev_ctx.stream()))); paddle::memory::Copy(dev_ctx.GetPlace(), tmp_gpu_ptrs_data->ptr(), @@ -67,20 +70,12 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, static_cast(cpu_ptrs.data()), cpu_ptrs.size() * sizeof(T*), dev_ctx.stream()); - T** gpu_inv_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - - // Allocate device memory for info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); - paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = - paddle::memory::Alloc( - dev_ctx.GetPlace(), - num_ints * sizeof(int), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); + T** gpu_inv_pivot_info = reinterpret_cast(tmp_gpu_ptrs_data->ptr()); + T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size; + int* gpu_info_ptr = + reinterpret_cast(gpu_inv_pivot_info + cpu_ptrs.size()); auto blas = phi::funcs::GetBlas(dev_ctx); - std::vector info; // only for singular checking info.resize(batch_size); // This functions in cuBLAS is intended to be used for matrices of small @@ -100,8 +95,7 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, // This function performs the LU factorization of each matrix A by the // equation P * A = L * U. L and U are written back to original matrix A, // and diagonal elements of L are discarded. - int* gpu_pivot_ptr = - reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; + int* gpu_pivot_ptr = gpu_info_ptr + batch_size; blas.BatchedGETRF(n, reinterpret_cast(tmp_gpu_ptrs_data->ptr()), gpu_pivot_ptr, diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h index 63202ca4a484d..d4314307873f4 100644 --- a/paddle/phi/kernels/funcs/values_vectors_functor.h +++ b/paddle/phi/kernels/funcs/values_vectors_functor.h @@ -354,12 +354,6 @@ struct MatrixEighFunctor { has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR; ValueType *out_value = dev_ctx.template Alloc(eigen_values); - auto info = paddle::memory::Alloc( - dev_ctx.GetPlace(), - sizeof(int) * batch_size, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - auto *info_ptr = reinterpret_cast(info->ptr()); - DenseTensor input_trans = phi::TransposeLast2Dim(dev_ctx, input); T *input_vector = input_trans.data(); @@ -410,11 +404,13 @@ struct MatrixEighFunctor { out_value, &workspace_size); } + size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size; auto work = paddle::memory::Alloc( dev_ctx.GetPlace(), - sizeof(T) * workspace_size, + total_bytes, phi::Stream(reinterpret_cast(dev_ctx.stream()))); auto *work_ptr = reinterpret_cast(work->ptr()); + auto *info_ptr = reinterpret_cast(work_ptr + workspace_size); for (auto i = 0; i < batch_size; ++i) { auto *input_data = input_vector + i * vector_stride;