Skip to content

Commit

Permalink
Combination of multiple paddle::memory::allocate operation into one f…
Browse files Browse the repository at this point in the history
…or ops (#49126)

* A leap of try for cudaLaunchCooperativeKernel

* fix bugs

* Totally replace the lar cuda kernel

* Fix bugs

* fix code according to comments

* fix codes according to  review comments

* adding some function overload

* relocate the power operation.

* add bf16 support for index select relevant ops

* revert bf16 type change.

* add changes for more op

* fix code writting bugs
  • Loading branch information
JamesLim-sy authored Feb 1, 2023
1 parent af67309 commit bdae548
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 60 deletions.
63 changes: 26 additions & 37 deletions paddle/phi/kernels/funcs/elementwise_grad_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -1530,37 +1530,31 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
ComputeBroadcastKernelSize(
y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);

auto x_strides_array_tmp = paddle::memory::Alloc(
// One part buffer for x_strides_array, rest for y_strides_array and
// out_dims_array.
size_t tmp_total_bytes = bytes * 3;
auto tmp_buffer = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
tmp_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_strides_array_gpu =
reinterpret_cast<int *>(x_strides_array_tmp->ptr());
int *x_strides_array_gpu = reinterpret_cast<int *>(tmp_buffer->ptr());
int *y_strides_array_gpu =
reinterpret_cast<int *>(x_strides_array_gpu + max_dim);
int *out_dims_array_gpu =
reinterpret_cast<int *>(y_strides_array_gpu + max_dim);

paddle::memory::Copy(gplace,
x_strides_array_gpu,
cplace,
x_strides_array.data(),
bytes,
ctx.stream());

auto y_strides_array_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_strides_array_gpu =
reinterpret_cast<int *>(y_strides_array_tmp->ptr());
paddle::memory::Copy(gplace,
y_strides_array_gpu,
cplace,
y_strides_array.data(),
bytes,
ctx.stream());

auto out_dims_array_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
paddle::memory::Copy(
gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());

Expand All @@ -1569,24 +1563,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
if (dx) {
auto x_strides_order_tmp = paddle::memory::Alloc(
size_t dx_total_bytes = bytes * 2;
auto dx_tmp_buffer = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
dx_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_strides_order_gpu =
reinterpret_cast<int *>(x_strides_order_tmp->ptr());
int *x_strides_order_gpu = reinterpret_cast<int *>(dx_tmp_buffer->ptr());
int *x_dims_order_gpu =
reinterpret_cast<int *>(x_strides_order_gpu + max_dim);

paddle::memory::Copy(gplace,
x_strides_order_gpu,
cplace,
x_strides_order.data(),
bytes,
ctx.stream());

auto x_dims_order_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
paddle::memory::Copy(gplace,
x_dims_order_gpu,
cplace,
Expand All @@ -1610,24 +1601,22 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
dx_op);
}
if (dy) {
auto y_strides_order_tmp = paddle::memory::Alloc(
// One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
size_t dy_total_bytes = bytes * 2;
auto dy_tmp_buffer = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
dy_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_strides_order_gpu =
reinterpret_cast<int *>(y_strides_order_tmp->ptr());
int *y_strides_order_gpu = reinterpret_cast<int *>(dy_tmp_buffer->ptr());
int *y_dims_order_gpu =
reinterpret_cast<int *>(y_strides_order_gpu + max_dim);

paddle::memory::Copy(gplace,
y_strides_order_gpu,
cplace,
y_strides_order.data(),
bytes,
ctx.stream());

auto y_dims_order_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
paddle::memory::Copy(gplace,
y_dims_order_gpu,
cplace,
Expand Down
26 changes: 10 additions & 16 deletions paddle/phi/kernels/funcs/matrix_inverse.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,32 +55,27 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
}

// Copy the addresses of A and A_inv from host to device.
// Copy the addresses of A and A_inv from host to device,
// and allocate device memory for info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
dev_ctx.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
dev_ctx.stream());
T** gpu_inv_ptrs =
reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;

// Allocate device memory for info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
paddle::memory::Alloc(
dev_ctx.GetPlace(),
num_ints * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
T** gpu_inv_pivot_info = reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr());
T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size;
int* gpu_info_ptr =
reinterpret_cast<int*>(gpu_inv_pivot_info + cpu_ptrs.size());

auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);

std::vector<int> info; // only for singular checking
info.resize(batch_size);
// This functions in cuBLAS is intended to be used for matrices of small
Expand All @@ -100,8 +95,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
// This function performs the LU factorization of each matrix A by the
// equation P * A = L * U. L and U are written back to original matrix A,
// and diagonal elements of L are discarded.
int* gpu_pivot_ptr =
reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
int* gpu_pivot_ptr = gpu_info_ptr + batch_size;
blas.BatchedGETRF(n,
reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
gpu_pivot_ptr,
Expand Down
10 changes: 3 additions & 7 deletions paddle/phi/kernels/funcs/values_vectors_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,12 +354,6 @@ struct MatrixEighFunctor<GPUContext, T> {
has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;

ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
auto info = paddle::memory::Alloc(
dev_ctx.GetPlace(),
sizeof(int) * batch_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto *info_ptr = reinterpret_cast<int *>(info->ptr());

DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
T *input_vector = input_trans.data<T>();

Expand Down Expand Up @@ -410,11 +404,13 @@ struct MatrixEighFunctor<GPUContext, T> {
out_value,
&workspace_size);
}
size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
auto work = paddle::memory::Alloc(
dev_ctx.GetPlace(),
sizeof(T) * workspace_size,
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto *work_ptr = reinterpret_cast<T *>(work->ptr());
auto *info_ptr = reinterpret_cast<int *>(work_ptr + workspace_size);

for (auto i = 0; i < batch_size; ++i) {
auto *input_data = input_vector + i * vector_stride;
Expand Down

0 comments on commit bdae548

Please sign in to comment.