Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unittest: support cross_entropy for communication strategies #1

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions paddle/phi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ if(WITH_MUSA)
"gpu/log_softmax_kernel.cu"
"gpu/log_softmax_grad_kernel.cu"
"gpu/weighted_sample_neighbors_kernel.cu"
"gpu/cross_entropy_kernel.cu"
"gpu/cross_entropy_grad_kernel.cu"
"gpu/gelu_kernel.cu"
"gpu/gelu_grad_kernel.cu"
"gpu/rnn_kernel.cu.cc"
Expand Down
5 changes: 0 additions & 5 deletions paddle/phi/kernels/funcs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,4 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
"*.cu")
endif()

if(WITH_MUSA)
list(REMOVE_ITEM func_cu_srcs
"softmax.cu")
endif()

collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
62 changes: 30 additions & 32 deletions paddle/phi/kernels/funcs/softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
const DeviceContext& context,
const phi::DenseTensor* X,
phi::DenseTensor* Y) {
printf("ShangShang %s %d SoftmaxCUDNNFunctor\n", __FILE__, __LINE__);
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor xDesc;
ScopedTensorDescriptor yDesc;
Expand Down Expand Up @@ -61,20 +62,16 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
#elif defined(PADDLE_WITH_MUSA)
mudnnTensorDescriptor_t cudnn_x_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
mudnnTensorDescriptor_t cudnn_y_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::mudnnSoftmaxForward(context.cudnn_handle(),
CudnnDataType<T>::kOne(),
cudnn_x_desc,
X->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_y_desc,
context.template Alloc<T>(Y),
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
auto& cudnn_x_desc =
xDesc.descriptor<T>(X->data<T>(), layout, cudnn_tensor_dims);
auto& cudnn_y_desc =
xDesc.descriptor<T>(context.template Alloc<T>(Y),
layout, cudnn_tensor_dims);
const int axis = cudnn_tensor_dims.size() - 1;
backends::gpu::ScopedSoftmaxDescriptor softmax_desc;
softmax_desc.descriptor(dynload::Softmax::Mode::SOFTMAX,
dynload::Softmax::Algorithm::ACCURATE, axis)
.Run(*context.cudnn_handle(), cudnn_y_desc, cudnn_x_desc);
#else
cudnnTensorDescriptor_t cudnn_x_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
Expand All @@ -99,6 +96,7 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
const phi::DenseTensor* Y,
const phi::DenseTensor* YGrad,
phi::DenseTensor* XGrad) {
printf("ShangShang %s %d SoftmaxGradCUDNNFunctor\n", __FILE__, __LINE__);
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor yDesc;
ScopedTensorDescriptor dyDesc;
Expand Down Expand Up @@ -133,24 +131,24 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
#elif defined(PADDLE_WITH_MUSA)
mudnnTensorDescriptor_t cudnn_y_desc =
yDesc.descriptor<T>(layout, cudnn_tensor_dims);
mudnnTensorDescriptor_t cudnn_xgrad_desc =
dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
mudnnTensorDescriptor_t cudnn_ygrad_desc =
dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::mudnnSoftmaxBackward(context.cudnn_handle(),
CudnnDataType<T>::kOne(),
cudnn_y_desc,
Y->data<T>(),
cudnn_ygrad_desc,
YGrad->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_xgrad_desc,
context.template Alloc<T>(XGrad),
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
//auto& cudnn_y_desc =
// yDesc.descriptor<T>(layout, cudnn_tensor_dims);
//auto& cudnn_xgrad_desc =
// dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
//auto& cudnn_ygrad_desc =
// dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
//PADDLE_ENFORCE_GPU_SUCCESS(
// phi::dynload::mudnnSoftmaxBackward(context.cudnn_handle(),
// CudnnDataType<T>::kOne(),
// cudnn_y_desc,
// Y->data<T>(),
// cudnn_ygrad_desc,
// YGrad->data<T>(),
// CudnnDataType<T>::kZero(),
// cudnn_xgrad_desc,
// context.template Alloc<T>(XGrad),
// MIOPEN_SOFTMAX_ACCURATE,
// MIOPEN_SOFTMAX_MODE_INSTANCE));
#else
cudnnTensorDescriptor_t cudnn_y_desc =
yDesc.descriptor<T>(layout, cudnn_tensor_dims);
Expand Down
44 changes: 18 additions & 26 deletions paddle/phi/kernels/gpu/cross_entropy_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,9 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
#elif defined(PADDLE_WITH_MUSA)
mudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
auto& idesc = desc.descriptor<T>(logits_data, layout, tensor_dims);
ScopedTensorDescriptor out_desc;
auto& odesc = out_desc.descriptor<T>(softmax_data, layout, tensor_dims);
#else
cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
#endif
Expand All @@ -777,18 +779,11 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
MIOPEN_SOFTMAX_LOG,
mode));
#elif defined(PADDLE_WITH_MUSA)
auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE
: MUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward(
handle,
MUDNN_SOFTMAX_LOG,
mode,
phi::backends::gpu::CudnnDataType<T>::kOne(),
descp,
logits_data,
phi::backends::gpu::CudnnDataType<T>::kZero(),
descp,
softmax_data));
backends::gpu::ScopedSoftmaxDescriptor softmax_desc;
softmax_desc.descriptor(dynload::Softmax::Mode::LOGSOFTMAX,
dynload::Softmax::Algorithm::DIRECT,
axis)
.Run(*handle, odesc, idesc);
#else
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
Expand Down Expand Up @@ -1107,6 +1102,7 @@ void SwitchWarpSoftmaxForward(T* loss,
int blocks = (batch_size + batches_per_block - 1) / batches_per_block;
dim3 threads(kWarpSize, warps_per_block, 1);

printf("ShangShang %s %d SwitchWarpSoftmaxForward %d\n", __FILE__, __LINE__, log2_elements);
switch (log2_elements) {
SOFTMAX_WARP_FORWARD_CASE(0, LabelT, T, AccT);
SOFTMAX_WARP_FORWARD_CASE(1, LabelT, T, AccT);
Expand Down Expand Up @@ -1203,7 +1199,7 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
#elif defined(PADDLE_WITH_MUSA)
mudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
auto& idesc = desc.descriptor<T>(logits_data, layout, tensor_dims);
#else
cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
#endif
Expand All @@ -1224,18 +1220,14 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
MIOPEN_SOFTMAX_LOG,
mode));
#elif defined(PADDLE_WITH_MUSA)
auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE
: MUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward(
handle,
MUDNN_SOFTMAX_LOG,
mode,
phi::backends::gpu::CudnnDataType<T>::kOne(),
descp,
logits_data,
phi::backends::gpu::CudnnDataType<T>::kZero(),
descp,
softmax_data));
ScopedTensorDescriptor odesc;
auto& odescp = odesc.descriptor<T>(softmax_data, layout, tensor_dims);
// auto mode = axis == rank - 1 ? dynload::Softmax::Mode::LOGSOFTMAX
// : dynload::Softmax::Mode::SOFTMAX;
backends::gpu::ScopedSoftmaxDescriptor softmax_desc;
softmax_desc.descriptor(dynload::Softmax::Mode::LOGSOFTMAX,
dynload::Softmax::Algorithm::DIRECT,
axis).Run(*handle, odescp, idesc);
#else
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
Expand Down
14 changes: 13 additions & 1 deletion paddle/phi/kernels/gpudnn/softmax_gpudnn.h
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
const bool log_mode,
const std::vector<int>& tensor_dims,
T* out_data) {
printf("ShangShang SoftmaxForwardCudnnKernel%s %d \n", __FILE__, __LINE__);
auto handle = dev_ctx.cudnn_handle();
GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;

Expand Down Expand Up @@ -1113,6 +1114,7 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
const bool log_mode,
const std::vector<int>& tensor_dims,
T* dx_data) {
printf("ShangShang %s %d SoftmaxBackwardCudnnKernel \n", __FILE__, __LINE__);
auto handle = dev_ctx.cudnn_handle();
GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;

Expand All @@ -1136,7 +1138,17 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
algo,
mode));
#elif defined(PADDLE_WITH_MUSA)
//
auto& desc = scoped_desc.descriptor<T>(out_data, layout, tensor_dims);
ScopedTensorDescriptor scoped_dxdesc;
auto& dxdesc = scoped_dxdesc.descriptor<T>(dx_data, layout, tensor_dims);
ScopedTensorDescriptor scoped_dodesc;
auto& dodesc = scoped_dodesc.descriptor<T>(dout_data, layout, tensor_dims);
backends::gpu::ScopedSoftmaxDescriptor softmax_desc;
auto mode = log_mode ? dynload::Softmax::Mode::LOGSOFTMAX
: dynload::Softmax::Mode::SOFTMAX;
auto algo = log_mode ? dynload::Softmax::Algorithm::DIRECT
: dynload::Softmax::Algorithm::ACCURATE;
softmax_desc.descriptor(mode, algo, axis).RunBwd(*handle, dxdesc, desc, dodesc);
#else
cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
# limitations under the License.

import unittest

import sys
sys.path.append("/workspace/Paddle/test")
from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

import os
import unittest

import sys
sys.path.append("/workspace/Paddle/test")
from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus


Expand Down