Skip to content

Commit

Permalink
Refactoring PADDLE_ENFORCE_CUDA_SUCCESS, and apply to curand/cudnn/cu…
Browse files Browse the repository at this point in the history
…blas/NCCL,test=develop
  • Loading branch information
zhwesky2010 committed Apr 16, 2020
1 parent cd0e7ba commit 69c4796
Show file tree
Hide file tree
Showing 22 changed files with 226 additions and 343 deletions.
4 changes: 1 addition & 3 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cu
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(

PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
cudaMemcpyHostToDevice, dev_ctx->stream()),
platform::errors::External(
"Async cudaMemcpy op_var info to gpu failed."));
cudaMemcpyHostToDevice, dev_ctx->stream()));
} else { // get
auto iter = op_var2gpu_str.find(op_var);
PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
Expand Down
27 changes: 9 additions & 18 deletions paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs_.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream));

int outer_rows = outer_rows_ * batchSize;

Expand Down Expand Up @@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream));

split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
Expand All @@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs.size() * sizeof(half*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
cudaMemcpyHostToDevice, stream));

split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
Expand Down
15 changes: 4 additions & 11 deletions paddle/fluid/memory/allocation/cuda_device_context_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator {
: place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming),
platform::errors::External(
"Create event failed in CUDADeviceContextAllocator"));
cudaEventCreate(&event_, cudaEventDisableTiming));
}

~CUDADeviceContextAllocator() {
if (event_) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventDestroy(event_),
"Destory event failed in CUDADeviceContextAllocator destroctor");
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
}
}

Expand All @@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator {
auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(event_, default_stream_),
"Failed to record event in CUDADeviceContextAllocator");
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0),
"Failed to wait event in CUDADeviceContextAllocator");
cudaStreamWaitEvent(default_stream_, event_, 0));
return allocation;
}

Expand Down
14 changes: 2 additions & 12 deletions paddle/fluid/operators/argsort_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream);
}
PADDLE_ENFORCE_CUDA_SUCCESS(
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
"temp_storage_bytes, status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
PADDLE_ENFORCE_CUDA_SUCCESS(err);

Tensor temp_storage;
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
Expand All @@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
cu_stream);
}

PADDLE_ENFORCE_CUDA_SUCCESS(
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
"temp_storage_bytes:%d status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
PADDLE_ENFORCE_CUDA_SUCCESS(err);
}

template <typename T, typename IndType>
Expand Down
90 changes: 21 additions & 69 deletions paddle/fluid/operators/fused/fused_bn_activation_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,32 +108,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&data_desc_)."));
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&bn_param_desc_)."));
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));

VLOG(3) << "Setting descriptors.";
std::vector<int> dims = {N, C, H, W, D};
std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};

PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
platform::errors::External(
"The error has happened when calling cudnnSetTensorNdDescriptor."));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));

PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_),
platform::errors::External("The error has happened when calling "
"cudnnDeriveBNTensorDescriptor."));
data_desc_, mode_));

double this_factor = 1. - momentum;
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
Expand Down Expand Up @@ -166,10 +155,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/activation_desc_,
/*sizeInBytes=*/&workspace_size),
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
/*sizeInBytes=*/&workspace_size));

// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_CUDA_SUCCESS(
Expand All @@ -179,10 +165,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
/*bnOps=*/bnOps_,
/*activationDesc=*/activation_desc_,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size),
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
/*sizeInBytes=*/&reserve_space_size));

reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
reserve_space_size);
Expand All @@ -204,22 +187,13 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
reserve_space_size),
platform::errors::External(
"The error has happened when calling "
"cudnnBatchNormalizationForwardTrainingEx."));
reserve_space_size));

// clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(data_desc_)."));
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(bn_param_desc_)."));
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
}
};

Expand Down Expand Up @@ -298,33 +272,22 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&data_desc_)."));
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&bn_param_desc_)."));
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);

PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
platform::errors::External(
"The error has happened when calling cudnnSetTensorNdDescriptor."));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_),
platform::errors::External("The error has happened when calling "
"cudnnDeriveBNTensorDescriptor."));
data_desc_, mode_));

const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
Expand Down Expand Up @@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/activation_desc_,
/*sizeInBytes=*/&workspace_size),
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
/*sizeInBytes=*/&workspace_size));

workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
workspace_size);
Expand Down Expand Up @@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size),
platform::errors::External("The error has happened when calling "
"cudnnBatchNormalizationBackwardEx."));
/*reserveSpaceSizeInBytes=*/reserve_space_size));

// clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(data_desc_)."));
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(bn_param_desc_)."));
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t in_desc;
cudnnTensorDescriptor_t out_desc;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&in_desc),
platform::errors::External("Create cudnn tensor descriptor failed in "
"transpose_flatten_concat_fusion op."));
platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&out_desc),
platform::errors::External("Create cudnn tensor descriptor failed in "
"transpose_flatten_concat_fusion op."));
platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;

auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
Expand Down Expand Up @@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
dims_y[i] = 1;
}

PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetTensorNdDescriptor(
in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()),
platform::errors::External("Create cudnn tensorNd descriptor failed "
"in transpose_flatten_concat op."));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetTensorNdDescriptor(
out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()),
platform::errors::External("Create cudnn tensorNd descriptor failed "
"in transpose_flatten_concat op."));

PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), in_desc,
static_cast<const void*>(ins[k]->data<T>()),
CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)),
platform::errors::External("Create cudnn transform tensor failed in "
"transpose_flatten_concat op."));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));

PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), in_desc,
static_cast<const void*>(ins[k]->data<T>()),
CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
if (concat_axis == 0) {
odata += osize;
} else {
Expand All @@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
}
}
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(in_desc),
platform::errors::External(
"Destory cudnn descriptor failed in transpose_flatten_concat op."));
platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(out_desc),
platform::errors::External(
"Destory cudnn descriptor failed in transpose_flatten_concat op."));
platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
}
};

Expand Down
15 changes: 5 additions & 10 deletions paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize<int>(output->dims()));

PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSpatialTfSamplerForward(
handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
output_data),
platform::errors::InvalidArgument(
"cudnnSpatialTfSamplerForward in Op(grid_sampler) failed"));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
output_data));
}
};

Expand Down Expand Up @@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
output_grad_data, grid_data, CudnnDataType<T>::kZero(),
grid_grad_data),
platform::errors::InvalidArgument(
"cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed"));
grid_grad_data));
}
};

Expand Down
Loading

0 comments on commit 69c4796

Please sign in to comment.