Refactoring PADDLE_ENFORCE_CUDA_SUCCESS, and apply to curand/cudnn/cu…

…blas/NCCL,test=develop
PaddlePaddle · Apr 16, 2020 · 69c4796 · 69c4796
1 parent cd0e7ba
commit 69c4796
Show file tree

Hide file tree

Showing 22 changed files with 226 additions and 343 deletions.
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
 
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
-                          cudaMemcpyHostToDevice, dev_ctx->stream()),
-          platform::errors::External(
-              "Async cudaMemcpy op_var info to gpu failed."));
+                          cudaMemcpyHostToDevice, dev_ctx->stream()));
     } else {  // get
       auto iter = op_var2gpu_str.find(op_var);
       PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,

diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
   float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
   float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyAsync(output_ptrs, h_odatas,
-                      d_output_ptrs_.size() * sizeof(float*),
-                      cudaMemcpyHostToDevice, stream),
-      platform::errors::External(
-          "CUDA Memcpy failed during split plugin run."));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+      output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
+      cudaMemcpyHostToDevice, stream));
 
   int outer_rows = outer_rows_ * batchSize;
 
@@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
     float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(output_ptrs, h_odatas,
-                        d_output_ptrs.size() * sizeof(float*),
-                        cudaMemcpyHostToDevice, stream),
-        platform::errors::External(
-            "CUDA Memcpy failed during split plugin run."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
+        cudaMemcpyHostToDevice, stream));
 
     split_kernel<<<grid, block, 0, stream>>>(
         d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
@@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
     half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(output_ptrs, h_odatas,
-                        d_output_ptrs.size() * sizeof(half*),
-                        cudaMemcpyHostToDevice, stream),
-        platform::errors::External(
-            "CUDA Memcpy failed during split plugin run."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
+        cudaMemcpyHostToDevice, stream));
 
     split_kernel<<<grid, block, 0, stream>>>(
         d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,

diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator {
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventCreate(&event_, cudaEventDisableTiming),
-        platform::errors::External(
-            "Create event failed in CUDADeviceContextAllocator"));
+        cudaEventCreate(&event_, cudaEventDisableTiming));
   }
 
   ~CUDADeviceContextAllocator() {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventDestroy(event_),
-          "Destory event failed in CUDADeviceContextAllocator destroctor");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
     }
   }
 
@@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator {
     auto allocation =
         new CUDADeviceContextAllocation(memory::Alloc(place_, size));
     // Wait for the event on stream
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventRecord(event_, default_stream_),
-        "Failed to record event in CUDADeviceContextAllocator");
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0),
-        "Failed to wait event in CUDADeviceContextAllocator");
+        cudaStreamWaitEvent(default_stream_, event_, 0));
     return allocation;
   }
 

diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
@@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
         cu_stream);
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      err,
-      "ArgSortOP failed as could not launch "
-      "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
-      "temp_storage_bytes, status:%s.",
-      temp_storage_bytes, cudaGetErrorString(err));
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
 
   Tensor temp_storage;
   temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
@@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         cu_stream);
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      err,
-      "ArgSortOP failed as could not launch "
-      "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-      "temp_storage_bytes:%d status:%s.",
-      temp_storage_bytes, cudaGetErrorString(err));
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }
 
 template <typename T, typename IndType>

diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -108,32 +108,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&data_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 
     VLOG(3) << "Setting descriptors.";
     std::vector<int> dims = {N, C, H, W, D};
     std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetTensorNdDescriptor(
-            data_desc_, CudnnDataType<T>::type,
-            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
-        platform::errors::External(
-            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnDeriveBNTensorDescriptor."));
+                                                         data_desc_, mode_));
 
     double this_factor = 1. - momentum;
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
@@ -166,10 +155,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                 /*yDesc=*/data_desc_,
                 /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
                 /*activationDesc=*/activation_desc_,
-                /*sizeInBytes=*/&workspace_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
+                /*sizeInBytes=*/&workspace_size));
 
     // -------------- cudnn batchnorm reserve space --------------
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -179,10 +165,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             /*bnOps=*/bnOps_,
             /*activationDesc=*/activation_desc_,
             /*xDesc=*/data_desc_,
-            /*sizeInBytes=*/&reserve_space_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
+            /*sizeInBytes=*/&reserve_space_size));
 
     reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
                                                     reserve_space_size);
@@ -204,22 +187,13 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
             activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
-            reserve_space_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnBatchNormalizationForwardTrainingEx."));
+            reserve_space_size));
 
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(data_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
 
@@ -298,33 +272,22 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&data_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
       LOG(ERROR) << "Provided epsilon is smaller than "
                  << "CUDNN_BN_MIN_EPSILON. Setting it to "
                  << "CUDNN_BN_MIN_EPSILON instead.";
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetTensorNdDescriptor(
-            data_desc_, CudnnDataType<T>::type,
-            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
-        platform::errors::External(
-            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnDeriveBNTensorDescriptor."));
+                                                         data_desc_, mode_));
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*dxDesc=*/data_desc_,
             /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
             /*activationDesc=*/activation_desc_,
-            /*sizeInBytes=*/&workspace_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
+            /*sizeInBytes=*/&workspace_size));
 
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
@@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*workspace=*/workspace_ptr,
             /*workSpaceSizeInBytes=*/workspace_size,
             /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
-            /*reserveSpaceSizeInBytes=*/reserve_space_size),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnBatchNormalizationBackwardEx."));
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));
 
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(data_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
 

diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
     cudnnTensorDescriptor_t in_desc;
     cudnnTensorDescriptor_t out_desc;
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_desc),
-        platform::errors::External("Create cudnn tensor descriptor failed in "
-                                   "transpose_flatten_concat_fusion op."));
+        platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&out_desc),
-        platform::errors::External("Create cudnn tensor descriptor failed in "
-                                   "transpose_flatten_concat_fusion op."));
+        platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
     cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
 
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         dims_y[i] = 1;
       }
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(
-              in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()),
-          platform::errors::External("Create cudnn tensorNd descriptor failed "
-                                     "in transpose_flatten_concat op."));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(
-              out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()),
-          platform::errors::External("Create cudnn tensorNd descriptor failed "
-                                     "in transpose_flatten_concat op."));
-
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnTransformTensor(
-              handle, CudnnDataType<T>::kOne(), in_desc,
-              static_cast<const void*>(ins[k]->data<T>()),
-              CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)),
-          platform::errors::External("Create cudnn transform tensor failed in "
-                                     "transpose_flatten_concat op."));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
+          handle, CudnnDataType<T>::kOne(), in_desc,
+          static_cast<const void*>(ins[k]->data<T>()),
+          CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
       if (concat_axis == 0) {
         odata += osize;
       } else {
@@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
       }
     }
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_desc),
-        platform::errors::External(
-            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
+        platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(out_desc),
-        platform::errors::External(
-            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
+        platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
   }
 };
 

diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSpatialTfSamplerForward(
-            handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
-            input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
-            output_data),
-        platform::errors::InvalidArgument(
-            "cudnnSpatialTfSamplerForward in Op(grid_sampler) failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
+        handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
+        output_data));
   }
 };
 
@@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
             input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
             input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
             output_grad_data, grid_data, CudnnDataType<T>::kZero(),
-            grid_grad_data),
-        platform::errors::InvalidArgument(
-            "cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed"));
+            grid_grad_data));
   }
 };