From d49bd57783591261dd4106da4d723c8881c8ec87 Mon Sep 17 00:00:00 2001
From: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Sat, 28 Mar 2020 13:15:33 +0000
Subject: [PATCH 1/3] supports thread-binding stream, test=develop

---
 paddle/fluid/platform/CMakeLists.txt        |   3 +-
 paddle/fluid/platform/device_context.cc     | 103 ++++------
 paddle/fluid/platform/device_context.h      | 203 +++++++++++++++++---
 paddle/fluid/platform/stream/CMakeLists.txt |   3 +
 paddle/fluid/platform/stream/cuda_stream.cc |  60 ++++++
 paddle/fluid/platform/stream/cuda_stream.h  |  61 ++++++
 6 files changed, 343 insertions(+), 90 deletions(-)
 create mode 100644 paddle/fluid/platform/stream/CMakeLists.txt
 create mode 100644 paddle/fluid/platform/stream/cuda_stream.cc
 create mode 100644 paddle/fluid/platform/stream/cuda_stream.h
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index b954d1b7658ae..ddf3035a92754 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -44,6 +44,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
+add_subdirectory(stream)
 
 cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
 cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
@@ -54,7 +55,7 @@ IF(WITH_DGC)
 ENDIF()
 
 IF(WITH_GPU)
-    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
+    set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()
 
 IF(WITH_MKLDNN)
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 322a32796787e..e319b395365e5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -211,6 +211,34 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
   allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
 }
 
+thread_local std::unordered_map<const CUDADeviceContext*,
+                                std::unique_ptr<CUDAContext>>
+    CUDADeviceContext::thread_ctx_;
+thread_local std::mutex CUDADeviceContext::ctx_mtx_;
+
+void CUDAContext::InitEigenContext(const stream::CUDAStream& stream) {
+  eigen_stream_.reset(new EigenCudaStreamDevice());
+  eigen_stream_->Reinitialize(&stream.stream(), place_);
+  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+CUDAContext::CUDAContext(const CUDAPlace& place,
+                         const enum stream::Priority& priority) {
+  place_ = place;
+  CUDADeviceGuard guard(place_.device);
+  stream_.Init(place, priority);
+  InitEigenContext(stream_);
+  InitCuBlasContext(stream_);
+  InitCuDNNContext(stream_);
+  InitCallbackManager(stream_);
+}
+
+CUDAContext::~CUDAContext() {
+  CUDADeviceGuard guard(place_.device);
+  DestoryCuDNNContext();
+  DestoryCuBlasContext();
+}
+
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   CUDADeviceGuard guard(place_.device);
   compute_capability_ = GetCUDAComputeCapability(place_.device);
@@ -218,18 +246,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
   max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
   max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
-  eigen_stream_.reset(new EigenCudaStreamDevice());
-  eigen_stream_->Reinitialize(&stream_, place);
-  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
-  cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH));
-
-  if (TensorCoreAvailable()) {
-#if CUDA_VERSION >= 9000
-    cublas_tensor_core_handle_.reset(
-        new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
-#endif
-  }
 
   driver_version_ = GetCUDADriverVersion(place_.device);
   runtime_version_ = GetCUDARuntimeVersion(place_.device);
@@ -263,48 +279,14 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
           << "Please recompile or reinstall Paddle with compatible CUDA "
              "version.";
     }
-
-    if (dynload::HasCUDNN()) {
-      auto local_cudnn_version = cudnn_dso_ver / 100;
-      auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with CUDNN "
-            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
-            << ", but CUDNN version in your machine is "
-            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible CUDNN "
-               "version.";
-      }
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnCreate(&cudnn_handle_),
-          "Failed to create Cudnn handle in DeviceContext");
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnSetStream(cudnn_handle_, stream_),
-          "Failed to set stream for Cudnn handle in DeviceContext");
-    } else {
-      cudnn_handle_ = nullptr;
-    }
   }
-
-  callback_manager_.reset(new StreamCallbackManager(stream_));
+  default_ctx_.reset(new CUDAContext(place_));
 }
 
 CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
   Wait();
   WaitStreamCallback();
-  cublas_handle_.reset();
-  cublas_tensor_core_handle_.reset();
-  eigen_stream_.reset();
-  eigen_device_.reset();
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
-  if (cudnn_handle_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
-                                "Failed to destory Cudnn handle");
-  }
 #if defined(PADDLE_WITH_NCCL)
   if (nccl_comm_) {
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
@@ -314,22 +296,7 @@ CUDADeviceContext::~CUDADeviceContext() {
 
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
-void CUDADeviceContext::Wait() const {
-  cudaError_t e_sync = cudaSuccess;
-#if !defined(_WIN32)
-  e_sync = cudaStreamSynchronize(stream_);
-#else
-  while (e_sync = cudaStreamQuery(stream_)) {
-    if (e_sync == cudaErrorNotReady) continue;
-    break;
-  }
-#endif
-
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      e_sync, platform::errors::Fatal(
-                  "cudaStreamSynchronize raises error: %s, errono: %d",
-                  cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
-}
+void CUDADeviceContext::Wait() const { context()->Wait(); }
 
 int CUDADeviceContext::GetComputeCapability() const {
   return compute_capability_;
@@ -346,24 +313,26 @@ int CUDADeviceContext::GetMaxThreadsPerBlock() const {
 }
 
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
-  return eigen_device_.get();
+  return context()->EigenDevice().get();
 }
 
 bool CUDADeviceContext::tensor_core_available() const {
-  return cublas_tensor_core_handle_ != nullptr;
+  return context()->CublasTensorCoreHandle() != nullptr;
 }
 
 dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
   return max_grid_dim_size_;
 }
 
-cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
+cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
+  return context()->CudnnHandle();
+}
 
 CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
   return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }
 
-cudaStream_t CUDADeviceContext::stream() const { return stream_; }
+cudaStream_t CUDADeviceContext::stream() const { return context()->Stream(); }
 
 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index d6b8cda94f20b..768a9d1deaadd 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -38,6 +38,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/fluid/platform/stream_callback_manager.h"
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -80,6 +81,160 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
 class EigenCudaStreamDevice;
 class CudnnWorkspaceHandle;
 
+class CUDAContext {
+ public:
+  CUDAContext() = default;
+  explicit CUDAContext(
+      const CUDAPlace& place,
+      const enum stream::Priority& priority = stream::Priority::NORMAL);
+
+  ~CUDAContext();
+
+  const CUDAPlace& Place() const { return place_; }
+
+  const std::unique_ptr<Eigen::GpuDevice>& EigenDevice() const {
+    return eigen_device_;
+  }
+
+  const std::unique_ptr<EigenCudaStreamDevice>& EigenStream() const {
+    return eigen_stream_;
+  }
+
+  const cudaStream_t& Stream() const { return stream_.stream(); }
+
+  const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; }
+
+  const std::unique_ptr<CublasHandleHolder>& CublasHandle() const {
+    return cublas_handle_;
+  }
+
+  const std::unique_ptr<CublasHandleHolder>& CublasTensorCoreHandle() const {
+    return cublas_tensor_core_handle_;
+  }
+
+  /*! \brief  Call cublas function safely. */
+  template <typename Callback>
+  inline void CublasCall(Callback&& callback) const {
+    cublas_handle_->Call(std::forward<Callback>(callback));
+  }
+
+  /*! \brief  Check whether tensor core is supported */
+  bool tensor_core_available() const;
+
+  /*! \brief  Call cublas function with Tensor Core safely. If
+      Tensor Core is not available, use DEFAULT_MATH instead. */
+  template <typename Callback>
+  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
+    if (cublas_tensor_core_handle_) {
+      cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
+    } else {
+      cublas_handle_->Call(std::forward<Callback>(callback));
+    }
+  }
+
+  template <typename Callback>
+  void RecordEvent(cudaEvent_t ev, Callback callback) {
+    callback();
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventRecord(ev, stream_.stream()),
+        platform::errors::Fatal("CUDA event recording failed."));
+  }
+
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    callback_manager_->AddCallback(callback);
+  }
+
+  void WaitStreamCallback() const { callback_manager_->Wait(); }
+
+  void Wait() const {
+    cudaError_t e_sync = cudaSuccess;
+#if !defined(_WIN32)
+    e_sync = cudaStreamSynchronize(stream_.stream());
+#else
+    while (e_sync = cudaStreamQuery(stream_.stream())) {
+      if (e_sync == cudaErrorNotReady) continue;
+      break;
+    }
+#endif
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        e_sync, platform::errors::Fatal(
+                    "cudaStreamSynchronize raises error: %s, errono: %d",
+                    cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
+  }
+
+ private:
+  void InitEigenContext(const stream::CUDAStream& stream);
+
+  void InitCuBlasContext(const stream::CUDAStream& stream) {
+    cublas_handle_.reset(
+        new CublasHandleHolder(stream.stream(), CUBLAS_DEFAULT_MATH));
+    if (TensorCoreAvailable()) {
+#if CUDA_VERSION >= 9000
+      cublas_tensor_core_handle_.reset(
+          new CublasHandleHolder(stream.stream(), CUBLAS_TENSOR_OP_MATH));
+#endif
+    }
+  }
+
+  void InitCallbackManager(const stream::CUDAStream& stream) {
+    callback_manager_.reset(new StreamCallbackManager(stream.stream()));
+  }
+
+  void InitCuDNNContext(const stream::CUDAStream& stream) {
+    if (dynload::HasCUDNN()) {
+      auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
+      auto compile_cudnn_version = CUDNN_VERSION / 100;
+      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with CUDNN "
+            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+            << ", but CUDNN version in your machine is "
+            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible CUDNN "
+               "version.";
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::cudnnCreate(&cudnn_handle_),
+          platform::errors::Fatal(
+              "Failed to create Cudnn handle in DeviceContext"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::cudnnSetStream(cudnn_handle_, stream.stream()),
+          platform::errors::Fatal(
+              "Failed to set stream for Cudnn handle in DeviceContext"));
+    } else {
+      cudnn_handle_ = nullptr;
+    }
+  }
+
+  void DestoryCuDNNContext() {
+    if (cudnn_handle_) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::cudnnDestroy(cudnn_handle_),
+          platform::errors::Fatal("Failed to destory Cudnn handle"));
+    }
+    cudnn_handle_ = nullptr;
+  }
+
+  void DestoryCuBlasContext() {
+    cublas_handle_.reset();
+    cublas_tensor_core_handle_.reset();
+  }
+
+  CUDAPlace place_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
+  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
+  stream::CUDAStream stream_;
+  cudnnHandle_t cudnn_handle_;
+  std::unique_ptr<CublasHandleHolder> cublas_handle_;
+  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
+  std::unique_ptr<StreamCallbackManager> callback_manager_;
+  DISABLE_COPY_AND_ASSIGN(CUDAContext);
+};
+
 class CUDADeviceContext : public DeviceContext {
  public:
   explicit CUDADeviceContext(CUDAPlace place);
@@ -112,7 +267,7 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Call cublas function safely. */
   template <typename Callback>
   inline void CublasCall(Callback&& callback) const {
-    cublas_handle_->Call(std::forward<Callback>(callback));
+    return context()->CublasCall(callback);
   }
 
   /*! \brief  Check whether tensor core is supported */
@@ -122,11 +277,7 @@ class CUDADeviceContext : public DeviceContext {
       Tensor Core is not available, use DEFAULT_MATH instead. */
   template <typename Callback>
   inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
-    if (cublas_tensor_core_handle_) {
-      cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
-    } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
-    }
+    return context()->TensorCoreCublasCallIfAvailable(callback);
   }
 
   /*! \brief  Return cudnn  handle in the device context. */
@@ -154,32 +305,43 @@ class CUDADeviceContext : public DeviceContext {
 
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) {
-    callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
+    return context()->RecordEvent(ev, callback);
   }
 
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
-    callback_manager_->AddCallback(callback);
+    return context()->AddStreamCallback(callback);
   }
 
-  void WaitStreamCallback() const { callback_manager_->Wait(); }
+  void WaitStreamCallback() const { return context()->WaitStreamCallback(); }
+
+  void ResetDefaultContext(const enum stream::Priority& priority) {
+    default_ctx_.reset(new CUDAContext(place_, priority));
+  }
+
+  void ResetThreadContext(const enum stream::Priority& priority) {
+    std::lock_guard<std::mutex> guard(ctx_mtx_);
+    thread_ctx_[this].reset(new CUDAContext(place_, priority));
+  }
+
+  const std::unique_ptr<CUDAContext>& context() const {
+    if (!thread_ctx_.count(this)) {
+      return default_ctx_;
+    }
+    return thread_ctx_.at(this);
+  }
 
  private:
   CUDAPlace place_;
+  std::unique_ptr<CUDAContext> default_ctx_;
 
-  mutable std::once_flag init_cudnn_;
-
-  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
-  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-  cudaStream_t stream_;
+  static thread_local std::unordered_map<const CUDADeviceContext*,
+                                         std::unique_ptr<CUDAContext>>
+      thread_ctx_;
+  static thread_local std::mutex ctx_mtx_;
 
-  cudnnHandle_t cudnn_handle_;
   mutable std::mutex cudnn_handle_mtx_;
 
-  std::unique_ptr<CublasHandleHolder> cublas_handle_;
-  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
-
 #if defined(PADDLE_WITH_NCCL)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs
@@ -197,9 +359,6 @@ class CUDADeviceContext : public DeviceContext {
   int max_threads_per_block_;
   dim3 max_grid_dim_size_;
 
-  // StreamCallbackManager is thread-safe
-  std::unique_ptr<StreamCallbackManager> callback_manager_;
-
   DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };
 
diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt
new file mode 100644
index 0000000000000..78eb776b544d2
--- /dev/null
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
@@ -0,0 +1,3 @@
+IF(WITH_GPU)
+cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce)
+ENDIF()
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
new file mode 100644
index 0000000000000..8880212b48cef
--- /dev/null
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+constexpr int64_t kHighPriority = -1;
+constexpr int64_t kNormalPriority = 0;
+constexpr unsigned int kDefaultFlag = cudaStreamDefault;
+
+bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
+  PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+                    platform::errors::InvalidArgument(
+                        "Cuda stream must be created using cuda place."));
+  place_ = place;
+  CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
+  if (priority == Priority::HIGH) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kHighPriority),
+        platform::errors::Fatal("High priority cuda stream creation failed."));
+  } else if (priority == Priority::NORMAL) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kNormalPriority),
+        platform::errors::Fatal(
+            "Normal priority cuda stream creation failed."));
+  }
+  VLOG(3) << "CUDAStream Init stream: " << stream_
+          << ", priority: " << static_cast<int>(priority);
+  return true;
+}
+
+void CUDAStream::Destroy() {
+  CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
+  if (stream_) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamDestroy(stream_),
+        platform::errors::Fatal("Cuda stream destruction failed."));
+  }
+  stream_ = nullptr;
+}
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
new file mode 100644
index 0000000000000..8c3317645d380
--- /dev/null
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+#ifdef PADDLE_WITH_CUDA
+
+enum class Priority : uint8_t {
+  NIL = 0x0,
+  HIGH = 0x1,
+  NORMAL = 0x2,
+};
+
+class CUDAStream final {
+ public:
+  CUDAStream() = default;
+  CUDAStream(const Place& place,
+             const enum Priority& priority = Priority::NORMAL) {
+    Init(place, priority);
+  }
+  virtual ~CUDAStream() { Destroy(); }
+
+  bool Init(const Place& place,
+            const enum Priority& priority = Priority::NORMAL);
+
+  const cudaStream_t& stream() const { return stream_; }
+  void Destroy();
+
+ private:
+  Place place_;
+  cudaStream_t stream_{nullptr};
+  Priority priority_{Priority::NORMAL};
+
+  DISABLE_COPY_AND_ASSIGN(CUDAStream);
+};
+
+#endif
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle

From 3f3ca4175cf64e378cab1e2cf1b4751f9467bf8e Mon Sep 17 00:00:00 2001
From: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Mon, 13 Apr 2020 12:01:51 +0000
Subject: [PATCH 2/3] avoid using thread_local variables in dtor, test=develop

---
 paddle/fluid/platform/device_context.cc     | 23 +++----
 paddle/fluid/platform/device_context.h      | 74 ++++++---------------
 paddle/fluid/platform/stream/cuda_stream.cc | 20 ++++++
 paddle/fluid/platform/stream/cuda_stream.h  | 21 +++++-
 4 files changed, 72 insertions(+), 66 deletions(-)

diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index e319b395365e5..8a6552549176e 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -212,13 +212,13 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
 }
 
 thread_local std::unordered_map<const CUDADeviceContext*,
-                                std::unique_ptr<CUDAContext>>
+                                std::shared_ptr<CUDAContext>>
     CUDADeviceContext::thread_ctx_;
 thread_local std::mutex CUDADeviceContext::ctx_mtx_;
 
-void CUDAContext::InitEigenContext(const stream::CUDAStream& stream) {
+void CUDAContext::InitEigenContext() {
   eigen_stream_.reset(new EigenCudaStreamDevice());
-  eigen_stream_->Reinitialize(&stream.stream(), place_);
+  eigen_stream_->Reinitialize(&RawStream(), place_);
   eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
 }
 
@@ -226,11 +226,10 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
                          const enum stream::Priority& priority) {
   place_ = place;
   CUDADeviceGuard guard(place_.device);
-  stream_.Init(place, priority);
-  InitEigenContext(stream_);
-  InitCuBlasContext(stream_);
-  InitCuDNNContext(stream_);
-  InitCallbackManager(stream_);
+  stream_.reset(new stream::CUDAStream(place, priority));
+  InitEigenContext();
+  InitCuBlasContext();
+  InitCuDNNContext();
 }
 
 CUDAContext::~CUDAContext() {
@@ -285,8 +284,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
 
 CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
-  Wait();
-  WaitStreamCallback();
 #if defined(PADDLE_WITH_NCCL)
   if (nccl_comm_) {
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
@@ -296,7 +293,7 @@ CUDADeviceContext::~CUDADeviceContext() {
 
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
-void CUDADeviceContext::Wait() const { context()->Wait(); }
+void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
 
 int CUDADeviceContext::GetComputeCapability() const {
   return compute_capability_;
@@ -332,7 +329,9 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
   return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }
 
-cudaStream_t CUDADeviceContext::stream() const { return context()->Stream(); }
+cudaStream_t CUDADeviceContext::stream() const {
+  return context()->RawStream();
+}
 
 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 768a9d1deaadd..4edec11070160 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -39,7 +39,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/stream/cuda_stream.h"
-#include "paddle/fluid/platform/stream_callback_manager.h"
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -100,7 +99,9 @@ class CUDAContext {
     return eigen_stream_;
   }
 
-  const cudaStream_t& Stream() const { return stream_.stream(); }
+  const std::unique_ptr<stream::CUDAStream>& Stream() const { return stream_; }
+
+  const cudaStream_t& RawStream() { return stream_->raw_stream(); }
 
   const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; }
 
@@ -132,57 +133,21 @@ class CUDAContext {
     }
   }
 
-  template <typename Callback>
-  void RecordEvent(cudaEvent_t ev, Callback callback) {
-    callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventRecord(ev, stream_.stream()),
-        platform::errors::Fatal("CUDA event recording failed."));
-  }
-
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    callback_manager_->AddCallback(callback);
-  }
-
-  void WaitStreamCallback() const { callback_manager_->Wait(); }
-
-  void Wait() const {
-    cudaError_t e_sync = cudaSuccess;
-#if !defined(_WIN32)
-    e_sync = cudaStreamSynchronize(stream_.stream());
-#else
-    while (e_sync = cudaStreamQuery(stream_.stream())) {
-      if (e_sync == cudaErrorNotReady) continue;
-      break;
-    }
-#endif
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        e_sync, platform::errors::Fatal(
-                    "cudaStreamSynchronize raises error: %s, errono: %d",
-                    cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
-  }
-
  private:
-  void InitEigenContext(const stream::CUDAStream& stream);
+  void InitEigenContext();
 
-  void InitCuBlasContext(const stream::CUDAStream& stream) {
+  void InitCuBlasContext() {
     cublas_handle_.reset(
-        new CublasHandleHolder(stream.stream(), CUBLAS_DEFAULT_MATH));
+        new CublasHandleHolder(RawStream(), CUBLAS_DEFAULT_MATH));
     if (TensorCoreAvailable()) {
 #if CUDA_VERSION >= 9000
       cublas_tensor_core_handle_.reset(
-          new CublasHandleHolder(stream.stream(), CUBLAS_TENSOR_OP_MATH));
+          new CublasHandleHolder(RawStream(), CUBLAS_TENSOR_OP_MATH));
 #endif
     }
   }
 
-  void InitCallbackManager(const stream::CUDAStream& stream) {
-    callback_manager_.reset(new StreamCallbackManager(stream.stream()));
-  }
-
-  void InitCuDNNContext(const stream::CUDAStream& stream) {
+  void InitCuDNNContext() {
     if (dynload::HasCUDNN()) {
       auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
       auto compile_cudnn_version = CUDNN_VERSION / 100;
@@ -202,7 +167,7 @@ class CUDAContext {
           platform::errors::Fatal(
               "Failed to create Cudnn handle in DeviceContext"));
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnSetStream(cudnn_handle_, stream.stream()),
+          dynload::cudnnSetStream(cudnn_handle_, RawStream()),
           platform::errors::Fatal(
               "Failed to set stream for Cudnn handle in DeviceContext"));
     } else {
@@ -227,11 +192,10 @@ class CUDAContext {
   CUDAPlace place_;
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-  stream::CUDAStream stream_;
+  std::unique_ptr<stream::CUDAStream> stream_;
   cudnnHandle_t cudnn_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
-  std::unique_ptr<StreamCallbackManager> callback_manager_;
   DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };
 
@@ -304,16 +268,18 @@ class CUDADeviceContext : public DeviceContext {
 #endif
 
   template <typename Callback>
-  void RecordEvent(cudaEvent_t ev, Callback callback) {
-    return context()->RecordEvent(ev, callback);
+  void RecordEvent(cudaEvent_t ev, Callback callback) const {
+    return context()->Stream()->RecordEvent(ev, callback);
   }
 
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
-    return context()->AddStreamCallback(callback);
+    return context()->Stream()->AddCallback(callback);
   }
 
-  void WaitStreamCallback() const { return context()->WaitStreamCallback(); }
+  void WaitStreamCallback() const {
+    return context()->Stream()->WaitCallback();
+  }
 
   void ResetDefaultContext(const enum stream::Priority& priority) {
     default_ctx_.reset(new CUDAContext(place_, priority));
@@ -324,7 +290,7 @@ class CUDADeviceContext : public DeviceContext {
     thread_ctx_[this].reset(new CUDAContext(place_, priority));
   }
 
-  const std::unique_ptr<CUDAContext>& context() const {
+  std::shared_ptr<CUDAContext> context() const {
     if (!thread_ctx_.count(this)) {
       return default_ctx_;
     }
@@ -333,10 +299,12 @@ class CUDADeviceContext : public DeviceContext {
 
  private:
   CUDAPlace place_;
-  std::unique_ptr<CUDAContext> default_ctx_;
+  std::shared_ptr<CUDAContext> default_ctx_;
 
+  // The thread_local static variable will be released before the
+  // global static variable, so avoid using it in dtor.
   static thread_local std::unordered_map<const CUDADeviceContext*,
-                                         std::unique_ptr<CUDAContext>>
+                                         std::shared_ptr<CUDAContext>>
       thread_ctx_;
   static thread_local std::mutex ctx_mtx_;
 
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 8880212b48cef..138156e673b5c 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -40,6 +40,7 @@ bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
         platform::errors::Fatal(
             "Normal priority cuda stream creation failed."));
   }
+  callback_manager_.reset(new StreamCallbackManager(stream_));
   VLOG(3) << "CUDAStream Init stream: " << stream_
           << ", priority: " << static_cast<int>(priority);
   return true;
@@ -47,6 +48,8 @@ bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
 
 void CUDAStream::Destroy() {
   CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
+  Wait();
+  WaitCallback();
   if (stream_) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaStreamDestroy(stream_),
@@ -55,6 +58,23 @@ void CUDAStream::Destroy() {
   stream_ = nullptr;
 }
 
+void CUDAStream::Wait() const {
+  cudaError_t e_sync = cudaSuccess;
+#if !defined(_WIN32)
+  e_sync = cudaStreamSynchronize(stream_);
+#else
+  while (e_sync = cudaStreamQuery(stream_)) {
+    if (e_sync == cudaErrorNotReady) continue;
+    break;
+  }
+#endif
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      e_sync, platform::errors::Fatal(
+                  "cudaStreamSynchronize raises error: %s, errono: %d",
+                  cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
+}
+
 }  // namespace stream
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 8c3317645d380..44b9789d6e121 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -15,9 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+#include <memory>
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream_callback_manager.h"
 
 namespace paddle {
 namespace platform {
@@ -43,13 +45,30 @@ class CUDAStream final {
   bool Init(const Place& place,
             const enum Priority& priority = Priority::NORMAL);
 
-  const cudaStream_t& stream() const { return stream_; }
+  template <typename Callback>
+  void AddCallback(Callback&& callback) const {
+    callback_manager_->AddCallback(callback);
+  }
+
+  template <typename Callback>
+  void RecordEvent(cudaEvent_t ev, Callback callback) const {
+    callback();
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventRecord(ev, stream_),
+        platform::errors::Fatal("CUDA event recording failed."));
+  }
+
+  void Wait() const;
+  void WaitCallback() const { callback_manager_->Wait(); }
+
+  const cudaStream_t& raw_stream() const { return stream_; }
   void Destroy();
 
  private:
   Place place_;
   cudaStream_t stream_{nullptr};
   Priority priority_{Priority::NORMAL};
+  std::unique_ptr<StreamCallbackManager> callback_manager_;
 
   DISABLE_COPY_AND_ASSIGN(CUDAStream);
 };

From 9f64567780bbbedf00aad5d00a13163d5c98ef36 Mon Sep 17 00:00:00 2001
From: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Tue, 14 Apr 2020 09:37:09 +0000
Subject: [PATCH 3/3] modify the stream priority enum, test=develop

---
 paddle/fluid/platform/device_context.h      |  2 +-
 paddle/fluid/platform/stream/cuda_stream.cc | 10 ++++-----
 paddle/fluid/platform/stream/cuda_stream.h  | 24 +++++++++++++++------
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 4edec11070160..e32c8d4ea6a52 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -85,7 +85,7 @@ class CUDAContext {
   CUDAContext() = default;
   explicit CUDAContext(
       const CUDAPlace& place,
-      const enum stream::Priority& priority = stream::Priority::NORMAL);
+      const enum stream::Priority& priority = stream::Priority::kNormal);
 
   ~CUDAContext();
 
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 138156e673b5c..739892eafd824 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -20,8 +20,6 @@ namespace paddle {
 namespace platform {
 namespace stream {
 
-constexpr int64_t kHighPriority = -1;
-constexpr int64_t kNormalPriority = 0;
 constexpr unsigned int kDefaultFlag = cudaStreamDefault;
 
 bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
@@ -30,13 +28,13 @@ bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
                         "Cuda stream must be created using cuda place."));
   place_ = place;
   CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
-  if (priority == Priority::HIGH) {
+  if (priority == Priority::kHigh) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kHighPriority),
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1),
         platform::errors::Fatal("High priority cuda stream creation failed."));
-  } else if (priority == Priority::NORMAL) {
+  } else if (priority == Priority::kNormal) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kNormalPriority),
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0),
         platform::errors::Fatal(
             "Normal priority cuda stream creation failed."));
   }
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 44b9789d6e121..f7149f1e13098 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -28,22 +28,22 @@ namespace stream {
 #ifdef PADDLE_WITH_CUDA
 
 enum class Priority : uint8_t {
-  NIL = 0x0,
-  HIGH = 0x1,
-  NORMAL = 0x2,
+  kNull = 0x0,
+  kHigh = 0x1,
+  kNormal = 0x2,
 };
 
 class CUDAStream final {
  public:
   CUDAStream() = default;
   CUDAStream(const Place& place,
-             const enum Priority& priority = Priority::NORMAL) {
+             const enum Priority& priority = Priority::kNormal) {
     Init(place, priority);
   }
   virtual ~CUDAStream() { Destroy(); }
 
   bool Init(const Place& place,
-            const enum Priority& priority = Priority::NORMAL);
+            const enum Priority& priority = Priority::kNormal);
 
   template <typename Callback>
   void AddCallback(Callback&& callback) const {
@@ -58,6 +58,18 @@ class CUDAStream final {
         platform::errors::Fatal("CUDA event recording failed."));
   }
 
+  void RecordEvent(cudaEvent_t ev) const {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventRecord(ev, stream_),
+        platform::errors::Fatal("CUDA event recording failed."));
+  }
+
+  void WaitEvent(cudaEvent_t ev) const {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamWaitEvent(stream_, ev, 0),
+        platform::errors::Fatal("Failed to wait event."));
+  }
+
   void Wait() const;
   void WaitCallback() const { callback_manager_->Wait(); }
 
@@ -67,7 +79,7 @@ class CUDAStream final {
  private:
   Place place_;
   cudaStream_t stream_{nullptr};
-  Priority priority_{Priority::NORMAL};
+  Priority priority_{Priority::kNormal};
   std::unique_ptr<StreamCallbackManager> callback_manager_;
 
   DISABLE_COPY_AND_ASSIGN(CUDAStream);