Skip to content

Commit

Permalink
DeviceContext Split, test=develop (PaddlePaddle#23737)
Browse files Browse the repository at this point in the history
* supports thread-binding stream, test=develop

* avoid using thread_local variables in dtor, test=develop

* modify the stream priority enum, test=develop
  • Loading branch information
Shixiaowei02 committed May 11, 2020
1 parent 0b54e0b commit 4abe102
Show file tree
Hide file tree
Showing 6 changed files with 363 additions and 94 deletions.
3 changes: 2 additions & 1 deletion paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

add_subdirectory(dynload)
add_subdirectory(stream)

cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
Expand All @@ -54,7 +55,7 @@ IF(WITH_DGC)
ENDIF()

IF(WITH_GPU)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF()

IF(WITH_MKLDNN)
Expand Down
106 changes: 37 additions & 69 deletions paddle/fluid/platform/device_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -211,25 +211,40 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}

thread_local std::unordered_map<const CUDADeviceContext*,
std::shared_ptr<CUDAContext>>
CUDADeviceContext::thread_ctx_;
thread_local std::mutex CUDADeviceContext::ctx_mtx_;

void CUDAContext::InitEigenContext() {
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&RawStream(), place_);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
}

CUDAContext::CUDAContext(const CUDAPlace& place,
const enum stream::Priority& priority) {
place_ = place;
CUDADeviceGuard guard(place_.device);
stream_.reset(new stream::CUDAStream(place, priority));
InitEigenContext();
InitCuBlasContext();
InitCuDNNContext();
}

CUDAContext::~CUDAContext() {
CUDADeviceGuard guard(place_.device);
DestoryCuDNNContext();
DestoryCuBlasContext();
}

CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceGuard guard(place_.device);
compute_capability_ = GetCUDAComputeCapability(place_.device);
multi_process_ = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH));

if (TensorCoreAvailable()) {
#if CUDA_VERSION >= 9000
cublas_tensor_core_handle_.reset(
new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
#endif
}

driver_version_ = GetCUDADriverVersion(place_.device);
runtime_version_ = GetCUDARuntimeVersion(place_.device);
Expand Down Expand Up @@ -263,48 +278,12 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}

if (dynload::HasCUDNN()) {
auto local_cudnn_version = cudnn_dso_ver / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN "
<< compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
<< ", but CUDNN version in your machine is "
<< local_cudnn_version / 10 << "." << local_cudnn_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreate(&cudnn_handle_),
"Failed to create Cudnn handle in DeviceContext");
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, stream_),
"Failed to set stream for Cudnn handle in DeviceContext");
} else {
cudnn_handle_ = nullptr;
}
}

callback_manager_.reset(new StreamCallbackManager(stream_));
default_ctx_.reset(new CUDAContext(place_));
}

CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
Wait();
WaitStreamCallback();
cublas_handle_.reset();
cublas_tensor_core_handle_.reset();
eigen_stream_.reset();
eigen_device_.reset();
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
if (cudnn_handle_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
"Failed to destory Cudnn handle");
}
#if defined(PADDLE_WITH_NCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
Expand All @@ -314,22 +293,7 @@ CUDADeviceContext::~CUDADeviceContext() {

Place CUDADeviceContext::GetPlace() const { return place_; }

void CUDADeviceContext::Wait() const {
cudaError_t e_sync = cudaSuccess;
#if !defined(_WIN32)
e_sync = cudaStreamSynchronize(stream_);
#else
while (e_sync = cudaStreamQuery(stream_)) {
if (e_sync == cudaErrorNotReady) continue;
break;
}
#endif

PADDLE_ENFORCE_CUDA_SUCCESS(
e_sync, platform::errors::Fatal(
"cudaStreamSynchronize raises error: %s, errono: %d",
cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
}
void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }

int CUDADeviceContext::GetComputeCapability() const {
return compute_capability_;
Expand All @@ -346,24 +310,28 @@ int CUDADeviceContext::GetMaxThreadsPerBlock() const {
}

Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get();
return context()->EigenDevice().get();
}

bool CUDADeviceContext::tensor_core_available() const {
return cublas_tensor_core_handle_ != nullptr;
return context()->CublasTensorCoreHandle() != nullptr;
}

dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return max_grid_dim_size_;
}

cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return context()->CudnnHandle();
}

CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
}

cudaStream_t CUDADeviceContext::stream() const { return stream_; }
cudaStream_t CUDADeviceContext::stream() const {
return context()->RawStream();
}

CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
Expand Down
Loading

0 comments on commit 4abe102

Please sign in to comment.