Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DeviceContext Split, test=develop #23737

Merged
merged 3 commits into from
Apr 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

add_subdirectory(dynload)
add_subdirectory(stream)

cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
Expand All @@ -54,7 +55,7 @@ IF(WITH_DGC)
ENDIF()

IF(WITH_GPU)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF()

IF(WITH_MKLDNN)
Expand Down
106 changes: 37 additions & 69 deletions paddle/fluid/platform/device_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -211,25 +211,40 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}

thread_local std::unordered_map<const CUDADeviceContext*,
std::shared_ptr<CUDAContext>>
CUDADeviceContext::thread_ctx_;
thread_local std::mutex CUDADeviceContext::ctx_mtx_;

void CUDAContext::InitEigenContext() {
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&RawStream(), place_);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
}

CUDAContext::CUDAContext(const CUDAPlace& place,
const enum stream::Priority& priority) {
place_ = place;
CUDADeviceGuard guard(place_.device);
stream_.reset(new stream::CUDAStream(place, priority));
InitEigenContext();
InitCuBlasContext();
InitCuDNNContext();
}

CUDAContext::~CUDAContext() {
CUDADeviceGuard guard(place_.device);
DestoryCuDNNContext();
DestoryCuBlasContext();
}

CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceGuard guard(place_.device);
compute_capability_ = GetCUDAComputeCapability(place_.device);
multi_process_ = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH));

if (TensorCoreAvailable()) {
#if CUDA_VERSION >= 9000
cublas_tensor_core_handle_.reset(
new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
#endif
}

driver_version_ = GetCUDADriverVersion(place_.device);
runtime_version_ = GetCUDARuntimeVersion(place_.device);
Expand Down Expand Up @@ -263,48 +278,12 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}

if (dynload::HasCUDNN()) {
auto local_cudnn_version = cudnn_dso_ver / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN "
<< compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
<< ", but CUDNN version in your machine is "
<< local_cudnn_version / 10 << "." << local_cudnn_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreate(&cudnn_handle_),
"Failed to create Cudnn handle in DeviceContext");
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, stream_),
"Failed to set stream for Cudnn handle in DeviceContext");
} else {
cudnn_handle_ = nullptr;
}
}

callback_manager_.reset(new StreamCallbackManager(stream_));
default_ctx_.reset(new CUDAContext(place_));
}

CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
Wait();
WaitStreamCallback();
cublas_handle_.reset();
cublas_tensor_core_handle_.reset();
eigen_stream_.reset();
eigen_device_.reset();
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
if (cudnn_handle_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
"Failed to destory Cudnn handle");
}
#if defined(PADDLE_WITH_NCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
Expand All @@ -314,22 +293,7 @@ CUDADeviceContext::~CUDADeviceContext() {

Place CUDADeviceContext::GetPlace() const { return place_; }

void CUDADeviceContext::Wait() const {
cudaError_t e_sync = cudaSuccess;
#if !defined(_WIN32)
e_sync = cudaStreamSynchronize(stream_);
#else
while (e_sync = cudaStreamQuery(stream_)) {
if (e_sync == cudaErrorNotReady) continue;
break;
}
#endif

PADDLE_ENFORCE_CUDA_SUCCESS(
e_sync, platform::errors::Fatal(
"cudaStreamSynchronize raises error: %s, errono: %d",
cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
}
void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }

int CUDADeviceContext::GetComputeCapability() const {
return compute_capability_;
Expand All @@ -346,24 +310,28 @@ int CUDADeviceContext::GetMaxThreadsPerBlock() const {
}

Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get();
return context()->EigenDevice().get();
}

bool CUDADeviceContext::tensor_core_available() const {
return cublas_tensor_core_handle_ != nullptr;
return context()->CublasTensorCoreHandle() != nullptr;
}

dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return max_grid_dim_size_;
}

cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return context()->CudnnHandle();
}

CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
}

cudaStream_t CUDADeviceContext::stream() const { return stream_; }
cudaStream_t CUDADeviceContext::stream() const {
return context()->RawStream();
}

CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
Expand Down
Loading