AllReduce and CPU Barrier hangs #176

sh1ng · 2019-01-29T13:23:35Z

NCCL: v2.2.13
Platform: x86_64
Docker: cuda:8.0-cudnn5-devel-centos7

We see from time to time that tests hang. We use python wrapper of xgboost and run tests in multiple processes.

(gdb) bt
#0  0x00007f1dfe612d47 in sched_yield () from target:/usr/lib64/libc.so.6
#1  0x00007f1d0b0fede5 in ncclCpuBarrierWait (comm=comm@entry=0x563036e90de0)
    at misc/enqueue.cu:146
#2  0x00007f1d0b0ffbb2 in ncclEnqueueCheck (
    func=func@entry=0x7f1d0b117510 <ncclAllReduceFunc(void const*, void*, unsigned long, ncclDataType_t, ncclRedOp_t, int, ncclComm*, CUstream_st*)>, 
    primName=primName@entry=0x7f1d0b14c8a1 "AllReduce", sendbuff=0x7f1cc1a17800, 
    recvbuff=0x7f1cc1a17800, count=6890, type=ncclFloat64, op=ncclSum, root=root@entry=0, 
    comm=0x563036e90de0, stream=stream@entry=0x563037008dd0) at misc/enqueue.cu:205
#3  0x00007f1d0b117c3e in ncclAllReduce (sendbuff=<optimized out>, recvbuff=<optimized out>, 
    count=<optimized out>, datatype=<optimized out>, op=<optimized out>, comm=<optimized out>, 
    stream=0x563037008dd0) at collectives/all_reduce.cu:29
#4  0x00007f1d0b03f5d6 in dh::AllReducer::AllReduceSum (count=6890, recvbuff=0x7f1cc1a17800, 
    sendbuff=0x7f1cc1a17800, communication_group_idx=0, this=<optimized out>)
    at /root/repo/xgboost/src/tree/../common/device_helpers.cuh:941
#5  xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::AllReduceHist(int)::{lambda(int, std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >&)#1}::operator()(int, std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >&) const (idx=0, shard=..., __closure=<optimized out>)
    at /root/repo/xgboost/src/tree/updater_gpu_hist.cu:1043
#6  dh::ExecuteIndexShards<std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >, xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::AllReduceHist(int)::{lambda(int, std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >&)#1}>(std::vector<std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >, std::allocator<std::vector> >*, xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::AllReduceHist(int)::{lambda(int, std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >&)#1})::{lambda()#1}::operator() () at /root/repo/xgboost/src/tree/../common/device_helpers.cuh:1055
#7  0x00007f1d0b04d674 in dh::ExecuteIndexShards<std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >, xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::AllReduceHist(int)::{lambda(int, std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >&)#1}>(std::vector<std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::t--Type <RET> for more, q to quit, c to continue without paging--
ree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >, std::allocator<std::vector> >*, xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::AllReduceHist(int)::{lambda(int, std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >&)#1})::{lambda()#1}::operator()() const (__closure=0x7ffc7bc5eee0)
    at /root/repo/xgboost/src/tree/../common/device_helpers.cuh:1053
#8  dh::SaveCudaContext::SaveCudaContext<dh::ExecuteIndexShards(std::vector<T>*, FunctionT) [with T = std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >; FunctionT = xgboost::tree::GPUHistMakerSpecialised<GradientSumT>::AllReduceHist(int) [with GradientSumT = xgboost::detail::GradientPairInternal<double>]::__lambda9]::__lambda2> (func=..., 
    this=0x7ffc7bc5eed0) at /root/repo/xgboost/src/tree/../common/device_helpers.cuh:1030
#9  dh::ExecuteIndexShards<std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >, xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::AllReduceHist(int)::{lambda(int, std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >&)#1}>(std::vector<std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >, std::allocator<std::vector> >*, xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::AllReduceHist(int)::{lambda(int, std::unique_ptr<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> >, std::default_delete<xgboost::tree::DeviceShard<xgboost::detail::GradientPairInternal<double> > > >&)#1}) (shards=shards@entry=0x563036eaa880, f=...)
    at /root/repo/xgboost/src/tree/../common/device_helpers.cuh:1052
#10 0x00007f1d0b04d763 in xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::AllReduceHist (this=this@entry=0x563036eaa6d0, nidx=nidx@entry=0)
    at /root/repo/xgboost/src/tree/updater_gpu_hist.cu:1039
#11 0x00007f1d0b05e9b6 in xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::InitRoot (this=this@entry=0x563036eaa6d0, p_tree=p_tree@entry=0x563036eaae30)
    at /root/repo/xgboost/src/tree/updater_gpu_hist.cu:1138
#12 0x00007f1d0b06477f in xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::UpdateTree (this=this@entry=0x563036eaa6d0, gpair=gpair@entry=0x563034b5ea78, 
    p_fmat=p_fmat@entry=0x563031d2bdd0, p_tree=0x563036eaae30)
    at /root/repo/xgboost/src/tree/updater_gpu_hist.cu:1230
#13 0x00007f1d0b065ce2 in xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update (this=0x563036eaa6d0, gpair=0x563034b5ea78, dmat=0x563031d2bdd0, trees=...)
    at /root/repo/xgboost/src/tree/updater_gpu_hist.cu:955
#14 0x00007f1d0ae61517 in xgboost::gbm::GBTree::BoostNewTrees (this=this@entry=0x563034bbcff0, 
    gpair=gpair@entry=0x563034b5ea78, p_fmat=p_fmat@entry=0x563031d2bdd0, 
    bst_group=bst_group@entry=0, ret=ret@entry=0x7ffc7bc5f900)
--Type <RET> for more, q to quit, c to continue without paging--
    at /root/repo/xgboost/src/gbm/gbtree.cc:293
#15 0x00007f1d0ae627d5 in xgboost::gbm::GBTree::DoBoost (this=0x563034bbcff0, 
    p_fmat=0x563031d2bdd0, in_gpair=0x563034b5ea78, obj=<optimized out>)
    at /root/repo/xgboost/src/gbm/gbtree.cc:180
#16 0x00007f1d0ae71030 in xgboost::LearnerImpl::UpdateOneIter (this=0x563034b5e920, iter=0, 
    train=0x563031d2bdd0) at /root/repo/xgboost/src/learner.cc:503
#17 0x00007f1d0adf4a85 in XGBoosterUpdateOneIter (handle=0x563030f8b720, iter=0, 
    dtrain=0x563031b6d1f0) at /root/repo/xgboost/src/c_api/c_api.cc:905
#18 0x00007f1df73a7ec0 in ffi_call_unix64 ()
   from target:/opt/h2oai/h2o4gpu/python/lib/python3.6/lib-dynload/../../libffi.so.6
#19 0x00007f1df73a787d in ffi_call ()
   from target:/opt/h2oai/h2o4gpu/python/lib/python3.6/lib-dynload/../../libffi.so.6
#20 0x00007f1df75bcdee in _ctypes_callproc ()
   from target:/opt/h2oai/h2o4gpu/python/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so

The text was updated successfully, but these errors were encountered:

sjeaugey · 2019-02-07T17:04:21Z

Aside from the Group call fixes, please refer to the paragraph named "Concurrency between NCCL and CUDA calls (NCCL up to 2.0.5 or CUDA 8)" of the documentation (applies to all versions when using CUDA 8.0)
https://docs.nvidia.com/deeplearning/sdk/nccl-archived/nccl_2213/nccl-developer-guide/index.html#knownissues

sjeaugey · 2019-05-06T23:53:54Z

Closing as this is the expected behavior with CUDA 8 and should work fine on recent versions of NCCL+CUDA.

sh1ng mentioned this issue Jan 29, 2019

Reproduce hang import h2oai/h2o4gpu#717

Closed

sh1ng mentioned this issue Feb 6, 2019

xgboost for GPU hangs on AllReduce and CPU Barrier dmlc/xgboost#4107

Closed

sjeaugey closed this as completed May 6, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AllReduce and CPU Barrier hangs #176

AllReduce and CPU Barrier hangs #176

sh1ng commented Jan 29, 2019

sjeaugey commented Feb 7, 2019 •

edited

Loading

sjeaugey commented May 6, 2019

AllReduce and CPU Barrier hangs #176

AllReduce and CPU Barrier hangs #176

Comments

sh1ng commented Jan 29, 2019

sjeaugey commented Feb 7, 2019 • edited Loading

sjeaugey commented May 6, 2019

sjeaugey commented Feb 7, 2019 •

edited

Loading