-
Notifications
You must be signed in to change notification settings - Fork 197
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add device_send, device_recv, device_sendrecv, device_multicast_sendrecv #144
Changes from all commits
3733238
d89925a
0d080c2
9a49d96
dc101f8
eb88acc
cfd46c7
c87a0cd
df8a50c
def1b4d
16da047
37401a9
4ec520e
5b708f8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,9 +112,6 @@ class std_comms : public comms_iface { | |
|
||
int get_rank() const { return rank_; } | ||
|
||
// FIXME: a temporary hack, should be removed | ||
ncclComm_t get_nccl_comm() const { return nccl_comm_; } | ||
|
||
std::unique_ptr<comms_iface> comm_split(int color, int key) const { | ||
mr::device::buffer<int> d_colors(device_allocator_, stream_, get_size()); | ||
mr::device::buffer<int> d_keys(device_allocator_, stream_, get_size()); | ||
|
@@ -418,6 +415,51 @@ class std_comms : public comms_iface { | |
} | ||
} | ||
|
||
// if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock | ||
void device_send(const void *buf, size_t size, int dest, | ||
cudaStream_t stream) const { | ||
NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); | ||
} | ||
|
||
// if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock | ||
void device_recv(void *buf, size_t size, int source, | ||
cudaStream_t stream) const { | ||
NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); | ||
} | ||
|
||
void device_sendrecv(const void *sendbuf, size_t sendsize, int dest, | ||
void *recvbuf, size_t recvsize, int source, | ||
cudaStream_t stream) const { | ||
// ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock | ||
NCCL_TRY(ncclGroupStart()); | ||
NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); | ||
NCCL_TRY( | ||
ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); | ||
NCCL_TRY(ncclGroupEnd()); | ||
} | ||
|
||
void device_multicast_sendrecv(const void *sendbuf, | ||
std::vector<size_t> const &sendsizes, | ||
std::vector<size_t> const &sendoffsets, | ||
std::vector<int> const &dests, void *recvbuf, | ||
std::vector<size_t> const &recvsizes, | ||
std::vector<size_t> const &recvoffsets, | ||
std::vector<int> const &sources, | ||
cudaStream_t stream) const { | ||
// ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In practice do these transfers get serialized on the same stream? The API doesn't seem to allow the backend to run on multiple streams. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All the send/receive operations are placed inside ncclGroupStart() and ncclGroupEnd(), so AFAIK, all the send/receive operations are executed concurrently (at least logically, NCCL may or may not restrict parallelism internally to avoid congestion based on the interconnect) after ncclGroupEnd(). If you are worried about the time to queue ncclSend/ncclRecv operations (the cost of the for loops, this may become problematic if sendsizes.size() or recvsizes.size() gets very large such as millions), I am assuming that sendsizes.size() <= # of GPUs and # of GPUs may not go that large (and I guess it is a great thing if our code scales to million GPUs everywhere else and this becomes a bottleneck.... but I don't expect that will happen in foreseeable future). |
||
NCCL_TRY(ncclGroupStart()); | ||
for (size_t i = 0; i < sendsizes.size(); ++i) { | ||
NCCL_TRY(ncclSend(static_cast<const char *>(sendbuf) + sendoffsets[i], | ||
sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); | ||
} | ||
for (size_t i = 0; i < recvsizes.size(); ++i) { | ||
NCCL_TRY(ncclRecv(static_cast<char *>(recvbuf) + recvoffsets[i], | ||
recvsizes[i], ncclUint8, sources[i], nccl_comm_, | ||
stream)); | ||
} | ||
NCCL_TRY(ncclGroupEnd()); | ||
} | ||
|
||
private: | ||
ncclComm_t nccl_comm_; | ||
cudaStream_t stream_; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just curious, is there an MPI equivalent for that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MPI has all-to-all, but AFAIK no multicast receiving data from a subset of the nodes and sending data to another subset.
And this multicast is actually ncclSend/ncclRecv operations placed inside ncclGroupStart() & ncclGroupEnd().