Skip to content

Commit

Permalink
change order for wait_server_ready;test=develop
Browse files Browse the repository at this point in the history
  • Loading branch information
danleifeng committed Nov 11, 2020
1 parent f485474 commit bfdaffb
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 14 deletions.
34 changes: 21 additions & 13 deletions paddle/fluid/imperative/nccl_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,20 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
address.sin_port = htons(port);

int try_times = 0;
int retry_time = 0;
while (true) {
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
retry_time = 3 * (try_times + 1);
LOG(WARNING) << "Socket bind worker " << ep
<< (try_times < 5 ? " failed, try again after 3 seconds."
: " failed, try again after 3 seconds. "
"Bind on endpoint %s failed. "
"Please confirm whether the "
"communication port or GPU card is "
"occupied.");
std::this_thread::sleep_for(std::chrono::seconds(3));
<< (try_times < 9
? " failed, try again after " +
std::to_string(retry_time) + " seconds."
: " failed, try again after " +
std::to_string(retry_time) +
" seconds. Bind on endpoint " + ep +
" failed. Please confirm whether the "
"communication port or GPU card is occupied.");
std::this_thread::sleep_for(std::chrono::seconds(retry_time));
++try_times;
continue;
}
Expand Down Expand Up @@ -129,16 +133,20 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
}

int try_times = 0;
int retry_time = 0;
while (true) {
if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
retry_time = 3 * (try_times + 1);
LOG(WARNING)
<< "Socket connect worker " << ep
<< (try_times < 5
? " failed, try again after 3 seconds."
: " failed, try again after 3 seconds. Maybe that "
"some process is occupied the GPUs of this node "
"now, and you should kill those process manually.");
std::this_thread::sleep_for(std::chrono::seconds(3));
<< (try_times < 9
? " failed, try again after " + std::to_string(retry_time) +
" seconds."
: " failed, try again after " + std::to_string(retry_time) +
" seconds. Maybe that some process is occupied the "
"GPUs of this node now, and you should kill those "
"process manually.");
std::this_thread::sleep_for(std::chrono::seconds(retry_time));
++try_times;
continue;
}
Expand Down
3 changes: 2 additions & 1 deletion python/paddle/distributed/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def _check_var_exists(var_name):
http_server.daemon = True
http_server_d["running"] = True
http_server.start()
wait_server_ready([ParallelEnv().trainer_endpoints[0]])

# 4. init NCCL ParallelStrategy
strategy = ParallelStrategy()
Expand Down Expand Up @@ -166,6 +165,8 @@ def _check_var_exists(var_name):
# dividing init_gloo into two part beacause nccl and gloo
# are separately looking for free ports which sometimes
# leads to port-conflict.
wait_server_ready([ParallelEnv().trainer_endpoints[0]])

gloo_strategy = core.GlooParallelStrategy()
gloo_strategy.rank = ParallelEnv().rank
gloo_strategy.rank_num = ParallelEnv().world_size
Expand Down

1 comment on commit bfdaffb

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.