Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed pserver crashed #10626

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions paddle/fluid/framework/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
}

void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) {
bool create_local_scope, bool create_vars,
bool drop_scope_kids) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a good way to add a bool switch here. I think a better way is to let listen_and_serv_op use a child scope.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try remove the create_vars bool in executor to see if we can solve this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with using a sub-scope in listen_and_serv_op.

Scope* local_scope = scope;
if (create_vars) {
if (create_local_scope) {
Expand Down Expand Up @@ -351,10 +352,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
platform::DeviceContextPool::Instance().Get(place_)->Wait();
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
} else {
// Delete the local scopes created in operators.
} else if (drop_scope_kids) {
scope->DropKids();
} else {
// need to call scop->DropKids() after this function call
}

if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class Executor {

void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true,
bool create_vars = true);
bool create_vars = true, bool drop_scop_kids = true);

void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ add_subdirectory(detail)
if(WITH_DISTRIBUTE)
if(WITH_GPU)
op_library(gen_nccl_id_op DEPS nccl_common)
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor)
else()
set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
endif()
Expand All @@ -207,7 +208,6 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor)
else()
set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op gen_nccl_id_op)
endif()
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/listen_and_serv_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,14 @@ static void ParallelExecuteBlocks(
int run_block = idx; // thread local
try {
executor->RunPreparedContext(prepared[run_block].get(), scope,
false, false);
false, false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
}));
}
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
scope->DropKids();
}

std::atomic_int ListenAndServOp::selected_port_{0};
Expand Down