-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix direct actor transport not treating some tasks as failed #5464
Changes from all commits
3266973
a62e134
5f9ea9f
53a96eb
b652866
a74c3b1
e0f95dd
68423a7
b0f5a22
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,7 +63,8 @@ Status CoreWorkerDirectActorTaskSubmitter::SubmitTask( | |
|
||
// Submit request. | ||
auto &client = rpc_clients_[actor_id]; | ||
return PushTask(*client, *request, task_id, num_returns); | ||
PushTask(*client, *request, task_id, num_returns); | ||
return Status::OK(); | ||
} else { | ||
// Actor is dead, treat the task as failure. | ||
RAY_CHECK(iter->second.state_ == ActorTableData::DEAD); | ||
|
@@ -92,6 +93,17 @@ Status CoreWorkerDirectActorTaskSubmitter::SubscribeActorUpdates() { | |
} else { | ||
// Remove rpc client if it's dead or being reconstructed. | ||
rpc_clients_.erase(actor_id); | ||
// If this actor is permanently dead and there are pending requests, treat | ||
// the pending tasks as failed. | ||
if (actor_data.state() == ActorTableData::DEAD && | ||
pending_requests_.count(actor_id) > 0) { | ||
for (const auto &request : pending_requests_[actor_id]) { | ||
TreatTaskAsFailed(TaskID::FromBinary(request->task_spec().task_id()), | ||
request->task_spec().num_returns(), | ||
rpc::ErrorType::ACTOR_DIED); | ||
} | ||
pending_requests_.erase(actor_id); | ||
} | ||
} | ||
|
||
RAY_LOG(INFO) << "received notification on actor, state=" | ||
|
@@ -114,17 +126,16 @@ void CoreWorkerDirectActorTaskSubmitter::ConnectAndSendPendingTasks( | |
auto &requests = pending_requests_[actor_id]; | ||
while (!requests.empty()) { | ||
const auto &request = *requests.front(); | ||
auto status = | ||
PushTask(*client, request, TaskID::FromBinary(request.task_spec().task_id()), | ||
request.task_spec().num_returns()); | ||
PushTask(*client, request, TaskID::FromBinary(request.task_spec().task_id()), | ||
request.task_spec().num_returns()); | ||
requests.pop_front(); | ||
} | ||
} | ||
|
||
Status CoreWorkerDirectActorTaskSubmitter::PushTask(rpc::DirectActorClient &client, | ||
const rpc::PushTaskRequest &request, | ||
const TaskID &task_id, | ||
int num_returns) { | ||
void CoreWorkerDirectActorTaskSubmitter::PushTask(rpc::DirectActorClient &client, | ||
const rpc::PushTaskRequest &request, | ||
const TaskID &task_id, | ||
int num_returns) { | ||
auto status = client.PushTask( | ||
request, | ||
[this, task_id, num_returns](Status status, const rpc::PushTaskReply &reply) { | ||
|
@@ -153,7 +164,9 @@ Status CoreWorkerDirectActorTaskSubmitter::PushTask(rpc::DirectActorClient &clie | |
store_provider_->Put(RayObject(data_buffer, metadata_buffer), object_id)); | ||
} | ||
}); | ||
return status; | ||
if (!status.ok()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this scenario where the status is not ok, does that always mean that the actor has died? Could it mean that the actor is overloaded and some buffer for sending messages is full or something like that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If buffer is full, the request should be blocked. But I guess it's possible that network in temporarily disconnected. However, no matter what case it is, we should treat the task as failed and let the app to decide what to do (retry, ignore, or error). I'll add a TODO here about making the error message more accurate, instead of just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok sounds good. |
||
TreatTaskAsFailed(task_id, num_returns, rpc::ErrorType::ACTOR_DIED); | ||
} | ||
} | ||
|
||
void CoreWorkerDirectActorTaskSubmitter::TreatTaskAsFailed( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,10 +49,10 @@ void CoreWorkerRayletTaskReceiver::HandleAssignTask( | |
/*transport_type=*/static_cast<int>(TaskTransportType::RAYLET)); | ||
Status status = object_interface_.Put(*results[i], id); | ||
if (!status.ok()) { | ||
// TODO (kfstorm): RAY_LOG(FATAL) except the error is about the object to put | ||
// already exists. | ||
RAY_LOG(WARNING) << "Task " << task_spec.TaskId() << " failed to put object " << id | ||
<< " in store: " << status.message(); | ||
// NOTE(hchen): `PlasmaObjectExists` error is already ignored inside | ||
// `ObjectInterface::Put`, we treat other error types as fatal here. | ||
RAY_LOG(FATAL) << "Task " << task_spec.TaskId() << " failed to put object " << id | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be good to add a comment here that we use log FATAL for put errors except when object exists. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good point. |
||
<< " in store: " << status.message(); | ||
} else { | ||
RAY_LOG(DEBUG) << "Task " << task_spec.TaskId() << " put object " << id | ||
<< " in store."; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what's the reason to return OK() in this case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because this could happen in normal cases. For example, 1) during reconstructing a task, an existing object could be put again. 2) when treating a task as failed, the task could have been succeeded but we don't know and put a duplicate object.
We are already using this behavior in python/java worker and raylet.