Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix distributed_work segfault when local work generation fails #2413

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 28 additions & 13 deletions nano/node/distributed_work.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,25 @@ need_resolve (peers_a),
difficulty (difficulty_a),
elapsed (nano::timer_state::started, "distributed work generation timer")
{
assert (!completed);
assert (!finished);
assert (status == work_generation_status::ongoing);
}

nano::distributed_work::~distributed_work ()
{
assert (status != work_generation_status::ongoing);
if (!node.stopped && node.websocket_server && node.websocket_server->any_subscriber (nano::websocket::topic::work))
{
nano::websocket::message_builder builder;
if (completed)
if (status == work_generation_status::success)
{
node.websocket_server->broadcast (builder.work_generation (root, work_result, difficulty, node.network_params.network.publish_threshold, elapsed.value (), winner, bad_peers));
}
else if (cancelled)
else if (status == work_generation_status::cancelled)
{
node.websocket_server->broadcast (builder.work_cancelled (root, difficulty, node.network_params.network.publish_threshold, elapsed.value (), bad_peers));
}
else
else if (status == work_generation_status::failure_local || status == work_generation_status::failure_peers)
{
node.websocket_server->broadcast (builder.work_failed (root, difficulty, node.network_params.network.publish_threshold, elapsed.value (), bad_peers));
}
Expand Down Expand Up @@ -102,9 +104,13 @@ void nano::distributed_work::start_work ()
{
this_l->set_once (*work_a);
}
else if (!this_l->cancelled && !this_l->completed)
else if (!this_l->finished.exchange (true))
{
this_l->callback (boost::none);
this_l->status = work_generation_status::failure_local;
if (this_l->callback)
{
this_l->callback (boost::none);
}
}
this_l->stop_once (false);
},
Expand Down Expand Up @@ -187,7 +193,7 @@ void nano::distributed_work::start_work ()
}
}

if (!local_generation_started && outstanding.empty ())
if (!local_generation_started && outstanding.empty () && callback)
{
callback (boost::none);
}
Expand Down Expand Up @@ -297,10 +303,14 @@ void nano::distributed_work::stop_once (bool const local_stop_a)

void nano::distributed_work::set_once (uint64_t work_a, std::string const & source_a)
{
if (!cancelled && !completed.exchange (true))
if (!finished.exchange (true))
{
elapsed.stop ();
callback (work_a);
status = work_generation_status::success;
if (callback)
{
callback (work_a);
}
winner = source_a;
work_result = work_a;
if (node.config.logging.work_generation_time ())
Expand All @@ -314,10 +324,14 @@ void nano::distributed_work::set_once (uint64_t work_a, std::string const & sour

void nano::distributed_work::cancel_once ()
{
if (!completed && !cancelled.exchange (true))
if (!finished.exchange (true))
{
elapsed.stop ();
callback (boost::none);
status = work_generation_status::cancelled;
if (callback)
{
callback (boost::none);
}
stop_once (true);
if (node.config.logging.work_generation_time ())
{
Expand All @@ -334,11 +348,12 @@ void nano::distributed_work::failure (boost::asio::ip::address const & address_a

void nano::distributed_work::handle_failure (bool const last_a)
{
if (last_a && !completed && !cancelled)
if (last_a && !finished)
{
node.unresponsive_work_peers = true;
if (!local_generation_started)
if (!local_generation_started && !finished.exchange (true))
{
status = work_generation_status::failure_peers;
if (backoff == 1 && node.config.logging.work_generation_time ())
{
node.logger.always_log ("Work peer(s) failed to generate work for root ", root.to_string (), ", retrying...");
Expand Down
13 changes: 11 additions & 2 deletions nano/node/distributed_work.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ class work_peer_request final
*/
class distributed_work final : public std::enable_shared_from_this<nano::distributed_work>
{
enum class work_generation_status
{
ongoing,
success,
cancelled,
failure_local,
failure_peers
};

public:
distributed_work (nano::node &, nano::root const &, std::vector<std::pair<std::string, uint16_t>> const & peers_a, unsigned int, std::function<void(boost::optional<uint64_t>)> const &, uint64_t, boost::optional<nano::account> const & = boost::none);
~distributed_work ();
Expand Down Expand Up @@ -64,10 +73,10 @@ class distributed_work final : public std::enable_shared_from_this<nano::distrib
std::vector<std::pair<std::string, uint16_t>> need_resolve;
uint64_t difficulty;
uint64_t work_result{ 0 };
std::atomic<bool> completed{ false };
std::atomic<bool> cancelled{ false };
std::atomic<bool> finished{ false };
std::atomic<bool> stopped{ false };
std::atomic<bool> local_generation_started{ false };
work_generation_status status{ work_generation_status::ongoing };
nano::timer<std::chrono::milliseconds> elapsed; // logging only
std::vector<std::string> bad_peers; // websocket
std::string winner; // websocket
Expand Down