From 906ed996fbf690844d9a9382c3019a31fb4482da Mon Sep 17 00:00:00 2001 From: Jerry Keselman Date: Tue, 19 Dec 2017 10:10:07 -0500 Subject: [PATCH 1/3] Add Heartbeat Thread to SmartProxy Worker In order to fix an issue where long-running Smartstate jobs get killed under the mistaken assumption that they are being unresponsive when they are actually quite busy, a separate thread is being added to the SmartProxy Worker which just heartbeats every 30 seconds. Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1519538 --- app/models/miq_smart_proxy_worker/runner.rb | 66 +++++++++++++++++++++ config/settings.yml | 3 +- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/app/models/miq_smart_proxy_worker/runner.rb b/app/models/miq_smart_proxy_worker/runner.rb index 893dd38ea25..c0e6a7019d5 100644 --- a/app/models/miq_smart_proxy_worker/runner.rb +++ b/app/models/miq_smart_proxy_worker/runner.rb @@ -1,3 +1,69 @@ class MiqSmartProxyWorker::Runner < MiqQueueWorkerBase::Runner self.delay_startup_for_vim_broker = true # NOTE: For smartproxy role + + def do_before_work_loop + @tid = start_heartbeat_thread + end + + def before_exit(message, _exit_code) + @exit_requested = true + # + # Stop the Heartbeat Thread + # + safe_log("#{message} Stopping Heartbeat Thread.") + + # + # Wait for the Heartbeat Thread to stop + # + unless @tid.nil? + safe_log("#{message} Waiting for Heartbeat Thread to Stop.") + @tid.join(worker_settings[:heartbeat_thread_shutdown_timeout]) rescue nil + end + end + + def start_heartbeat_thread + @exit_requested = false + @heartbeat_started = Concurrent::Event.new + _log.info("#{log_prefix} Starting Heartbeat Thread") + + tid = Thread.new do + begin + heartbeat_thread + rescue => err + _log.error("#{log_prefix} Heartbeat Thread aborted because [#{err.message}]") + _log.log_backtrace(err) + Thread.exit + ensure + @heartbeat_started.set + end + end + + @heartbeat_started.wait + _log.info("#{log_prefix} Started Heartbeat Thread") + + tid + end + + def heartbeat_thread + @heartbeat_started.set + until @exit_requested do + heartbeat + sleep 30 + end + end + + def do_work + if @tid.nil? || !@tid.alive? + if !@tid.nil? && @tid.status.nil? + dead_tid, @tid = @tid, nil + _log.info("#{log_prefix} Waiting for the Heartbeat Thread to exit...") + dead_tid.join # raise the exception the dead thread failed with + end + + _log.info("#{log_prefix} Heartbeat Thread gone. Restarting...") + @tid = start_heartbeat_thread + end + + super + end end diff --git a/config/settings.yml b/config/settings.yml index 89ff1665d8b..a16d35b9337 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -972,7 +972,7 @@ :prefetch_stale_threshold: 30.seconds :rails_server: puma :remote_console_type: VMRC - :role: database_operations,event,reporting,scheduler,smartstate,ems_operations,ems_inventory,user_interface,websocket,web_services,automate + :role: database_operations,event,reporting,scheduler,smartstate,ems_operations,ems_inventory,user_interface,websocket,web_services,automate,smartproxy :server_dequeue_frequency: 5.seconds :server_log_frequency: 5.minutes :server_log_timings_threshold: 1.second @@ -1165,6 +1165,7 @@ :memory_threshold: 2.gigabytes :queue_timeout: 20.minutes :restart_interval: 6.hours + :heartbeat_thread_shutdown_timeout: 10.seconds :schedule_worker: :container_entities_purge_interval: 1.day :binary_blob_purge_interval: 1.hour From 46f9a9a743e1ef17327b80abe62d9d5881ffda4b Mon Sep 17 00:00:00 2001 From: Jerry Keselman Date: Tue, 19 Dec 2017 11:49:34 -0500 Subject: [PATCH 2/3] RuboCop Warning Cleanup Clean up two RuboCop warnings - One for a do in an until clause One for a 'rescue nil' modifier - changed to a begin-rescue-end block for the NoMethodError that would result if the thread id was nil. --- app/models/miq_smart_proxy_worker/runner.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/app/models/miq_smart_proxy_worker/runner.rb b/app/models/miq_smart_proxy_worker/runner.rb index c0e6a7019d5..ad1ebc1e887 100644 --- a/app/models/miq_smart_proxy_worker/runner.rb +++ b/app/models/miq_smart_proxy_worker/runner.rb @@ -17,7 +17,11 @@ def before_exit(message, _exit_code) # unless @tid.nil? safe_log("#{message} Waiting for Heartbeat Thread to Stop.") - @tid.join(worker_settings[:heartbeat_thread_shutdown_timeout]) rescue nil + begin + @tid.join(worker_settings[:heartbeat_thread_shutdown_timeout]) + rescue NoMethodError => join_err + safe_log(join_err) + end end end @@ -46,7 +50,7 @@ def start_heartbeat_thread def heartbeat_thread @heartbeat_started.set - until @exit_requested do + until @exit_requested heartbeat sleep 30 end From 5d0ce26ea2d11e7d68dace573f78095264c26e58 Mon Sep 17 00:00:00 2001 From: Jerry Keselman Date: Thu, 21 Dec 2017 17:07:44 -0500 Subject: [PATCH 3/3] Review Comments The settings file had an extra modified role that was not intended as part of this PR. Style comments in runner.rb --- app/models/miq_smart_proxy_worker/runner.rb | 4 ++-- config/settings.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/models/miq_smart_proxy_worker/runner.rb b/app/models/miq_smart_proxy_worker/runner.rb index ad1ebc1e887..3a99e8a2bfc 100644 --- a/app/models/miq_smart_proxy_worker/runner.rb +++ b/app/models/miq_smart_proxy_worker/runner.rb @@ -15,7 +15,7 @@ def before_exit(message, _exit_code) # # Wait for the Heartbeat Thread to stop # - unless @tid.nil? + if @tid safe_log("#{message} Waiting for Heartbeat Thread to Stop.") begin @tid.join(worker_settings[:heartbeat_thread_shutdown_timeout]) @@ -58,7 +58,7 @@ def heartbeat_thread def do_work if @tid.nil? || !@tid.alive? - if !@tid.nil? && @tid.status.nil? + unless @tid.try(:status) dead_tid, @tid = @tid, nil _log.info("#{log_prefix} Waiting for the Heartbeat Thread to exit...") dead_tid.join # raise the exception the dead thread failed with diff --git a/config/settings.yml b/config/settings.yml index a16d35b9337..e8cbc756d4e 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -972,7 +972,7 @@ :prefetch_stale_threshold: 30.seconds :rails_server: puma :remote_console_type: VMRC - :role: database_operations,event,reporting,scheduler,smartstate,ems_operations,ems_inventory,user_interface,websocket,web_services,automate,smartproxy + :role: database_operations,event,reporting,scheduler,smartstate,ems_operations,ems_inventory,user_interface,websocket,web_services,automate :server_dequeue_frequency: 5.seconds :server_log_frequency: 5.minutes :server_log_timings_threshold: 1.second