From e5f4bd3fe1299070e40d235be963428b4f9a2d14 Mon Sep 17 00:00:00 2001 From: Joe Rafaniello Date: Tue, 3 Jan 2017 11:18:07 -0500 Subject: [PATCH] Kill workers that don't stop after a configurable time Previously, we'd gracefully ask them to exit and if the queue work they're doing, takes 1 hour to do, they'd exceed memory thresholds, keep running until the work is done and finally respond to the exit request. Now, we mark them as 'stopping' when they exceed a threshold and they have up to 10 minutes to finish before we'd kill them. This value is configurable in the 'stopping_timeout' field in each worker's advanced settings. https://bugzilla.redhat.com/show_bug.cgi?id=1395736 --- .../miq_server/worker_management/monitor.rb | 8 ++++- app/models/miq_worker.rb | 8 +++++ config/settings.yml | 1 + .../worker_management/monitor_spec.rb | 34 +++++++++++++++++++ spec/models/miq_worker_spec.rb | 21 ++++++++++++ 5 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 spec/models/miq_server/worker_management/monitor_spec.rb diff --git a/app/models/miq_server/worker_management/monitor.rb b/app/models/miq_server/worker_management/monitor.rb index a026eede858..8cafc866ae9 100644 --- a/app/models/miq_server/worker_management/monitor.rb +++ b/app/models/miq_server/worker_management/monitor.rb @@ -89,7 +89,7 @@ def check_not_responding(class_name = nil) processed_workers = [] miq_workers.each do |w| next unless class_name.nil? || (w.type == class_name) - next unless [:not_responding, :memory_exceeded].include?(worker_get_monitor_reason(w.pid)) + next unless monitor_reason_not_responding?(w) next unless [:waiting_for_stop_before_restart, :waiting_for_stop].include?(worker_get_monitor_status(w.pid)) processed_workers << w worker_not_responding(w) @@ -99,6 +99,12 @@ def check_not_responding(class_name = nil) processed_workers.collect(&:id) end + NOT_RESPONDING = :not_responding + MEMORY_EXCEEDED = :memory_exceeded + def monitor_reason_not_responding?(w) + [NOT_RESPONDING, MEMORY_EXCEEDED].include?(worker_get_monitor_reason(w.pid)) || w.stopping_for_too_long? + end + def do_system_limit_exceeded self.class.monitor_class_names_in_kill_order.each do |class_name| workers = class_name.constantize.find_current.to_a diff --git a/app/models/miq_worker.rb b/app/models/miq_worker.rb index f40561d41e7..30605aac6d3 100644 --- a/app/models/miq_worker.rb +++ b/app/models/miq_worker.rb @@ -411,6 +411,14 @@ def enabled_or_running? !is_stopped? || actually_running? end + def stopping_for_too_long? + # Note, a 'stopping' worker heartbeats in DRb but NOT to + # the database, so we can see how long it's been + # 'stopping' by checking the last_heartbeat. + stopping_timeout = self.class.worker_settings[:stopping_timeout] || 10.minutes + status == MiqWorker::STATUS_STOPPING && last_heartbeat < stopping_timeout.seconds.ago + end + def validate_active_messages active_messages.each { |msg| msg.check_for_timeout(_log.prefix) } end diff --git a/config/settings.yml b/config/settings.yml index 9fb97fdab07..29dcbaa2196 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -1192,6 +1192,7 @@ :poll_method: :normal :restart_interval: 0.hours :starting_timeout: 10.minutes + :stopping_timeout: 10.minutes :embedded_ansible_worker: :poll: 10.seconds :memory_threshold: 0.megabytes diff --git a/spec/models/miq_server/worker_management/monitor_spec.rb b/spec/models/miq_server/worker_management/monitor_spec.rb new file mode 100644 index 00000000000..1ec24507db6 --- /dev/null +++ b/spec/models/miq_server/worker_management/monitor_spec.rb @@ -0,0 +1,34 @@ +describe MiqServer::WorkerManagement::Monitor do + context "#check_not_responding" do + let(:server) { EvmSpecHelper.local_miq_server } + let(:worker) do + FactoryGirl.create(:miq_worker, + :type => "MiqGenericWorker", + :miq_server => server, + :pid => 12345, + :last_heartbeat => 5.minutes.ago) + end + + before do + server.setup_drb_variables + server.worker_add(worker.pid) + end + + it "destroys an unresponsive 'stopping' worker" do + worker.update(:last_heartbeat => 20.minutes.ago) + server.stop_worker(worker) + server.check_not_responding + server.reload + expect(server.miq_workers).to be_empty + expect { worker.reload }.to raise_error(ActiveRecord::RecordNotFound) + end + + it "monitors recently heartbeated 'stopping' workers" do + worker.update(:last_heartbeat => 1.minute.ago) + server.stop_worker(worker) + server.check_not_responding + server.reload + expect(server.miq_workers.first.id).to eq(worker.id) + end + end +end diff --git a/spec/models/miq_worker_spec.rb b/spec/models/miq_worker_spec.rb index a779061eee0..bf458efb772 100644 --- a/spec/models/miq_worker_spec.rb +++ b/spec/models/miq_worker_spec.rb @@ -331,6 +331,27 @@ def check_has_required_role(worker_role_names, expected_result) expect(@worker.worker_options).to eq(:guid => @worker.guid) end + describe "#stopping_for_too_long?" do + subject { @worker.stopping_for_too_long? } + + it "false if started" do + @worker.update(:status => described_class::STATUS_STARTED) + expect(subject).to be_falsey + end + + it "true if stopping and not heartbeated recently" do + @worker.update(:status => described_class::STATUS_STOPPING, + :last_heartbeat => 30.minutes.ago) + expect(subject).to be_truthy + end + + it "false if stopping and heartbeated recently" do + @worker.update(:status => described_class::STATUS_STOPPING, + :last_heartbeat => 1.minute.ago) + expect(subject).to be_falsey + end + end + it "is_current? false when starting" do @worker.update_attribute(:status, described_class::STATUS_STARTING) expect(@worker.is_current?).not_to be_truthy