From 272a7eece77ad2bb3fd5b1f21bc3eb68dfddc0d8 Mon Sep 17 00:00:00 2001 From: Nick Carboni Date: Thu, 22 Feb 2018 16:56:31 -0500 Subject: [PATCH 1/2] Delete container objects when the server is shutting down Before this, when #stop_worker was called, we would attempt to scale the deployment. This would work if we had a different number (or zero) workers configured in the settings, but we don't shut workers down like that when shutting down the server to preserve the user's settings for the next startup. This resulted in worker pods not exiting when the orchestrator pod stopped and much confusion when the next orchestrator started up. --- app/models/miq_server/worker_management/monitor/quiesce.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/app/models/miq_server/worker_management/monitor/quiesce.rb b/app/models/miq_server/worker_management/monitor/quiesce.rb index c17053bd32b..ab4038ab82a 100644 --- a/app/models/miq_server/worker_management/monitor/quiesce.rb +++ b/app/models/miq_server/worker_management/monitor/quiesce.rb @@ -35,7 +35,10 @@ def quiesce_workers_loop @quiesce_loop_timeout = @worker_monitor_settings[:quiesce_loop_timeout] || 5.minutes worker_monitor_poll = (@worker_monitor_settings[:poll] || 1.seconds).to_i_with_method - miq_workers.each { |w| stop_worker(w) } + miq_workers.each do |w| + MiqEnvironment::Command.is_podified? && w.containerized_worker? ? w.delete_container_objects : stop_worker(w) + end + loop do reload # Reload from SQL this MiqServer AND its miq_workers association break if self.workers_quiesced? From 6f52ccbf73d9077a4989dfe82ac38312eb924c26 Mon Sep 17 00:00:00 2001 From: Nick Carboni Date: Fri, 23 Feb 2018 09:30:17 -0500 Subject: [PATCH 2/2] Catch errors raised when we try to delete objects that don't exist. This will happen from time to time as we're calling #delete_container_objects on each worker instance. The calls to OpenShift are asynchronous so the next worker we tell to #delete_container_objects may try to delete objects that have already been removed. Additionally, this makes it difficult to track when objects have been deleted. Ideally we would scale a deployment down and have the last worker delete the deployment iteslf, but a previous worker may not have deleted the worker record by the time we tell the next worker to exit. --- lib/container_orchestrator.rb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/container_orchestrator.rb b/lib/container_orchestrator.rb index d33a8d1b0bf..f536a2fd993 100644 --- a/lib/container_orchestrator.rb +++ b/lib/container_orchestrator.rb @@ -47,18 +47,26 @@ def delete_deployment_config(name) scale(name, 0) connection.delete_deployment_config(name, my_namespace) delete_replication_controller(rc.metadata.name) if rc + rescue KubeException => e + raise unless e.message =~ /not found/ end def delete_replication_controller(name) kube_connection.delete_replication_controller(name, my_namespace) + rescue KubeException => e + raise unless e.message =~ /not found/ end def delete_service(name) kube_connection.delete_service(name, my_namespace) + rescue KubeException => e + raise unless e.message =~ /not found/ end def delete_secret(name) kube_connection.delete_secret(name, my_namespace) + rescue KubeException => e + raise unless e.message =~ /not found/ end private