From 0ad07b59a72fccdd18e6913b5a444d217f160e34 Mon Sep 17 00:00:00 2001 From: Nick Carboni Date: Tue, 17 Mar 2020 14:47:24 -0400 Subject: [PATCH] Don't recurse into poll_runner when in pods, loop instead For a sufficiently long-running job, we could exceed the stack size threshold, so this commit implements a loop in poll_runner that is only used when we're running in pods and waiting for the ansible runner process to finish. --- .../providers/ansible_runner_workflow.rb | 53 ++++++++++++------- .../providers/ansible_role_workflow_spec.rb | 16 +++--- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/app/models/manageiq/providers/ansible_runner_workflow.rb b/app/models/manageiq/providers/ansible_runner_workflow.rb index cd5ae773a34c..3af2c584f88f 100644 --- a/app/models/manageiq/providers/ansible_runner_workflow.rb +++ b/app/models/manageiq/providers/ansible_runner_workflow.rb @@ -53,28 +53,40 @@ def execute end def poll_runner - response = Ansible::Runner::ResponseAsync.load(context[:ansible_runner_response]) - if response.running? - if started_on + options[:timeout] < Time.now.utc - response.stop - - route_signal(:abort, "ansible #{execution_type} has been running longer than timeout", "error") + loop do + response = Ansible::Runner::ResponseAsync.load(context[:ansible_runner_response]) + if response.running? + if started_on + options[:timeout] < Time.now.utc + response.stop + + route_signal(:abort, "ansible #{execution_type} has been running longer than timeout", "error") + else + if MiqEnvironment::Command.is_podified? + # If we're running in pods loop so we don't exhaust the stack limit in very long jobs + sleep options[:poll_interval] + next + else + queue_signal(:poll_runner, :deliver_on => deliver_on) + end + end else - route_signal(:poll_runner, :deliver_on => deliver_on) + result = response.response + + context[:ansible_runner_return_code] = result.return_code + context[:ansible_runner_stdout] = result.parsed_stdout + + if result.return_code != 0 + set_status("ansible #{execution_type} failed", "error") + _log.warn("ansible #{execution_type} failed:\n#{result.parsed_stdout.join("\n")}") + else + set_status("ansible #{execution_type} completed with no errors", "ok") + end + route_signal(:post_execute) end - else - result = response.response - - context[:ansible_runner_return_code] = result.return_code - context[:ansible_runner_stdout] = result.parsed_stdout - if result.return_code != 0 - set_status("ansible #{execution_type} failed", "error") - _log.warn("ansible #{execution_type} failed:\n#{result.parsed_stdout.join("\n")}") - else - set_status("ansible #{execution_type} completed with no errors", "ok") - end - route_signal(:post_execute) + # Break out of the loop when we've either queued a message + # or, if we're running in pods, the job has finished + break end end @@ -91,9 +103,10 @@ def post_execute protected + # Continue in the current process if we're running in pods, or queue the message for the next worker otherwise + # We can't queue in pods as jobs of this type depend on filesystem state def route_signal(*args, deliver_on: nil) if MiqEnvironment::Command.is_podified? - sleep(deliver_on - Time.now.utc) if deliver_on signal(*args) else queue_signal(*args, :deliver_on => deliver_on) diff --git a/spec/models/manageiq/providers/ansible_role_workflow_spec.rb b/spec/models/manageiq/providers/ansible_role_workflow_spec.rb index 213ca8e43fea..62e9f7fc4276 100644 --- a/spec/models/manageiq/providers/ansible_role_workflow_spec.rb +++ b/spec/models/manageiq/providers/ansible_role_workflow_spec.rb @@ -268,13 +268,17 @@ job.signal(:poll_runner) end - it "doesn't queue the next state when running in pods and ansible-runner still running" do - expect(MiqEnvironment::Command).to receive(:is_podified?).and_return(true) - now = Time.now.utc - allow(Time).to receive(:now).and_return(now) - expect(response_async).to receive(:running?).and_return(true) + it "if ansible-runner still runningin pods it loops until the job is done" do + expect(MiqEnvironment::Command).to receive(:is_podified?).twice.and_return(true) + expect(response_async).to receive(:running?).and_return(true, false) + + # First loop, the job is still running so we sleep for the poll interval expect(job).to receive(:sleep).with(1) - expect(job).to receive(:signal).with(:poll_runner) + + # Second loop we get a response and signal the post_execute state + response = Ansible::Runner::Response.new(response_async.dump.merge(:return_code => 0)) + expect(response_async).to receive(:response).and_return(response) + expect(job).to receive(:signal).with(:post_execute) job.poll_runner end