From 0ad07b59a72fccdd18e6913b5a444d217f160e34 Mon Sep 17 00:00:00 2001
From: Nick Carboni <ncarboni@redhat.com>
Date: Tue, 17 Mar 2020 14:47:24 -0400
Subject: [PATCH] Don't recurse into poll_runner when in pods, loop instead

For a sufficiently long-running job, we could exceed the stack
size threshold, so this commit implements a loop in poll_runner
that is only used when we're running in pods and waiting for the
ansible runner process to finish.
---
 .../providers/ansible_runner_workflow.rb      | 53 ++++++++++++-------
 .../providers/ansible_role_workflow_spec.rb   | 16 +++---
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/app/models/manageiq/providers/ansible_runner_workflow.rb b/app/models/manageiq/providers/ansible_runner_workflow.rb
index cd5ae773a34c..3af2c584f88f 100644
--- a/app/models/manageiq/providers/ansible_runner_workflow.rb
+++ b/app/models/manageiq/providers/ansible_runner_workflow.rb
@@ -53,28 +53,40 @@ def execute
   end
 
   def poll_runner
-    response = Ansible::Runner::ResponseAsync.load(context[:ansible_runner_response])
-    if response.running?
-      if started_on + options[:timeout] < Time.now.utc
-        response.stop
-
-        route_signal(:abort, "ansible #{execution_type} has been running longer than timeout", "error")
+    loop do
+      response = Ansible::Runner::ResponseAsync.load(context[:ansible_runner_response])
+      if response.running?
+        if started_on + options[:timeout] < Time.now.utc
+          response.stop
+
+          route_signal(:abort, "ansible #{execution_type} has been running longer than timeout", "error")
+        else
+          if MiqEnvironment::Command.is_podified?
+            # If we're running in pods loop so we don't exhaust the stack limit in very long jobs
+            sleep options[:poll_interval]
+            next
+          else
+            queue_signal(:poll_runner, :deliver_on => deliver_on)
+          end
+        end
       else
-        route_signal(:poll_runner, :deliver_on => deliver_on)
+        result = response.response
+
+        context[:ansible_runner_return_code] = result.return_code
+        context[:ansible_runner_stdout]      = result.parsed_stdout
+
+        if result.return_code != 0
+          set_status("ansible #{execution_type} failed", "error")
+          _log.warn("ansible #{execution_type} failed:\n#{result.parsed_stdout.join("\n")}")
+        else
+          set_status("ansible #{execution_type} completed with no errors", "ok")
+        end
+        route_signal(:post_execute)
       end
-    else
-      result = response.response
-
-      context[:ansible_runner_return_code] = result.return_code
-      context[:ansible_runner_stdout]      = result.parsed_stdout
 
-      if result.return_code != 0
-        set_status("ansible #{execution_type} failed", "error")
-        _log.warn("ansible #{execution_type} failed:\n#{result.parsed_stdout.join("\n")}")
-      else
-        set_status("ansible #{execution_type} completed with no errors", "ok")
-      end
-      route_signal(:post_execute)
+      # Break out of the loop when we've either queued a message
+      # or, if we're running in pods, the job has finished
+      break
     end
   end
 
@@ -91,9 +103,10 @@ def post_execute
 
   protected
 
+  # Continue in the current process if we're running in pods, or queue the message for the next worker otherwise
+  # We can't queue in pods as jobs of this type depend on filesystem state
   def route_signal(*args, deliver_on: nil)
     if MiqEnvironment::Command.is_podified?
-      sleep(deliver_on - Time.now.utc) if deliver_on
       signal(*args)
     else
       queue_signal(*args, :deliver_on => deliver_on)
diff --git a/spec/models/manageiq/providers/ansible_role_workflow_spec.rb b/spec/models/manageiq/providers/ansible_role_workflow_spec.rb
index 213ca8e43fea..62e9f7fc4276 100644
--- a/spec/models/manageiq/providers/ansible_role_workflow_spec.rb
+++ b/spec/models/manageiq/providers/ansible_role_workflow_spec.rb
@@ -268,13 +268,17 @@
       job.signal(:poll_runner)
     end
 
-    it "doesn't queue the next state when running in pods and ansible-runner still running" do
-      expect(MiqEnvironment::Command).to receive(:is_podified?).and_return(true)
-      now = Time.now.utc
-      allow(Time).to receive(:now).and_return(now)
-      expect(response_async).to receive(:running?).and_return(true)
+    it "if ansible-runner still runningin pods it loops until the job is done" do
+      expect(MiqEnvironment::Command).to receive(:is_podified?).twice.and_return(true)
+      expect(response_async).to receive(:running?).and_return(true, false)
+
+      # First loop, the job is still running so we sleep for the poll interval
       expect(job).to receive(:sleep).with(1)
-      expect(job).to receive(:signal).with(:poll_runner)
+
+      # Second loop we get a response and signal the post_execute state
+      response = Ansible::Runner::Response.new(response_async.dump.merge(:return_code => 0))
+      expect(response_async).to receive(:response).and_return(response)
+      expect(job).to receive(:signal).with(:post_execute)
 
       job.poll_runner
     end