Skip to content

Commit

Permalink
Don't recurse into poll_runner when in pods, loop instead
Browse files Browse the repository at this point in the history
For a sufficiently long-running job, we could exceed the stack
size threshold, so this commit implements a loop in poll_runner
that is only used when we're running in pods and waiting for the
ansible runner process to finish.
  • Loading branch information
carbonin committed Mar 17, 2020
1 parent 17d4194 commit 0ad07b5
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 26 deletions.
53 changes: 33 additions & 20 deletions app/models/manageiq/providers/ansible_runner_workflow.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,28 +53,40 @@ def execute
end

def poll_runner
response = Ansible::Runner::ResponseAsync.load(context[:ansible_runner_response])
if response.running?
if started_on + options[:timeout] < Time.now.utc
response.stop

route_signal(:abort, "ansible #{execution_type} has been running longer than timeout", "error")
loop do
response = Ansible::Runner::ResponseAsync.load(context[:ansible_runner_response])
if response.running?
if started_on + options[:timeout] < Time.now.utc
response.stop

route_signal(:abort, "ansible #{execution_type} has been running longer than timeout", "error")
else
if MiqEnvironment::Command.is_podified?
# If we're running in pods loop so we don't exhaust the stack limit in very long jobs
sleep options[:poll_interval]
next
else
queue_signal(:poll_runner, :deliver_on => deliver_on)
end
end
else
route_signal(:poll_runner, :deliver_on => deliver_on)
result = response.response

context[:ansible_runner_return_code] = result.return_code
context[:ansible_runner_stdout] = result.parsed_stdout

if result.return_code != 0
set_status("ansible #{execution_type} failed", "error")
_log.warn("ansible #{execution_type} failed:\n#{result.parsed_stdout.join("\n")}")
else
set_status("ansible #{execution_type} completed with no errors", "ok")
end
route_signal(:post_execute)
end
else
result = response.response

context[:ansible_runner_return_code] = result.return_code
context[:ansible_runner_stdout] = result.parsed_stdout

if result.return_code != 0
set_status("ansible #{execution_type} failed", "error")
_log.warn("ansible #{execution_type} failed:\n#{result.parsed_stdout.join("\n")}")
else
set_status("ansible #{execution_type} completed with no errors", "ok")
end
route_signal(:post_execute)
# Break out of the loop when we've either queued a message
# or, if we're running in pods, the job has finished
break
end
end

Expand All @@ -91,9 +103,10 @@ def post_execute

protected

# Continue in the current process if we're running in pods, or queue the message for the next worker otherwise
# We can't queue in pods as jobs of this type depend on filesystem state
def route_signal(*args, deliver_on: nil)
if MiqEnvironment::Command.is_podified?
sleep(deliver_on - Time.now.utc) if deliver_on
signal(*args)
else
queue_signal(*args, :deliver_on => deliver_on)
Expand Down
16 changes: 10 additions & 6 deletions spec/models/manageiq/providers/ansible_role_workflow_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -268,13 +268,17 @@
job.signal(:poll_runner)
end

it "doesn't queue the next state when running in pods and ansible-runner still running" do
expect(MiqEnvironment::Command).to receive(:is_podified?).and_return(true)
now = Time.now.utc
allow(Time).to receive(:now).and_return(now)
expect(response_async).to receive(:running?).and_return(true)
it "if ansible-runner still runningin pods it loops until the job is done" do
expect(MiqEnvironment::Command).to receive(:is_podified?).twice.and_return(true)
expect(response_async).to receive(:running?).and_return(true, false)

# First loop, the job is still running so we sleep for the poll interval
expect(job).to receive(:sleep).with(1)
expect(job).to receive(:signal).with(:poll_runner)

# Second loop we get a response and signal the post_execute state
response = Ansible::Runner::Response.new(response_async.dump.merge(:return_code => 0))
expect(response_async).to receive(:response).and_return(response)
expect(job).to receive(:signal).with(:post_execute)

job.poll_runner
end
Expand Down

0 comments on commit 0ad07b5

Please sign in to comment.