diff --git a/app/runners/submission_runner.rb b/app/runners/submission_runner.rb index 81dbd05eb9..9f4301bab9 100644 --- a/app/runners/submission_runner.rb +++ b/app/runners/submission_runner.rb @@ -163,11 +163,19 @@ def execute timer = Thread.new do while Time.zone.now - before_time < time_limit before_stats = Time.zone.now - # Check if container is still running - if !Rails.env.test? && (Docker::Container.all.any? { |c| c.id.starts_with?(container.id) || container.id.starts_with?(container.id) } && container.refresh!.info['State']['Running']) - # If we don't pass these extra options gathering stats takes 1+ seconds (https://github.com/moby/moby/issues/23188#issuecomment-223211481) - stats = container.stats({ 'one-shot': true, stream: false }) - memory = [stats['memory_stats']['usage'] / (1024.0 * 1024.0), memory].max if stats['memory_stats']&.fetch('usage', nil) + + begin + # Check if container is still running + if !Rails.env.test? && (Docker::Container.all.any? { |c| c.id.starts_with?(container.id) || container.id.starts_with?(container.id) } && container.refresh!.info['State']['Running']) + # If we don't pass these extra options gathering stats takes 1+ seconds (https://github.com/moby/moby/issues/23188#issuecomment-223211481) + stats = container.stats({ 'one-shot': true, stream: false }) + memory = [stats['memory_stats']['usage'] / (1024.0 * 1024.0), memory].max if stats['memory_stats']&.fetch('usage', nil) + end + rescue Docker::Error::TimeoutError, Docker::Error::ServerError + # The docker container might be in a bad state + # We just ignore this and try again later + # The timeout will clean up the container if this lasts too long + Rails.logger.warn "Failed to get stats from docker container #{container.id} for submission #{@submission.id}" end # Gathering stats still takes a long time, so if we spent enough time on @@ -179,6 +187,9 @@ def execute timeout = true if timeout.nil? end end + # errors raised in the thread should also be raised in the main thread + # This ensures they are reported and handled correctly + timer.abort_on_exception = true begin outlines, errlines = container.tap(&:start).attach(