ManageIQ · jrafanie · Jan 4, 2021 · Nov 24, 2020 · Dec 1, 2020 · Dec 2, 2020
@@ -161,6 +161,7 @@ end
 
 group :systemd, :optional => true do
   gem "dbus-systemd",    "~>1.1.0", :require => false
+  gem "sd_notify",       "~>0.1.0", :require => false
   gem "systemd-journal", "~>1.4.2", :require => false
 end
 

@@ -144,7 +144,7 @@ def drb_dequeue_available?
   def heartbeat_message_timeout(message)
     if message.msg_timeout
       timeout = worker_settings[:poll] + message.msg_timeout
-      heartbeat_to_file(timeout)
+      systemd_worker? ? worker.sd_notify_watchdog_usec(timeout) : heartbeat_to_file(timeout)
     end
   end
 end
@@ -27,13 +27,7 @@ def monitor_workers
 
     cleanup_failed_workers
 
-    # Monitor all remaining current worker records
-    miq_workers.where(:status => MiqWorker::STATUSES_CURRENT_OR_STARTING).each do |worker|
-      # Push the heartbeat into the database
-      persist_last_heartbeat(worker)
-      # Check the worker record for heartbeat timeouts
-      validate_worker(worker)
-    end
+    monitor_active_workers
 
     do_system_limit_exceeded if self.kill_workers_due_to_resources_exhausted?
   end
@@ -84,6 +78,22 @@ def cleanup_orphaned_worker_rows
     end
   end
 
+  def monitor_active_workers
+    # When k8s or systemd is operating as the worker monitor then all of the
+    # worker monitoring (liveness, memory threshold) is handled by those
+    # systems.  Only when workers are run as standalone processes does MiqServer
+    # have to monitor the workers itself.
+    return if podified? || systemd?
+
+    # Monitor all remaining current worker records
+    miq_workers.where(:status => MiqWorker::STATUSES_CURRENT_OR_STARTING).each do |worker|
+      # Push the heartbeat into the database
+      persist_last_heartbeat(worker)
+      # Check the worker record for heartbeat timeouts
+      validate_worker(worker)
+    end
+  end
+
   def cleanup_failed_workers
     check_not_responding
     check_pending_stop

@@ -9,6 +9,8 @@ class TemporaryFailure < RuntimeError
   attr_accessor :last_hb, :worker, :worker_settings
   attr_reader   :active_roles, :server
 
+  delegate :systemd_worker?, :to => :worker
+
   INTERRUPT_SIGNALS = %w[SIGINT SIGTERM].freeze
 
   SAFE_SLEEP_SECONDS = 60
@@ -142,6 +144,7 @@ def starting_worker_record
 
   def started_worker_record
     reload_worker_record
+    @worker.sd_notify_started if systemd_worker?
     @worker.status         = "started"
     @worker.last_heartbeat = Time.now.utc
     @worker.update_spid
@@ -191,6 +194,7 @@ def update_worker_record_at_exit(exit_code)
     @worker.stopped_on = Time.now.utc
     @worker.save
 
+    @worker.sd_notify_stopping if systemd_worker?
     @worker.status_update
     @worker.log_status
   end
@@ -288,7 +292,7 @@ def heartbeat
     # Heartbeats can be expensive, so do them only when needed
     return if @last_hb.kind_of?(Time) && (@last_hb + worker_settings[:heartbeat_freq]) >= now
 
-    heartbeat_to_file
+    systemd_worker? ? @worker.sd_notify_watchdog : heartbeat_to_file
 
     if config_out_of_date?
       _log.info("#{log_prefix} Synchronizing configuration...")

@@ -70,6 +70,7 @@ def unit_file
           Environment=BUNDLER_GROUPS=#{bundler_groups.join(",")}
           ExecStart=/bin/bash -lc '#{exec_start}'
           Restart=no
+          Type=notify
           Slice=#{slice_name}
         UNIT_FILE
       end
@@ -115,6 +116,23 @@ def stop_systemd_unit(mode: "replace")
       systemd.StopUnit(unit_name, mode)
     end
 
+    def sd_notify_started
+      sd_notify.ready
+    end
+
+    def sd_notify_stopping
+      sd_notify.stopping
+    end
+
+    def sd_notify_watchdog
+      sd_notify.watchdog
+    end
+
+    def sd_notify_watchdog_usec(timeout_in_seconds)
+      usec = timeout_in_seconds * 1_000_000
+      sd_notify.notify("WATCHDOG_USEC=#{usec}", false)
+    end
+
     private
 
     def systemd
@@ -124,6 +142,13 @@ def systemd
       end
     end
 
+    def sd_notify
+      @sd_notify ||= begin
+        require "sd_notify"
+        SdNotify
+      end
+    end
+
     def service_base_name
       self.class.service_base_name
     end
@@ -166,6 +191,7 @@ def unit_config_file
         MemoryHigh=#{worker_settings[:memory_threshold].bytes}
         TimeoutStartSec=#{worker_settings[:starting_timeout]}
         TimeoutStopSec=#{worker_settings[:stopping_timeout]}
+        WatchdogSec=#{worker_settings[:heartbeat_timeout]}
         #{unit_environment_variables.map { |env_var| "Environment=#{env_var}" }.join("\n")}
       UNIT_CONFIG_FILE
     end