From 3f57c90a45b0302ac206fc032fd54aa581b06166 Mon Sep 17 00:00:00 2001 From: bugclerk <40872210+bugclerk@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:49:06 -0700 Subject: [PATCH] Make vrrp_master more robust wrt iSCSI ALUA on newly booted node (#14366) (cherry picked from commit 5df0d721d10d11e84fa836f13638513b35624660) Co-authored-by: Brian M --- .../middlewared/plugins/failover_/event.py | 61 +++++++++++++------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/src/middlewared/middlewared/plugins/failover_/event.py b/src/middlewared/middlewared/plugins/failover_/event.py index 0195686e98abb..b2e73c21928f9 100644 --- a/src/middlewared/middlewared/plugins/failover_/event.py +++ b/src/middlewared/middlewared/plugins/failover_/event.py @@ -350,6 +350,37 @@ def fenced_start_loop(self, max_retries=3): return fenced_error + def iscsi_cleanup_alua_state(self): + """ + Cleanup iSCSI ALUA state if we are now becoming ACTIVE node, and + previously were STANDBY node. + """ + # We will suspend iSCSI and then close any existing iSCSI sessions + # to avoid inflight I/O interfering with the LUN replacement during + # become_active. Suspending iSCSI means BUSY will be returned. + suspended = cleaned = False + try: + try: + logger.info('Suspending iSCSI') + self.run_call('iscsi.scst.suspend', 30) + suspended = True + logger.info('Suspended iSCSI') + except FileNotFoundError: + # This can occur if we are booting into ACTIVE node + # rather than becoming ACTIVE from STANDBY. + logger.info('Did not suspend iSCSI') + else: + logger.info('Closing iSCSI sessions') + self.run_call('iscsi.alua.force_close_sessions') + logger.info('Closed iSCSI sessions') + logger.info('calling iscsi ALUA active elected') + self.run_call('iscsi.alua.active_elected') + logger.info('done calling iscsi ALUA active elected') + cleaned = True + except Exception: + logger.exception('Unexpected failure setting up iscsi') + return (suspended, cleaned) + @job(lock=FAILOVER_LOCK_NAME) def vrrp_master(self, job, fobj, ifname, event): @@ -477,23 +508,13 @@ def vrrp_master(self, job, fobj, ifname, event): # Kick off a job to clean up any left-over ALUA state from when we were STANDBY/BACKUP. logger.info('Verifying iSCSI service') + iscsi_suspended = iscsi_cleaned = False if self.run_call('service.started_or_enabled', 'iscsitarget'): logger.info('Checking if ALUA is enabled') handle_alua = self.run_call('iscsi.global.alua_enabled') logger.info('Done checking if ALUA is enabled') if handle_alua: - # We will suspend iSCSI and then close any existing iSCSI sessions - # to avoid inflight I/O interfering with the LUN replacement during - # become_active. Suspending iSCSI means BUSY will be returned. - logger.info('Suspending iSCSI') - self.run_call('iscsi.scst.suspend', 30) - logger.info('Suspended iSCSI') - logger.info('Closing iSCSI sessions') - self.run_call('iscsi.alua.force_close_sessions') - logger.info('Closed iSCSI sessions') - logger.info('calling iscsi ALUA active elected') - self.run_call('iscsi.alua.active_elected') - logger.info('done calling iscsi ALUA active elected') + iscsi_suspended, iscsi_cleaned = self.iscsi_cleanup_alua_state() else: handle_alua = False logger.info('Done verifying iSCSI service') @@ -626,7 +647,7 @@ def vrrp_master(self, job, fobj, ifname, event): logger.info('Volume imports complete') # Now that the volumes have been imported, get a head-start on activating extents. - if handle_alua: + if handle_alua and iscsi_cleaned: logger.info('Activating ALUA extents') self.run_call('iscsi.alua.activate_extents') logger.info('Done activating ALUA extents') @@ -683,11 +704,15 @@ def vrrp_master(self, job, fobj, ifname, event): logger.info('Done starting failover background jobs') if handle_alua: - logger.info('Clearing iSCSI suspend') - if self.run_call('iscsi.scst.clear_suspend'): - logger.info('Cleared iSCSI suspend') - # Kick off a job to start clearing up HA targets from when we were STANDBY - self.run_call('iscsi.alua.reset_active') + try: + if iscsi_suspended: + logger.info('Clearing iSCSI suspend') + if self.run_call('iscsi.scst.clear_suspend'): + logger.info('Cleared iSCSI suspend') + # Kick off a job to start clearing up HA targets from when we were STANDBY + self.run_call('iscsi.alua.reset_active') + except Exception: + logger.exception('Failed to complete iSCSI bringup') # restart the remaining "non-critical" services logger.info('Restarting remaining services')