From 9d3251c90fb2bb6bcd9cc6a3b8afc264347a5101 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Tue, 14 Feb 2023 14:26:25 +0800 Subject: [PATCH] [Mellanox] set select timeout to no more than 1 sec to make sure fast shutdown (#13611) - Why I did it Commit sonic-net/sonic-platform-daemons@153ea47 changed SfpStateUpdateTask from Process to Thread. In this commit, it raises an exception in SfpStateUpdateTask to make shutdown flow fast. But it does not work on Nvidia platform as Nvidia platform is passing timeout parameter of get_change_event to select. Linux select function can not be interrupted by a Python exception. There is no such issue on Nvidia platform before that commit. However, in order to comply with the commit and make shutdown flow fast, we decided to change Nvidia platform API implementation. To fix issue #13591. - How I did it The select call in get_change_event should use no more than 1 second as timeout parameter. Outside the select call, add a while loop to make sure timeout parameter of get_change_event work as expected - How to verify it Manual test --- .../sonic_platform/chassis.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index d01b1bfc901a..d72ea333cc64 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -31,10 +31,10 @@ from . import utils from .device_data import DeviceDataManager import re + import time except ImportError as e: raise ImportError (str(e) + "- required module not found") -MAX_SELECT_DELAY = 3600 RJ45_TYPE = "RJ45" @@ -387,26 +387,30 @@ def get_change_event(self, timeout=0): self.sfp_event.initialize() wait_for_ever = (timeout == 0) + # select timeout should be no more than 1000ms to ensure fast shutdown flow + select_timeout = 1000.0 if timeout >= 1000 else float(timeout) port_dict = {} error_dict = {} - if wait_for_ever: - timeout = MAX_SELECT_DELAY - while True: - status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout) - if bool(port_dict): + begin = time.time() + while True: + status = self.sfp_event.check_sfp_status(port_dict, error_dict, select_timeout) + if bool(port_dict): + break + + if not wait_for_ever: + elapse = time.time() - begin + if elapse * 1000 > timeout: break - else: - status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout) if status: if port_dict: self.reinit_sfps(port_dict) - result_dict = {'sfp':port_dict} + result_dict = {'sfp': port_dict} if error_dict: result_dict['sfp_error'] = error_dict return True, result_dict else: - return True, {'sfp':{}} + return True, {'sfp': {}} def reinit_sfps(self, port_dict): """