diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index e0cc62ede8c8..362289aa5e6a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -16,6 +16,7 @@ from .led import SharedLed, ComponentFaultyIndicator from .utils import read_int_from_file, read_str_from_file, write_file + from .thermal import Thermal except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -29,7 +30,6 @@ CONFIG_PATH = "/var/run/hw-management/config" # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches FAN_DIR = "/var/run/hw-management/system/fan_dir" -COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" # Platforms with unplugable FANs: # 1. don't have fanX_status and should be treated as always present @@ -42,9 +42,6 @@ class Fan(FanBase): """Platform-specific Fan class""" STATUS_LED_COLOR_ORANGE = "orange" - min_cooling_level = 2 - MIN_VALID_COOLING_LEVEL = 1 - MAX_VALID_COOLING_LEVEL = 10 # Fan drawer leds fan_drawer_leds = {} @@ -119,7 +116,7 @@ def get_direction(self): depending on fan direction Notes: - What Mellanox calls forward: + What Mellanox calls forward: Air flows from fans side to QSFP side, for example: MSN2700-CS2F which means intake in community What Mellanox calls reverse: @@ -213,7 +210,7 @@ def get_target_speed(self): if self.is_psu_fan: try: # Get PSU fan target speed according to current system cooling level - cooling_level = self.get_cooling_level() + cooling_level = Thermal.get_cooling_level() return int(self.PSU_FAN_SPEED[cooling_level], 16) except Exception: return self.get_speed() @@ -230,7 +227,7 @@ def set_speed(self, speed): in the range 0 (off) to 100 (full speed) Returns: - bool: True if set success, False if fail. + bool: True if set success, False if fail. """ status = True @@ -254,18 +251,13 @@ def set_speed(self, speed): return False try: - cooling_level = int(speed / 10) - if cooling_level < self.min_cooling_level: - cooling_level = self.min_cooling_level - speed = self.min_cooling_level * 10 - self.set_cooling_level(cooling_level, cooling_level) pwm = int(round(PWM_MAX*speed/100.0)) write_file(os.path.join(FAN_PATH, self.fan_speed_set_path), pwm, raise_exception=True) except (ValueError, IOError): status = False return status - + def _get_led_capability(self): cap_list = None try: @@ -274,7 +266,7 @@ def _get_led_capability(self): cap_list = caps.split() except (ValueError, IOError): status = 0 - + return cap_list def set_status_led(self, color): @@ -296,7 +288,7 @@ def _set_status_led(self, color): fan module status LED Returns: - bool: True if set success, False if fail. + bool: True if set success, False if fail. """ led_cap_list = self._get_led_capability() if led_cap_list is None: @@ -340,38 +332,3 @@ def get_speed_tolerance(self): """ # The tolerance value is fixed as 50% for all the Mellanox platform return 50 - - @classmethod - def set_cooling_level(cls, level, cur_state): - """ - Change cooling level. The input level should be an integer value [1, 10]. - 1 means 10%, 2 means 20%, 10 means 100%. - """ - if not isinstance(level, int): - raise RuntimeError("Failed to set cooling level, input parameter must be integer") - - if level < cls.MIN_VALID_COOLING_LEVEL or level > cls.MAX_VALID_COOLING_LEVEL: - raise RuntimeError("Failed to set cooling level, level value must be in range [{}, {}], got {}".format( - cls.MIN_VALID_COOLING_LEVEL, - cls.MAX_VALID_COOLING_LEVEL, - level - )) - - try: - # Reset FAN cooling level vector. According to low level team, - # if we need set cooling level to X, we need first write a (10+X) - # to cooling_cur_state file to reset the cooling level vector. - write_file(COOLING_STATE_PATH, level + 10, raise_exception=True) - - # We need set cooling level after resetting the cooling level vector - write_file(COOLING_STATE_PATH, cur_state, raise_exception=True) - except (ValueError, IOError) as e: - raise RuntimeError("Failed to set cooling level - {}".format(e)) - - @classmethod - def get_cooling_level(cls): - try: - return read_int_from_file(COOLING_STATE_PATH, raise_exception=True) - except (ValueError, IOError) as e: - raise RuntimeError("Failed to get cooling level - {}".format(e)) - diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 28db60474281..f36962c65931 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -15,6 +15,9 @@ from os.path import isfile, join import io import os.path + import glob + + from . import utils except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -48,7 +51,17 @@ THERMAL_ZONE_MODE = "thermal_zone_mode" THERMAL_ZONE_POLICY = "thermal_zone_policy" THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp" -THERMAL_ZONE_NORMAL_TEMPERATURE = "temp_trip_norm" +THERMAL_ZONE_HOT_THRESHOLD = "temp_trip_hot" +THERMAL_ZONE_HIGH_THRESHOLD = "temp_trip_high" +THERMAL_ZONE_NORMAL_THRESHOLD = "temp_trip_norm" +THERMAL_ZONE_FOLDER_WILDCARD = '/run/hw-management/thermal/mlxsw*' +THERMAL_ZONE_HYSTERESIS = 5000 +COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" +# Min allowed cooling level when all thermal zones are in normal state +MIN_COOLING_LEVEL_FOR_NORMAL = 2 +# Min allowed cooling level when any thermal zone is in high state but no thermal zone is in emergency state +MIN_COOLING_LEVEL_FOR_HIGH = 4 +MAX_COOLING_LEVEL = 10 MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault" @@ -92,14 +105,14 @@ THERMAL_DEV_BOARD_AMBIENT : "Ambient Board Temp" } thermal_api_handlers = { - THERMAL_DEV_CATEGORY_CPU_CORE : thermal_api_handler_cpu_core, + THERMAL_DEV_CATEGORY_CPU_CORE : thermal_api_handler_cpu_core, THERMAL_DEV_CATEGORY_CPU_PACK : thermal_api_handler_cpu_pack, THERMAL_DEV_CATEGORY_MODULE : thermal_api_handler_module, THERMAL_DEV_CATEGORY_PSU : thermal_api_handler_psu, THERMAL_DEV_CATEGORY_GEARBOX : thermal_api_handler_gearbox } thermal_name = { - THERMAL_DEV_CATEGORY_CPU_CORE : "CPU Core {} Temp", + THERMAL_DEV_CATEGORY_CPU_CORE : "CPU Core {} Temp", THERMAL_DEV_CATEGORY_CPU_PACK : "CPU Pack Temp", THERMAL_DEV_CATEGORY_MODULE : "xSFP module {} Temp", THERMAL_DEV_CATEGORY_PSU : "PSU-{} Temp", @@ -335,6 +348,14 @@ def initialize_thermals(platform, thermal_list, psu_list): class Thermal(ThermalBase): thermal_profile = None thermal_algorithm_status = False + # Expect cooling level, used for caching the cooling level value before commiting to hardware + expect_cooling_level = None + # Expect cooling state + expect_cooling_state = None + # Last committed cooling level + last_set_cooling_level = None + last_set_cooling_state = None + last_set_psu_cooling_level = None def __init__(self, category, index, has_index, dependency = None): """ @@ -405,7 +426,7 @@ def get_temperature(self): Returns: A float number of current temperature in Celsius up to nearest thousandth - of one degree Celsius, e.g. 30.125 + of one degree Celsius, e.g. 30.125 """ if self.dependency: status, hint = self.dependency() @@ -472,7 +493,7 @@ def get_high_critical_threshold(self): @classmethod def _write_generic_file(cls, filename, content): """ - Generic functions to write content to a specified file path if + Generic functions to write content to a specified file path if the content has changed. """ try: @@ -492,8 +513,8 @@ def set_thermal_algorithm_status(cls, status, force=True): only adjust fan speed when temperature across some "edge", e.g temperature changes to exceed high threshold. When disable kernel thermal algorithm, kernel no longer adjust fan speed. - We usually disable the algorithm when we want to set a fix speed. E.g, when - a fan unit is removed from system, we will set fan speed to 100% and disable + We usually disable the algorithm when we want to set a fix speed. E.g, when + a fan unit is removed from system, we will set fan speed to 100% and disable the algorithm to avoid it adjust the speed. Returns: @@ -527,51 +548,38 @@ def set_thermal_algorithm_status(cls, status, force=True): return True @classmethod - def check_thermal_zone_temperature(cls): - """ - Check thermal zone current temperature with normal temperature + def get_min_allowed_cooling_level_by_thermal_zone(cls): + """Get min allowed cooling level according to thermal zone status: + 1. If temperature of all thermal zones is less than normal threshold, min allowed cooling level is + $MIN_COOLING_LEVEL_FOR_NORMAL = 2 + 2. If temperature of any thermal zone is greater than normal threshold, but no thermal zone temperature + is greater than high threshold, min allowed cooling level is $MIN_COOLING_LEVEL_FOR_HIGH = 4 + 3. Otherwise, there is no minimum allowed value and policy should not adjust cooling level Returns: - True if all thermal zones current temperature less or equal than normal temperature + int: minimum allowed cooling level """ - if not cls.thermal_profile: - raise Exception("Fail to get thermal profile for this switch") - - if not cls._check_thermal_zone_temperature(THERMAL_ZONE_ASIC_PATH): - return False - - if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: - start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] - if count != 0: - for index in range(count): - if not cls._check_thermal_zone_temperature(THERMAL_ZONE_MODULE_PATH.format(start + index)): - return False - - if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: - start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] - if count != 0: - for index in range(count): - if not cls._check_thermal_zone_temperature(THERMAL_ZONE_GEARBOX_PATH.format(start + index)): - return False - - return True + min_allowed = MIN_COOLING_LEVEL_FOR_NORMAL + thermal_zone_present = False + try: + for thermal_zone_folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD): + thermal_zone_present = True + normal_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_NORMAL_THRESHOLD)) + current = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_TEMPERATURE)) + if current < normal_thresh - THERMAL_ZONE_HYSTERESIS: + continue - @classmethod - def _check_thermal_zone_temperature(cls, thermal_zone_path): - normal_temp_path = join(thermal_zone_path, THERMAL_ZONE_NORMAL_TEMPERATURE) - current_temp_path = join(thermal_zone_path, THERMAL_ZONE_TEMPERATURE) - normal = None - current = None - try: - with open(normal_temp_path, 'r') as file_obj: - normal = float(file_obj.read()) - - with open(current_temp_path, 'r') as file_obj: - current = float(file_obj.read()) - - return current <= normal + hot_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_HIGH_THRESHOLD)) + if current < hot_thresh - THERMAL_ZONE_HYSTERESIS: + min_allowed = MIN_COOLING_LEVEL_FOR_HIGH + else: + min_allowed = None + break except Exception as e: - logger.log_info("Fail to check thermal zone temperature for file {} due to {}".format(thermal_zone_path, repr(e))) + logger.log_error('Failed to get thermal zone status for {} - {}'.format(thermal_zone_folder, repr(e))) + return None + + return min_allowed if thermal_zone_present else None @classmethod def check_module_temperature_trustable(cls): @@ -595,3 +603,85 @@ def get_min_amb_temperature(cls): fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0)) port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0)) return fan_ambient_temp if fan_ambient_temp < port_ambient_temp else port_ambient_temp + + @classmethod + def set_cooling_level(cls, level): + """ + Change cooling level. The input level should be an integer value [1, 10]. + 1 means 10%, 2 means 20%, 10 means 100%. + """ + if cls.last_set_cooling_level != level: + utils.write_file(COOLING_STATE_PATH, level + 10, raise_exception=True) + cls.last_set_cooling_level = level + + @classmethod + def set_cooling_state(cls, state): + """Change cooling state. + + Args: + state (int): cooling state + """ + if cls.last_set_cooling_state != state: + utils.write_file(COOLING_STATE_PATH, state, raise_exception=True) + cls.last_set_cooling_state = state + + @classmethod + def get_cooling_level(cls): + try: + return utils.read_int_from_file(COOLING_STATE_PATH, raise_exception=True) + except (ValueError, IOError) as e: + raise RuntimeError("Failed to get cooling level - {}".format(e)) + + @classmethod + def set_expect_cooling_level(cls, expect_value): + """During thermal policy running, cache the expect cooling level generated by policies. The max expect + cooling level will be committed to hardware. + + Args: + expect_value (int): Expected cooling level value + """ + if cls.expect_cooling_level is None or cls.expect_cooling_level < expect_value: + cls.expect_cooling_level = int(expect_value) + + @classmethod + def commit_cooling_level(cls, thermal_info_dict): + """Commit cooling level to hardware. This will affect system fan and PSU fan speed. + + Args: + thermal_info_dict (dict): Thermal information dictionary + """ + if cls.expect_cooling_level is not None: + cls.set_cooling_level(cls.expect_cooling_level) + + if cls.expect_cooling_state is not None: + cls.set_cooling_state(cls.expect_cooling_state) + elif cls.expect_cooling_level is not None: + cls.set_cooling_state(cls.expect_cooling_level) + + cls.expect_cooling_level = None + # We need to set system fan speed here because kernel will automaticlly adjust fan speed according to cooling level and cooling state + + # Commit PSU fan speed with current state + from .thermal_infos import ChassisInfo + if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo): + cooling_level = cls.get_cooling_level() + if cls.last_set_psu_cooling_level == cooling_level: + return + speed = cooling_level * 10 + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + for psu in chassis.get_all_psus(): + for psu_fan in psu.get_all_fans(): + psu_fan.set_speed(speed) + cls.last_set_psu_cooling_level = cooling_level + + @classmethod + def monitor_asic_themal_zone(cls): + """This is a protection for asic thermal zone, if asic temperature is greater than hot threshold + THERMAL_ZONE_HYSTERESIS, + and if cooling state is not MAX, we need enforce the cooling state to MAX + """ + asic_temp = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_TEMPERATURE), raise_exception=True) + hot_thresh = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_HOT_THRESHOLD), raise_exception=True) + if asic_temp >= hot_thresh + THERMAL_ZONE_HYSTERESIS: + cls.expect_cooling_state = MAX_COOLING_LEVEL + else: + cls.expect_cooling_state = None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index e7436bd0a5b7..c5c2645b7915 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -1,6 +1,5 @@ from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object -from .thermal import logger class SetFanSpeedAction(ThermalPolicyActionBase): @@ -48,120 +47,30 @@ def execute(self, thermal_info_dict): :param thermal_info_dict: A dictionary stores all thermal information. :return: """ - from .thermal_infos import FanInfo - if FanInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[FanInfo.INFO_NAME], FanInfo): - fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] - for fan in fan_info_obj.get_presence_fans(): - fan.set_speed(self.speed) - logger.log_info('Set all system FAN speed to {}'.format(self.speed)) - - SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, self.speed) - - @classmethod - def set_psu_fan_speed(cls, thermal_info_dict, speed): - from .thermal_infos import ChassisInfo - if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo): - chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() - for psu in chassis.get_all_psus(): - for psu_fan in psu.get_all_fans(): - psu_fan.set_speed(speed) - - -@thermal_json_object('fan.all.check_and_set_speed') -class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): - """ - Action to check thermal zone temperature and recover speed for all fans - """ - def execute(self, thermal_info_dict): - """ - Check thermal zone and set speed for all fans - :param thermal_info_dict: A dictionary stores all thermal information. - :return: - """ from .thermal import Thermal - if Thermal.check_thermal_zone_temperature(): - SetAllFanSpeedAction.execute(self, thermal_info_dict) - - -@thermal_json_object('thermal_control.control') -class ControlThermalAlgoAction(ThermalPolicyActionBase): - """ - Action to control the thermal control algorithm - """ - # JSON field definition - JSON_FIELD_STATUS = 'status' - - def __init__(self): - self.status = True - - def load_from_json(self, json_obj): - """ - Construct ControlThermalAlgoAction via JSON. JSON example: - { - "type": "thermal_control.control" - "status": "true" - } - :param json_obj: A JSON object representing a ControlThermalAlgoAction action. - :return: - """ - if ControlThermalAlgoAction.JSON_FIELD_STATUS in json_obj: - status_str = json_obj[ControlThermalAlgoAction.JSON_FIELD_STATUS].lower() - if status_str == 'true': - self.status = True - elif status_str == 'false': - self.status = False - else: - raise ValueError('Invalid {} field value, please specify true of false'. - format(ControlThermalAlgoAction.JSON_FIELD_STATUS)) - else: - raise ValueError('ControlThermalAlgoAction ' - 'missing mandatory field {} in JSON policy file'. - format(ControlThermalAlgoAction.JSON_FIELD_STATUS)) - - def execute(self, thermal_info_dict): - """ - Disable thermal control algorithm - :param thermal_info_dict: A dictionary stores all thermal information. - :return: - """ - from .thermal_infos import FanInfo - from .thermal import Thermal - from .thermal_conditions import UpdateCoolingLevelToMinCondition - from .fan import Fan - status_changed = Thermal.set_thermal_algorithm_status(self.status, False) - - # Only update cooling level if thermal algorithm status changed - if status_changed: - if self.status: - # Check thermal zone temperature, if all thermal zone temperature - # back to normal, set it to minimum allowed speed to - # save power - UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) - - logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) + Thermal.set_expect_cooling_level(self.speed / 10) @thermal_json_object('thermal.recover') class ThermalRecoverAction(ThermalPolicyActionBase): - def execute(self, thermal_info_dict): - UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) - - -class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): UNKNOWN_SKU_COOLING_LEVEL = 6 + def execute(self, thermal_info_dict): from .device_data import DEVICE_DATA - from .fan import Fan + from .thermal import Thermal, MAX_COOLING_LEVEL, MIN_COOLING_LEVEL_FOR_HIGH, logger from .thermal_infos import ChassisInfo - from .thermal_conditions import MinCoolingLevelChangeCondition - from .thermal_conditions import UpdateCoolingLevelToMinCondition + Thermal.monitor_asic_themal_zone() + # Calculate dynamic minimum cooling level + dynamic_min_cooling_level = None chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() if chassis.platform_name not in DEVICE_DATA or 'thermal' not in DEVICE_DATA[chassis.platform_name] or 'minimum_table' not in DEVICE_DATA[chassis.platform_name]['thermal']: - Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL + # If there is no minimum_table defined, set dynamic_min_cooling_level to default value + dynamic_min_cooling_level = ThermalRecoverAction.UNKNOWN_SKU_COOLING_LEVEL else: - trust_state = MinCoolingLevelChangeCondition.trust_state - temperature = MinCoolingLevelChangeCondition.temperature + trust_state = Thermal.check_module_temperature_trustable() + temperature = Thermal.get_min_amb_temperature() + temperature = int(temperature / 1000) minimum_table = DEVICE_DATA[chassis.platform_name]['thermal']['minimum_table']['unk_{}'.format(trust_state)] for key, cooling_level in minimum_table.items(): @@ -169,41 +78,19 @@ def execute(self, thermal_info_dict): temp_min = int(temp_range[0].strip()) temp_max = int(temp_range[1].strip()) if temp_min <= temperature <= temp_max: - Fan.min_cooling_level = cooling_level - 10 + dynamic_min_cooling_level = cooling_level - 10 break - - current_cooling_level = Fan.get_cooling_level() - if current_cooling_level < Fan.min_cooling_level: - Fan.set_cooling_level(Fan.min_cooling_level, Fan.min_cooling_level) - SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, Fan.min_cooling_level * 10) - else: - Fan.set_cooling_level(Fan.min_cooling_level, current_cooling_level) - UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) - -class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): - def execute(self, thermal_info_dict): - from .thermal_conditions import CoolingLevelChangeCondition - SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, CoolingLevelChangeCondition.cooling_level * 10) + if not dynamic_min_cooling_level: + # Should not go to this branch, just in case + logger.log_error('Failed to get dynamic minimum cooling level') + dynamic_min_cooling_level = MAX_COOLING_LEVEL - -class UpdateCoolingLevelToMinAction(ThermalPolicyActionBase): - def execute(self, thermal_info_dict): - self.update_cooling_level_to_minimum(thermal_info_dict) - - @classmethod - def update_cooling_level_to_minimum(cls, thermal_info_dict): - from .fan import Fan - from .thermal import Thermal - from .thermal_conditions import UpdateCoolingLevelToMinCondition - from .thermal_infos import FanInfo - if Thermal.check_thermal_zone_temperature(): - fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] - speed = Fan.min_cooling_level * 10 - for fan in fan_info_obj.get_presence_fans(): - fan.set_speed(speed) - SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, speed) - UpdateCoolingLevelToMinCondition.enable = False + if Thermal.last_set_cooling_level is not None and dynamic_min_cooling_level > Thermal.last_set_cooling_level and dynamic_min_cooling_level >= MIN_COOLING_LEVEL_FOR_HIGH: + # No need to check thermal zone as dynamic_min_cooling_level is greater than previous value and MIN_COOLING_LEVEL_FOR_HIGH + Thermal.set_expect_cooling_level(dynamic_min_cooling_level) else: - UpdateCoolingLevelToMinCondition.enable = True - + min_cooling_level_by_tz = Thermal.get_min_allowed_cooling_level_by_thermal_zone() + if min_cooling_level_by_tz is not None: + cooling_level = max(dynamic_min_cooling_level, min_cooling_level_by_tz) + Thermal.set_expect_cooling_level(cooling_level) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 94e18a2e00b0..94366c2c27ac 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -74,53 +74,3 @@ class AllPsuPresenceCondition(PsuCondition): def is_match(self, thermal_info_dict): psu_info_obj = self.get_psu_info(thermal_info_dict) return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False - - -class MinCoolingLevelChangeCondition(ThermalPolicyConditionBase): - trust_state = None - temperature = None - - def is_match(self, thermal_info_dict): - from .thermal import Thermal - - trust_state = Thermal.check_module_temperature_trustable() - temperature = Thermal.get_min_amb_temperature() - temperature = temperature / 1000 - - change_cooling_level = False - if trust_state != MinCoolingLevelChangeCondition.trust_state: - MinCoolingLevelChangeCondition.trust_state = trust_state - change_cooling_level = True - - if temperature != MinCoolingLevelChangeCondition.temperature: - MinCoolingLevelChangeCondition.temperature = temperature - change_cooling_level = True - - return change_cooling_level - - -class CoolingLevelChangeCondition(ThermalPolicyConditionBase): - cooling_level = None - - def is_match(self, thermal_info_dict): - from .fan import Fan - current_cooling_level = Fan.get_cooling_level() - if current_cooling_level != CoolingLevelChangeCondition.cooling_level: - CoolingLevelChangeCondition.cooling_level = current_cooling_level - return True - else: - return False - - -class UpdateCoolingLevelToMinCondition(ThermalPolicyConditionBase): - enable = False - def is_match(self, thermal_info_dict): - if not UpdateCoolingLevelToMinCondition.enable: - return False - - from .fan import Fan - current_cooling_level = Fan.get_cooling_level() - if current_cooling_level == Fan.min_cooling_level: - UpdateCoolingLevelToMinCondition.enable = False - return False - return True diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index c0eae332e435..1bbc60d81857 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -1,20 +1,11 @@ from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase -from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy from .thermal_actions import * # lgtm [py/polluting-import] from .thermal_conditions import * # lgtm [py/polluting-import] from .thermal_infos import * # lgtm [py/polluting-import] +from .thermal import logger, MAX_COOLING_LEVEL, Thermal class ThermalManager(ThermalManagerBase): - @classmethod - def initialize(cls): - """ - Initialize thermal manager, including register thermal condition types and thermal action types - and any other vendor specific initialization. - :return: - """ - cls._add_private_thermal_policy() - @classmethod def deinitialize(cls): """ @@ -30,9 +21,8 @@ def start_thermal_control_algorithm(cls): Start thermal control algorithm Returns: - bool: True if set success, False if fail. + bool: True if set success, False if fail. """ - from .thermal import Thermal Thermal.set_thermal_algorithm_status(True) @classmethod @@ -41,24 +31,31 @@ def stop_thermal_control_algorithm(cls): Stop thermal control algorithm Returns: - bool: True if set success, False if fail. + bool: True if set success, False if fail. """ - from .thermal import Thermal Thermal.set_thermal_algorithm_status(False) @classmethod - def _add_private_thermal_policy(cls): - dynamic_min_speed_policy = ThermalPolicy() - dynamic_min_speed_policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() - dynamic_min_speed_policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() - cls._policy_dict['DynamicMinCoolingLevelPolicy'] = dynamic_min_speed_policy - - update_psu_fan_speed_policy = ThermalPolicy() - update_psu_fan_speed_policy.conditions[CoolingLevelChangeCondition] = CoolingLevelChangeCondition() - update_psu_fan_speed_policy.actions[UpdatePsuFanSpeedAction] = UpdatePsuFanSpeedAction() - cls._policy_dict['UpdatePsuFanSpeedPolicy'] = update_psu_fan_speed_policy + def run_policy(cls, chassis): + if not cls._policy_dict: + return + + try: + cls._collect_thermal_information(chassis) + except Exception as e: + logger.log_error('Failed to collect thermal information {}'.format(repr(e))) + Thermal.set_expect_cooling_level(MAX_COOLING_LEVEL) + Thermal.commit_cooling_level(cls._thermal_info_dict) + return + + for policy in cls._policy_dict.values(): + try: + if policy.is_match(cls._thermal_info_dict): + policy.do_action(cls._thermal_info_dict) + except Exception as e: + logger.log_error('Failed to run thermal policy {} - {}'.format(policy.name, repr(e))) + # In case there is an exception, we put cooling level to max value + Thermal.set_expect_cooling_level(MAX_COOLING_LEVEL) + + Thermal.commit_cooling_level(cls._thermal_info_dict) - update_cooling_level_policy = ThermalPolicy() - update_cooling_level_policy.conditions[UpdateCoolingLevelToMinCondition] = UpdateCoolingLevelToMinCondition() - update_cooling_level_policy.actions[UpdateCoolingLevelToMinAction] = UpdateCoolingLevelToMinAction() - cls._policy_dict['UpdateCoolingLevelPolicy'] = update_cooling_level_policy diff --git a/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json index ace291be1c55..9efe773a9b07 100644 --- a/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json +++ b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json @@ -27,8 +27,7 @@ ], "actions": [ { - "type": "thermal_control.control", - "status": "false" + "type": "thermal.recover" }, { "type": "fan.all.set_speed", @@ -45,8 +44,7 @@ ], "actions": [ { - "type": "thermal_control.control", - "status": "false" + "type": "thermal.recover" }, { "type": "fan.all.set_speed", @@ -66,8 +64,7 @@ ], "actions": [ { - "type": "thermal_control.control", - "status": "true" + "type": "thermal.recover" } ] } diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index a5505feae4ad..ac9b0da32755 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -2,7 +2,7 @@ import sys import pytest import json -from mock import MagicMock +from mock import MagicMock, patch from .mock_platform import MockChassis, MockFan, MockPsu test_path = os.path.dirname(os.path.abspath(__file__)) @@ -11,10 +11,7 @@ from sonic_platform.thermal_manager import ThermalManager from sonic_platform.thermal_infos import FanInfo, PsuInfo -from sonic_platform.thermal import Thermal - -Thermal.check_thermal_zone_temperature = MagicMock() -Thermal.set_thermal_algorithm_status = MagicMock() +from sonic_platform.thermal import Thermal, MAX_COOLING_LEVEL @pytest.fixture(scope='session', autouse=True) @@ -86,51 +83,58 @@ def test_psu_info(): assert not psu_info.is_status_changed() -def test_fan_policy(thermal_manager): +@patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock()) +@patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone', MagicMock(return_value=2)) +@patch('sonic_platform.thermal.Thermal.get_cooling_level', MagicMock(return_value=6)) +@patch('sonic_platform.thermal.Thermal.set_cooling_state') +@patch('sonic_platform.thermal.Thermal.set_cooling_level') +def test_fan_policy(mock_set_cooling_level, mock_set_cooling_state, thermal_manager): chassis = MockChassis() chassis.make_fan_absence() chassis.fan_list.append(MockFan()) + chassis.platform_name = 'some_platform' + thermal_manager.run_policy(chassis) + mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL) + mock_set_cooling_state.assert_called_with(MAX_COOLING_LEVEL) + Thermal.expect_cooling_level = None fan_list = chassis.get_all_fans() - assert fan_list[1].speed == 100 - Thermal.set_thermal_algorithm_status.assert_called_with(False, False) - fan_list[0].presence = True - Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) thermal_manager.run_policy(chassis) - Thermal.set_thermal_algorithm_status.assert_called_with(True, False) - assert Thermal.check_thermal_zone_temperature.call_count == 2 - assert fan_list[0].speed == 60 - assert fan_list[1].speed == 60 + mock_set_cooling_level.assert_called_with(6) + mock_set_cooling_state.assert_called_with(6) + Thermal.expect_cooling_level = None fan_list[0].status = False thermal_manager.run_policy(chassis) - Thermal.set_thermal_algorithm_status.assert_called_with(False, False) + mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL) + Thermal.expect_cooling_level = None fan_list[0].status = True - Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) thermal_manager.run_policy(chassis) - Thermal.set_thermal_algorithm_status.assert_called_with(True, False) - assert Thermal.check_thermal_zone_temperature.call_count == 2 - assert fan_list[0].speed == 100 - assert fan_list[1].speed == 100 + mock_set_cooling_level.assert_called_with(6) + mock_set_cooling_state.assert_called_with(6) -def test_psu_policy(thermal_manager): +@patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock()) +@patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone', MagicMock(return_value=2)) +@patch('sonic_platform.thermal.Thermal.get_cooling_level', MagicMock(return_value=6)) +@patch('sonic_platform.thermal.Thermal.set_cooling_state') +@patch('sonic_platform.thermal.Thermal.set_cooling_level') +def test_psu_policy(mock_set_cooling_level, mock_set_cooling_state, thermal_manager): chassis = MockChassis() chassis.make_psu_absence() - chassis.fan_list.append(MockFan()) + chassis.platform_name = 'some_platform' thermal_manager.run_policy(chassis) - - fan_list = chassis.get_all_fans() - assert fan_list[0].speed == 100 - Thermal.set_thermal_algorithm_status.assert_called_with(False, False) + mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL) + mock_set_cooling_state.assert_called_with(MAX_COOLING_LEVEL) psu_list = chassis.get_all_psus() psu_list[0].presence = True thermal_manager.run_policy(chassis) - Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + mock_set_cooling_level.assert_called_with(6) + mock_set_cooling_state.assert_called_with(6) def test_any_fan_absence_condition(): @@ -282,6 +286,7 @@ def test_load_set_fan_speed_action(): action.load_from_json(json_obj) +@patch('sonic_platform.thermal.Thermal.set_cooling_level', MagicMock()) def test_execute_set_fan_speed_action(): chassis = MockChassis() fan_list = chassis.get_all_fans() @@ -290,83 +295,13 @@ def test_execute_set_fan_speed_action(): fan_info = FanInfo() fan_info.collect(chassis) + Thermal.expect_cooling_level = None from sonic_platform.thermal_actions import SetAllFanSpeedAction action = SetAllFanSpeedAction() - action.speed = 99 + action.speed = 20 action.execute({'fan_info': fan_info}) - assert fan_list[0].speed == 99 - assert fan_list[1].speed == 99 - - -def test_load_control_thermal_algo_action(): - from sonic_platform.thermal_actions import ControlThermalAlgoAction - action = ControlThermalAlgoAction() - json_str = '{\"status\": \"false\"}' - json_obj = json.loads(json_str) - action.load_from_json(json_obj) - assert not action.status - - json_str = '{\"status\": \"true\"}' - json_obj = json.loads(json_str) - action.load_from_json(json_obj) - assert action.status + assert Thermal.expect_cooling_level == 2 - json_str = '{\"status\": \"invalid\"}' - json_obj = json.loads(json_str) - with pytest.raises(ValueError): - action.load_from_json(json_obj) - - json_str = '{\"invalid\": \"true\"}' - json_obj = json.loads(json_str) - with pytest.raises(ValueError): - action.load_from_json(json_obj) - -def test_load_check_and_set_speed_action(): - from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction - action = CheckAndSetAllFanSpeedAction() - json_str = '{\"speed\": \"40\"}' - json_obj = json.loads(json_str) - action.load_from_json(json_obj) - assert action.speed == 40 - - json_str = '{\"speed\": \"-1\"}' - json_obj = json.loads(json_str) - with pytest.raises(ValueError): - action.load_from_json(json_obj) - - json_str = '{\"speed\": \"101\"}' - json_obj = json.loads(json_str) - with pytest.raises(ValueError): - action.load_from_json(json_obj) - - json_str = '{\"invalid\": \"60\"}' - json_obj = json.loads(json_str) - with pytest.raises(ValueError): - action.load_from_json(json_obj) - -def test_execute_check_and_set_fan_speed_action(): - chassis = MockChassis() - fan_list = chassis.get_all_fans() - fan_list.append(MockFan()) - fan_list.append(MockFan()) - fan_info = FanInfo() - fan_info.collect(chassis) - Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) - - from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction - action = CheckAndSetAllFanSpeedAction() - action.speed = 99 - action.execute({'fan_info': fan_info}) - assert fan_list[0].speed == 99 - assert fan_list[1].speed == 99 - - Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) - fan_list[0].speed = 100 - fan_list[1].speed = 100 - action.speed = 60 - action.execute({'fan_info': fan_info}) - assert fan_list[0].speed == 100 - assert fan_list[1].speed == 100 def test_load_duplicate_condition(): from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy @@ -449,48 +384,63 @@ def check_minimum_table_data(platform, minimum_table): assert cooling_level > previous_cooling_level previous_cooling_level = cooling_level -def test_dynamic_minimum_policy(thermal_manager): - from sonic_platform.thermal_conditions import MinCoolingLevelChangeCondition - from sonic_platform.thermal_actions import ChangeMinCoolingLevelAction - from sonic_platform.thermal_infos import ChassisInfo - from sonic_platform.thermal import Thermal - from sonic_platform.fan import Fan - ThermalManager.initialize() - assert 'DynamicMinCoolingLevelPolicy' in thermal_manager._policy_dict - policy = thermal_manager._policy_dict['DynamicMinCoolingLevelPolicy'] - assert MinCoolingLevelChangeCondition in policy.conditions - assert ChangeMinCoolingLevelAction in policy.actions - - condition = policy.conditions[MinCoolingLevelChangeCondition] - action = policy.actions[ChangeMinCoolingLevelAction] - Thermal.check_module_temperature_trustable = MagicMock(return_value='trust') - Thermal.get_min_amb_temperature = MagicMock(return_value=35000) - assert condition.is_match(None) - assert MinCoolingLevelChangeCondition.trust_state == 'trust' - assert MinCoolingLevelChangeCondition.temperature == 35 - assert not condition.is_match(None) - - Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust') - assert condition.is_match(None) - assert MinCoolingLevelChangeCondition.trust_state == 'untrust' - - Thermal.get_min_amb_temperature = MagicMock(return_value=25000) - assert condition.is_match(None) - assert MinCoolingLevelChangeCondition.temperature == 25 +@patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock()) +@patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone') +@patch('sonic_platform.thermal.Thermal.get_min_amb_temperature') +@patch('sonic_platform.thermal.Thermal.check_module_temperature_trustable') +def test_thermal_recover_policy(mock_check_trustable, mock_get_min_amb, moc_get_min_allowed): + from sonic_platform.thermal_infos import ChassisInfo + from sonic_platform.thermal_actions import ThermalRecoverAction chassis = MockChassis() chassis.platform_name = 'invalid' info = ChassisInfo() info._chassis = chassis thermal_info_dict = {ChassisInfo.INFO_NAME: info} - Fan.get_cooling_level = MagicMock(return_value=5) - Fan.set_cooling_level = MagicMock() + + Thermal.expect_cooling_level = None + action = ThermalRecoverAction() + moc_get_min_allowed.return_value = 2 action.execute(thermal_info_dict) - assert Fan.min_cooling_level == 6 - Fan.set_cooling_level.assert_called_with(6, 6) - Fan.set_cooling_level.call_count = 0 + assert Thermal.expect_cooling_level == 6 + Thermal.last_set_cooling_level = Thermal.expect_cooling_level + Thermal.expect_cooling_level = None chassis.platform_name = 'x86_64-mlnx_msn2700-r0' + mock_check_trustable.return_value = 'trust' + mock_get_min_amb.return_value = 29999 + moc_get_min_allowed.return_value = None + action.execute(thermal_info_dict) + assert Thermal.expect_cooling_level is None + + moc_get_min_allowed.return_value = 4 action.execute(thermal_info_dict) - assert Fan.min_cooling_level == 3 - Fan.set_cooling_level.assert_called_with(3, 5) + assert Thermal.expect_cooling_level == 4 + Thermal.last_set_cooling_level = Thermal.expect_cooling_level + + mock_check_trustable.return_value = 'untrust' + mock_get_min_amb.return_value = 31001 + action.execute(thermal_info_dict) + assert Thermal.expect_cooling_level == 5 + + +@patch('sonic_platform.utils.read_int_from_file') +def test_monitor_asic_themal_zone(mock_read_int): + mock_read_int.side_effect = [111000, 105000] + Thermal.monitor_asic_themal_zone() + assert Thermal.expect_cooling_state == MAX_COOLING_LEVEL + mock_read_int.reset() + mock_read_int.side_effect = [104000, 105000] + Thermal.monitor_asic_themal_zone() + assert Thermal.expect_cooling_state is None + + +def test_set_expect_cooling_level(): + Thermal.set_expect_cooling_level(5) + assert Thermal.expect_cooling_level == 5 + + Thermal.set_expect_cooling_level(3) + assert Thermal.expect_cooling_level == 5 + + Thermal.set_expect_cooling_level(10) + assert Thermal.expect_cooling_level == 10 diff --git a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json index 413211b21220..4a0dd28a2d15 100644 --- a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json +++ b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json @@ -23,10 +23,6 @@ } ], "actions": [ - { - "type": "thermal_control.control", - "status": "false" - }, { "type": "fan.all.set_speed", "speed": "100" @@ -41,10 +37,6 @@ } ], "actions": [ - { - "type": "thermal_control.control", - "status": "false" - }, { "type": "fan.all.set_speed", "speed": "100" @@ -59,10 +51,6 @@ } ], "actions": [ - { - "type": "thermal_control.control", - "status": "false" - }, { "type": "fan.all.set_speed", "speed": "100" @@ -84,12 +72,7 @@ ], "actions": [ { - "type": "thermal_control.control", - "status": "true" - }, - { - "type": "fan.all.check_and_set_speed", - "speed": "60" + "type": "thermal.recover" } ] }