Skip to content

Commit

Permalink
[Smartswitch][Mellanox] Enable thermal updater class for DPUs on swit…
Browse files Browse the repository at this point in the history
…ch (sonic-net#20945)

- Why I did it
In order to use the thermal data from DPUs to be used by hw-mgmt algorithms for controlling the speed of the fans.
Changes in

thermal_manager.py : Updated condition to consider if DPUs are present along with host management mode
thermal_updater.py: Added new APIs for DPUs and made configuration to update start and stop based on the initialization

- How I did it
Parse the DPUs information from the chassis_instance modules, get the temperature information from the module.get_temperature_dict information, This will be added to the Platform API PR

- How i tested it
Added new unit test to cover the thermal algo.
  • Loading branch information
gpunathilell authored Dec 5, 2024
1 parent 8e0f1c6 commit 7c9f85e
Show file tree
Hide file tree
Showing 4 changed files with 454 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from . import utils
from .thermal_updater import ThermalUpdater
from sonic_py_common import logger

import sys

sys.path.append('/run/hw-management/bin')

try:
import hw_management_dpu_thermal_update
except ImportError:
# For unit test and for non-smartswitch systems, these functions should not be called
from unittest import mock
hw_management_dpu_thermal_update = mock.MagicMock()
hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set = mock.MagicMock()
hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set = mock.MagicMock()
hw_management_dpu_thermal_update.thermal_data_dpu_drive_set = mock.MagicMock()
hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear = mock.MagicMock()
hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear = mock.MagicMock()
hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear = mock.MagicMock()

CRIT_THRESH = "critical_high_threshold"
HIGH_THRESH = "high_threshold"
TEMPERATURE_DATA = "temperature"
DPU_STATUS_OFFLINE = "Offline"
DPU_STATUS_ONLINE = "Online"
CPU_FIELD = "CPU"
NVME_FIELD = "NVME"
DDR_FIELD = "DDR"
dpu_func_dict = {
CPU_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set,
NVME_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_drive_set,
DDR_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set,
}

ERROR_READ_THERMAL_DATA = 254000

TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json'
logger = logger.Logger('smart-switch-thermal-updater')


class SmartswitchThermalUpdater(ThermalUpdater):
def __init__(self, sfp_list, dpu_list, is_host_mgmt_mode=True):
super().__init__(sfp_list=sfp_list)
self._dpu_list = dpu_list
self._dpu_status = {}
self.host_mgmt_mode = is_host_mgmt_mode

def load_tc_config_dpu(self):
dpu_poll_interval = 3
data = utils.load_json_file(TC_CONFIG_FILE, log_func=None)
if data:
dev_parameters = data.get('dev_parameters', {})
dpu_parameter = dev_parameters.get('dpu\\d+_module', {})
dpu_poll_interval_config = dpu_parameter.get('poll_time')
dpu_poll_interval = int(dpu_poll_interval_config) / 2 if dpu_poll_interval_config else dpu_poll_interval
else:
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')
logger.log_notice(f'DPU polling interval: {dpu_poll_interval}')
self._timer.schedule(dpu_poll_interval, self.update_dpu)

def start(self):
self.clean_thermal_data_dpu()
self.load_tc_config_dpu()
if self.host_mgmt_mode:
super().start()
else:
self._timer.start()

def stop(self):
if self.host_mgmt_mode:
super().stop()
else:
self._timer.stop()

def clean_thermal_data_dpu(self):
for dpu in self._dpu_list:
self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id())

def thermal_data_dpu_clear(self, dpu_index):
hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear(dpu_index)
hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear(dpu_index)
hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear(dpu_index)

def get_dpu_temperature_data_from_dict_obj(self, dpu_component_temperature_data, field_name):
value = dpu_component_temperature_data.get(field_name)
fault_state = False
if not value:
fault_state = True
return 0, fault_state
try:
int_value = int(float(value))
except ValueError:
logger.log_error(f"Unable to obtain temperature data for DPU {field_name}: {value}")
int_value = 0
fault_state = True
return int_value, fault_state

def get_dpu_component_temperature_data(self, dpu_temperature_data, component_name):
dpu_component_temperature_data = dpu_temperature_data.get(component_name, {})
output_dict = {}
output_false_state = False
for value in [TEMPERATURE_DATA, HIGH_THRESH, CRIT_THRESH]:
output_dict[value], fault_state = self.get_dpu_temperature_data_from_dict_obj(dpu_component_temperature_data, value)
output_false_state = output_false_state or fault_state
return output_dict[TEMPERATURE_DATA], output_dict[HIGH_THRESH], output_dict[CRIT_THRESH], ERROR_READ_THERMAL_DATA if output_false_state else 0

def update_dpu_temperature(self, dpu, fault_state=False):
dpu_temperature_data = dpu.get_temperature_dict() if not fault_state else {}
for key, func in dpu_func_dict.items():
temp_data, temp_thresh, temp_crit_thresh, fault_val = self.get_dpu_component_temperature_data(dpu_temperature_data, key)
return_val = func(dpu.get_hw_mgmt_id(), temp_data, temp_thresh, temp_crit_thresh, fault_val)
if not return_val:
logger.log_error(f"Unable to update Temperature data to hw-mgmt for {key} for {dpu.get_name()}")

def update_single_dpu(self, dpu):
try:
dpu_oper_status = dpu.get_oper_status()
pre_oper_status = self._dpu_status.get(dpu.get_name())
if dpu_oper_status == DPU_STATUS_ONLINE:
self.update_dpu_temperature(dpu)
elif pre_oper_status != dpu_oper_status:
# If dpu is shutdown from previous execution
self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id())
if pre_oper_status != dpu_oper_status:
# If there is a change in oper_status (irrespective of type of change)
self._dpu_status[dpu.get_name()] = dpu_oper_status
except Exception as e:
logger.log_error(f'Failed to update DPU {dpu.get_hw_mgmt_id()} thermal data - {e}')
self.update_dpu_temperature(dpu, fault_state=True)

def update_dpu(self):
for dpu in self._dpu_list:
self.update_single_dpu(dpu)
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
# limitations under the License.
#
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from . import thermal_updater
from . import thermal_updater
from . import smartswitch_thermal_updater
from .device_data import DeviceDataManager


Expand All @@ -33,18 +34,27 @@ def initialize(cls):
and any other vendor specific initialization.
:return:
"""
if DeviceDataManager.is_module_host_management_mode():
dpus_present = DeviceDataManager.get_platform_dpus_data()
host_mgmt_mode = DeviceDataManager.is_module_host_management_mode()
if not dpus_present and host_mgmt_mode:
# Non smart switch behaviour has highest priority
from .chassis import Chassis
cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps())
cls.thermal_updater_task = thermal_updater.ThermalUpdater(sfp_list=Chassis.chassis_instance.get_all_sfps())
elif dpus_present:
from .chassis import Chassis
dpus = Chassis.chassis_instance.get_all_modules()
cls.thermal_updater_task = smartswitch_thermal_updater.SmartswitchThermalUpdater(sfp_list=Chassis.chassis_instance.get_all_sfps(),
dpu_list=dpus,
is_host_mgmt_mode=host_mgmt_mode)
if cls.thermal_updater_task:
cls.thermal_updater_task.start()


@classmethod
def deinitialize(cls):
"""
Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function
is a no-op.
:return:
"""
if DeviceDataManager.is_module_host_management_mode() and cls.thermal_updater_task:
if cls.thermal_updater_task:
cls.thermal_updater_task.stop()
Loading

0 comments on commit 7c9f85e

Please sign in to comment.