Skip to content

Commit

Permalink
[thermalctld] Update line card thermal sensor status to DB (sonic-net…
Browse files Browse the repository at this point in the history
…#211)

Update line card thermal sensor status to DB, includes PSU thermal sensors and SFP thermal sensors on line card. Depends on sonic-net#8422.

#### Description

In thermal update function, update PSU, SFP and direct thermal of line card

#### Motivation and Context

To support modular chassis

#### How Has This Been Tested?

1. Full platform regression, 100% passed
2. Unit test passed
  • Loading branch information
Junchao-Mellanox authored Oct 18, 2021
1 parent 294995b commit 1565a23
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 86 deletions.
202 changes: 124 additions & 78 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -538,11 +538,17 @@ class TemperatureUpdater(logger.Logger):

self.is_chassis_system = chassis.is_modular_chassis()
if self.is_chassis_system:
self.module_thermals = set()
my_slot = try_get(chassis.get_my_slot, INVALID_SLOT)
if my_slot != INVALID_SLOT:
table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot)
chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
self.chassis_table = swsscommon.Table(chassis_state_db, table_name)
try:
# Modular chassis does not have to have table CHASSIS_STATE_DB.
# So catch the exception here and ignore it.
table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot)
chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
self.chassis_table = swsscommon.Table(chassis_state_db, table_name)
except Exception as e:
self.chassis_table = None

def deinit(self):
"""
Expand Down Expand Up @@ -576,31 +582,61 @@ class TemperatureUpdater(logger.Logger):
for index, thermal in enumerate(self.chassis.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)

for psu_index, psu in enumerate(self.chassis.get_all_psus()):
parent_name = 'PSU {}'.format(psu_index + 1)
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(parent_name, thermal, thermal_index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(parent_name, thermal, thermal_index)

for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()):
parent_name = 'SFP {}'.format(sfp_index + 1)
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(parent_name, thermal, thermal_index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(parent_name, thermal, thermal_index)

if self.is_chassis_system:
available_thermals = set()
for module_index, module in enumerate(self.chassis.get_all_modules()):
module_name = try_get(module.get_name, 'Module {}'.format(module_index + 1))

for thermal_index, thermal in enumerate(module.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, module_name, thermal_index))
self._refresh_temperature_status(module_name, thermal, thermal_index)

for sfp_index, sfp in enumerate(module.get_all_sfps()):
sfp_name = '{} SFP {}'.format(module_name, sfp_index + 1)
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, sfp_name, thermal_index))
self._refresh_temperature_status(sfp_name, thermal, thermal_index)

for psu_index, psu in enumerate(module.get_all_psus()):
psu_name = '{} PSU {}'.format(module_name, psu_index + 1)
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, psu_name, thermal_index))
self._refresh_temperature_status(psu_name, thermal, thermal_index)


thermals_to_remove = self.module_thermals - available_thermals
self.module_thermals = available_thermals
for thermal, parent_name, thermal_index in thermals_to_remove:
self._remove_thermal_from_db(thermal, parent_name, thermal_index)

self.log_debug("End temperature updating")

def _refresh_temperature_status(self, parent_name, thermal, thermal_index):
Expand All @@ -611,72 +647,82 @@ class TemperatureUpdater(logger.Logger):
:param thermal_index: Index of the thermal object in platform chassis
:return:
"""
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
try:
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))

# Only save entity info for thermals that belong to chassis and PSU
# for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
# and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
# logical port
if 'SFP' not in parent_name:
update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1)

if name not in self.temperature_status_dict:
self.temperature_status_dict[name] = TemperatureStatus()

temperature_status = self.temperature_status_dict[name]

high_threshold = NOT_AVAILABLE
low_threshold = NOT_AVAILABLE
high_critical_threshold = NOT_AVAILABLE
low_critical_threshold = NOT_AVAILABLE
maximum_temperature = NOT_AVAILABLE
minimum_temperature = NOT_AVAILABLE
temperature = try_get(thermal.get_temperature)
is_replaceable = try_get(thermal.is_replaceable, False)
if temperature != NOT_AVAILABLE:
temperature_status.set_temperature(name, temperature)
minimum_temperature = try_get(thermal.get_minimum_recorded)
maximum_temperature = try_get(thermal.get_maximum_recorded)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
low_critical_threshold = try_get(thermal.get_low_critical_threshold)

warning = False
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
self._log_on_status_changed(not temperature_status.over_temperature,
'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'.
format(name, temperature, high_threshold),
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold)
)
warning = warning | temperature_status.over_temperature

if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
self._log_on_status_changed(not temperature_status.under_temperature,
'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'.
format(name, temperature, low_threshold),
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold)
)
warning = warning | temperature_status.under_temperature

fvs = swsscommon.FieldValuePairs(
[('temperature', str(temperature)),
('minimum_temperature', str(minimum_temperature)),
('maximum_temperature', str(maximum_temperature)),
('high_threshold', str(high_threshold)),
('low_threshold', str(low_threshold)),
('warning_status', str(warning)),
('critical_high_threshold', str(high_critical_threshold)),
('critical_low_threshold', str(low_critical_threshold)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])

# Only save entity info for thermals that belong to chassis and PSU
# for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
# and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
# logical port
if 'SFP' not in parent_name:
update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1)

if name not in self.temperature_status_dict:
self.temperature_status_dict[name] = TemperatureStatus()

temperature_status = self.temperature_status_dict[name]

high_threshold = NOT_AVAILABLE
low_threshold = NOT_AVAILABLE
high_critical_threshold = NOT_AVAILABLE
low_critical_threshold = NOT_AVAILABLE
maximum_temperature = NOT_AVAILABLE
minimum_temperature = NOT_AVAILABLE
temperature = try_get(thermal.get_temperature)
is_replaceable = try_get(thermal.is_replaceable, False)
if temperature != NOT_AVAILABLE:
temperature_status.set_temperature(name, temperature)
minimum_temperature = try_get(thermal.get_minimum_recorded)
maximum_temperature = try_get(thermal.get_maximum_recorded)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
low_critical_threshold = try_get(thermal.get_low_critical_threshold)

warning = False
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
self._log_on_status_changed(not temperature_status.over_temperature,
'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'.
format(name, temperature, high_threshold),
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold)
)
warning = warning | temperature_status.over_temperature

if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
self._log_on_status_changed(not temperature_status.under_temperature,
'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'.
format(name, temperature, low_threshold),
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold)
)
warning = warning | temperature_status.under_temperature
self.table.set(name, fvs)
if self.is_chassis_system and self.chassis_table is not None:
self.chassis_table.set(name, fvs)
except Exception as e:
self.log_warning('Failed to update thermal status for {} - {}'.format(name, repr(e)))

fvs = swsscommon.FieldValuePairs(
[('temperature', str(temperature)),
('minimum_temperature', str(minimum_temperature)),
('maximum_temperature', str(maximum_temperature)),
('high_threshold', str(high_threshold)),
('low_threshold', str(low_threshold)),
('warning_status', str(warning)),
('critical_high_threshold', str(high_critical_threshold)),
('critical_low_threshold', str(low_critical_threshold)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])
def _remove_thermal_from_db(self, thermal, parent_name, thermal_index):
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
self.table._del(name)

self.table.set(name, fvs)
if self.is_chassis_system and self.chassis_table is not None:
self.chassis_table.set(name, fvs)
if self.chassis_table is not None:
self.chassis_table._del(name)


class ThermalMonitor(ProcessTaskBase):
Expand Down
16 changes: 16 additions & 0 deletions sonic-thermalctld/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,17 @@ def make_error_thermal(self):
thermal = MockErrorThermal()
self._thermal_list.append(thermal)

def make_module_thermal(self):
module = MockModule()
self._module_list.append(module)
sfp = MockSfp()
sfp._thermal_list.append(MockThermal())
psu = MockPsu()
psu._thermal_list.append(MockThermal())
module._sfp_list.append(sfp)
module._psu_list.append(psu)
module._thermal_list.append(MockThermal())

def is_modular_chassis(self):
return self._is_chassis_system

Expand Down Expand Up @@ -430,3 +441,8 @@ def get_position_in_parent(self):

def is_replaceable(self):
return self._replaceable


class MockModule(module_base.ModuleBase):
def __init__(self):
super(MockModule, self).__init__()
28 changes: 20 additions & 8 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,15 +470,15 @@ def test_update_psu_thermals(self):
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 0

temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message"))
mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message"))
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 1

# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message')")
else:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message',)")

def test_update_sfp_thermals(self):
chassis = MockChassis()
Expand All @@ -490,15 +490,15 @@ def test_update_sfp_thermals(self):
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 0

temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message"))
mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message"))
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 1

# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message')")
else:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message',)")

def test_update_thermal_with_exception(self):
chassis = MockChassis()
Expand All @@ -514,16 +514,28 @@ def test_update_thermal_with_exception(self):
# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
expected_calls = [
mock.call("Failed to update thermal status - Exception('Failed to get temperature')"),
mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature')"),
mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C')
]
else:
expected_calls = [
mock.call("Failed to update thermal status - Exception('Failed to get temperature',)"),
mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature',)"),
mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C')
]
assert temperature_updater.log_warning.mock_calls == expected_calls

def test_update_module_thermals(self):
chassis = MockChassis()
chassis.make_module_thermal()
chassis.set_modular_chassis(True)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temperature_updater.update()
assert len(temperature_updater.module_thermals) == 3

chassis._module_list = []
temperature_updater.update()
assert len(temperature_updater.module_thermals) == 0


# Modular chassis-related tests

Expand Down

0 comments on commit 1565a23

Please sign in to comment.