From 91775cf09c93304f9c2266b86e5c04aa624945d6 Mon Sep 17 00:00:00 2001 From: Salvatore Daniele Date: Wed, 22 Jan 2025 13:30:50 -0500 Subject: [PATCH] Add preconfig step ExtraConfigMevFwUp Provide a pre-config to put the MeV firmware into a good state. Reflashing the firmware takes a very long time. By default, it will not do this if the firmware is already on the desired version. However, if we specify to force, we will reflash regardless of the current state. Signed-off-by: Salvatore Daniele --- clustersConfig.py | 4 +++ extraConfigMev.py | 61 ++++++++++++++++++++++++++++++++++++++++++++ extraConfigRunner.py | 2 ++ host.py | 16 ++++++------ 4 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 extraConfigMev.py diff --git a/clustersConfig.py b/clustersConfig.py index 5b834b9f0..fb581db9e 100644 --- a/clustersConfig.py +++ b/clustersConfig.py @@ -58,6 +58,10 @@ class ExtraConfigArgs: base_image: str = "" + mev_version: str = "" + + force_mev_fw_up: bool = False + def pre_check(self) -> None: if self.sriov_network_operator_local: if self.name != "sriov_network_operator": diff --git a/extraConfigMev.py b/extraConfigMev.py new file mode 100644 index 000000000..b526332d1 --- /dev/null +++ b/extraConfigMev.py @@ -0,0 +1,61 @@ +from clustersConfig import ClustersConfig +import host +from logger import logger +from clustersConfig import ExtraConfigArgs +from bmc import BMC +from concurrent.futures import Future +from typing import Optional +import time + +LATEST_MEV_FW = "1.8.0.10052" + + +def ExtraConfigMevFwUp(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[str, Future[Optional[host.Result]]]) -> None: + logger.info("Running pre config step to flash MeV firmware on IPU IMC") + + # This preconfig step is expected to run on an IMC only + assert cc.kind == "iso" + master = cc.masters[0] + assert master.kind == "ipu" + assert master.host_side_bmc is not None + imc = host.Host(master.bmc) + + # Check if a particular firmware version is being requested or if we will use default + if cfg.mev_version == "": + logger.info("Desired MeV fw release not specified, will install the latest by default") + cfg.mev_version = LATEST_MEV_FW + logger.info(f"Will ensure {master.bmc} is on firmware version: {cfg.mev_version}") + + # We should only perform an update if it is required, or if the user insists we do so + if not cfg.force_mev_fw_up: + logger.info("Checking if firmware update is required") + if imc.ping(): + imc.ssh_connect(master.bmc_user, master.bmc_password) + ret = imc.run("cat /etc/issue.net") + if cfg.mev_version in ret.out: + logger.info(f"Current MeV fw version is {ret.out.strip()}, no need to update") + return + + # Perform upgrade + lh = host.LocalHost() + + fw_up_cmd = f"--dpu-type ipu --imc-address {master.bmc} firmware up --version {cfg.mev_version}" + + ret = lh.run_in_container(fw_up_cmd, interactive=True) + + if not ret.success(): + logger.error_and_exit(f"Failed to flash new firmware. Error: {ret.err}") + + # Perform coldboot to apply the change + ipu_host_bmc = BMC.from_bmc(master.host_side_bmc) + ipu_host_bmc.cold_boot() + # Cold boot should also reboot IMC, give time to settle before trying to ping IMC + time.sleep(20) + + # Access the IMC to validate the flash was successful + imc.ssh_connect(master.bmc_user, master.bmc_password) + ret = imc.run("cat /etc/issue.net") + if cfg.mev_version not in ret.out or ret.returncode != 0: + logger.error_and_exit(f"Mev firmware release is not the expected version: {ret.out}") + + logger.info("MeV firmware flash complete") diff --git a/extraConfigRunner.py b/extraConfigRunner.py index f20761b20..2836c12ff 100644 --- a/extraConfigRunner.py +++ b/extraConfigRunner.py @@ -14,6 +14,7 @@ from extraConfigMicroshift import ExtraConfigMicroshift from extraConfigRhSubscription import ExtraConfigRhSubscription from extraConfigDpu import ExtraConfigDpu, ExtraConfigDpuHost +from extraConfigMev import ExtraConfigMevFwUp from clustersConfig import ClustersConfig from clustersConfig import ExtraConfigArgs from concurrent.futures import Future @@ -51,6 +52,7 @@ def __init__(self, cc: ClustersConfig): "rh_subscription": ExtraConfigRhSubscription, "dpu_operator_host": ExtraConfigDpuHost, "dpu_operator_dpu": ExtraConfigDpu, + "mev_firmware_up": ExtraConfigMevFwUp, } def run(self, to_run: ExtraConfigArgs, futures: dict[str, Future[Optional[host.Result]]]) -> None: diff --git a/host.py b/host.py index 8c24fdb48..d675347a0 100644 --- a/host.py +++ b/host.py @@ -322,6 +322,14 @@ def run_or_die(self, cmd: str) -> Result: logger.debug(ret.out.strip()) return ret + def run_in_container(self, cmd: str, interactive: bool = False, verbose: bool = True, dry_run: bool = False) -> Result: + name = "dpu-tools" + it = "-it" if interactive else "" + v = "--verbose" if verbose else "" + d = "--dry-run" if dry_run else "" + full_command = f"sudo podman run {it} --rm --pull always --replace --pid host --network host --user 0 --name {name} --privileged -v /dev:/dev quay.io/bnemeth/bf {v} {d} {cmd}" + return self.run(full_command, logging.INFO) + def close(self) -> None: assert self._host is not None self._host.close() @@ -443,14 +451,6 @@ def cx_firmware_upgrade(self) -> Result: logger.info("Upgrading CX firmware") return self.run_in_container("utils cx-fwup") - def run_in_container(self, cmd: str, interactive: bool = False, verbose: bool = True, dry_run: bool = False) -> Result: - name = "dpu-tools" - it = "-it" if interactive else "" - v = "--verbose" if verbose else "" - d = "--dry-run" if dry_run else "" - full_command = f"sudo podman run {it} --rm --pull always --replace --pid host --network host --user 0 --name {name} --privileged -v /dev:/dev quay.io/bnemeth/bf {v} {d} {cmd}" - return self.run(full_command, logging.DEBUG) - class HostWithBF2(Host): def connect_to_bf(self, bf_addr: str) -> None: