diff --git a/tests/e2e-test-framework/conftest.py b/tests/e2e-test-framework/conftest.py index c6df5aff4..8912e28e7 100644 --- a/tests/e2e-test-framework/conftest.py +++ b/tests/e2e-test-framework/conftest.py @@ -1,6 +1,6 @@ import logging from datetime import datetime -from typing import Generator +from typing import Generator, Dict import pytest import re @@ -111,7 +111,7 @@ def get_utils(request) -> Utils: namespace=request.config.getoption("--namespace") ) -def get_ssh_executors(request) -> dict[str, SSHCommandExecutor]: +def get_ssh_executors(request) -> Dict[str, SSHCommandExecutor]: utils = get_utils(request) ips = utils.get_worker_ips() + utils.get_controlplane_ips() executors = {ip: SSHCommandExecutor(ip_address=ip, username=utils.vm_user, password=utils.vm_cred) for ip in ips} @@ -122,11 +122,11 @@ def utils(request) -> Utils: return get_utils(request) @pytest.fixture(scope="session") -def ssh_executors(request) -> dict[str, SSHCommandExecutor]: +def ssh_executors(request) -> Dict[str, SSHCommandExecutor]: return get_ssh_executors(request) @pytest.fixture(scope="session") -def drive_utils_executors(request) -> dict[str, DriveUtils]: +def drive_utils_executors(request) -> Dict[str, DriveUtils]: ssh_execs = get_ssh_executors(request) return {ip: DriveUtils(executor) for ip, executor in ssh_execs.items()} @@ -138,7 +138,7 @@ def link_requirements_in_background(request): pytest.threads.append(requirements_thread) @pytest.fixture(autouse=True) -def keep_drive_count(drive_utils_executors: dict[str, DriveUtils]) -> Generator[None, None, None]: +def keep_drive_count(drive_utils_executors: Dict[str, DriveUtils]) -> Generator[None, None, None]: hosts_per_node_before = {ip: drive_utils.get_all_hosts() for ip, drive_utils in drive_utils_executors.items()} yield hosts_per_node_after = {ip: drive_utils.get_all_hosts() for ip, drive_utils in drive_utils_executors.items()} @@ -146,7 +146,7 @@ def keep_drive_count(drive_utils_executors: dict[str, DriveUtils]) -> Generator[ drive_utils.rescan_missing_hosts(before=hosts_per_node_before[ip], after=hosts_per_node_after[ip]) @pytest.fixture(autouse=True) -def wipe_drives(drive_utils_executors: dict[str, DriveUtils]) -> Generator[None, None, None]: +def wipe_drives(drive_utils_executors: Dict[str, DriveUtils]) -> Generator[None, None, None]: yield for _, drive_utils in drive_utils_executors.items(): drive_utils.wipe_drives() diff --git a/tests/e2e-test-framework/framework/const.py b/tests/e2e-test-framework/framework/const.py index 1bc48755b..2b6b5ebd0 100644 --- a/tests/e2e-test-framework/framework/const.py +++ b/tests/e2e-test-framework/framework/const.py @@ -31,6 +31,7 @@ # statuses STATUS_ONLINE = "ONLINE" STATUS_OFFLINE = "OFFLINE" +STATUS_OPERATIVE = "OPERATIVE" # annotation keys DRIVE_HEALTH_ANNOTATION = "health" @@ -48,6 +49,11 @@ # fake attach events FAKE_ATTACH_INVOLVED = "FakeAttachInvolved" FAKE_ATTACH_CLEARED = "FakeAttachCleared" +DRIVE_HEALTH_FAILURE = "DriveHealthFailure" +DRIVE_READY_FOR_REMOVAL = "DriveReadyForRemoval" +VOLUME_BAD_HEALTH = "VolumeBadHealth" +DRIVE_READY_FOR_PHYSICAL_REMOVAL = "DriveReadyForPhysicalRemoval" +DRIVE_SUCCESSFULLY_REMOVED = "DriveSuccessfullyRemoved" # drive events DRIVE_HEALTH_FAILURE_EVENT = "DriveHealthFailure" @@ -60,3 +66,6 @@ ACR_PLURAL = "availablecapacityreservations" LVG_PLURAL = "logicalvolumegroups" VOLUMES_PLURAL = "volumes" + +# led +LED_STATE = "1,2" diff --git a/tests/e2e-test-framework/framework/utils.py b/tests/e2e-test-framework/framework/utils.py index 6fd77a4f1..0dd9acfb4 100644 --- a/tests/e2e-test-framework/framework/utils.py +++ b/tests/e2e-test-framework/framework/utils.py @@ -391,7 +391,8 @@ def wait_volume( expected_status: Optional[str] = None, expected_health: Optional[str] = None, expected_usage: Optional[str] = None, - timeout: int = 60, + expected_operational_status: Optional[str] = None, + timeout: int = 90, ) -> bool: """ Waits for a volume with the given name to meet the expected status, health, and usage within the given timeout. @@ -401,6 +402,7 @@ def wait_volume( expected_status (Optional[str], optional): The expected status of the volume. Defaults to None. expected_health (Optional[str], optional): The expected health of the volume. Defaults to None. expected_usage (Optional[str], optional): The expected usage of the volume. Defaults to None. + expected_operational_status (Optional[str], optional): The expected operational status of the volume. Defaults to None. timeout (int): The maximum time to wait for the volume in seconds. Defaults to 60. Returns: @@ -413,6 +415,8 @@ def wait_volume( expected["Usage"] = expected_usage if expected_health: expected["Health"] = expected_health + if expected_operational_status: + expected['OperationalStatus'] = expected_operational_status def callback(): return self.list_volumes(name)[0] @@ -427,7 +431,8 @@ def wait_drive( expected_status: Optional[str] = None, expected_health: Optional[str] = None, expected_usage: Optional[str] = None, - timeout: int = 60, + expected_led_state: Optional[str] = None, + timeout: int = 90, ) -> bool: """ Waits for a drive with the given name to meet the expected status, health, and usage within the given timeout. @@ -437,6 +442,7 @@ def wait_drive( expected_status (Optional[str], optional): The expected status of the drive. Defaults to None. expected_health (Optional[str], optional): The expected health of the drive. Defaults to None. expected_usage (Optional[str], optional): The expected usage of the drive. Defaults to None. + expected_led_state (Optional[str], optional): The expected LED state of the drive. Defaults to None. timeout (int): The maximum time to wait for the drive in seconds. Defaults to 60. Returns: @@ -449,6 +455,8 @@ def wait_drive( expected["Usage"] = expected_usage if expected_health: expected["Health"] = expected_health + if expected_led_state: + expected["LEDState"] = expected_led_state def callback(): return self.custom_objects_api.get_cluster_custom_object( @@ -463,7 +471,7 @@ def _wait_cr( self, expected: Dict[str, str], get_cr_fn: Callable[[None], Any], - timeout: int = 60, + timeout: int = 90, ) -> bool: """ Waits for the custom resource (CR) to reach the expected state. @@ -471,7 +479,7 @@ def _wait_cr( Args: expected (dict): The expected state of the CR's spec. get_cr_fn (callable): The function to get the CR. - timeout (int, optional): The timeout for checking the CR, defaults to 60. + timeout (int, optional): The timeout for checking the CR, defaults to 90. Returns: bool: True if the CR meets the expected state within the given timeout, False otherwise. @@ -487,7 +495,7 @@ def _wait_cr( cr = get_cr_fn() for key, value in expected.items(): - if cr["spec"][key] == value: + if cr["spec"][key] in value: assertions[key] = True if all(assertions.values()): @@ -694,14 +702,14 @@ def recreate_pod(self, name: str, namespace: str) -> V1Pod: time.sleep(5) pod = self.list_pods(name, namespace=namespace)[0] assert self.is_pod_ready( - name, timeout=120 - ), "pod not ready after 120 seconds timeout" + name, timeout=150 + ), "pod not ready after 150 seconds timeout" logging.info(f"pod {name} is ready") return pod def wait_for_event_with_reason( - self, reason: str, timeout_seconds: int = 60 + self, reason: str, timeout_seconds: int = 90 ) -> bool: """ Wait for an event with a specified reason in the Kubernetes cluster. @@ -729,29 +737,82 @@ def wait_for_event_with_reason( return False def clear_pvc_and_pod( - self, pod_name: str, pvc_name: str, volume_name: str, namespace: str + self, pod_name: str, namespace: str, pvc_name: Optional[str] = None, volume_name: Optional[str] = None ) -> None: """ Clears the PersistentVolumeClaim (PVC) and the Pod with the specified names in the Kubernetes cluster. + If the name of pvc or volume is not specified it clears all PVCs connected with specific Pod. Args: pod_name (str): The name of the Pod to be cleared. - pvc_name (str): The name of the PersistentVolumeClaim to be cleared. - volume_name (str): The name of the volume to be checked. namespace (str): The namespace of the PersistentVolumeClaim and Pod. + pvc_name (Optional[str], optional): The name of the PersistentVolumeClaim to be cleared. + volume_name (Optional[str], optional): The name of the volume to be checked. Returns: None: This function does not return anything. """ - logging.info(f"clearing pvc {pvc_name} and pod {pod_name}") - self.core_v1_api.delete_namespaced_persistent_volume_claim( - name=pvc_name, - namespace=namespace, - ) + if pvc_name and volume_name: + logging.info(f"clearing pvc {pvc_name}") + self.core_v1_api.delete_namespaced_persistent_volume_claim( + name=pvc_name, + namespace=namespace, + ) + assert self.wait_volume( + name=volume_name, + expected_usage=const.USAGE_RELEASED, + ), f"Volume: {volume_name} failed to reach expected usage: {const.USAGE_RELEASED}" + else: + pvcs = self.list_persistent_volume_claims( + namespace=namespace, pod_name=pod_name + ) + for pvc in pvcs: + logging.info(f"clearing pvc {pvc.metadata.name}") + self.core_v1_api.delete_namespaced_persistent_volume_claim( + name=pvc.metadata.name, + namespace=namespace, + ) + for pvc in pvcs: + assert self.wait_volume( + name=pvc.spec.volume_name, + expected_usage=const.USAGE_RELEASED, + ), f"Volume: {pvc.spec.volume_name} failed to reach expected usage: {const.USAGE_RELEASED}" + logging.info(f"volume: {pvc.spec.volume_name} reach expected usage: {const.USAGE_RELEASED}") + + time.sleep(30) + self.recreate_pod(name=pod_name, namespace=namespace) + + def check_drive_cr_not_exist(self, drive_name: str, timeout: int = 120) -> bool: + """ + Checks if a custom resource (CR) representing a drive with the given name does not exist. - assert self.wait_volume( - name=volume_name, - expected_usage=const.USAGE_RELEASED, - ), f"Volume: {volume_name} failed to reach expected usage: {const.USAGE_RELEASED}" + Args: + drive_name (str): The name of the drive CR. + timeout (int, optional): The timeout for checking the CR, defaults to 120. - self.recreate_pod(name=pod_name, namespace=namespace) + Returns: + bool: True if the drive CR was removed within the given timeout, False otherwise. + """ + end_time = time.time() + timeout + while time.time() < end_time: + try: + self.custom_objects_api.get_cluster_custom_object( + group=const.CR_GROUP, + version=const.CR_VERSION, + plural="drives", + name=drive_name, + ) + logging.warning(f"Drive CR '{drive_name}' still exists.") + except ApiException as e: + if e.status == 404: + logging.info(f"Drive CR {drive_name} does not exist.") + return True + else: + raise + time.sleep(2) + logging.warning( + f"Drive CR '{drive_name}' still exists after {timeout} seconds timeout." + ) + return False + + \ No newline at end of file diff --git a/tests/e2e-test-framework/tests/test_drive_replacement_multi_volumes.py b/tests/e2e-test-framework/tests/test_drive_replacement_multi_volumes.py new file mode 100644 index 000000000..4d2d92ee2 --- /dev/null +++ b/tests/e2e-test-framework/tests/test_drive_replacement_multi_volumes.py @@ -0,0 +1,180 @@ +import pytest +import logging +from typing import Dict + +import framework.const as const + +from framework.sts import STS +from framework.utils import Utils +from framework.drive import DriveUtils + + + + +class TestAutoDriveReplacementWithMultipleVolumesPerPod: + @classmethod + @pytest.fixture(autouse=True) + def setup_class( + cls, + namespace: str, + drive_utils_executors: Dict[str, DriveUtils], + utils: Utils, + ): + cls.namespace = namespace + cls.name = "test-auto-drive-replacement-multiple-volumes" + cls.timeout = 120 + cls.replicas = 1 + + cls.utils = utils + + cls.drive_utils = drive_utils_executors + cls.sts = STS(cls.namespace, cls.name, cls.replicas) + cls.sts.delete() + cls.sts.create(storage_classes=[const.SSD_SC, const.HDD_SC]) + + yield + + cls.sts.delete() + + @pytest.mark.hal + def test_5921_auto_drive_replacement_with_multiple_volumes_per_pod(self): + # 1. get volume and volume groups for deployed pod + assert ( + self.sts.verify(self.timeout) is True + ), f"STS: {self.name} failed to reach desired number of replicas: {self.replicas}" + pod = self.utils.list_pods(name_prefix=self.name)[0] + node_ip = self.utils.get_pod_node_ip( + pod_name=pod.metadata.name, namespace=self.namespace + ) + volumes = self.utils.list_volumes(pod_name=pod.metadata.name) + # get all drives + drives = [] + for volume in volumes: + drive = self.utils.get_drive_cr( + volume_name=volume["metadata"]["name"], + namespace=volume["metadata"]["namespace"]) + drives.append(drive) + # 2. simulate drive failure. Annotate drive used by pod with health=BAD + for drive in drives: + drive_name = drive["metadata"]["name"] + self.utils.annotate_custom_resource( + resource_name=drive_name, + resource_type="drives", + annotation_key="health", + annotation_value="BAD" + ) + logging.info(f"drive: {drive_name} was annotated with health=BAD") + # 3. wait until drive health is BAD, status=ONLINE, usage=RELEASING. + for drive in drives: + drive_name = drive["metadata"]["name"] + logging.info(f"Waiting for drive: {drive_name}") + assert self.utils.wait_drive( + name=drive_name, + expected_status=const.STATUS_ONLINE, + expected_health=const.HEALTH_BAD, + expected_usage=const.USAGE_RELEASING + ), f"Drive {drive_name} failed to reach expected Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}" + logging.info(f"drive {drive_name} went in Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}") + # 4. wait until volume health is BAD, status=OPERATIVE, usage=RELEASING. + for volume in volumes: + volume_name = volume["metadata"]["name"] + logging.info(f"Waiting for volume: {volume_name}") + assert self.utils.wait_volume( + name=volume_name, + expected_health=const.HEALTH_BAD, + expected_usage=const.USAGE_RELEASING, + expected_operational_status=const.STATUS_OPERATIVE + ), f"Volume {volume_name} failed to reach OperationalStatus: {const.STATUS_OPERATIVE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}" + logging.info(f"volume {drive_name} went in OperationalStatus: {const.STATUS_OPERATIVE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}") + # 5. check events and locate event related to DriveHealthFailure + for drive in drives: + drive_name = drive["metadata"]["name"] + assert self.utils.event_in( + resource_name=drive_name, + reason=const.DRIVE_HEALTH_FAILURE, + ), f"event {const.DRIVE_HEALTH_FAILURE} for drive {drive_name} not found" + # 6. annotate volume with release=done + for volume in volumes: + volume_name = volume["metadata"]["name"] + self.utils.annotate_custom_resource( + resource_name=volume_name, + resource_type="volumes", + annotation_key="release", + annotation_value="done", + namespace=volume['metadata']['namespace'] + ) + logging.info(f"volume: {volume_name} was annotated with release=done") + # 7. check drive usages are RELEASED + for drive in drives: + assert self.utils.wait_drive( + name=drive['metadata']['name'], + expected_usage=const.USAGE_RELEASED + ), f"Drive {drive_name} failed to reach expected Usage: {const.USAGE_RELEASED}" + logging.info(f"drive {drive_name} went in Usage: {const.USAGE_RELEASED}") + # 8. check volumes are RELEASED + for volume in volumes: + assert self.utils.wait_volume( + name=volume['metadata']['name'], + expected_usage=const.USAGE_RELEASED + ), f"Volume {volume_name} failed to reach expected Usage {const.USAGE_RELEASED}" + logging.info(f"volume {volume_name} went in Usage: {const.USAGE_RELEASED}") + # 9. check event DriveReadyForRemoval is generated + for drive in drives: + drive_name = drive["metadata"]["name"] + assert self.utils.event_in( + resource_name=drive_name, + reason=const.DRIVE_READY_FOR_REMOVAL, + ), f"event {const.DRIVE_READY_FOR_REMOVAL} for drive {drive_name} not found" + # 10. check events and locate event related to VolumeBadHealth + for volume in volumes: + volume_name = volume["metadata"]["name"] + assert self.utils.event_in( + resource_name=volume_name, + reason=const.VOLUME_BAD_HEALTH, + ), f"event {const.VOLUME_BAD_HEALTH} for volume {volume_name} not found" + # 11. delete pod and pvc + self.utils.clear_pvc_and_pod(pod_name=pod.metadata.name, namespace=self.namespace) + # 12. check Drive status to be REMOVING or REMOVED and LED state to be 1 (if drive supports LED ) or 2 (if drive does not support LED) Status to be ONLINE #TODO: status LED 2 => another test case + for drive in drives: + assert self.utils.wait_drive( + name=drive['metadata']['name'], + expected_status=const.STATUS_ONLINE, + expected_usage=const.USAGE_REMOVED, + expected_health=const.HEALTH_BAD, + expected_led_state=const.LED_STATE + ), f"Drive {drive_name} failed to reach expected Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_REMOVED}, LEDState: {drive["spec"]["LEDState"]}" + logging.info(f"drive {drive_name} went in Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_REMOVED}, LEDState: {drive["spec"]["LEDState"]}") + # 13. check for events: DriveReadyForPhysicalRemoval + for drive in drives: + drive_name = drive["metadata"]["name"] + assert self.utils.event_in( + resource_name=drive_name, + reason=const.DRIVE_READY_FOR_PHYSICAL_REMOVAL, + ), f"event {const.DRIVE_READY_FOR_PHYSICAL_REMOVAL} for drive {drive_name} not found" + # 14. get Node ID on which drives reside, Obtain path for affected drives, identify node name for corresponding node id and remove drives + for drive in drives: + drive_name = drive["metadata"]["name"] + drive_path = drive["spec"]["Path"] + assert drive_path, f"Drive path for drive {drive_name} not found" + logging.info(f"drive_path: {drive_path}") + + host_num = self.drive_utils[node_ip].get_host_num(drive_path) + scsi_id = self.drive_utils[node_ip].get_scsi_id(drive_path) + assert scsi_id, f"scsi_id for drive {drive_name} not found" + logging.info(f"scsi_id: {scsi_id}") + + self.drive_utils[node_ip].remove(scsi_id) + logging.info(f"drive {drive_path}, {scsi_id} removed") + # 15. check driveCR succesfully removed + for drive in drives: + drive_name = drive["metadata"]["name"] + assert self.utils.check_drive_cr_not_exist( + drive_name=drive_name + ), f"Drive CR {drive_name} still exists" + # 16. check for events DriveSuccessfullyRemoved in kubernetes events + for drive in drives: + drive_name = drive["metadata"]["name"] + assert self.utils.event_in( + resource_name=drive_name, + reason=const.DRIVE_SUCCESSFULLY_REMOVED, + ), f"event {const.DRIVE_SUCCESSFULLY_REMOVED} for drive {drive_name} not found" diff --git a/tests/e2e-test-framework/tests/test_fake_attach.py b/tests/e2e-test-framework/tests/test_fake_attach.py index 103adc390..05634cc65 100644 --- a/tests/e2e-test-framework/tests/test_fake_attach.py +++ b/tests/e2e-test-framework/tests/test_fake_attach.py @@ -6,6 +6,7 @@ from framework.sts import STS from framework.utils import Utils from framework.drive import DriveUtils +from typing import Dict class TestFakeAttach: @@ -14,7 +15,7 @@ class TestFakeAttach: def setup_class( cls, namespace: str, - drive_utils_executors: dict[str, DriveUtils], + drive_utils_executors: Dict[str, DriveUtils], utils: Utils, ): cls.namespace = namespace diff --git a/tests/e2e-test-framework/tests/test_fake_attach_dr.py b/tests/e2e-test-framework/tests/test_fake_attach_dr.py index be7d57549..b43b67fa3 100644 --- a/tests/e2e-test-framework/tests/test_fake_attach_dr.py +++ b/tests/e2e-test-framework/tests/test_fake_attach_dr.py @@ -1,6 +1,7 @@ import logging import time import pytest +from typing import Dict import framework.const as const @@ -15,7 +16,7 @@ class TestFakeAttachMultipleVolumesPerPod: def setup_class( cls, namespace: str, - drive_utils_executors: dict[str, DriveUtils], + drive_utils_executors: Dict[str, DriveUtils], utils: Utils, ): cls.namespace = namespace