Skip to content

Commit

Permalink
Fix method name and refactor clustermgtd_conf_path parameter
Browse files Browse the repository at this point in the history
Signed-off-by: Luca Carrogu <[email protected]>
  • Loading branch information
lukeseawalker committed Jan 23, 2024
1 parent 175f0a3 commit 87fd5ae
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 18 deletions.
4 changes: 2 additions & 2 deletions tests/integration-tests/tests/common/assertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from time_utils import minutes, seconds
from utils import (
get_cfn_resources,
get_compute_nodes_count,
get_compute_nodes_instance_count,
get_compute_nodes_instance_ids,
get_compute_nodes_instance_ips,
)
Expand Down Expand Up @@ -130,7 +130,7 @@ def submit_job_and_assert_logs(


def assert_no_node_in_ec2(region, stack_name, instance_types=None):
assert_that(get_compute_nodes_count(stack_name, region, instance_types)).is_equal_to(0)
assert_that(get_compute_nodes_instance_count(stack_name, region, instance_types)).is_equal_to(0)


def assert_scaling_worked(
Expand Down
6 changes: 3 additions & 3 deletions tests/integration-tests/tests/common/scaling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from remote_command_executor import RemoteCommandExecutor
from retrying import RetryError, retry
from time_utils import seconds
from utils import CWMetric, get_compute_instances_count, publish_metrics_to_cloudwatch
from utils import CWMetric, get_compute_nodes_instance_count, publish_metrics_to_cloudwatch

SCALING_COMMON_DATADIR = pathlib.Path(__file__).parent / "scaling"

Expand Down Expand Up @@ -70,7 +70,7 @@ def _collect_metrics():
compute_node_count = int(headnode_metrics.get("NodeCount"))
pending_jobs_count = int(headnode_metrics.get("PendingJobsCount"))
running_jobs_count = int(headnode_metrics.get("RunningJobsCount"))
ec2_capacity = get_compute_instances_count(cluster_name, region)
ec2_capacity = get_compute_nodes_instance_count(cluster_name, region)
if publish_metrics:
# Use the cluster name as a dimension for each scaling metric
scaling_metrics = [
Expand Down Expand Up @@ -148,7 +148,7 @@ def get_compute_nodes_allocation(
)
def _watch_compute_nodes_allocation():
compute_nodes = len(scheduler_commands.get_unique_static_nodes())
ec2_capacity = get_compute_instances_count(stack_name, region)
ec2_capacity = get_compute_nodes_instance_count(stack_name, region)
timestamp = time.time()

# add values only if there is a transition.
Expand Down
21 changes: 9 additions & 12 deletions tests/integration-tests/tests/schedulers/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,7 @@ def test_slurm_custom_partitions(
scheduler_commands.set_partition_state(custom_partitions[0], "UP")

logging.info("Decreasing protected failure count for quicker enter protected mode...")
clustermgtd_conf_path = retrieve_clustermgtd_conf_path(remote_command_executor)
set_protected_failure_count(remote_command_executor, 2, clustermgtd_conf_path)
set_protected_failure_count(remote_command_executor, 2)
failing_partition = "ondemand1"
logging.info("Testing protected mode is skipped while job running and activated when no jobs are in the queue...")
pending_job_id = _test_active_job_running(
Expand Down Expand Up @@ -495,20 +494,19 @@ def test_slurm_protected_mode(
cluster_config = pcluster_config_reader(bucket=bucket_name, scaling_strategy=scaling_strategy)
cluster = clusters_factory(cluster_config)
remote_command_executor = RemoteCommandExecutor(cluster)
clustermgtd_conf_path = retrieve_clustermgtd_conf_path(remote_command_executor)
scheduler_commands = scheduler_commands_factory(remote_command_executor)

remote_command_executor.clear_clustermgtd_log()
remote_command_executor.clear_slurm_resume_log()

_test_disable_protected_mode(
remote_command_executor, cluster, bucket_name, pcluster_config_reader, clustermgtd_conf_path, scaling_strategy
remote_command_executor, cluster, bucket_name, pcluster_config_reader, scaling_strategy
)

# Re-enable protected mode
_enable_protected_mode(remote_command_executor, clustermgtd_conf_path)
_enable_protected_mode(remote_command_executor)
# Decrease protected failure count for quicker enter protected mode.
set_protected_failure_count(remote_command_executor, 2, clustermgtd_conf_path)
set_protected_failure_count(remote_command_executor, 2)

partition = "half-broken"
pending_job_id = _test_active_job_running(
Expand Down Expand Up @@ -1924,24 +1922,23 @@ def _wait_until_protected_mode_failure_count_set(cluster):
"ClusterManager Startup",
],
)
clustermgtd_conf_path = retrieve_clustermgtd_conf_path(remote_command_executor)
assert_that(clustermgtd_conf_path).is_not_empty()
set_protected_failure_count(remote_command_executor, 3, clustermgtd_conf_path)
set_protected_failure_count(remote_command_executor, 3)

return remote_command_executor


def _enable_protected_mode(remote_command_executor, clustermgtd_conf_path):
def _enable_protected_mode(remote_command_executor):
"""Enable protected mode by removing lines related to protected mode in the config, so it will be set to default."""
clustermgtd_conf_path = retrieve_clustermgtd_conf_path(remote_command_executor)
remote_command_executor.run_remote_command(f"sudo sed -i '/'protected_failure_count'/d' {clustermgtd_conf_path}")


def _test_disable_protected_mode(
remote_command_executor, cluster, bucket_name, pcluster_config_reader, clustermgtd_conf_path, scaling_strategy
remote_command_executor, cluster, bucket_name, pcluster_config_reader, scaling_strategy
):
"""Test Bootstrap failures have no affect on cluster when protected mode is disabled."""
# Disable protected_mode by setting protected_failure_count to -1
set_protected_failure_count(remote_command_executor, -1, clustermgtd_conf_path)
set_protected_failure_count(remote_command_executor, -1)
_inject_bootstrap_failures(cluster, bucket_name, pcluster_config_reader, scaling_strategy)
# wait till the node failed
retry(wait_fixed=seconds(20), stop_max_delay=minutes(7))(assert_lines_in_logs)(
Expand Down
3 changes: 2 additions & 1 deletion tests/integration-tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def get_substacks(stack_name, region=None, sub_stack_name=None):
return [r.get("PhysicalResourceId") for r in stacks]


def get_compute_instances_count(stack_name, region, instance_types=None):
def get_compute_nodes_instance_count(stack_name, region, instance_types=None):
return len(get_compute_nodes_instance_ids(stack_name, region, instance_types=instance_types))


Expand Down Expand Up @@ -857,6 +857,7 @@ def retrieve_clustermgtd_conf_path(remote_command_executor):
).stdout
if clustermgtd_conf_path_override:
clustermgtd_conf_path = clustermgtd_conf_path_override
assert_that(clustermgtd_conf_path).is_not_empty()
return clustermgtd_conf_path


Expand Down

0 comments on commit 87fd5ae

Please sign in to comment.