diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index 6c4602b0..17f21ec5 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -903,9 +903,10 @@ def update_config_for_exostellar(self): if self.slurm_compute_node_sg_id: if self.slurm_compute_node_sg_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']: self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.slurm_compute_node_sg_id) - if self.res_dcv_security_group_id: - if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']: - self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.res_dcv_security_group_id) + if 'RESStackName' in self.config: + if self.res_dcv_security_group_id: + if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']: + self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.res_dcv_security_group_id) # Get values from stack outputs ems_ip_address = None @@ -1376,8 +1377,9 @@ def check_regions_config(self): self.instance_types = sorted(self.instance_types) # Filter the instance types by architecture due to PC limitation to 1 architecture - # Also require at least 2 GB of memory. + # Also require at least 4 GB of memory. # Also filter by the CPU vendor from the config + MIN_COMPUTE_NODE_GB = 4 cluster_architecture = self.config['slurm']['ParallelClusterConfig']['Architecture'] logger.info(f"ParallelCluster Architecture: {cluster_architecture}") filtered_instance_types = [] @@ -1387,7 +1389,7 @@ def check_regions_config(self): logger.warning(f"Excluding {instance_type} because architecture ({instance_architecture}) != {cluster_architecture}") continue mem_gb = int(self.plugin.get_MemoryInMiB(self.cluster_region, instance_type) / 1024) - if mem_gb < 2: + if mem_gb < MIN_COMPUTE_NODE_GB: logger.warning(f"Excluding {instance_type} because has less than 2 GiB of memory.") continue cpu_vendor = self.plugin.get_cpu_vendor(self.cluster_region, instance_type) @@ -1425,9 +1427,7 @@ def create_parallel_cluster_lambdas(self): aws_lambda.Architecture.X86_64, ], compatible_runtimes = [ - aws_lambda.Runtime.PYTHON_3_9, - # aws_lambda.Runtime.PYTHON_3_10, # Doesn't work: No module named 'rpds.rpds' - # aws_lambda.Runtime.PYTHON_3_11, # Doesn't work: No module named 'rpds.rpds' + aws_lambda.Runtime.PYTHON_3_12, ], ) @@ -1437,7 +1437,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-CreateBuildFiles", description="Create ParallelCluster build configuration files", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(2), log_retention=logs.RetentionDays.INFINITE, @@ -1499,7 +1499,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-CreateParallelClusterConfig", description="Create ParallelCluster config", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, @@ -1547,7 +1547,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-CreateParallelCluster", description="Create ParallelCluster", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, @@ -1846,7 +1846,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-CreateHeadNodeARecord", description="Create head node A record", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, @@ -1893,7 +1893,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-UpdateHeadNode", description="Update head node", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, @@ -1935,7 +1935,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-ConfigUsersGroupsJson", description="Configure users and groups json file", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, @@ -1983,7 +1983,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-ConfigExternalLoginNodes", description="Configure external login nodes", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, @@ -2030,7 +2030,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-DeconfigUsersGroupsJson", description="Deconfigure RES users and groups json file", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, @@ -2072,7 +2072,7 @@ def create_parallel_cluster_lambdas(self): function_name=f"{self.stack_name}-DeconfigExternalLoginNodes", description="Deconfigure external login nodes", memory_size=2048, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, @@ -2114,7 +2114,7 @@ def create_callSlurmRestApiLambda(self): function_name=f"{self.stack_name}-CallSlurmRestApiLambda", description="Example showing how to call Slurm REST API", memory_size=128, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_12, architecture=aws_lambda.Architecture.ARM_64, timeout=Duration.minutes(1), log_retention=logs.RetentionDays.INFINITE, diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index 92702499..61c91bc4 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -90,6 +90,10 @@ # 3.11.0: # * Add support for ap-southeast-3 # * login node enhancements +# 3.11.1: +# * Disable Pyxis Spack plugin by default +# * Upgrade Python runtime to 3.12 +# * Upgrade libjwt to version 1.17.0. MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0') # Update source/resources/default_config.yml with latest version when this is updated. PARALLEL_CLUSTER_VERSIONS = [ @@ -106,14 +110,17 @@ '3.10.0', '3.10.1', '3.11.0', + '3.11.1', ] PARALLEL_CLUSTER_ENROOT_VERSIONS = { # This can be found on the head node by running 'yum info enroot' '3.11.0': '3.4.1', # confirmed + '3.11.1': '3.4.1', # confirmed } PARALLEL_CLUSTER_PYXIS_VERSIONS = { # This can be found on the head node at /opt/parallelcluster/sources '3.11.0': '0.20.0', # confirmed + '3.11.1': '0.20.0', # confirmed } PARALLEL_CLUSTER_MUNGE_VERSIONS = { # This can be found on the head node at /opt/parallelcluster/sources @@ -131,6 +138,7 @@ '3.10.0': '0.5.16', # confirmed '3.10.1': '0.5.16', # confirmed '3.11.0': '0.5.16', # confirmed + '3.11.1': '0.5.16', # confirmed } PARALLEL_CLUSTER_PYTHON_VERSIONS = { # This can be found on the head node at /opt/parallelcluster/pyenv/versions @@ -147,6 +155,7 @@ '3.10.0': '3.9.19', # confirmed '3.10.1': '3.9.19', # confirmed '3.11.0': '3.9.20', # confirmed + '3.11.1': '3.9.20', # confirmed } PARALLEL_CLUSTER_SLURM_VERSIONS = { # This can be found on the head node at /etc/chef/local-mode-cache/cache/ @@ -163,6 +172,7 @@ '3.10.0': '23.11.7', # confirmed '3.10.1': '23.11.7', # confirmed '3.11.0': '23.11.10', # confirmed + '3.11.1': '23.11.10', # confirmed } PARALLEL_CLUSTER_PC_SLURM_VERSIONS = { # This can be found on the head node at /etc/chef/local-mode-cache/cache/ @@ -179,6 +189,7 @@ '3.10.0': '23-11-7-1', # confirmed '3.10.1': '23-11-7-1', # confirmed '3.11.0': '23-11-10-1', # confirmed + '3.11.1': '23-11-10-1', # confirmed } SLURM_REST_API_VERSIONS = { '23-02-2-1': '0.0.39', diff --git a/source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py b/source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py index 631af244..d0831d15 100644 --- a/source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py +++ b/source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py @@ -139,6 +139,10 @@ def lambda_handler(event, context): else: raise KeyError(error_message) + if requestType == 'Delete': + cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) + return + ami_builds = json.loads(environ['AmiBuildsJson']) assets_bucket = environ['AssetsBucket'] assets_base_key = environ['AssetsBaseKey'] diff --git a/source/resources/parallel-cluster/config/bin/on_compute_node_configured.sh b/source/resources/parallel-cluster/config/bin/on_compute_node_configured.sh index 15fbdbf5..9d22be0f 100755 --- a/source/resources/parallel-cluster/config/bin/on_compute_node_configured.sh +++ b/source/resources/parallel-cluster/config/bin/on_compute_node_configured.sh @@ -63,7 +63,9 @@ fi export PATH=/usr/sbin:$PATH echo "Creating users and groups" -$config_bin_dir/create_users_groups.py -i $config_dir/users_groups.json +if [[ -e $config_dir/users_groups.json ]]; then + $config_bin_dir/create_users_groups.py -i $config_dir/users_groups.json +fi # ansible_compute_node_vars_yml_s3_url="s3://$assets_bucket/$assets_base_key/config/ansible/ansible_compute_node_vars.yml" diff --git a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py index 9ee7a4e0..f0f7166a 100755 --- a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py +++ b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py @@ -23,8 +23,6 @@ import json import logging import logging.handlers -import os -import pycurl import requests import yaml diff --git a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/tasks/main.yml b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/tasks/main.yml index c3dec3ac..9a84ec21 100644 --- a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/tasks/main.yml +++ b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/tasks/main.yml @@ -143,6 +143,8 @@ cmd: | set -ex + yum -y install python3.11-pip + python3.11 -m pip install requests PyYaml {{ exostellar_dir }}/configure_xio.py - name: Create {{ exostellar_dir }}/xspot.slurm.conf