From 0a8a45ce975b7d3772bc6af30f45ddecae3be86f Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Mon, 26 Feb 2024 21:27:34 +0000 Subject: [PATCH] Update config files and fix errors found in testing new configs Add --RESEnvironmentName to the installer Ease initial integration with Research and Engineering Studio (RES). Automatically add the correct submitter security groups and configure the /home directory. Resolves #207 ============================ Update template config files Added more comments to clarify that these are examples that should be copied and customized by users. Added comments for typical configuration options. Deleted obsolete configs that were from v1. Resolves #203 ============================= Set default head node instance type based on architecture. Resolves #206 ============================== Clean up ansible-lint errors and warnings. Arm architecture cluster was failing because of an incorrect condition in the ansible playbook that is flagged by lint. ============================== Use vdi controller instead of cluster manager for users and groups info Cluster manager stopped being domain joined for some reason. ============================== Paginate describe_instances when creating head node a record. Otherwise, may not find the cluster head node instance. ============================== Add default MungeKeySecret. This should be the default or you can't access multiple clusters from the same server. ============================== Increase timeout for ssm command that configures submitters Need the time to compile slurm. ============================== Force slurm to be rebuilt for submitters of all os distributions even if they match the os of the cluster. Otherwise get errors because can't find PluginDir in the same location as when it was compiled. ============================== Paginate describe_instances in UpdateHeadNode lambda ============================== Add check for min memory of 4 GB for slurm controller --- source/cdk/cdk_slurm_stack.py | 357 +++++++++++++++--- source/cdk/config_schema.py | 49 ++- source/resources/config/default_config.yml | 72 +++- .../config/slurm_all_arm_instance_types.yml | 84 +++++ .../config/slurm_all_instance_types.yml | 26 -- source/resources/config/slurm_all_os.yml | 37 -- .../config/slurm_all_x86_instance_types.yml | 84 +++++ source/resources/config/slurm_alma_linux.yml | 29 -- source/resources/config/slurm_eda.yml | 37 -- source/resources/config/slurm_eda_az1.yml | 46 --- source/resources/config/slurm_eda_az2.yml | 47 --- source/resources/config/slurm_eda_az3.yml | 48 --- .../resources/config/slurm_elasticsearch.yml | 50 --- source/resources/config/slurm_fpga_dev.yml | 46 --- source/resources/config/slurm_lustre.yml | 32 -- source/resources/config/slurm_multi_az.yml | 97 ----- source/resources/config/slurm_ontap.yml | 29 -- .../slurm_recommended_arm_instance_types.yml | 81 ++++ .../slurm_recommended_x86_instance_types.yml | 81 ++++ source/resources/config/slurm_rocky_linux.yml | 25 -- source/resources/config/slurm_zfs.yml | 30 -- .../ConfigureRESClusterManager.py | 91 ----- .../ConfigureRESSubmitters.py | 11 +- .../ConfigureRESUsersGroupsJson.py | 133 +++++++ .../CreateHeadNodeARecord.py | 24 +- .../CreateParallelCluster.py | 17 +- .../CreateParallelClusterConfig.py | 7 +- .../DeconfigureRESUsersGroupsJson.py} | 88 +++-- .../cfnresponse.py | 0 .../lambdas/UpdateHeadNode/UpdateHeadNode.py | 58 ++- .../config/bin/on_head_node_configured.sh | 6 +- .../tasks/main.yml | 22 +- .../tasks/main.yml | 28 +- .../tasks/config-licenses.yml | 34 +- .../tasks/config-slurmdb-accounts.yml | 16 +- .../tasks/config-slurmrestd.yml | 36 +- .../tasks/config-submitter-access.yml | 86 ++--- .../tasks/config-users-groups.yml | 2 +- .../tasks/main.yml | 39 +- .../tasks/main.yml | 22 +- .../playbooks/roles/all/tasks/main.yml | 84 ++--- .../playbooks/roles/bug_fixes/tasks/main.yml | 2 - .../create_users_groups_json/tasks/main.yml | 6 +- .../playbooks/roles/eda_tools/tasks/main.yml | 95 ++--- .../roles/install_slurm/tasks/main.yml | 127 ++++--- .../roles/lustre-client/tasks/main.yml | 12 +- .../roles/mount_extra_fs/tasks/main.yml | 4 +- .../roles/mount_slurm_fs/tasks/main.yml | 30 +- .../tasks/main.yml | 2 +- .../roles/security_updates/tasks/main.yml | 3 +- .../find_existing_resources.py | 59 +++ source/slurm_installer/installer.py | 62 +-- 52 files changed, 1442 insertions(+), 1151 deletions(-) create mode 100644 source/resources/config/slurm_all_arm_instance_types.yml delete mode 100644 source/resources/config/slurm_all_instance_types.yml delete mode 100644 source/resources/config/slurm_all_os.yml create mode 100644 source/resources/config/slurm_all_x86_instance_types.yml delete mode 100644 source/resources/config/slurm_alma_linux.yml delete mode 100644 source/resources/config/slurm_eda.yml delete mode 100644 source/resources/config/slurm_eda_az1.yml delete mode 100644 source/resources/config/slurm_eda_az2.yml delete mode 100644 source/resources/config/slurm_eda_az3.yml delete mode 100644 source/resources/config/slurm_elasticsearch.yml delete mode 100644 source/resources/config/slurm_fpga_dev.yml delete mode 100644 source/resources/config/slurm_lustre.yml delete mode 100644 source/resources/config/slurm_multi_az.yml delete mode 100644 source/resources/config/slurm_ontap.yml create mode 100644 source/resources/config/slurm_recommended_arm_instance_types.yml create mode 100644 source/resources/config/slurm_recommended_x86_instance_types.yml delete mode 100644 source/resources/config/slurm_rocky_linux.yml delete mode 100644 source/resources/config/slurm_zfs.yml delete mode 100644 source/resources/lambdas/ConfigureRESClusterManager/ConfigureRESClusterManager.py create mode 100644 source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py rename source/resources/lambdas/{DeconfigureRESClusterManager/DeconfigureRESClusterManager.py => DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py} (63%) rename source/resources/lambdas/{DeconfigureRESClusterManager => DeconfigureRESUsersGroupsJson}/cfnresponse.py (100%) diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index dafacf70..d7bf1322 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -95,6 +95,8 @@ class CdkSlurmStack(Stack): def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) + self.ec2InstanceTypeInfo = None + self.onprem_cidr = None self.principals_suffix = { @@ -138,6 +140,22 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: self.create_fault_injection_templates() + def get_ec2InstanceTypeInfo(self): + if not self.ec2InstanceTypeInfo: + self.ec2InstanceTypeInfo = EC2InstanceTypeInfo([self.config['Region']], get_savings_plans=False, json_filename='/tmp/instance_type_info.json', debug=False) + self.instance_type_and_family_info = self.ec2InstanceTypeInfo.instance_type_and_family_info[self.config['Region']] + self.instance_families_info = self.instance_type_and_family_info['instance_families'] + self.instance_types_info = self.instance_type_and_family_info['instance_types'] + return self.ec2InstanceTypeInfo + + def get_instance_family_info(self, instance_family): + self.get_ec2InstanceTypeInfo() + return self.instance_tfamiles_info[instance_family] + + def get_instance_type_info(self, instance_type): + self.get_ec2InstanceTypeInfo() + return self.instance_types_info[instance_type] + def get_config(self, context_var, default_path): default_config_file_path = realpath(f"{dirname(realpath(__file__))}/../resources/config/") config_file_path = self.node.try_get_context(context_var) @@ -188,9 +206,12 @@ def override_config_with_context(self): ''' Override the config using context variables ''' + # Config keys: [context_key, command_line_switch] + # command_line_switch is None if not required. config_keys = { 'Region': ['region', 'region'], 'SshKeyPair': ['SshKeyPair', 'ssh-keypair'], + 'RESEnvironmentName': ['RESEnvironmentName', None], 'VpcId': ['VpcId', 'vpc-id'], 'CIDR': ['CIDR', 'cidr'], 'SubnetId': ['SubnetId', None], @@ -223,7 +244,9 @@ def override_config_with_context(self): def check_config(self): ''' - Check config, set defaults, and sanity check the configuration + Check config, set defaults, and sanity check the configuration. + + If RESEnvironmentName is configured then update configuration from RES stacks. ''' config_errors = 0 @@ -237,6 +260,9 @@ def check_config(self): logger.error(f"You must provide --stack-name on the command line or StackName in the config file.") config_errors += 1 + if 'RESEnvironmentName' in self.config: + self.update_config_for_res() + if 'ErrorSnsTopicArn' not in self.config: logger.warning(f"ErrorSnsTopicArn not set. Provide error-sns-topic-arn on the command line or ErrorSnsTopicArn in the config file to get error notifications.") @@ -399,6 +425,13 @@ def check_config(self): logger.error(f"ParallelCluster requires VolumeId for {mount_dir} in slurm/storage/ExtraMounts") config_errors += 1 + # Check to make sure controller instance type has at least 4 GB of memmory. + slurmctl_instance_type = self.config['slurm']['SlurmCtl']['instance_type'] + slurmctl_memory_in_gb = int(self.get_instance_type_info(slurmctl_instance_type)['MemoryInMiB'] / 1024) + if slurmctl_memory_in_gb < 4: + logger.error(f"Configured SlurmCtl instance type ({slurmctl_instance_type}) has {slurmctl_memory_in_gb} GB and needs at least 4.") + config_errors += 1 + if config_errors: exit(1) @@ -412,6 +445,221 @@ def check_config(self): exit(1) self.config = validated_config + def update_config_for_res(self): + ''' + Update config with information from RES stacks + + Add Submitter security groups. + Configure /home file system. + ''' + res_environment_name = self.config['RESEnvironmentName'] + logger.info(f"Updating configuration for RES environment: {res_environment_name}") + cloudformation_client = boto3.client('cloudformation', region_name=self.config['Region']) + res_stack_name = None + stack_statuses = {} + stack_dicts = {} + for stack_dict in cloudformation_client.list_stacks( + StackStatusFilter=[ + 'CREATE_COMPLETE', + 'ROLLBACK_COMPLETE', + 'UPDATE_COMPLETE', + 'UPDATE_ROLLBACK_COMPLETE', + 'IMPORT_COMPLETE', + 'IMPORT_ROLLBACK_COMPLETE' + ] + )["StackSummaries"]: + stack_name = stack_dict['StackName'] + if stack_name == res_environment_name: + res_stack_name = stack_dict['StackName'] + # Don't break here so get all of the stack names + stack_status = stack_dict['StackStatus'] + stack_statuses[stack_name] = stack_status + stack_dicts[stack_name] = stack_dict + if not res_stack_name: + message = f"CloudFormation RES stack named {res_environment_name} not found. Existing stacks:" + for stack_name in sorted(stack_statuses): + message += f"\n {stack_name:32}: status={stack_statuses[stack_name]}" + logger.error(message) + exit(1) + + # Get VpcId, SubnetId from RES stack + stack_parameters = cloudformation_client.describe_stacks(StackName=res_stack_name)['Stacks'][0]['Parameters'] + vpc_id = None + subnet_ids = [] + for stack_parameter_dict in stack_parameters: + if stack_parameter_dict['ParameterKey'] == 'VpcId': + vpc_id = stack_parameter_dict['ParameterValue'] + elif stack_parameter_dict['ParameterKey'] == 'PrivateSubnets': + subnet_ids = stack_parameter_dict['ParameterValue'].split(',') + if not vpc_id: + logger.error(f"VpcId parameter not found in {res_environment_name} RES stack.") + exit(1) + if 'VpcId' in self.config and self.config['VpcId'] != vpc_id: + logger.error(f"Config file VpcId={self.config['VpcId']} is not the same as RESEnvironmentName VpcId={vpc_id}.") + exit(1) + if 'VpcId' not in self.config: + self.config['VpcId'] = vpc_id + logger.info(f" VpcId: {vpc_id}") + if not subnet_ids: + logger.error(f"PrivateSubnets parameter not found in {res_environment_name} RES stack.") + exit(1) + if 'SubnetId' in self.config and self.config['SubnetId'] not in subnet_ids: + logger.error(f"Config file SubnetId={self.config['SubnetId']} is not a RES private subnet. RES private subnets: {subnet_ids}.") + exit(1) + if 'SubnetId' not in self.config: + self.config['SubnetId'] = subnet_ids[0] + logger.info(f" SubnetId: {self.config['SubnetId']}") + + submitter_security_group_ids = [] + if 'SubmitterSecurityGroupIds' not in self.config['slurm']: + self.config['slurm']['SubmitterSecurityGroupIds'] = {} + else: + for security_group_name, security_group_ids in self.config['slurm']['SubmitterSecurityGroupIds'].items(): + submitter_security_group_ids.append(security_group_ids) + + # Get RES VDI Security Group + res_vdc_stack_name = f"{res_stack_name}-vdc" + if res_vdc_stack_name not in stack_statuses: + message = f"CloudFormation RES stack named {res_vdc_stack_name} not found. Existing stacks:" + for stack_name in sorted(stack_statuses): + message += f"\n {stack_name:32}: status={stack_statuses[stack_name]}" + logger.error(message) + exit(1) + res_dcv_security_group_id = None + list_stack_resources_paginator = cloudformation_client.get_paginator('list_stack_resources') + for stack_resource_summaries in list_stack_resources_paginator.paginate(StackName=res_vdc_stack_name): + for stack_resource_summary_dict in stack_resource_summaries['StackResourceSummaries']: + if stack_resource_summary_dict['LogicalResourceId'].startswith('vdcdcvhostsecuritygroup'): + res_dcv_security_group_id = stack_resource_summary_dict['PhysicalResourceId'] + break + if res_dcv_security_group_id: + break + if not res_dcv_security_group_id: + logger.error(f"RES VDI security group not found.") + exit(1) + if res_dcv_security_group_id not in submitter_security_group_ids: + res_dcv_security_group_name = f"{res_environment_name}-dcv-sg" + logger.info(f" SubmitterSecurityGroupIds['{res_dcv_security_group_name}'] = '{res_dcv_security_group_id}'") + self.config['slurm']['SubmitterSecurityGroupIds'][res_dcv_security_group_name] = res_dcv_security_group_id + submitter_security_group_ids.append(res_dcv_security_group_id) + + # Get cluster manager Security Group + logger.debug(f"Searching for cluster manager security group id") + res_cluster_manager_stack_name = f"{res_stack_name}-cluster-manager" + if res_cluster_manager_stack_name not in stack_statuses: + message = f"CloudFormation RES stack named {res_cluster_manager_stack_name} not found. Existing stacks:" + for stack_name in sorted(stack_statuses): + message += f"\n {stack_name:32}: status={stack_statuses[stack_name]}" + logger.error(message) + exit(1) + res_cluster_manager_security_group_id = None + list_stack_resources_paginator = cloudformation_client.get_paginator('list_stack_resources') + for stack_resource_summaries in list_stack_resources_paginator.paginate(StackName=res_cluster_manager_stack_name): + for stack_resource_summary_dict in stack_resource_summaries['StackResourceSummaries']: + if stack_resource_summary_dict['LogicalResourceId'].startswith('clustermanagersecuritygroup'): + res_cluster_manager_security_group_id = stack_resource_summary_dict['PhysicalResourceId'] + break + if res_cluster_manager_security_group_id: + break + if not res_cluster_manager_security_group_id: + logger.error(f"RES cluster manager security group not found.") + exit(1) + if res_cluster_manager_security_group_id not in submitter_security_group_ids: + res_cluster_manager_security_group_name = f"{res_environment_name}-cluster-manager-sg" + logger.info(f" SubmitterSecurityGroupIds['{res_cluster_manager_security_group_name}'] = '{res_cluster_manager_security_group_id}'") + self.config['slurm']['SubmitterSecurityGroupIds'][res_cluster_manager_security_group_name] = res_cluster_manager_security_group_id + submitter_security_group_ids.append(res_cluster_manager_security_group_id) + + # Get vdc controller Security Group + logger.debug(f"Searching for VDC controller security group id") + res_vdc_stack_name = f"{res_stack_name}-vdc" + if res_vdc_stack_name not in stack_statuses: + message = f"CloudFormation RES stack named {res_vdc_stack_name} not found. Existing stacks:" + for stack_name in sorted(stack_statuses): + message += f"\n {stack_name:32}: status={stack_statuses[stack_name]}" + logger.error(message) + exit(1) + res_vdc_controller_security_group_id = None + list_stack_resources_paginator = cloudformation_client.get_paginator('list_stack_resources') + for stack_resource_summaries in list_stack_resources_paginator.paginate(StackName=res_vdc_stack_name): + logger.debug(f" stack resource summaries for {res_vdc_stack_name}:") + for stack_resource_summary_dict in stack_resource_summaries['StackResourceSummaries']: + logger.debug(f" LogicalResourceId: {stack_resource_summary_dict['LogicalResourceId']}") + if stack_resource_summary_dict['LogicalResourceId'].startswith('vdccontrollersecuritygroup'): + res_vdc_controller_security_group_id = stack_resource_summary_dict['PhysicalResourceId'] + break + if res_vdc_controller_security_group_id: + break + if not res_vdc_controller_security_group_id: + logger.error(f"RES VDC controller security group not found.") + exit(1) + if res_vdc_controller_security_group_id not in submitter_security_group_ids: + res_vdc_controller_security_group_name = f"{res_environment_name}-vdc-controller-sg" + logger.info(f" SubmitterSecurityGroupIds['{res_vdc_controller_security_group_name}'] = '{res_vdc_controller_security_group_id}'") + self.config['slurm']['SubmitterSecurityGroupIds'][res_vdc_controller_security_group_name] = res_vdc_controller_security_group_id + submitter_security_group_ids.append(res_vdc_controller_security_group_id) + + # Configure the /home mount from RES if /home not already configured + home_mount_found = False + for extra_mount in self.config['slurm'].get('storage', {}).get('ExtraMounts', []): + if extra_mount['dest'] == '/home': + home_mount_found = True + break + if home_mount_found: + logger.warning(f"Config file already has a mount for /home configured:\n{json.dumps(extra_mount, indent=4)}.") + else: + # RES takes the shared file system for /home as a parameter; it is not created by RES. + # parameter SharedHomeFileSystemId + logger.setLevel(logging.DEBUG) + logger.debug(f"Searching for RES /home file system") + res_shared_storage_stack_name = f"{res_stack_name}" + if res_shared_storage_stack_name not in stack_statuses: + message = f"CloudFormation RES stack named {res_shared_storage_stack_name} not found. Existing stacks:" + for stack_name in sorted(stack_statuses): + message += f"\n {stack_name:32}: status={stack_statuses[stack_name]}" + logger.error(message) + exit(1) + res_home_efs_id = None + for stack_parameter_dict in cloudformation_client.describe_stacks(StackName=res_stack_name)['Stacks'][0]['Parameters']: + if stack_parameter_dict['ParameterKey'] == 'SharedHomeFileSystemId': + res_home_efs_id = stack_parameter_dict['ParameterValue'] + break + if not res_home_efs_id: + logger.error(f"RES shared /home EFS storage id not found.") + exit(1) + logger.info(f" /home efs id: {res_home_efs_id}") + if 'storage' not in self.config['slurm']: + self.config['slurm']['storage'] = {} + if 'ExtraMounts' not in self.config['slurm']['storage']: + self.config['slurm']['storage']['ExtraMounts'] = [] + self.config['slurm']['storage']['ExtraMounts'].append( + { + 'dest': '/home', + 'StorageType': 'Efs', + 'FileSystemId': res_home_efs_id, + 'src': f"{res_home_efs_id}.efs.{self.config['Region']}.amazonaws.com:/", + 'type': 'nfs4', + 'options': 'nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport' + } + ) + if 'ExtraMountSecurityGroups' not in self.config['slurm']['storage']: + self.config['slurm']['storage']['ExtraMountSecurityGroups'] = {} + if 'nfs' not in self.config['slurm']['storage']['ExtraMountSecurityGroups']: + self.config['slurm']['storage']['ExtraMountSecurityGroups']['nfs'] = {} + res_home_mount_sg_id = res_dcv_security_group_id + home_sg_found = False + for extra_mount_sg in self.config['slurm']['storage']['ExtraMountSecurityGroups']['nfs']: + extra_mount_sg_id = self.config['slurm']['storage']['ExtraMountSecurityGroups']['nfs'][extra_mount_sg] + if extra_mount_sg_id == res_home_mount_sg_id: + home_sg_found = True + break + if home_sg_found: + logger.info(f" {extra_mount_sg}({res_home_mount_sg_id}) already configured in config['slurm']['storage']['ExtraMountSecurityGroups']['nfs']") + else: + res_home_mount_sg = f"{res_environment_name}-DCV-Host" + self.config['slurm']['storage']['ExtraMountSecurityGroups']['nfs'][res_home_mount_sg] = res_home_mount_sg_id + logger.info(f" ExtraMountSecurityGroup: {res_home_mount_sg}({res_home_mount_sg_id})") + def create_parallel_cluster_assets(self): # Create a secure hash of all of the assets so that changes can be easily detected to trigger cluster updates. self.assets_hash = sha512() @@ -478,12 +726,12 @@ def create_parallel_cluster_assets(self): self.suppress_cfn_nag(self.create_head_node_a_record_sns_topic, 'W47', 'Use default KMS key.') if 'RESEnvironmentName' in self.config: # SNS topic that gets notified when cluster is created and triggers a lambda to configure the cluster manager - self.configure_res_cluster_manager_sns_topic = sns.Topic( - self, "ConfigureRESClusterManagerSnsTopic", - topic_name = f"{self.config['slurm']['ClusterName']}ConfigureRESClusterManager" + self.configure_res_users_groups_json_sns_topic = sns.Topic( + self, "ConfigureRESUsersGroupsJsonSnsTopic", + topic_name = f"{self.config['slurm']['ClusterName']}ConfigureRESUsersGroupsJson" ) # W47:SNS Topic should specify KmsMasterKeyId property - self.suppress_cfn_nag(self.configure_res_cluster_manager_sns_topic, 'W47', 'Use default KMS key.') + self.suppress_cfn_nag(self.configure_res_users_groups_json_sns_topic, 'W47', 'Use default KMS key.') # SNS topic that gets notified when cluster is created and triggers a lambda to configure the cluster manager self.configure_res_submitters_sns_topic = sns.Topic( self, "ConfigureRESSubmittersSnsTopic", @@ -526,13 +774,13 @@ def create_parallel_cluster_assets(self): os.remove(playbooks_zipfile_filename) if 'RESEnvironmentName' in self.config: - self.configure_res_cluster_manager_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureRESClusterManagerSnsTopicArn" - self.configure_res_cluster_manager_sns_topic_arn_parameter = ssm.StringParameter( - self, f"ConfigureRESClusterManagerSnsTopicArnParameter", - parameter_name = self.configure_res_cluster_manager_sns_topic_arn_parameter_name, - string_value = self.configure_res_cluster_manager_sns_topic.topic_arn + self.configure_res_users_groups_json_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureRESUsersGroupsJsonSnsTopicArn" + self.configure_res_users_groups_json_sns_topic_arn_parameter = ssm.StringParameter( + self, f"ConfigureRESUsersGroupsJsonSnsTopicArnParameter", + parameter_name = self.configure_res_users_groups_json_sns_topic_arn_parameter_name, + string_value = self.configure_res_users_groups_json_sns_topic.topic_arn ) - self.configure_res_cluster_manager_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) + self.configure_res_users_groups_json_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) self.configure_res_submitters_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureRESSubmittersSnsTopicArn" self.configure_res_submitters_sns_topic_arn_parameter = ssm.StringParameter( @@ -554,7 +802,7 @@ def create_parallel_cluster_assets(self): 'assets_bucket': self.assets_bucket, 'assets_base_key': self.assets_base_key, 'ClusterName': self.config['slurm']['ClusterName'], - 'ConfigureRESClusterManagerSnsTopicArnParameter': '', + 'ConfigureRESUsersGroupsJsonSnsTopicArnParameter': '', 'ConfigureRESSubmittersSnsTopicArnParameter': '', 'CreateHeadNodeARecordSnsTopicArnParameter': self.create_head_node_a_record_sns_topic_arn_parameter_name, 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), @@ -572,7 +820,7 @@ def create_parallel_cluster_assets(self): else: template_vars['HomeMountSrc'] = '' if 'RESEnvironmentName' in self.config: - template_vars['ConfigureRESClusterManagerSnsTopicArnParameter'] = self.configure_res_cluster_manager_sns_topic_arn_parameter_name + template_vars['ConfigureRESUsersGroupsJsonSnsTopicArnParameter'] = self.configure_res_users_groups_json_sns_topic_arn_parameter_name template_vars['ConfigureRESSubmittersSnsTopicArnParameter'] = self.configure_res_submitters_sns_topic_arn_parameter_name # Additions or deletions to the list should be reflected in config_scripts in on_head_node_start.sh. @@ -869,11 +1117,14 @@ def create_vpc(self): else: # Subnet not specified so pick the first private or isolated subnet, otherwise first public subnet if self.vpc.private_subnets: - self.subnet = self.private_subnets[0] - elif self.isolated_subnets: - self.subnet = self.isolated_subnets[0] + self.subnet = self.vpc.private_subnets[0] + elif self.vpc.isolated_subnets: + self.subnet = self.vpc.isolated_subnets[0] + elif self.vpc.public_subnets: + self.subnet = self.vpc.public_subnets[0] else: - self.subnet = self.public_subnets[0] + logger.error(f"No private, isolated, or public subnets found in {self.config['VpcId']}") + exit(1) self.config['SubnetId'] = self.subnet.subnet_id logger.info(f"Subnet set to {self.config['SubnetId']}") logger.info(f"availability zone: {self.subnet.availability_zone}") @@ -917,7 +1168,7 @@ def check_regions_config(self): self.compute_region_cidrs_dict[compute_region] = compute_region_cidr logger.info(f"{len(self.compute_regions)} regions configured: {sorted(self.compute_regions)}") - self.eC2InstanceTypeInfo = EC2InstanceTypeInfo(self.compute_regions, get_savings_plans=False, json_filename='/tmp/instance_type_info.json', debug=False) + self.eC2InstanceTypeInfo = self.get_ec2InstanceTypeInfo() self.plugin = SlurmPlugin(slurm_config_file=None, region=self.cluster_region) self.plugin.instance_type_and_family_info = self.eC2InstanceTypeInfo.instance_type_and_family_info @@ -1464,26 +1715,30 @@ def create_parallel_cluster_lambdas(self): ) if 'RESEnvironmentName' in self.config: - configureRESClusterManagerLambdaAsset = s3_assets.Asset(self, "ConfigureRESClusterManagerAsset", path="resources/lambdas/ConfigureRESClusterManager") - self.configure_res_cluster_manager_lambda = aws_lambda.Function( - self, "ConfigRESClusterManagerLambda", - function_name=f"{self.stack_name}-ConfigRESClusterManager", - description="Configure RES cluster manager", + configureRESUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "ConfigureRESUsersGroupsJsonAsset", path="resources/lambdas/ConfigureRESUsersGroupsJson") + self.configure_res_users_groups_json_lambda = aws_lambda.Function( + self, "ConfigRESUsersGroupsJsonLambda", + function_name=f"{self.stack_name}-ConfigRESUsersGroupsJson", + description="Configure RES users and groups json file", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="ConfigureRESClusterManager.lambda_handler", - code=aws_lambda.Code.from_bucket(configureRESClusterManagerLambdaAsset.bucket, configureRESClusterManagerLambdaAsset.s3_object_key), + handler="ConfigureRESUsersGroupsJson.lambda_handler", + code=aws_lambda.Code.from_bucket(configureRESUsersGroupsJsonLambdaAsset.bucket, configureRESUsersGroupsJsonLambdaAsset.s3_object_key), environment = { 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'Region': self.cluster_region, - 'RESEnvironmentName': self.config['RESEnvironmentName'] + 'RESEnvironmentName': self.config['RESEnvironmentName'], + 'RESDomainJoinedInstanceName': f"{self.config['RESEnvironmentName']}-vdc-controller", + 'RESDomainJoinedInstanceModuleName': 'virtual-desktop-controller', + 'RESDomainJoinedInstanceModuleId': 'vdc', + 'RESDomainJoinedInstanceNodeType': 'app' } ) - self.configure_res_cluster_manager_lambda.add_to_role_policy( + self.configure_res_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1495,7 +1750,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.configure_res_cluster_manager_lambda.add_to_role_policy( + self.configure_res_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1504,10 +1759,10 @@ def create_parallel_cluster_lambdas(self): resources=[self.config['ErrorSnsTopicArn']] ) ) - self.configure_res_cluster_manager_lambda.add_event_source( - lambda_event_sources.SnsEventSource(self.configure_res_cluster_manager_sns_topic) + self.configure_res_users_groups_json_lambda.add_event_source( + lambda_event_sources.SnsEventSource(self.configure_res_users_groups_json_sns_topic) ) - self.configure_res_cluster_manager_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) + self.configure_res_users_groups_json_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) configureRESSubmittersLambdaAsset = s3_assets.Asset(self, "ConfigureRESSubmittersAsset", path="resources/lambdas/ConfigureRESSubmitters") self.configure_res_submitters_lambda = aws_lambda.Function( @@ -1554,26 +1809,30 @@ def create_parallel_cluster_lambdas(self): ) self.configure_res_submitters_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) - self.deconfigureRESClusterManagerLambdaAsset = s3_assets.Asset(self, "DeconfigureRESClusterManagerAsset", path="resources/lambdas/DeconfigureRESClusterManager") - self.deconfigure_res_cluster_manager_lambda = aws_lambda.Function( - self, "DeconfigRESClusterManagerLambda", - function_name=f"{self.stack_name}-DeconfigRESClusterManager", - description="Deconfigure RES cluster manager", + self.deconfigureRESUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "DeconfigureRESUsersGroupsJsonAsset", path="resources/lambdas/DeconfigureRESUsersGroupsJson") + self.deconfigure_res_users_groups_json_lambda = aws_lambda.Function( + self, "DeconfigRESUsersGroupsJsonLambda", + function_name=f"{self.stack_name}-DeconfigRESUsersGroupsJson", + description="Deconfigure RES users and groups json file", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="DeconfigureRESClusterManager.lambda_handler", - code=aws_lambda.Code.from_bucket(self.deconfigureRESClusterManagerLambdaAsset.bucket, self.deconfigureRESClusterManagerLambdaAsset.s3_object_key), + handler="DeconfigureRESUsersGroupsJson.lambda_handler", + code=aws_lambda.Code.from_bucket(self.deconfigureRESUsersGroupsJsonLambdaAsset.bucket, self.deconfigureRESUsersGroupsJsonLambdaAsset.s3_object_key), environment = { 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'Region': self.cluster_region, - 'RESEnvironmentName': self.config['RESEnvironmentName'] + 'RESEnvironmentName': self.config['RESEnvironmentName'], + 'RESDomainJoinedInstanceName': f"{self.config['RESEnvironmentName']}-vdc-controller", + 'RESDomainJoinedInstanceModuleName': 'virtual-desktop-controller', + 'RESDomainJoinedInstanceModuleId': 'vdc', + 'RESDomainJoinedInstanceNodeType': 'app' } ) - self.deconfigure_res_cluster_manager_lambda.add_to_role_policy( + self.deconfigure_res_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1585,7 +1844,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.deconfigure_res_cluster_manager_lambda.add_to_role_policy( + self.deconfigure_res_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -2853,7 +3112,8 @@ def create_parallel_cluster_config(self): for extra_mount_dict in self.config['slurm'].get('storage', {}).get('ExtraMounts', {}): mount_dir = extra_mount_dict['dest'] if mount_dir == '/home' and not config_schema.PARALLEL_CLUSTER_SUPPORTS_HOME_MOUNT(self.PARALLEL_CLUSTER_VERSION): - continue + logger.error(f"Mounting /home is not supported in this version of ParallelCluster.") + exit(1) storage_type = extra_mount_dict['StorageType'] if storage_type == 'Efs': parallel_cluster_storage_dict = { @@ -2883,6 +3143,7 @@ def create_parallel_cluster_config(self): 'MountDir': mount_dir, 'FsxOpenZfsSettings': {'VolumeId': extra_mount_dict['VolumeId']}, } + logger.debug(f"Adding SharedStorage:\n{json.dumps(parallel_cluster_storage_dict, indent=4)}") self.parallel_cluster_config['SharedStorage'].append(parallel_cluster_storage_dict) # Save the config template to s3. @@ -2949,7 +3210,7 @@ def create_parallel_cluster_config(self): self.parallel_cluster.node.add_dependency(self.update_head_node_lambda) # The lambdas to configure instances must exist befor the cluster so they can be called. if 'RESEnvironmentName' in self.config: - self.parallel_cluster.node.add_dependency(self.configure_res_cluster_manager_lambda) + self.parallel_cluster.node.add_dependency(self.configure_res_users_groups_json_lambda) self.parallel_cluster.node.add_dependency(self.configure_res_submitters_lambda) # Build config files need to be created before cluster so that they can be downloaded as part of on_head_node_configures self.parallel_cluster.node.add_dependency(self.build_config_files) @@ -2962,20 +3223,20 @@ def create_parallel_cluster_config(self): self, "UpdateHeadNode", service_token = self.update_head_node_lambda.function_arn, properties = { - 'ParallelClusterConfigHash': self.parallel_cluster_config_yaml_hash, + 'ParallelClusterConfigHash': self.assets_hash.hexdigest(), } ) self.update_head_node.node.add_dependency(self.parallel_cluster) if 'RESEnvironmentName' in self.config: # Custom resource to deconfigure cluster manager before deleting cluster - self.deconfigure_res_cluster_manager = CustomResource( - self, "DeconfigureRESClusterManager", - service_token = self.deconfigure_res_cluster_manager_lambda.function_arn, + self.deconfigure_res_users_groups_json = CustomResource( + self, "DeconfigureRESUsersGroupsJson", + service_token = self.deconfigure_res_users_groups_json_lambda.function_arn, properties = { } ) - self.deconfigure_res_cluster_manager.node.add_dependency(self.parallel_cluster) + self.deconfigure_res_users_groups_json.node.add_dependency(self.parallel_cluster) # Custom resource to deconfigure submitters before deleting cluster self.deconfigure_res_submitters = CustomResource( diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index 35f08d70..217ae327 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -35,6 +35,7 @@ logger.setLevel(logging.INFO) # MIN_PARALLEL_CLUSTER_VERSION +# Releases: https://github.com/aws/aws-parallelcluster/releases # 3.2.0: # * Add support for memory-based job scheduling in Slurm # 3.3.0: @@ -61,7 +62,7 @@ # * Fix pmix CVE # * Use Slurm 23.02.5 MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0') -DEFAULT_PARALLEL_CLUSTER_VERSION = parse_version('3.8.0') +# Update source/resources/default_config.yml with latest version when this is updated. PARALLEL_CLUSTER_VERSIONS = [ '3.6.0', '3.6.1', @@ -124,7 +125,7 @@ ] def get_parallel_cluster_version(config): - return config['slurm']['ParallelClusterConfig'].get('Version', str(DEFAULT_PARALLEL_CLUSTER_VERSION)) + return config['slurm']['ParallelClusterConfig']['Version'] def get_PARALLEL_CLUSTER_MUNGE_VERSION(config): parallel_cluster_version = get_parallel_cluster_version(config) @@ -185,6 +186,38 @@ def PARALLEL_CLUSTER_SUPPORTS_HOME_MOUNT(parallel_cluster_version): logger.error(f"{fg('red')}Unable to list all AWS regions. Make sure you have set your IAM credentials. {err} {attr('reset')}") exit(1) +VALID_ARCHITECTURES = ['arm64', 'x86_64'] + +DEFAULT_ARCHITECTURE = 'x86_64' + +# Controller needs at least 4 GB or will hit OOM + +DEFAULT_ARM_CONTROLLER_INSTANCE_TYPE = 'c6g.large' + +DEFAULT_X86_CONTROLLER_INSTANCE_TYPE = 'c6a.large' + +def default_controller_instance_type(config): + architecture = config['slurm']['ParallelClusterConfig'].get('Architecture', DEFAULT_ARCHITECTURE) + if architecture == 'x86_64': + return DEFAULT_X86_CONTROLLER_INSTANCE_TYPE + elif architecture == 'arm64': + return DEFAULT_ARM_CONTROLLER_INSTANCE_TYPE + else: + raise ValueError(f"Invalid architecture: {architecture}") + +DEFAULT_ARM_OS = 'rhel8' + +DEFAULT_X86_OS = 'rhel8' + +def DEFAULT_OS(config): + architecture = config['slurm']['ParallelClusterConfig'].get('Architecture', DEFAULT_ARCHITECTURE) + if architecture == 'x86_64': + return DEFAULT_X86_OS + elif architecture == 'arm64': + return DEFAULT_ARM_OS + else: + raise ValueError(f"Invalid architecture: {architecture}") + filesystem_lifecycle_policies = [ 'None', 'AFTER_14_DAYS', @@ -350,12 +383,12 @@ def get_config_schema(config): 'slurm': { Optional('ParallelClusterConfig'): { Optional('Enable', default=True): And(bool, lambda s: s == True), - Optional('Version', default=str(DEFAULT_PARALLEL_CLUSTER_VERSION)): And(str, lambda version: version in PARALLEL_CLUSTER_VERSIONS, lambda version: parse_version(version) >= MIN_PARALLEL_CLUSTER_VERSION), - Optional('Image', default={'Os': 'centos7'}): { - 'Os': And(str, lambda s: s in PARALLEL_CLUSTER_ALLOWED_OSES, ), + 'Version': And(str, lambda version: version in PARALLEL_CLUSTER_VERSIONS, lambda version: parse_version(version) >= MIN_PARALLEL_CLUSTER_VERSION), + Optional('Image', default={'Os': DEFAULT_OS(config)}): { + 'Os': And(str, lambda s: s in PARALLEL_CLUSTER_ALLOWED_OSES), Optional('CustomAmi'): And(str, lambda s: s.startswith('ami-')), }, - Optional('Architecture', default='x86_64'): And(str, lambda s: s in ['arm64', 'x86_64']), + Optional('Architecture', default=DEFAULT_ARCHITECTURE): And(str, lambda s: s in VALID_ARCHITECTURES), Optional('ComputeNodeAmi'): And(str, lambda s: s.startswith('ami-')), Optional('DisableSimultaneousMultithreading', default=True): bool, # Recommend to not use EFA unless necessary to avoid insufficient capacity errors when starting new instances in group or when multiple instance types in the group @@ -424,13 +457,13 @@ def get_config_schema(config): # If the secret doesn't exist one will be created, but won't be part of the cloudformation stack # so that it won't be deleted when the stack is deleted. # Required if your submitters need to use more than 1 cluster. - Optional('MungeKeySecret'): str, + Optional('MungeKeySecret', default='/slurm/munge_key'): str, # # SlurmCtl: # Required, but can be an empty dict to accept all of the defaults 'SlurmCtl': { Optional('SlurmdPort', default=6818): int, - Optional('instance_type', default='c6a.large'): str, + Optional('instance_type', default=default_controller_instance_type(config)): str, Optional('volume_size', default=200): int, Optional('CloudWatchPeriod', default=5): int, Optional('PreemptMode', default='REQUEUE'): And(str, lambda s: s in ['OFF', 'CANCEL', 'GANG', 'REQUEUE', 'SUSPEND']), diff --git a/source/resources/config/default_config.yml b/source/resources/config/default_config.yml index e20f5193..9820e834 100644 --- a/source/resources/config/default_config.yml +++ b/source/resources/config/default_config.yml @@ -2,7 +2,17 @@ #==================================================================== # Sample configuraton that creates a minimal Slurm cluster # +# NOTE: This is just an example. +# Please create your own revision controlled config file. +# # No SlurmDbd in this configuration. +# Configure 5 each of t3 instance types. +# +# This config doesn't provide required parameters like VpcId so you must +# use the --prompt option with it. +# To use: +# source setup.sh +# ./install.sh --config-file source/config/default_config.yml --prompt # # Defaults and valid configuration options are in source/config_schema.py. # Command line values override values in the config file. @@ -10,20 +20,74 @@ StackName: slurmminimal-config +# @TODO: Add Region +# Region: {{Region}} + +# @TODO: Add your SshKeyPair +# SshKeyPair: {{SshKeyPair}} + +# @TODO: Update with your VPC +# VpcId: vpc-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your private subnet in your VPC +# SubnetId: subnet-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription +# ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} + +# @TODO: Add your preferred timezone so times aren't in UTC +# TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York + +# @TODO: If using Research and Engineering Studio, update with environment name +# RESEnvironmentName: {{ResEnvironmentName}} + slurm: ParallelClusterConfig: - Enable: true + Version: 3.8.0 + # @TODO: Choose the CPU architecture: x86_64, arm64. Default: x86_64 + # Architecture: x86_64 + # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 + # Database: + # DatabaseStackName: {{DatabaseStackName}} + + MungeKeySecret: SlurmMungeKey SlurmCtl: {} - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true Include: + # @TODO: Update InstanceFamiles and InstanceTypes to use in your cluster InstanceFamilies: - t3 InstanceTypes: [] NodeCounts: + # @TODO: Update the max number of each instance type to configure DefaultMaxCount: 5 + # @TODO: You can update the max instance count for each compute resource + # ComputeResourceCounts: + # od-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge + # MaxCount: 1 + # sp-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge + # MaxCount: 2 + + # @TODO: Configure storage mounts + # storage: + # ExtraMounts: + # - dest: /home + # StorageType: Efs + # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' + # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ + # type: nfs4 + # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport + # ExtraMountSecurityGroups: + # nfs: + # DCV-Host: sg-xxxxxxxxxxxxxxxxx + +# @TODO: Configure license counts +Licenses: + vcs: + Count: 10 + Server: synopsys_licenses + Port: '24680' + ServerType: flexlm diff --git a/source/resources/config/slurm_all_arm_instance_types.yml b/source/resources/config/slurm_all_arm_instance_types.yml new file mode 100644 index 00000000..77ae245b --- /dev/null +++ b/source/resources/config/slurm_all_arm_instance_types.yml @@ -0,0 +1,84 @@ +--- +#==================================================================== +# Minimal cluster with all X86_64 instance types +# +# NOTE: This is just an example. +# Please create your own revision controlled config file. +# +# No SlurmDbd in this configuration. +# Configure 10 each of all x86_64 instance types. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== + +StackName: slurm-all-arm-config + +# @TODO: Add Region +# Region: {{Region}} + +# @TODO: Add your SshKeyPair +# SshKeyPair: {{SshKeyPair}} + +# @TODO: Update with your VPC +# VpcId: vpc-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your private subnet in your VPC +# SubnetId: subnet-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription +# ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} + +# @TODO: Add your preferred timezone so times aren't in UTC +# TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York + +# @TODO: If using Research and Engineering Studio, update with environment name +# RESEnvironmentName: {{ResEnvironmentName}} + +slurm: + ParallelClusterConfig: + Version: 3.8.0 + Architecture: arm64 + # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 + # Database: + # DatabaseStackName: {{DatabaseStackName}} + + MungeKeySecret: SlurmMungeKey + + SlurmCtl: {} + + InstanceConfig: + UseSpot: true + Include: + InstanceFamilies: ['.*'] + InstanceTypes: [] + NodeCounts: + # @TODO: Update the max number of each instance type to configure + DefaultMaxCount: 5 + # @TODO: You can update the max instance count for each compute resource + ComputeResourceCounts: + od-1024gb-64-cores: # x2gd.16xlarge + MaxCount: 1 + sp-1024gb-64-cores: # x2gd.16xlarge + MaxCount: 2 + + # @TODO: Configure storage mounts + # storage: + # ExtraMounts: + # - dest: /home + # StorageType: Efs + # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' + # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ + # type: nfs4 + # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport + # ExtraMountSecurityGroups: + # nfs: + # DCV-Host: sg-xxxxxxxxxxxxxxxxx + +# @TODO: Configure license counts +Licenses: + vcs: + Count: 10 + Server: synopsys_licenses + Port: '24680' + ServerType: flexlm diff --git a/source/resources/config/slurm_all_instance_types.yml b/source/resources/config/slurm_all_instance_types.yml deleted file mode 100644 index 5e88f29c..00000000 --- a/source/resources/config/slurm_all_instance_types.yml +++ /dev/null @@ -1,26 +0,0 @@ ---- -#==================================================================== -# Create a minimal cluster with all instance types -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmalltypes - -slurm: - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 10 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - Include: - InstanceFamilies: [] - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_all_os.yml b/source/resources/config/slurm_all_os.yml deleted file mode 100644 index 6a08d427..00000000 --- a/source/resources/config/slurm_all_os.yml +++ /dev/null @@ -1,37 +0,0 @@ ---- -#==================================================================== -# Slurm cluster with all supported OS distributions and versions. -# -# Note that CentOS 8 has been discontinued and support has been removed. -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmallos - -slurm: - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 10 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - Amazon: {2: [x86_64, arm64]} - CentOS: - 7: [x86_64] - RedHat: - 7: [x86_64] - 8: [x86_64, arm64] - Rocky: {8: [x86_64, arm64]} - Include: - InstanceFamilies: - - t3 - - t4g - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_all_x86_instance_types.yml b/source/resources/config/slurm_all_x86_instance_types.yml new file mode 100644 index 00000000..e39ac0d5 --- /dev/null +++ b/source/resources/config/slurm_all_x86_instance_types.yml @@ -0,0 +1,84 @@ +--- +#==================================================================== +# Minimal cluster with all X86_64 instance types +# +# NOTE: This is just an example. +# Please create your own revision controlled config file. +# +# No SlurmDbd in this configuration. +# Configure 10 each of all x86_64 instance types. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== + +StackName: slurm-all-x86-config + +# @TODO: Add Region +# Region: {{Region}} + +# @TODO: Add your SshKeyPair +# SshKeyPair: {{SshKeyPair}} + +# @TODO: Update with your VPC +# VpcId: vpc-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your private subnet in your VPC +# SubnetId: subnet-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription +# ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} + +# @TODO: Add your preferred timezone so times aren't in UTC +# TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York + +# @TODO: If using Research and Engineering Studio, update with environment name +# RESEnvironmentName: {{ResEnvironmentName}} + +slurm: + ParallelClusterConfig: + Version: 3.8.0 + Architecture: x86_64 + # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 + # Database: + # DatabaseStackName: {{DatabaseStackName}} + + MungeKeySecret: SlurmMungeKey + + SlurmCtl: {} + + InstanceConfig: + UseSpot: true + Include: + InstanceFamilies: ['.*'] + InstanceTypes: [] + NodeCounts: + # @TODO: Update the max number of each instance type to configure + DefaultMaxCount: 5 + # @TODO: You can update the max instance count for each compute resource + ComputeResourceCounts: + od-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge + MaxCount: 1 + sp-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge + MaxCount: 2 + + # @TODO: Configure storage mounts + # storage: + # ExtraMounts: + # - dest: /home + # StorageType: Efs + # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' + # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ + # type: nfs4 + # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport + # ExtraMountSecurityGroups: + # nfs: + # DCV-Host: sg-xxxxxxxxxxxxxxxxx + +# @TODO: Configure license counts +Licenses: + vcs: + Count: 10 + Server: synopsys_licenses + Port: '24680' + ServerType: flexlm diff --git a/source/resources/config/slurm_alma_linux.yml b/source/resources/config/slurm_alma_linux.yml deleted file mode 100644 index d2e1ac5f..00000000 --- a/source/resources/config/slurm_alma_linux.yml +++ /dev/null @@ -1,29 +0,0 @@ ---- -#==================================================================== -# Slurm cluster to test Alma Linux support -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - - -StackName: slurmalma - -slurm: - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 10 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - Include: - InstanceFamilies: - - t3 - - t4g - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda.yml b/source/resources/config/slurm_eda.yml deleted file mode 100644 index 97590bff..00000000 --- a/source/resources/config/slurm_eda.yml +++ /dev/null @@ -1,37 +0,0 @@ ---- -#==================================================================== -# Slurm cluster for EDA -# -# Redundant controllers and typical instances used by EDA. -# Uses CentOS 7 and AlmaLinux 8 and both x86_64 and arm64 architectures. -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmeda - -slurm: - SlurmCtl: - # For high availability configure multiple controllers - NumberOfControllers: 2 - - # The accounting database is required to enable fairshare scheduling - # It is managed by the Slurm Database Daemon (slurmdbd) instance - SlurmDbd: {} - - # Configure typical EDA instance types - # A partition will be created for each combination of Base OS, Architecture, and Spot - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 5 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - # OnPremComputeNodes: - # ConfigFile: '/path/slurm_nodes_on_prem.conf' - # CIDR: 'x.x.x.x/16' - - # Use defaults from schema - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az1.yml b/source/resources/config/slurm_eda_az1.yml deleted file mode 100644 index 5ec9f19d..00000000 --- a/source/resources/config/slurm_eda_az1.yml +++ /dev/null @@ -1,46 +0,0 @@ ---- -#==================================================================== -# Federated Slurm cluster for EDA -# -# This is the first AZ that other AZs will reference. -# Other federated clusters will share this cluster's SlurmDbd instance. -# Redundant controllers and typical instances used by EDA. -# Uses CentOS 7 and AlmaLinux 8 and both x86_64 and arm64 architectures. -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmedaaz1 - -# Add your subnet id -SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 - -# This is optional, but highly recommended -#ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} - -slurm: - SlurmCtl: - # For high availability configure multiple controllers - NumberOfControllers: 2 - - # The accounting database is required to enable fairshare scheduling - # It is managed by the Slurm Database Daemon (slurmdbd) instance - SlurmDbd: {} - - Federation: - Name: slurmeda - FederatedClusterStackNames: [] - - # Configure typical EDA instance types - # A partition will be created for each combination of Base OS, Architecture, and Spot - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 5 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - - # Use defaults from schema - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az2.yml b/source/resources/config/slurm_eda_az2.yml deleted file mode 100644 index 09fd2fb4..00000000 --- a/source/resources/config/slurm_eda_az2.yml +++ /dev/null @@ -1,47 +0,0 @@ ---- -#==================================================================== -# Federated Slurm cluster for EDA -# -# This is the 2nd AZ that must be created after the 1st cluster. -# Shares the SlurmDbd instance from the 1st AZ's cluster. -# Redundant controllers and typical instances used by EDA. -# Uses CentOS 7 and AlmaLinux 8 and both x86_64 and arm64 architectures. -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmedaaz2 - -# Add your subnet id -SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 - -# This is optional, but highly recommended -#ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} - -slurm: - SlurmCtl: - # For high availability configure multiple controllers - NumberOfControllers: 2 - - # Re-use the SlurmDbd instance from slurmedaaz1 - ExistingSlurmDbd: - StackName: slurmedaaz1 - - Federation: - Name: slurmeda - FederatedClusterStackNames: - - slurmedaaz1 - - # Configure typical EDA instance types - # A partition will be created for each combination of Base OS, Architecture, and Spot - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 5 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - - # Use defaults from schema - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az3.yml b/source/resources/config/slurm_eda_az3.yml deleted file mode 100644 index b66421d6..00000000 --- a/source/resources/config/slurm_eda_az3.yml +++ /dev/null @@ -1,48 +0,0 @@ ---- -#==================================================================== -# Federated Slurm cluster for EDA -# -# This is the 3rd AZ that must be created after the 1st AZ and 2nd clusters. -# Shares the SlurmDbd instance from the 1st AZ's cluster. -# Redundant controllers and typical instances used by EDA. -# Uses CentOS 7 and AlmaLinux 8 and both x86_64 and arm64 architectures. -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmedaaz3 - -# Add your subnet id -SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 - -# This is optional, but highly recommended -#ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} - -slurm: - SlurmCtl: - # For high availability configure multiple controllers - NumberOfControllers: 2 - - # Re-use the SlurmDbd instance from slurmedaaz1 - ExistingSlurmDbd: - StackName: slurmedaaz1 - - Federation: - Name: slurmeda - FederatedClusterStackNames: - - slurmedaaz1 - - slurmedaaz2 - - # Configure typical EDA instance types - # A partition will be created for each combination of Base OS, Architecture, and Spot - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 5 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - - # Use defaults from schema - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_elasticsearch.yml b/source/resources/config/slurm_elasticsearch.yml deleted file mode 100644 index 95fd7c8e..00000000 --- a/source/resources/config/slurm_elasticsearch.yml +++ /dev/null @@ -1,50 +0,0 @@ ---- -#==================================================================== -# Minimal Slurm cluster with an ElasticSearch domain -# -# Creates a new domain and configures Slurm to write completed job information to the domain. -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmes - -slurm: - # ElasticSearch: - # data_nodes: Must be a multiple of number_of_azs - ElasticSearch: - ebs_volume_size: 20 - ebs_volume_type: GP2 - enable_version_upgrade: False - number_of_azs: 2 - master_nodes: 3 - master_node_instance_type: m5.large.search - data_nodes: 2 - data_node_instance_type: m5.large.search - warm_nodes: 0 - warm_instance_type: ultrawarm.medium.search - - # JobCompType: - # Values: - # jobcomp/none - # jobcomp/elasticsearch - # jobcomp/filetxt - JobCompType: jobcomp/elasticsearch - - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 10 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - Include: - InstanceFamilies: - - t3 - - t4g - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_fpga_dev.yml b/source/resources/config/slurm_fpga_dev.yml deleted file mode 100644 index 05922465..00000000 --- a/source/resources/config/slurm_fpga_dev.yml +++ /dev/null @@ -1,46 +0,0 @@ ---- -#==================================================================== -# Slurm cluster that uses the AWS FPGA Developer AMI as the base AMI for compute nodes. -# -# Based on the EDA configuration. -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmedafpga - -slurm: - SlurmNodeAmis: - # AWS FPGA Developer AMIs - BaseAmis: - us-east-1: - Amazon: {2: {x86_64: {ImageId: ami-0efdec76678df9a64, RootDeviceSize: '+5'}}} - CentOS: {7: {x86_64: {ImageId: ami-02155c6289e76719a, RootDeviceSize: '+5'}}} - - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 5 - BaseOsArchitecture: - Amazon: - 2: [x86_64] - CentOS: - 7: [x86_64] - Include: - InstanceFamilies: - - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_lustre.yml b/source/resources/config/slurm_lustre.yml deleted file mode 100644 index fa5a0d6e..00000000 --- a/source/resources/config/slurm_lustre.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -#==================================================================== -# Slurm cluster that uses Lustre for storing the Slurm configuration and tool files -# -# EFS should be adequate and more cost effective for most uses. -# You might consider Lustre for very large, dynamic clusters if they are putting a strain on EFS metadata. -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmlustre - -slurm: - - storage: - provider: lustre - lustre': {} - - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 5 - BaseOsArchitecture: - CentOS: - 7: [x86_64] - Include: - InstanceFamilies: - - t3 - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} diff --git a/source/resources/config/slurm_multi_az.yml b/source/resources/config/slurm_multi_az.yml deleted file mode 100644 index 72a3e62d..00000000 --- a/source/resources/config/slurm_multi_az.yml +++ /dev/null @@ -1,97 +0,0 @@ ---- -# Multi-region Slurm cluster with Netapp Ontap - -StackName: slurmmultiaz - -#Region: us-east-1 - -#SshKeyPair: name of your ec2 keypair - -#VpcId: vpc-xxxxxxxxxxxxxxxxx - -#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 - -#HostedZoneId: XXXXXXXXXXXXXXXXXXX - -# This is optional, but highly recommended -#ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} - -#TimeZone: 'US/Central' - -slurm: - MungeKeySsmParameter: "/slurm/munge_key" - - SlurmCtl: - NumberOfControllers: 2 - - SlurmDbd: {} - - # External security groups that should be able to use the cluster - # SubmitterSecurityGroupIds: - # soca-ComputeNodeSG: sg-xxxxxxxxxxxxxxxxx - - # SubmitterInstanceTags: - # 'soca:ClusterId': ['soca-xyz'] - - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 10 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - Include: - InstanceFamilies: - - t3 - - t4g - InstanceTypes: [] - Regions: - eu-west-1: - VpcId: vpc-xxxxxxxxxxxxxxxxx - CIDR: 10.1.0.0/16 - SshKeyPair: admin-eu-west-1 - AZs: - - Priority: 10 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 - - Priority: 9 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 - - Priority: 8 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 - us-east-1: - VpcId: vpc-xxxxxxxxxxxxxxxxx - CIDR: 10.2.0.0/16 - SshKeyPair: admin-us-east-1 - AZs: - - Priority: 7 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 - - Priority: 6 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 - - Priority: 5 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 - us-west-2: - VpcId: vpc-xxxxxxxxxxxxxxxxx - CIDR: 10.3.0.0/16 - SshKeyPair: admin-us-west-2 - #SecurityGroupId: sg-0addccc8388e008fd - AZs: - - Priority: 4 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 - - Priority: 3 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 - - Priority: 2 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 - - storage: - provider: ontap - removal_policy: DESTROY - ontap: {} - - #ExtraMounts: - # - dest: /apps - # src: fs-xxxxxxxx.efs.us-east-1.amazonaws.com:/ - # type: nfs4 - # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport - # - dest: /data - # src: fs-xxxxxxxx.efs.us-east-1.amazonaws.com:/ - # type: nfs4 - # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport diff --git a/source/resources/config/slurm_ontap.yml b/source/resources/config/slurm_ontap.yml deleted file mode 100644 index ccf02007..00000000 --- a/source/resources/config/slurm_ontap.yml +++ /dev/null @@ -1,29 +0,0 @@ ---- -#==================================================================== -# Slurm cluster that uses FSx for NetApp ONTAP for storing the Slurm configuration and tool files -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmontap -slurm: - storage: - provider: ontap - ontap: {} # This causes the defaults from the schema to be applied. - - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 10 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - Include: - InstanceFamilies: - - t3 - - t4g - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} diff --git a/source/resources/config/slurm_recommended_arm_instance_types.yml b/source/resources/config/slurm_recommended_arm_instance_types.yml new file mode 100644 index 00000000..c419f70f --- /dev/null +++ b/source/resources/config/slurm_recommended_arm_instance_types.yml @@ -0,0 +1,81 @@ +--- +#==================================================================== +# Minimal cluster with all X86_64 instance types +# +# NOTE: This is just an example. +# Please create your own revision controlled config file. +# +# No SlurmDbd in this configuration. +# Configure 10 each of all x86_64 instance types. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== + +StackName: slurm-arm-config + +# @TODO: Add Region +# Region: {{Region}} + +# @TODO: Add your SshKeyPair +# SshKeyPair: {{SshKeyPair}} + +# @TODO: Update with your VPC +# VpcId: vpc-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your private subnet in your VPC +# SubnetId: subnet-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription +# ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} + +# @TODO: Add your preferred timezone so times aren't in UTC +# TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York + +# @TODO: If using Research and Engineering Studio, update with environment name +# RESEnvironmentName: {{ResEnvironmentName}} + +slurm: + ParallelClusterConfig: + Version: 3.8.0 + Architecture: arm64 + # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 + # Database: + # DatabaseStackName: {{DatabaseStackName}} + + MungeKeySecret: SlurmMungeKey + + SlurmCtl: {} + + InstanceConfig: + UseSpot: true + NodeCounts: + # @TODO: Update the max number of each instance type to configure + DefaultMaxCount: 5 + # @TODO: You can update the max instance count for each compute resource + ComputeResourceCounts: + od-1024gb-64-cores: # x2gd.16xlarge + MaxCount: 1 + sp-1024gb-64-cores: # x2gd.16xlarge + MaxCount: 2 + + # @TODO: Configure storage mounts + # storage: + # ExtraMounts: + # - dest: /home + # StorageType: Efs + # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' + # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ + # type: nfs4 + # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport + # ExtraMountSecurityGroups: + # nfs: + # DCV-Host: sg-xxxxxxxxxxxxxxxxx + +# @TODO: Configure license counts +Licenses: + vcs: + Count: 10 + Server: synopsys_licenses + Port: '24680' + ServerType: flexlm diff --git a/source/resources/config/slurm_recommended_x86_instance_types.yml b/source/resources/config/slurm_recommended_x86_instance_types.yml new file mode 100644 index 00000000..f5eecf5f --- /dev/null +++ b/source/resources/config/slurm_recommended_x86_instance_types.yml @@ -0,0 +1,81 @@ +--- +#==================================================================== +# Minimal cluster with all X86_64 instance types +# +# NOTE: This is just an example. +# Please create your own revision controlled config file. +# +# No SlurmDbd in this configuration. +# Configure 10 each of all x86_64 instance types. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== + +StackName: slurm-x86-config + +# @TODO: Add Region +# Region: {{Region}} + +# @TODO: Add your SshKeyPair +# SshKeyPair: {{SshKeyPair}} + +# @TODO: Update with your VPC +# VpcId: vpc-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your private subnet in your VPC +# SubnetId: subnet-xxxxxxxxxxxxxxxxx + +# @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription +# ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} + +# @TODO: Add your preferred timezone so times aren't in UTC +# TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York + +# @TODO: If using Research and Engineering Studio, update with environment name +# RESEnvironmentName: {{ResEnvironmentName}} + +slurm: + ParallelClusterConfig: + Version: 3.8.0 + Architecture: x86_64 + # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 + # Database: + # DatabaseStackName: {{DatabaseStackName}} + + MungeKeySecret: SlurmMungeKey + + SlurmCtl: {} + + InstanceConfig: + UseSpot: true + NodeCounts: + # @TODO: Update the max number of each instance type to configure + DefaultMaxCount: 5 + # @TODO: You can update the max instance count for each compute resource + ComputeResourceCounts: + od-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge + MaxCount: 1 + sp-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge + MaxCount: 2 + + # @TODO: Configure storage mounts + # storage: + # ExtraMounts: + # - dest: /home + # StorageType: Efs + # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' + # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ + # type: nfs4 + # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport + # ExtraMountSecurityGroups: + # nfs: + # DCV-Host: sg-xxxxxxxxxxxxxxxxx + +# @TODO: Configure license counts +Licenses: + vcs: + Count: 10 + Server: synopsys_licenses + Port: '24680' + ServerType: flexlm diff --git a/source/resources/config/slurm_rocky_linux.yml b/source/resources/config/slurm_rocky_linux.yml deleted file mode 100644 index 7d4ba1ee..00000000 --- a/source/resources/config/slurm_rocky_linux.yml +++ /dev/null @@ -1,25 +0,0 @@ ---- -#==================================================================== -# Test Rocky linux support -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmrocky - -slurm: - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 5 - BaseOsArchitecture: - Rocky: {8: [x86_64, arm64]} - Include: - InstanceFamilies: - - t3 - - t4g - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} - storage: {'zfs': {}} diff --git a/source/resources/config/slurm_zfs.yml b/source/resources/config/slurm_zfs.yml deleted file mode 100644 index 7f013467..00000000 --- a/source/resources/config/slurm_zfs.yml +++ /dev/null @@ -1,30 +0,0 @@ ---- -#==================================================================== -# Slurm cluster that uses FSx for OpenZfs for storing the Slurm configuration and tool files -# -# Defaults and valid configuration options are in source/config_schema.py. -# Command line values override values in the config file. -#==================================================================== - -StackName: slurmzfs - -slurm: - storage: - provider: zfs - zfs: {} # This causes the defaults from the schema to be applied. - - InstanceConfig: - UseSpot: true - NodesPerInstanceType: 10 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - Include: - InstanceFamilies: - - t3 - - t4g - InstanceTypes: [] - - # Use defaults from schema - SlurmCtl: {} diff --git a/source/resources/lambdas/ConfigureRESClusterManager/ConfigureRESClusterManager.py b/source/resources/lambdas/ConfigureRESClusterManager/ConfigureRESClusterManager.py deleted file mode 100644 index d3c1191f..00000000 --- a/source/resources/lambdas/ConfigureRESClusterManager/ConfigureRESClusterManager.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -SPDX-License-Identifier: MIT-0 - -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, copy, modify, -merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" - -''' -Call /opt/slurm/{{ClusterName}}/config/bin/create_users_groups_json_configure.sh using ssm run command. -''' -import boto3 -import json -import logging -from os import environ as environ - -logger=logging.getLogger(__file__) -logger_formatter = logging.Formatter('%(levelname)s: %(message)s') -logger_streamHandler = logging.StreamHandler() -logger_streamHandler.setFormatter(logger_formatter) -logger.addHandler(logger_streamHandler) -logger.setLevel(logging.INFO) -logger.propagate = False - -def lambda_handler(event, context): - try: - logger.info(f"event:\n{json.dumps(event, indent=4)}") - - cluster_name = environ['ClusterName'] - cluster_region = environ['Region'] - environment_name = environ['RESEnvironmentName'] - logger.info(f"Update RES cluster={environment_name} manager for {cluster_name} in {cluster_region}") - - ec2_client = boto3.client('ec2', region_name=cluster_region) - - cluster_manager_info = ec2_client.describe_instances( - Filters = [ - {'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]}, - {'Name': 'tag:res:ModuleId', 'Values': ['cluster-manager']} - ] - )['Reservations'][0]['Instances'][0] - cluster_manager_instance_id = cluster_manager_info['InstanceId'] - logger.info(f"cluster manager instance id: {cluster_manager_instance_id}") - - ssm_client = boto3.client('ssm', region_name=cluster_region) - commands = f""" -set -ex - -if ! [ -e /opt/slurm/{cluster_name} ]; then - sudo mkdir -p /opt/slurm/{cluster_name} -fi -if ! mountpoint /opt/slurm/{cluster_name} ; then - sudo mount head_node.{cluster_name}.pcluster:/opt/slurm /opt/slurm/{cluster_name} || true -fi - -script="/opt/slurm/{cluster_name}/config/bin/create_users_groups_json_configure.sh" -if ! [ -e $script ]; then - echo "$script doesn't exist" - exit 1 -fi - -sudo $script - """ - response = ssm_client.send_command( - DocumentName = 'AWS-RunShellScript', - InstanceIds = [cluster_manager_instance_id], - Parameters = {'commands': [commands]}, - Comment = f"Configure {environment_name} cluster manager for {cluster_name}" - ) - logger.info(f"Sent SSM command {response['Command']['CommandId']}") - - except Exception as e: - logger.exception(str(e)) - sns_client = boto3.client('sns') - sns_client.publish( - TopicArn = environ['ErrorSnsTopicArn'], - Subject = f"{cluster_name} CreateHeadNodeARecord failed", - Message = str(e) - ) - logger.info(f"Published error to {environ['ErrorSnsTopicArn']}") - raise diff --git a/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py b/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py index 45813a00..e09ae81d 100644 --- a/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py +++ b/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py @@ -90,15 +90,16 @@ def lambda_handler(event, context): sudo $script """ - # @todo Command is failing because the DCV instance doesn't have permissions to describe instances in the playbook - # I should be able to pass it in as a variable. - response = ssm_client.send_command( + TIMEOUT_MINUTES = 90 + TIMEOUT_SECONDS = TIMEOUT_MINUTES * 60 + send_command_response = ssm_client.send_command( DocumentName = 'AWS-RunShellScript', InstanceIds = submitter_instance_ids, Parameters = {'commands': [commands]}, - Comment = f"Configure {environment_name} submitters for {cluster_name}" + Comment = f"Configure {environment_name} submitters for {cluster_name}", + TimeoutSeconds = TIMEOUT_SECONDS ) - logger.info(f"Sent SSM command {response['Command']['CommandId']}") + logger.info(f"Sent SSM command {send_command_response['Command']['CommandId']}") except Exception as e: logger.exception(str(e)) diff --git a/source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py b/source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py new file mode 100644 index 00000000..71a3cabf --- /dev/null +++ b/source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py @@ -0,0 +1,133 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +SPDX-License-Identifier: MIT-0 + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +''' +Call /opt/slurm/{{ClusterName}}/config/bin/create_users_groups_json_configure.sh using ssm run command. +''' +import boto3 +import json +import logging +from os import environ as environ + +logger=logging.getLogger(__file__) +logger_formatter = logging.Formatter('%(levelname)s: %(message)s') +logger_streamHandler = logging.StreamHandler() +logger_streamHandler.setFormatter(logger_formatter) +logger.addHandler(logger_streamHandler) +logger.setLevel(logging.INFO) +logger.propagate = False + +def lambda_handler(event, context): + try: + logger.info(f"event:\n{json.dumps(event, indent=4)}") + + cluster_name = environ['ClusterName'] + cluster_region = environ['Region'] + environment_name = environ['RESEnvironmentName'] + res_domain_joined_instance_name = environ['RESDomainJoinedInstanceName'] + res_domain_joined_instance_module_name = environ['RESDomainJoinedInstanceModuleName'] + res_domain_joined_instance_module_id = environ['RESDomainJoinedInstanceModuleId'] + res_domain_joined_instance_node_type = environ['RESDomainJoinedInstanceNodeType'] + logger.info(f"Update RES /opt/slurm/{cluster_name}/config/users_groups.json from {environment_name} domain joined instance with ModuleId={res_domain_joined_instance_module_name}") + + domain_joined_instance_id = None + ec2_client = boto3.client('ec2', region_name=cluster_region) + describe_instances_paginator = ec2_client.get_paginator('describe_instances') + describe_instances_kwargs = { + 'Filters': [ + {'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]}, + {'Name': 'tag:Name', 'Values': [res_domain_joined_instance_name]}, + {'Name': 'tag:res:ModuleName', 'Values': [res_domain_joined_instance_module_name]}, + {'Name': 'tag:res:ModuleId', 'Values': [res_domain_joined_instance_module_id]}, + {'Name': 'tag:res:NodeType', 'Values': [res_domain_joined_instance_node_type]}, + {'Name': 'instance-state-name', 'Values': ['running']} + ] + } + for describe_instances_response in describe_instances_paginator.paginate(**describe_instances_kwargs): + for reservation_dict in describe_instances_response['Reservations']: + domain_joined_instance_info = reservation_dict['Instances'][0] + domain_joined_instance_id = domain_joined_instance_info['InstanceId'] + logger.info(f"Domain joined instance id: {domain_joined_instance_id}") + if not domain_joined_instance_id: + raise RuntimeError(f"No running instances found with tags res:EnvironmentName={environment_name} and res:ModuleId={res_domain_joined_instance_module_name}") + + ssm_client = boto3.client('ssm', region_name=cluster_region) + commands = f""" +set -ex + +if ! [ -e /opt/slurm/{cluster_name} ]; then + sudo mkdir -p /opt/slurm/{cluster_name} +fi +if ! mountpoint /opt/slurm/{cluster_name} ; then + timeout 5s sudo mount head_node.{cluster_name}.pcluster:/opt/slurm /opt/slurm/{cluster_name} || true +fi + +script="/opt/slurm/{cluster_name}/config/bin/create_users_groups_json_configure.sh" +if ! [ -e $script ]; then + echo "$script doesn't exist" + exit 1 +fi + +sudo $script + """ + send_command_response = ssm_client.send_command( + DocumentName = 'AWS-RunShellScript', + InstanceIds = [domain_joined_instance_id], + Parameters = {'commands': [commands]}, + Comment = f"Configure {environment_name} users and groups for {cluster_name}", + TimeoutSeconds = 5 * 60 # 5 minutes + ) + logger.info(f"Sent SSM command {send_command_response['Command']['CommandId']}") + + # Wait for SSM command to complete + MAX_WAIT_TIME = 15 * 60 + DELAY = 10 + MAX_ATTEMPTS = int(MAX_WAIT_TIME / DELAY) + waiter = ssm_client.get_waiter('command_executed') + waiter.wait( + CommandId=send_command_response['Command']['CommandId'], + InstanceId=domain_joined_instance_id, + WaiterConfig={ + 'Delay': DELAY, + 'MaxAttempts': MAX_ATTEMPTS + } + ) + + # Check the result of the command + get_command_invocation_response = ssm_client.get_command_invocation( + CommandId = command_id, + InstanceId = domain_joined_instance_id + ) + command_status = get_command_invocation_response['Status'] + if command_status in ['Success']: + logger.info(f"Command passed on {domain_joined_instance_id}") + else: + logger.error(f"Command {command_status} on {domain_joined_instance_id}") + cfnresponse.send(event, context, cfnresponse.FAILED, {'error': f"Configure command failed."}, physicalResourceId=cluster_name) + return + + except Exception as e: + logger.exception(str(e)) + sns_client = boto3.client('sns') + sns_client.publish( + TopicArn = environ['ErrorSnsTopicArn'], + Subject = f"{cluster_name} CreateRESUsersGroups failed", + Message = str(e) + ) + logger.info(f"Published error to {environ['ErrorSnsTopicArn']}") + raise diff --git a/source/resources/lambdas/CreateHeadNodeARecord/CreateHeadNodeARecord.py b/source/resources/lambdas/CreateHeadNodeARecord/CreateHeadNodeARecord.py index 99faa826..92deb034 100644 --- a/source/resources/lambdas/CreateHeadNodeARecord/CreateHeadNodeARecord.py +++ b/source/resources/lambdas/CreateHeadNodeARecord/CreateHeadNodeARecord.py @@ -61,7 +61,6 @@ def lambda_handler(event, context): raise ValueError(f"No private hosted zone named {hosted_zone_name} found.") # Check to see if the A record already exists - # /hostedzone/Z007151912IBQH21Y4P5H list_resource_record_sets_paginator = route53_client.get_paginator('list_resource_record_sets') list_resource_record_sets_iterator = list_resource_record_sets_paginator.paginate(HostedZoneId=hosted_zone_id) head_node_a_record_name = f"head_node.{hosted_zone_name}" @@ -75,18 +74,23 @@ def lambda_handler(event, context): logger.info(f"Creating {head_node_a_record_name} A record.") + head_node_ip_address = None + head_node_instance_id = None ec2_client = boto3.client('ec2', region_name=cluster_region) - instances_info = ec2_client.describe_instances( - Filters = [ + describe_instances_paginator = ec2_client.get_paginator('describe_instances') + describe_instances_kwargs = { + 'Filters': [ {'Name': 'tag:parallelcluster:cluster-name', 'Values': [cluster_name]}, - {'Name': 'tag:parallelcluster:node-type', 'Values': ['HeadNode']} + {'Name': 'tag:parallelcluster:node-type', 'Values': ['HeadNode']}, + {'Name': 'instance-state-name', 'Values': ['running']} ] - ) - head_node_ip_address = None - for reservation_dict in instances_info['Reservations']: - for instance_dict in reservation_dict['Instances']: - head_node_ip_address = instance_dict.get('PrivateIpAddress', None) - head_node_instance_id = instance_dict['InstanceId'] + } + for describe_instances_response in describe_instances_paginator.paginate(**describe_instances_kwargs): + for reservation_dict in describe_instances_response['Reservations']: + for instance_dict in reservation_dict['Instances']: + head_node_ip_address = instance_dict.get('PrivateIpAddress', None) + if head_node_ip_address: + head_node_instance_id = instance_dict['InstanceId'] if not head_node_ip_address: raise ValueError(f"No head node private IP address found for {cluster_name}") logger.info(f"head node instance id: {head_node_instance_id}") diff --git a/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py b/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py index 85304ed7..aa6b5aad 100644 --- a/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py +++ b/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py @@ -38,12 +38,23 @@ logger.setLevel(logging.INFO) logger.propagate = False +def get_clusters(cluster_region): + clusters = [] + list_clusters_kwargs = {'region': cluster_region} + while list_clusters_kwargs: + clusters_dict = pc.list_clusters(**list_clusters_kwargs) + if 'nextToken' in clusters_dict: + list_clusters_kwargs['next_token'] = clusters_dict['nextToken'] + else: + list_clusters_kwargs = None + for cluster in clusters_dict['clusters']: + clusters.append(cluster) + return clusters + def get_cluster_status(cluster_name, cluster_region): logger.info("Listing clusters to get cluster status") - clusters_dict = pc.list_clusters(region=cluster_region) - logger.info(f"clusters_dict:\n{json.dumps(clusters_dict, indent=4)}") cluster_status = None - for cluster_dict in clusters_dict['clusters']: + for cluster_dict in get_clusters(cluster_region): if cluster_dict['clusterName'] != cluster_name: continue logger.info(f"cluster_dict:\n{json.dumps(cluster_dict, indent=4)}") diff --git a/source/resources/lambdas/CreateParallelClusterConfig/CreateParallelClusterConfig.py b/source/resources/lambdas/CreateParallelClusterConfig/CreateParallelClusterConfig.py index 222bfd2f..6179067e 100644 --- a/source/resources/lambdas/CreateParallelClusterConfig/CreateParallelClusterConfig.py +++ b/source/resources/lambdas/CreateParallelClusterConfig/CreateParallelClusterConfig.py @@ -42,7 +42,7 @@ def lambda_handler(event, context): try: logger.info(f"event:\n{json.dumps(event, indent=4)}") - cluster_name = None + cluster_name = environ.get('ClusterName', None) requestType = event['RequestType'] properties = event['ResourceProperties'] required_properties = [ @@ -91,6 +91,7 @@ def lambda_handler(event, context): except: pass else: # Create or Update + logger.info(f"Deleting Parallel Cluster yaml config template from {yaml_template_s3_url}") parallel_cluster_config_yaml_template = Template( s3_client.get_object( Bucket = environ['ParallelClusterConfigS3Bucket'], @@ -118,7 +119,7 @@ def lambda_handler(event, context): except Exception as e: logger.exception(str(e)) - cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, physicalResourceId=cluster_name) + cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, physicalResourceId=yaml_s3_url) sns_client = boto3.client('sns') sns_client.publish( TopicArn = environ['ErrorSnsTopicArn'], @@ -128,4 +129,4 @@ def lambda_handler(event, context): logger.info(f"Published error to {environ['ErrorSnsTopicArn']}") raise - cfnresponse.send(event, context, cfnresponse.SUCCESS, {'ConfigTemplateYamlS3Url': yaml_template_s3_url, 'ConfigYamlS3Url': yaml_s3_url, 'ConfigYamlHash': parallel_cluster_config_hash.hexdigest()}, physicalResourceId=cluster_name) + cfnresponse.send(event, context, cfnresponse.SUCCESS, {'ConfigTemplateYamlS3Url': yaml_template_s3_url, 'ConfigYamlS3Url': yaml_s3_url, 'ConfigYamlHash': parallel_cluster_config_hash.hexdigest()}, physicalResourceId=yaml_s3_url) diff --git a/source/resources/lambdas/DeconfigureRESClusterManager/DeconfigureRESClusterManager.py b/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py similarity index 63% rename from source/resources/lambdas/DeconfigureRESClusterManager/DeconfigureRESClusterManager.py rename to source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py index 9ce76018..c3e0ca01 100644 --- a/source/resources/lambdas/DeconfigureRESClusterManager/DeconfigureRESClusterManager.py +++ b/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py @@ -17,7 +17,7 @@ """ ''' -Update the head node when the config assets hash changes. +Deconfigure users and groups crontab using ssm run command. ''' import boto3 import cfnresponse @@ -59,6 +59,10 @@ def lambda_handler(event, context): cluster_name = environ['ClusterName'] cluster_region = environ['Region'] environment_name = environ['RESEnvironmentName'] + res_domain_joined_instance_name = environ['RESDomainJoinedInstanceName'] + res_domain_joined_instance_module_name = environ['RESDomainJoinedInstanceModuleName'] + res_domain_joined_instance_module_id = environ['RESDomainJoinedInstanceModuleId'] + res_domain_joined_instance_node_type = environ['RESDomainJoinedInstanceNodeType'] logger.info(f"{requestType} request for {cluster_name} in {cluster_region}") if requestType != 'Delete': @@ -66,15 +70,26 @@ def lambda_handler(event, context): cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return + domain_joined_instance_id = None ec2_client = boto3.client('ec2', region_name=cluster_region) - cluster_manager_info = ec2_client.describe_instances( - Filters = [ + describe_instances_paginator = ec2_client.get_paginator('describe_instances') + describe_instances_kwargs = { + 'Filters': [ {'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]}, - {'Name': 'tag:res:ModuleId', 'Values': ['cluster-manager']} + {'Name': 'tag:Name', 'Values': [res_domain_joined_instance_name]}, + {'Name': 'tag:res:ModuleName', 'Values': [res_domain_joined_instance_module_name]}, + {'Name': 'tag:res:ModuleId', 'Values': [res_domain_joined_instance_module_id]}, + {'Name': 'tag:res:NodeType', 'Values': [res_domain_joined_instance_node_type]}, + {'Name': 'instance-state-name', 'Values': ['running']} ] - )['Reservations'][0]['Instances'][0] - cluster_manager_instance_id = cluster_manager_info['InstanceId'] - logger.info(f"cluster manager instance id: {cluster_manager_instance_id}") + } + for describe_instances_response in describe_instances_paginator.paginate(**describe_instances_kwargs): + for reservation_dict in describe_instances_response['Reservations']: + domain_joined_instance_info = reservation_dict['Instances'][0] + domain_joined_instance_id = domain_joined_instance_info['InstanceId'] + logger.info(f"Domain joined instance id: {domain_joined_instance_id}") + if not domain_joined_instance_id: + raise RuntimeError(f"No running instances found with tags res:EnvironmentName={environment_name} and res:ModuleId={res_domain_joined_instance_module_name}") ssm_client = boto3.client('ssm', region_name=cluster_region) commands = f""" @@ -120,46 +135,43 @@ def lambda_handler(event, context): fi """ logger.info(f"Submitting SSM command") - response = ssm_client.send_command( + send_command_response = ssm_client.send_command( DocumentName = 'AWS-RunShellScript', - InstanceIds = [cluster_manager_instance_id], + InstanceIds = [domain_joined_instance_id], Parameters = {'commands': [commands]}, - Comment = f"Deconfigure {environment_name} cluster manager for {cluster_name}" + Comment = f"Deconfigure {environment_name} users and groups for {cluster_name}" ) - command_id = response['Command']['CommandId'] + command_id = send_command_response['Command']['CommandId'] logger.info(f"Sent SSM command {command_id}") # Wait for the command invocations to be made time.sleep(5) + # Wait for the command to complete before returning so that the cluster resources aren't removed before the command completes. - num_errors = 0 - MAX_WAIT_TIME = 13 * 60 - wait_time = 0 - instance_id = cluster_manager_instance_id - command_complete = False - while not command_complete: - response = ssm_client.get_command_invocation( - CommandId = command_id, - InstanceId = instance_id + MAX_WAIT_TIME = 5 * 60 + DELAY = 10 + MAX_ATTEMPTS = int(MAX_WAIT_TIME / DELAY) + waiter = ssm_client.get_waiter('command_executed') + waiter.wait( + CommandId=command_id, + InstanceId=domain_joined_instance_id, + WaiterConfig={ + 'Delay': DELAY, + 'MaxAttempts': MAX_ATTEMPTS + } ) - command_status = response['Status'] - if command_status in ['Success']: - logger.info(f"Command passed on {instance_id}") - break - elif command_status in ['Cancelled', 'TimedOut', 'Failed', 'Cancelling']: - logger.error(f"Command {command_status} on {instance_id}") - num_errors += 1 - break - else: - logger.info(f"Command still running on {instance_id}") - if wait_time >= MAX_WAIT_TIME: - logger.error(f"Timed out waiting for command completion.") - num_errors += 1 - break - time.sleep(10) - wait_time += 10 - if num_errors: - cfnresponse.send(event, context, cfnresponse.FAILED, {'error': f"Denconfigure command failed."}, physicalResourceId=cluster_name) + + # Check the result of the command + get_command_invocation_response = ssm_client.get_command_invocation( + CommandId = command_id, + InstanceId = domain_joined_instance_id + ) + command_status = get_command_invocation_response['Status'] + if command_status in ['Success']: + logger.info(f"Command passed on {domain_joined_instance_id}") + else: + logger.error(f"Command {command_status} on {domain_joined_instance_id}") + cfnresponse.send(event, context, cfnresponse.FAILED, {'error': f"Deconfigure command failed."}, physicalResourceId=cluster_name) return except Exception as e: diff --git a/source/resources/lambdas/DeconfigureRESClusterManager/cfnresponse.py b/source/resources/lambdas/DeconfigureRESUsersGroupsJson/cfnresponse.py similarity index 100% rename from source/resources/lambdas/DeconfigureRESClusterManager/cfnresponse.py rename to source/resources/lambdas/DeconfigureRESUsersGroupsJson/cfnresponse.py diff --git a/source/resources/lambdas/UpdateHeadNode/UpdateHeadNode.py b/source/resources/lambdas/UpdateHeadNode/UpdateHeadNode.py index c63e2ef8..d7318a60 100644 --- a/source/resources/lambdas/UpdateHeadNode/UpdateHeadNode.py +++ b/source/resources/lambdas/UpdateHeadNode/UpdateHeadNode.py @@ -71,32 +71,31 @@ def lambda_handler(event, context): cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return + head_node_ip_address = None + head_node_instance_id = None ec2_client = boto3.client('ec2', region_name=cluster_region) - reservations_info = ec2_client.describe_instances( - Filters = [ + describe_instances_paginator = ec2_client.get_paginator('describe_instances') + describe_instances_kwargs = { + 'Filters': [ {'Name': 'tag:parallelcluster:cluster-name', 'Values': [cluster_name]}, {'Name': 'tag:parallelcluster:node-type', 'Values': ['HeadNode']}, {'Name': 'instance-state-name', 'Values': ['running']} ] - )['Reservations'] - # If the cluster hasn't deployed yet or didn't successfully deploy initially, then the head node might not exist. - # This shouldn't cause the custom resource to fail. - if not reservations_info: + } + for describe_instances_response in describe_instances_paginator.paginate(**describe_instances_kwargs): + for reservation_dict in describe_instances_response['Reservations']: + if reservation_dict['Instances']: + head_node_info = reservation_dict['Instances'][0] + if 'PrivateIpAddress' in head_node_info: + head_node_ip_address = head_node_info['PrivateIpAddress'] + head_node_instance_id = head_node_info['InstanceId'] + break + if not head_node_instance_id: + # If the cluster hasn't deployed yet or didn't successfully deploy initially, then the head node might not exist. + # This shouldn't cause the custom resource to fail. logger.info(f"No head node instance found.") cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return - instances_info = reservations_info[0]['Instances'] - if not instances_info: - logger.info(f"No head node instance found.") - cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) - return - head_node_info = instances_info[0] - if 'PrivateIpAddress' not in head_node_info: - logger.info(f"No head node private IP address found.") - cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) - return - head_node_ip_address = head_node_info['PrivateIpAddress'] - head_node_instance_id = head_node_info['InstanceId'] logger.info(f"head node instance id: {head_node_instance_id}") logger.info(f"head node ip address: {head_node_ip_address}") @@ -112,13 +111,32 @@ def lambda_handler(event, context): sudo $script """ - response = ssm_client.send_command( + send_command_response = ssm_client.send_command( DocumentName = 'AWS-RunShellScript', InstanceIds = [head_node_instance_id], Parameters = {'commands': [commands]}, Comment = f"''Update head node of {cluster_name}({head_node_instance_id})" ) - logger.info(f"Sent SSM command {response['Command']['CommandId']}") + command_id = send_command_response['Command']['CommandId'] + logger.info(f"Sent SSM command {command_id}") + + # If I wait then the stack creation or update won't finish until the command completes. + # I like the idea that the stack waits for the update to complete. + MAX_WAIT_TIME = 15 * 60 + DELAY = 5 + MAX_ATTEMPTS = int(MAX_WAIT_TIME / DELAY) + waiter = ssm_client.get_waiter('command_executed') + waiter.wait( + CommandId=command_id, + InstanceId=head_node_instance_id, + WaiterConfig={ + 'Delay': DELAY, + 'MaxAttempts': MAX_ATTEMPTS + } + ) + + # I want the custom resource to be successful whether script passes or not so + # don't need to check the return status. except Exception as e: logger.exception(str(e)) diff --git a/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh b/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh index 6dcc1de1..21b4184c 100755 --- a/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh +++ b/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh @@ -84,9 +84,9 @@ ansible-playbook $PLAYBOOKS_PATH/ParallelClusterHeadNode.yml \ popd # Notify SNS topic that trigger configuration of cluster manager and submitters -ConfigureRESClusterManagerSnsTopicArnParameter={{ConfigureRESClusterManagerSnsTopicArnParameter}} -if ! [[ -z "$ConfigureRESClusterManagerSnsTopicArnParameter" ]]; then - ConfigureRESClusterManagerSnsTopicArn=$(aws ssm get-parameter --name $ConfigureRESClusterManagerSnsTopicArnParameter --query 'Parameter.Value' --output text) +ConfigureRESUsersGroupsJsonSnsTopicArnParameter={{ConfigureRESUsersGroupsJsonSnsTopicArnParameter}} +if ! [[ -z "$ConfigureRESUsersGroupsJsonSnsTopicArnParameter" ]]; then + ConfigureRESClusterManagerSnsTopicArn=$(aws ssm get-parameter --name $ConfigureRESUsersGroupsJsonSnsTopicArnParameter --query 'Parameter.Value' --output text) aws sns publish --topic-arn $ConfigureRESClusterManagerSnsTopicArn --message 'Configure {{ClusterName}} RES ClusterManager' fi diff --git a/source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonConfigure/tasks/main.yml b/source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonConfigure/tasks/main.yml index 0c1f8379..b211f601 100644 --- a/source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonConfigure/tasks/main.yml +++ b/source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonConfigure/tasks/main.yml @@ -4,27 +4,29 @@ - name: Show vars used in this playbook debug: msg: | - ClusterName: {{ClusterName}} - Region: {{Region}} - SlurmConfigDir: {{SlurmConfigDir}} + ClusterName: {{ ClusterName }} + Region: {{ Region }} + SlurmConfigDir: {{ SlurmConfigDir }} -- name: Add /opt/slurm/{{ClusterName}} to /etc/fstab +- name: Add /opt/slurm/{{ ClusterName }} to /etc/fstab mount: - path: /opt/slurm/{{ClusterName}} - src: "head_node.{{ClusterName}}.pcluster:/opt/slurm" + path: /opt/slurm/{{ ClusterName }} + src: "head_node.{{ ClusterName }}.pcluster:/opt/slurm" fstype: nfs backup: true state: present # Should already be mounted -- name: Create {{SlurmConfigDir}}/users_groups.json +- name: Create {{ SlurmConfigDir }}/users_groups.json shell: | set -ex - {{SlurmConfigDir}}/bin/create_or_update_users_groups_json.sh + {{ SlurmConfigDir }}/bin/create_or_update_users_groups_json.sh + args: + creates: '{{ SlurmConfigDir }}/users_groups.json' -- name: Create cron to refresh {{SlurmConfigDir}}/users_groups.json every hour +- name: Create cron to refresh {{ SlurmConfigDir }}/users_groups.json every hour template: - dest: /etc/cron.d/slurm_{{ClusterName}}_update_users_groups_json + dest: /etc/cron.d/slurm_{{ ClusterName }}_update_users_groups_json src: etc/cron.d/slurm_update_users_groups_json owner: root group: root diff --git a/source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonDeconfigure/tasks/main.yml b/source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonDeconfigure/tasks/main.yml index 0de04e5c..734c96bb 100644 --- a/source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonDeconfigure/tasks/main.yml +++ b/source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonDeconfigure/tasks/main.yml @@ -4,37 +4,37 @@ - name: Show vars used in this playbook debug: msg: | - ClusterName: {{ClusterName}} + ClusterName: {{ ClusterName }} -- name: Delete cron to refresh {{SlurmConfigDir}}/users_groups.json every hour +- name: Delete cron to refresh {{ SlurmConfigDir }}/users_groups.json every hour file: - dest: /etc/cron.d/slurm_{{ClusterName}}_update_users_groups_json + dest: /etc/cron.d/slurm_{{ ClusterName }}_update_users_groups_json state: absent -- name: Unmount /opt/slurm/{{ClusterName}} +- name: Unmount /opt/slurm/{{ ClusterName }} shell: | set -ex # Handle case where cluster was already deleted so the mountpoint is hung - if ! timeout 1s /opt/slurm/{{ClusterName}}; then + if ! timeout 1s /opt/slurm/{{ ClusterName }}; then echo "Mount point is hung. Source has already been deleted." - umount -lf /opt/slurm/{{ClusterName}} + umount -lf /opt/slurm/{{ ClusterName }} fi - if ! mountpoint /opt/slurm/{{ClusterName}}; then - echo "/opt/slurm/{{ClusterName}} already unmounted." + if ! mountpoint /opt/slurm/{{ ClusterName }}; then + echo "/opt/slurm/{{ ClusterName }} already unmounted." exit 0 fi - umount -lf /opt/slurm/{{ClusterName}} || lsof /opt/slurm/{{ClusterName}} + umount -lf /opt/slurm/{{ ClusterName }} || lsof /opt/slurm/{{ ClusterName }} register: umount_results - name: Show umount results debug: msg: | - umount_results: {{umount_results}} + umount_results: {{ umount_results }} -- name: Remove /opt/slurm/{{ClusterName}} from /etc/fstab and ignore errors +- name: Remove /opt/slurm/{{ ClusterName }} from /etc/fstab and ignore errors mount: - path: /opt/slurm/{{ClusterName}} + path: /opt/slurm/{{ ClusterName }} backup: true fstype: nfs state: absent @@ -42,9 +42,9 @@ # Retry it again without ignoring errors. ignore_errors: true -- name: Remove /opt/slurm/{{ClusterName}} from /etc/fstab +- name: Remove /opt/slurm/{{ ClusterName }} from /etc/fstab mount: - path: /opt/slurm/{{ClusterName}} + path: /opt/slurm/{{ ClusterName }} backup: true fstype: nfs state: absent diff --git a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-licenses.yml b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-licenses.yml index 83e64e06..725b2c9b 100644 --- a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-licenses.yml +++ b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-licenses.yml @@ -1,48 +1,58 @@ --- +- name: Show vars used in this playbook + debug: + msg: | + AccountingStorageHost: {{ AccountingStorageHost }} + ClusterName: {{ ClusterName }} + Licenses: {{ Licenses }} + PrimaryController: {{ PrimaryController }} + SlurmBinDir: {{ SlurmBinDir }} + PrimaryController: {{ PrimaryController }} + - name: Configure remote licenses # This uses sacctmcr so must do this after slurmctld and slurmd are working. - when: PrimaryController|bool and (AccountingStorageHost != '') and Licenses + when: PrimaryController|bool and AccountingStorageHost and Licenses shell: cmd: | set -ex # Add or update configured licenses declare -A licenses {% for lic in Licenses -%} - license='{{lic}}' + license='{{ lic }}' # Using '@' for the port separator instead of ':' because sbatch doesn't work if ':' is in the server name. - server='{% if 'Server' in Licenses[lic] %}{{Licenses[lic].Server}}{% if 'Port' in Licenses[lic] %}@{{Licenses[lic].Port}}{% endif %}{% else %}slurmdb{% endif %}' - count='{{Licenses[lic].Count}}' + server='{% if 'Server' in Licenses[lic] %}{{ Licenses[lic].Server }}{% if 'Port' in Licenses[lic] %}@{{ Licenses[lic].Port }}{% endif %}{% else %}slurmdb{% endif %}' + count='{{ Licenses[lic].Count }}' licenses["$license@$server"]="$count" # Check to see if license has already been created - slurm_license=$({{SlurmBinDir}}/sacctmgr -i show resource $license --parsable2 --noheader) + slurm_license=$({{ SlurmBinDir }}/sacctmgr -i show resource $license --parsable2 --noheader) if [ -z $slurm_license ]; then echo "$license license not in slurmdbd so add it" - {{SlurmBinDir}}/sacctmgr -i add resource type=License name=$license server=$server{% if 'ServerType' in Licenses[lic] %} servertype={{Licenses[lic].ServerType}}{% endif %} count={{Licenses[lic].Count}} cluster={{ClusterName}} percentallowed=100 + {{ SlurmBinDir }}/sacctmgr -i add resource type=License name=$license server=$server{% if 'ServerType' in Licenses[lic] %} servertype={{ Licenses[lic].ServerType }}{% endif %} count={{ Licenses[lic].Count }} cluster={{ ClusterName }} percentallowed=100 else echo "$license already in slurmdbd so check count and percent allowed." - slurmdb_count=$({{SlurmBinDir}}/sacctmgr -i show resource $license --parsable2 --noheader | cut -d '|' -f 4) + slurmdb_count=$({{ SlurmBinDir }}/sacctmgr -i show resource $license --parsable2 --noheader | cut -d '|' -f 4) echo "slurmdb count=$slurmdb_count" if [[ $count != $slurmdb_count ]]; then echo "Update $license count from $slurmdb_count to $count" - {{SlurmBinDir}}/sacctmgr -i modify resource name=$license server=$server set count=$count + {{ SlurmBinDir }}/sacctmgr -i modify resource name=$license server=$server set count=$count fi - slurmdb_percent_allowed=$({{SlurmBinDir}}/sacctmgr -i show resource $license --parsable2 --noheader | cut -d '|' -f 6) + slurmdb_percent_allowed=$({{ SlurmBinDir }}/sacctmgr -i show resource $license --parsable2 --noheader | cut -d '|' -f 6) if [[ "100" != $slurmdb_percent_allowed ]]; then - {{SlurmBinDir}}/sacctmgr -i modify resource name=$license server=$server cluster={{ClusterName}} set percentallowed=100 + {{ SlurmBinDir }}/sacctmgr -i modify resource name=$license server=$server cluster={{ ClusterName }} set percentallowed=100 fi fi {% endfor -%} # Remove deleted licenses - configured_licenses_and_servers=( $({{SlurmBinDir}}/sacctmgr --noheader --parsable2 show resource Clusters={{ClusterName}} format=name,server) ) + configured_licenses_and_servers=( $({{ SlurmBinDir }}/sacctmgr --noheader --parsable2 show resource Clusters={{ ClusterName }} format=name,server) ) echo ${configured_licenses_and_servers[@]} for configured_license_and_server in ${configured_licenses_and_servers[@]}; do configured_license=$(echo $configured_license_and_server | cut -d '|' -f 1) configured_server=$(echo $configured_license_and_server | cut -d '|' -f 2) if [ -z ${licenses["$configured_license@$configured_server"]} ]; then - {{SlurmBinDir}}/sacctmgr -i delete resource name=$configured_license server=$configured_server + {{ SlurmBinDir }}/sacctmgr -i delete resource name=$configured_license server=$configured_server fi done diff --git a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmdb-accounts.yml b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmdb-accounts.yml index cc985ce9..5ec818e4 100644 --- a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmdb-accounts.yml +++ b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmdb-accounts.yml @@ -14,19 +14,19 @@ name: - pyyaml -- name: Create {{SlurmConfigDir}}/bin/create_slurm_accounts.py +- name: Create {{ SlurmConfigDir }}/bin/create_slurm_accounts.py when: PrimaryController|bool copy: - dest: "{{SlurmConfigDir}}/bin/create_slurm_accounts.py" + dest: "{{ SlurmConfigDir }}/bin/create_slurm_accounts.py" src: opt/slurm/config/bin/create_slurm_accounts.py owner: root group: root mode: 0755 -- name: Create {{SlurmConfigDir}}/accounts.yml +- name: Create {{ SlurmConfigDir }}/accounts.yml when: PrimaryController|bool copy: - dest: "{{SlurmConfigDir}}/accounts.yml" + dest: "{{ SlurmConfigDir }}/accounts.yml" src: opt/slurm/config/accounts.yml.example owner: root group: root @@ -44,15 +44,15 @@ group: root mode: 0755 -- name: Run {{SlurmConfigDir}}/bin/create_slurm_accounts.py to make sure it works +- name: Run {{ SlurmConfigDir }}/bin/create_slurm_accounts.py to make sure it works # This uses sacctmcr so must do this after slurmctld and slurmdbd are working. - when: PrimaryController|bool and (AccountingStorageHost != '') and Licenses + when: PrimaryController|bool and AccountingStorageHost and Licenses shell: cmd: | set -ex - export SLURM_ROOT={{SlurmRoot}} - {{SlurmConfigDir}}/bin/create_slurm_accounts.py --accounts {{SlurmConfigDir}}/accounts.yml --users {{SlurmConfigDir}}/users_groups.json --default-account unassigned -d + export SLURM_ROOT={{ SlurmRoot }} + {{ SlurmConfigDir }}/bin/create_slurm_accounts.py --accounts {{ SlurmConfigDir }}/accounts.yml --users {{ SlurmConfigDir }}/users_groups.json --default-account unassigned -d - name: Create /etc/cron.d/slurm_accounts when: PrimaryController|bool diff --git a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmrestd.yml b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmrestd.yml index 8995177c..f4daef3a 100644 --- a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmrestd.yml +++ b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmrestd.yml @@ -21,17 +21,17 @@ make -j make install -- name: Build {{SlurmSbinDir}}/slurmrestd +- name: Build {{ SlurmSbinDir }}/slurmrestd when: PrimaryController|bool args: - creates: "{{SlurmSbinDir}}/slurmrestd" + creates: "{{ SlurmSbinDir }}/slurmrestd" shell: cmd: | set -ex - source /opt/parallelcluster/pyenv/versions/{{ParallelClusterPythonVersion}}/envs/cookbook_virtualenv/bin/activate + source /opt/parallelcluster/pyenv/versions/{{ ParallelClusterPythonVersion }}/envs/cookbook_virtualenv/bin/activate - cd /etc/chef/local-mode-cache/cache/slurm-slurm-{{PCSlurmVersion}} + cd /etc/chef/local-mode-cache/cache/slurm-slurm-{{ PCSlurmVersion }} ./configure --prefix /opt/slurm --with-pmix=/opt/pmix --with-slurmrestd --enable-slurmrestd &> configure.log CORES=$(grep processor /proc/cpuinfo | wc -l) make -j $CORES &> make_all.log @@ -43,22 +43,22 @@ user: name: slurmrestd system: yes - uid: '{{SlurmrestdUid}}' + uid: '{{ SlurmrestdUid }}' create_home: no -- name: Create {{SlurmrestdSocketDir}} +- name: Create {{ SlurmrestdSocketDir }} when: PrimaryController|bool file: - path: "{{SlurmrestdSocketDir}}" + path: "{{ SlurmrestdSocketDir }}" state: directory owner: slurmrestd group: slurmrestd mode: 0755 -- name: Create {{SlurmSpoolDir}} +- name: Create {{ SlurmSpoolDir }} when: PrimaryController|bool file: - path: "{{SlurmSpoolDir}}" + path: "{{ SlurmSpoolDir }}" state: directory owner: slurmrestd group: slurmrestd @@ -67,14 +67,14 @@ - name: Create jwt key for slurmrestd when: PrimaryController|bool args: - creates: "{{SlurmSpoolDir}}/jwt_hs256.key" + creates: "{{ SlurmSpoolDir }}/jwt_hs256.key" shell: cmd: | set -ex - dd if=/dev/random of={{SlurmSpoolDir}}/jwt_hs256.key bs=32 count=1 - chown slurm:slurm {{SlurmSpoolDir}}/jwt_hs256.key - chmod 0600 {{SlurmSpoolDir}}/jwt_hs256.key + dd if=/dev/random of={{ SlurmSpoolDir }}/jwt_hs256.key bs=32 count=1 + chown slurm:slurm {{ SlurmSpoolDir }}/jwt_hs256.key + chmod 0600 {{ SlurmSpoolDir }}/jwt_hs256.key - name: Create /etc/sysconfig/slurmrestd template: @@ -143,15 +143,15 @@ state: started register: slurmrestd_started -- name: Wait for {{SlurmBinDir}}/scontrol +- name: Wait for {{ SlurmBinDir }}/scontrol wait_for: - path: "{{SlurmBinDir}}/scontrol" + path: "{{ SlurmBinDir }}/scontrol" timeout: 1800 # 30 minutes -- name: Wait for slurmctld to accept requests on port {{SlurmctldPortMin}} +- name: Wait for slurmctld to accept requests on port {{ SlurmctldPortMin }} wait_for: host: "127.0.0.1" - port: "{{SlurmctldPortMin}}" + port: "{{ SlurmctldPortMin }}" timeout: 1800 # 30 minutes - name: Test scontrol token @@ -159,4 +159,4 @@ shell: | set -xe - {{SlurmBinDir}}/scontrol token + {{ SlurmBinDir }}/scontrol token diff --git a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-submitter-access.yml b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-submitter-access.yml index b4fd4fd0..44d89e6f 100644 --- a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-submitter-access.yml +++ b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-submitter-access.yml @@ -2,34 +2,34 @@ - name: Set variables used by the role set_fact: - SlurmOSDir: /opt/slurm/{{ClusterName}} + SlurmOSDir: /opt/slurm/{{ ClusterName }} - name: Show vars used in this playbook debug: msg: | - Architecture: {{Architecture}} - ClusterName: {{ClusterName}} - DefaultPartition: {{DefaultPartition}} - distribution: {{distribution}} - distribution_major_version: {{distribution_major_version}} - ParallelClusterVersion: {{ParallelClusterVersion}} - PCModulefilesBaseDir: {{PCModulefilesBaseDir}} - SlurmConfigDir: {{SlurmConfigDir}} - SlurmOSDir: {{SlurmOSDir}} + Architecture: {{ Architecture }} + ClusterName: {{ ClusterName }} + DefaultPartition: {{ DefaultPartition }} + distribution: {{ distribution }} + distribution_major_version: {{ distribution_major_version }} + ParallelClusterVersion: {{ ParallelClusterVersion }} + PCModulefilesBaseDir: {{ PCModulefilesBaseDir }} + SlurmConfigDir: {{ SlurmConfigDir }} + SlurmOSDir: {{ SlurmOSDir }} -- name: Create /opt/slurm/{{ClusterName}} symbolic link +- name: Create /opt/slurm/{{ ClusterName }} symbolic link # All head nodes are originally configured to use /opt/slurm # This doesn't work for submitter nodes that need to access more than one cluster because they can't all be mounted at /opt/slurm. - # So create a path, /opt/slurm/{{ClusterName}} that can exist both on the Head node and the submitter node + # So create a path, /opt/slurm/{{ ClusterName }} that can exist both on the Head node and the submitter node file: - path: "/opt/slurm/{{ClusterName}}" + path: "/opt/slurm/{{ ClusterName }}" src: "/opt/slurm" state: link owner: root group: root mode: '0775' -- name: Change /opt/slurm to /opt/slurm/{{ClusterName}} in slurm.conf +- name: Change /opt/slurm to /opt/slurm/{{ ClusterName }} in slurm.conf shell: cmd: | set -ex @@ -38,8 +38,8 @@ backup_suffix=".$(date '+%Y-%m-%dT%H:%M:%S')~" num_changed=0 for conf_file in ${conf_files[*]}; do - sed --in-place=$backup_suffix 's%/opt/slurm/etc%/opt/slurm/{{ClusterName}}/etc%' $conf_file - sed --in-place=$backup_suffix 's%/opt/slurm/lib%/opt/slurm/{{ClusterName}}/lib%' $conf_file + sed --in-place=$backup_suffix 's%/opt/slurm/etc%/opt/slurm/{{ ClusterName }}/etc%' $conf_file + sed --in-place=$backup_suffix 's%/opt/slurm/lib%/opt/slurm/{{ ClusterName }}/lib%' $conf_file backup_conf_file="${conf_file}${backup_suffix}" if diff -q $backup_conf_file $conf_file; then @@ -58,15 +58,15 @@ - name: Fix permissions on config dir so users can access it to get the modulefiles file: - path: "{{SlurmConfigDir}}" + path: "{{ SlurmConfigDir }}" state: directory owner: root group: root mode: '0755' -- name: Create {{PCModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}} +- name: Create {{ PCModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }} file: - path: "{{PCModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}" + path: "{{ PCModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }}" state: directory owner: root group: root @@ -81,29 +81,29 @@ chmod -R 0755 /opt/slurm/config/modules -- name: Create slurm modulefile .template - template: - dest: "{{PCModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/.template" - src: opt/slurm/modules/modulefiles/slurm/.template - owner: root - group: root - mode: '0664' - force: yes +# - name: Create slurm modulefile .template +# template: +# dest: "{{ PCModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }}/.template" +# src: opt/slurm/modules/modulefiles/slurm/.template +# owner: root +# group: root +# mode: '0664' +# force: yes -- name: Create slurm modulefile - file: - path: "{{PCModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/{{ParallelClusterVersion}}" - src: ".template" - state: link - owner: root - group: root - mode: '0664' +# - name: Create slurm modulefile +# file: +# path: "{{ PCModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }}/{{ ParallelClusterVersion }}" +# src: ".template" +# state: link +# owner: root +# group: root +# mode: '0664' -- name: Create slurm modulefile .version - template: - dest: "{{PCModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/.version" - src: opt/slurm/modules/modulefiles/slurm/.version - owner: root - group: root - mode: '0664' - force: yes +# - name: Create slurm modulefile .version +# template: +# dest: "{{ PCModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }}/.version" +# src: opt/slurm/modules/modulefiles/slurm/.version +# owner: root +# group: root +# mode: '0664' +# force: yes diff --git a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-users-groups.yml b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-users-groups.yml index 37e7cff9..5c3755d9 100644 --- a/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-users-groups.yml +++ b/source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-users-groups.yml @@ -14,4 +14,4 @@ shell: | set -ex - {{SlurmConfigDir}}/bin//create_users_groups.py -i {{SlurmConfigDir}}/users_groups.json + {{ SlurmConfigDir }}/bin//create_users_groups.py -i {{ SlurmConfigDir }}/users_groups.json diff --git a/source/resources/playbooks/roles/ParallelClusterSubmitterConfigure/tasks/main.yml b/source/resources/playbooks/roles/ParallelClusterSubmitterConfigure/tasks/main.yml index 8e18fd1b..bd64c986 100644 --- a/source/resources/playbooks/roles/ParallelClusterSubmitterConfigure/tasks/main.yml +++ b/source/resources/playbooks/roles/ParallelClusterSubmitterConfigure/tasks/main.yml @@ -3,15 +3,15 @@ - name: Show vars used in this playbook debug: msg: | - ClusterName: {{ClusterName}} - distribution: {{distribution}} - Region: {{Region}} - SlurmBaseDir: {{SlurmBaseDir}} + ClusterName: {{ ClusterName }} + distribution: {{ distribution }} + Region: {{ Region }} + SlurmBaseDir: {{ SlurmBaseDir }} -- name: Add /opt/slurm/{{ClusterName}} to /etc/fstab +- name: Add /opt/slurm/{{ ClusterName }} to /etc/fstab mount: - path: /opt/slurm/{{ClusterName}} - src: "head_node.{{ClusterName}}.pcluster:/opt/slurm" + path: /opt/slurm/{{ ClusterName }} + src: "head_node.{{ ClusterName }}.pcluster:/opt/slurm" fstype: nfs backup: true state: present # Should already be mounted @@ -53,7 +53,7 @@ state: present create_home: no -- name: Build munge version used by ParallelCluster ({{ParallelClusterMungeVersion}}) +- name: Build munge version used by ParallelCluster ({{ ParallelClusterMungeVersion }}) args: creates: /usr/sbin/munged shell: | @@ -62,14 +62,14 @@ cd /usr/local/src mkdir -p munge cd munge - rm -f munge-{{ParallelClusterMungeVersion}}.tar.xz - rm -f munge-{{ParallelClusterMungeVersion}}.tar - rm -rf munge-{{ParallelClusterMungeVersion}} - wget https://github.com/dun/munge/releases/download/munge-{{ParallelClusterMungeVersion}}/munge-{{ParallelClusterMungeVersion}}.tar.xz - xz -d munge-{{ParallelClusterMungeVersion}}.tar.xz - tar -xf munge-{{ParallelClusterMungeVersion}}.tar - rm -f munge-{{ParallelClusterMungeVersion}}.tar - cd munge-{{ParallelClusterMungeVersion}} + rm -f munge-{{ ParallelClusterMungeVersion }}.tar.xz + rm -f munge-{{ ParallelClusterMungeVersion }}.tar + rm -rf munge-{{ ParallelClusterMungeVersion }} + wget https://github.com/dun/munge/releases/download/munge-{{ ParallelClusterMungeVersion }}/munge-{{ ParallelClusterMungeVersion }}.tar.xz + xz -d munge-{{ ParallelClusterMungeVersion }}.tar.xz + tar -xf munge-{{ ParallelClusterMungeVersion }}.tar + rm -f munge-{{ ParallelClusterMungeVersion }}.tar + cd munge-{{ ParallelClusterMungeVersion }} ./configure --prefix=/usr --sysconfdir=/etc --localstatedir=/var --libdir=/usr/lib64 &> configure.log make make install &> make_install.log @@ -80,6 +80,7 @@ path: /var/log/munge owner: munge group: munge + mode: 0700 state: directory - name: Create /etc/munge with correct permissions @@ -90,10 +91,10 @@ mode: 0700 state: directory -- name: Copy {{SlurmConfigDir}}/munge.key to /etc/munge/munge.key +- name: Copy {{ SlurmConfigDir }}/munge.key to /etc/munge/munge.key copy: dest: /etc/munge/munge.key - src: "{{SlurmConfigDir}}/munge.key" + src: "{{ SlurmConfigDir }}/munge.key" remote_src: true force: true # Has to be true or won't be copied when they are different. backup: true @@ -118,7 +119,7 @@ - name: Configure modules template: - dest: /etc/profile.d/slurm_{{ClusterName}}_modulefiles.sh + dest: /etc/profile.d/slurm_{{ ClusterName }}_modulefiles.sh src: etc/profile.d/slurm_modulefiles.sh owner: root group: root diff --git a/source/resources/playbooks/roles/ParallelClusterSubmitterDeconfigure/tasks/main.yml b/source/resources/playbooks/roles/ParallelClusterSubmitterDeconfigure/tasks/main.yml index 5dbfe947..52ec283e 100644 --- a/source/resources/playbooks/roles/ParallelClusterSubmitterDeconfigure/tasks/main.yml +++ b/source/resources/playbooks/roles/ParallelClusterSubmitterDeconfigure/tasks/main.yml @@ -3,37 +3,37 @@ - name: Show vars used in this playbook debug: msg: | - ClusterName: {{ClusterName}} + ClusterName: {{ ClusterName }} - name: Remove modulefile configuration file: - dest: /etc/profile.d/slurm_{{ClusterName}}_modulefiles.sh + dest: /etc/profile.d/slurm_{{ ClusterName }}_modulefiles.sh state: absent -- name: Unmount /opt/slurm/{{ClusterName}} +- name: Unmount /opt/slurm/{{ ClusterName }} shell: | set -ex # Handle case where cluster was already deleted so the mountpoint is hung - if ! timeout 1s /opt/slurm/{{ClusterName}}; then + if ! timeout 1s /opt/slurm/{{ ClusterName }}; then echo "Mount point is hung. Source has already been deleted." - umount -lf /opt/slurm/{{ClusterName}} + umount -lf /opt/slurm/{{ ClusterName }} fi - if ! mountpoint /opt/slurm/{{ClusterName}}; then - echo "/opt/slurm/{{ClusterName}} already unmounted." + if ! mountpoint /opt/slurm/{{ ClusterName }}; then + echo "/opt/slurm/{{ ClusterName }} already unmounted." exit 0 fi - umount /opt/slurm/{{ClusterName}} || lsof /opt/slurm/{{ClusterName}} + umount /opt/slurm/{{ ClusterName }} || lsof /opt/slurm/{{ ClusterName }} register: umount_results - name: Show umount results debug: msg: | - umount_results: {{umount_results}} + umount_results: {{ umount_results }} -- name: Remove /opt/slurm/{{ClusterName}} from /etc/fstab +- name: Remove /opt/slurm/{{ ClusterName }} from /etc/fstab mount: - path: /opt/slurm/{{ClusterName}} + path: /opt/slurm/{{ ClusterName }} backup: true fstype: nfs state: absent diff --git a/source/resources/playbooks/roles/all/tasks/main.yml b/source/resources/playbooks/roles/all/tasks/main.yml index ad3648d8..8f87ab7e 100644 --- a/source/resources/playbooks/roles/all/tasks/main.yml +++ b/source/resources/playbooks/roles/all/tasks/main.yml @@ -6,51 +6,51 @@ - name: Show vars set in inventories/groupvars/all debug: msg: | - ansible_architecture: {{ansible_architecture}} - Architecture: {{Architecture}} - distribution: {{distribution}} - distribution_major_version: {{distribution_major_version}} - distribution_version: {{distribution_version}} - kernel: {{kernel}} - memtotal_mb: {{memtotal_mb}} - - amazonlinux2: {{amazonlinux2}} - alma: {{alma}} - alma8: {{alma8}} - centos: {{centos}} - centos7: {{centos7}} - rhel: {{rhel}} - rhel7: {{rhel7}} - rhel8: {{rhel8}} - rocky: {{rocky}} - rocky8: {{rocky8}} - rhelclone: {{rhelclone}} - rhel8clone: {{rhel8clone}} - centos7_5_to_6: {{centos7_5_to_6}} - centos7_5_to_9: {{centos7_5_to_9}} - centos7_7_to_9: {{centos7_7_to_9}} - - ansible_ssh_user: {{ansible_ssh_user}} - ansible_ssh_common_args: {{ansible_ssh_common_args}} - - SlurmBaseDir: {{SlurmBaseDir}} - SlurmSbinDir: {{SlurmSbinDir}} - SlurmBinDir: {{SlurmBinDir}} - SlurmScriptsDir: {{SlurmScriptsDir}} - SlurmRoot: {{SlurmRoot}} - ModulefilesBaseDir: {{ModulefilesBaseDir}} - SupportedDistributions: {{SupportedDistributions}} + ansible_architecture: {{ ansible_architecture }} + Architecture: {{ Architecture }} + distribution: {{ distribution }} + distribution_major_version: {{ distribution_major_version }} + distribution_version: {{ distribution_version }} + kernel: {{ kernel }} + memtotal_mb: {{ memtotal_mb }} + + amazonlinux2: {{ amazonlinux2 }} + alma: {{ alma }} + alma8: {{ alma8 }} + centos: {{ centos }} + centos7: {{ centos7 }} + rhel: {{ rhel }} + rhel7: {{ rhel7 }} + rhel8: {{ rhel8 }} + rocky: {{ rocky }} + rocky8: {{ rocky8 }} + rhelclone: {{ rhelclone }} + rhel8clone: {{ rhel8clone }} + centos7_5_to_6: {{ centos7_5_to_6 }} + centos7_5_to_9: {{ centos7_5_to_9 }} + centos7_7_to_9: {{ centos7_7_to_9 }} + + ansible_ssh_user: {{ ansible_ssh_user }} + ansible_ssh_common_args: {{ ansible_ssh_common_args }} + + SlurmBaseDir: {{ SlurmBaseDir }} + SlurmSbinDir: {{ SlurmSbinDir }} + SlurmBinDir: {{ SlurmBinDir }} + SlurmScriptsDir: {{ SlurmScriptsDir }} + SlurmRoot: {{ SlurmRoot }} + ModulefilesBaseDir: {{ ModulefilesBaseDir }} + SupportedDistributions: {{ SupportedDistributions }} Cluster Specific Vars - SlurmConfigDir: {{SlurmConfigDir}} - SlurmEtcDir: {{SlurmEtcDir}} - SlurmLogsDir: {{SlurmLogsDir}} - SlurmSpoolDir: {{SlurmSpoolDir}} - SlurmConf: {{SlurmConf}} + SlurmConfigDir: {{ SlurmConfigDir }} + SlurmEtcDir: {{ SlurmEtcDir }} + SlurmLogsDir: {{ SlurmLogsDir }} + SlurmSpoolDir: {{ SlurmSpoolDir }} + SlurmConf: {{ SlurmConf }} - name: Set timezone timezone: - name: "{{TimeZone}}" + name: "{{ TimeZone }}" # CentOS pip 19.3.1 is broken. Developers recommend using "python3 -m pip" but the ansible # pip task breaks when you use an executable with spaces. @@ -67,7 +67,7 @@ - name: Install python3 yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - python3 @@ -112,7 +112,7 @@ - name: Install packages required by mount_ssds.bash yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - jq - nvme-cli diff --git a/source/resources/playbooks/roles/bug_fixes/tasks/main.yml b/source/resources/playbooks/roles/bug_fixes/tasks/main.yml index 16127fb0..2abae416 100644 --- a/source/resources/playbooks/roles/bug_fixes/tasks/main.yml +++ b/source/resources/playbooks/roles/bug_fixes/tasks/main.yml @@ -3,6 +3,4 @@ - name: Install bug fixes yum: - state: latest - disablerepo: "{{yum_disablerepo|default(omit)}}" bugfix: yes diff --git a/source/resources/playbooks/roles/create_users_groups_json/tasks/main.yml b/source/resources/playbooks/roles/create_users_groups_json/tasks/main.yml index e731f7bf..95e01d6d 100644 --- a/source/resources/playbooks/roles/create_users_groups_json/tasks/main.yml +++ b/source/resources/playbooks/roles/create_users_groups_json/tasks/main.yml @@ -5,7 +5,7 @@ # Should only be installed on 1 instance. - name: Create /etc/cron.d/slurm_users_groups_json template: - dest: /etc/cron.d/slurm_{{ClusterName}}_create_users_groups_json + dest: /etc/cron.d/slurm_{{ ClusterName }}_create_users_groups_json src: etc/cron.d/slurm_users_groups_json owner: root group: root @@ -13,7 +13,7 @@ force: yes # The file gets created on slurmctl and has no users and groups so this needs to be run to update it. -- name: Create/update {{SlurmScriptsDir}}/users_groups.json +- name: Create/update {{ SlurmScriptsDir }}/users_groups.json shell: cmd: | - {{SlurmScriptsDir}}/create_users_groups_json.py -o {{SlurmConfigDir}}/users_groups.json + {{ SlurmScriptsDir }}/create_users_groups_json.py -o {{ SlurmConfigDir }}/users_groups.json diff --git a/source/resources/playbooks/roles/eda_tools/tasks/main.yml b/source/resources/playbooks/roles/eda_tools/tasks/main.yml index c97c63c5..57071f99 100644 --- a/source/resources/playbooks/roles/eda_tools/tasks/main.yml +++ b/source/resources/playbooks/roles/eda_tools/tasks/main.yml @@ -46,12 +46,12 @@ rm -rf /usr/local/aws-cli rm -f awscliv2.zip rm -rf aws - if [[ {{Architecture}} == 'x86_64' ]]; then + if [[ {{ Architecture }} == 'x86_64' ]]; then curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - elif [[ {{Architecture}} == 'arm64' ]]; then + elif [[ {{ Architecture }} == 'arm64' ]]; then curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip" else - echo "error: Unsupported {{Architecture}} architecture" + echo "error: Unsupported {{ Architecture }} architecture" exit 1 fi unzip awscliv2.zip @@ -79,7 +79,7 @@ - pip yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - python3 - python3-pip @@ -91,7 +91,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - gcc-c++ - platform-python-devel @@ -120,9 +120,10 @@ ignore_errors: true register: numpy_results -- debug: +- name: Show numpy_results + debug: msg: | - numpy_results: {{numpy_results}} + numpy_results: {{ numpy_results }} - name: Install pandas when: (numpy_results.skipped|default(False)) or not numpy_results.failed @@ -165,7 +166,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - http://mirror.centos.org/centos/7/os/x86_64/Packages/compat-libstdc++-33-3.2.3-72.el7.x86_64.rpm - http://mirror.centos.org/centos/7/os/x86_64/Packages/compat-libstdc++-33-3.2.3-72.el7.i686.rpm @@ -176,7 +177,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - compat-libstdc++-33.x86_64 - compat-libstdc++-33.i686 @@ -189,7 +190,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - pulseaudio-libs - name: Install pulseaudio on Amazon Linux @@ -199,7 +200,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - pulseaudio @@ -211,7 +212,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - libcrypt.x86_64 - libcrypt.i686 @@ -222,7 +223,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - libcrypt @@ -233,7 +234,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - perl-Switch @@ -245,7 +246,7 @@ # - packages # yum: # state: present -# disablerepo: "{{yum_disablerepo|default(omit)}}" +# disablerepo: "{{ yum_disablerepo|default(omit) }}" # enablerepo: "rhel-7-server-rhui-optional-rpms,rhui-rhel-7-server-rhui-optional-rpms" # name: # - perl-Switch @@ -256,7 +257,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - uuid-devel - xorg-x11-server-devel @@ -267,7 +268,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - xorg-x11-fonts-ISO8859-15-100dpi @@ -279,7 +280,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - ncurses-compat-libs @@ -316,7 +317,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - elfutils-libelf.x86_64 - elfutils-libelf.i686 @@ -327,7 +328,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - elfutils-libelf @@ -339,7 +340,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - system-lsb @@ -351,7 +352,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - redhat-lsb @@ -364,7 +365,7 @@ - eda_packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - gperf @@ -387,7 +388,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - bc - bzip2-devel @@ -420,7 +421,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - gpaste - gpaste-ui @@ -474,7 +475,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - libICE.x86_64 - libICE.i686 @@ -493,7 +494,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - libaio - libffi-devel @@ -518,7 +519,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - ncurses-libs.x86_64 - ncurses-libs.i686 @@ -530,7 +531,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - gstreamer - lzma-sdk-devel @@ -546,7 +547,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - lsof - make @@ -591,7 +592,7 @@ - packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - xkeyboard-config - xorg-x11-font-utils @@ -617,7 +618,7 @@ - eda_packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - apr-util - gnuplot @@ -634,7 +635,7 @@ - eda_packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" name: - compat-db47 @@ -699,27 +700,27 @@ - name: Set nodejs download URL when: Architecture == 'x86_64' set_fact: - nodejs_url: "https://nodejs.org/dist/v{{nodejs_version}}/node-v{{nodejs_version}}-linux-x64.tar.xz" - nodejs_tarball: "node-v{{nodejs_version}}-linux-x64.tar.xz" - nodejs_dir: "/opt/node-v{{nodejs_version}}-linux-x64" + nodejs_url: "https://nodejs.org/dist/v{{ nodejs_version }}/node-v{{ nodejs_version }}-linux-x64.tar.xz" + nodejs_tarball: "node-v{{ nodejs_version }}-linux-x64.tar.xz" + nodejs_dir: "/opt/node-v{{ nodejs_version }}-linux-x64" - name: Set nodejs download URL when: Architecture == 'arm64' set_fact: - nodejs_url: "https://nodejs.org/dist/v{{nodejs_version}}/node-v{{nodejs_version}}-linux-arm64.tar.xz" - nodejs_tarball: "node-v{{nodejs_version}}-linux-arm64.tar.xz" - nodejs_dir: "/opt/node-v{{nodejs_version}}-linux-arm64" + nodejs_url: "https://nodejs.org/dist/v{{ nodejs_version }}/node-v{{ nodejs_version }}-linux-arm64.tar.xz" + nodejs_tarball: "node-v{{ nodejs_version }}-linux-arm64.tar.xz" + nodejs_dir: "/opt/node-v{{ nodejs_version }}-linux-arm64" - name: Install nodejs shell: - creates: "{{nodejs_dir}}" + creates: "{{ nodejs_dir }}" cmd: | set -ex cd /tmp - wget {{nodejs_url}} + wget {{ nodejs_url }} cd /opt - tar -xf /tmp/{{nodejs_tarball}} - rm /tmp/{{nodejs_tarball}} + tar -xf /tmp/{{ nodejs_tarball }} + rm /tmp/{{ nodejs_tarball }} - name: Add nodejs to sh path template: @@ -741,7 +742,7 @@ - name: Install ejs shell: - creates: "{{nodejs_dir}}/lib/node_modules/ejs" + creates: "{{ nodejs_dir }}/lib/node_modules/ejs" cmd: | set -ex source /etc/profile.d/nodejs.sh @@ -749,7 +750,7 @@ - name: Install typescript shell: - creates: "{{nodejs_dir}}/lib/node_modules/typescript" + creates: "{{ nodejs_dir }}/lib/node_modules/typescript" cmd: | set -ex source /etc/profile.d/nodejs.sh @@ -759,10 +760,10 @@ # npm: # name: ejs # global: yes -# executable: "{{nodejs_dir}}/bin/npm" +# executable: "{{ nodejs_dir }}/bin/npm" # - name: Install typescript NPM package # npm: # name: typescript # global: yes -# executable: "{{nodejs_dir}}/bin/npm" +# executable: "{{ nodejs_dir }}/bin/npm" diff --git a/source/resources/playbooks/roles/install_slurm/tasks/main.yml b/source/resources/playbooks/roles/install_slurm/tasks/main.yml index 3c8cd178..a8db4de7 100644 --- a/source/resources/playbooks/roles/install_slurm/tasks/main.yml +++ b/source/resources/playbooks/roles/install_slurm/tasks/main.yml @@ -4,51 +4,51 @@ - name: Show variables used by this role debug: msg: | - ansible_architecture: {{ansible_architecture}} - Architecture: {{Architecture}} - distribution: {{distribution}} - distribution_major_version: {{distribution_major_version}} - distribution_version: {{distribution_version}} - - amazonlinux2: {{amazonlinux2}} - alma: {{alma}} - alma8: {{alma8}} - centos: {{centos}} - centos7: {{centos7}} - rhel: {{rhel}} - rhel7: {{rhel7}} - rhel8: {{rhel8}} - rocky: {{rocky}} - rocky8: {{rocky8}} - rhelclone: {{rhelclone}} - rhel8clone: {{rhel8clone}} - centos7_5_to_6: {{centos7_5_to_6}} - centos7_5_to_9: {{centos7_5_to_9}} - centos7_7_to_9: {{centos7_7_to_9}} + ansible_architecture: {{ ansible_architecture }} + Architecture: {{ Architecture }} + distribution: {{ distribution }} + distribution_major_version: {{ distribution_major_version }} + distribution_version: {{ distribution_version }} + + amazonlinux2: {{ amazonlinux2 }} + alma: {{ alma }} + alma8: {{ alma8 }} + centos: {{ centos }} + centos7: {{ centos7 }} + rhel: {{ rhel }} + rhel7: {{ rhel7 }} + rhel8: {{ rhel8 }} + rocky: {{ rocky }} + rocky8: {{ rocky8 }} + rhelclone: {{ rhelclone }} + rhel8clone: {{ rhel8clone }} + centos7_5_to_6: {{ centos7_5_to_6 }} + centos7_5_to_9: {{ centos7_5_to_9 }} + centos7_7_to_9: {{ centos7_7_to_9 }} - name: Set SlurmSrcDir set_fact: - SlurmSrcDir: "/opt/slurm/{{ClusterName}}/config/src/{{distribution}}/{{distribution_major_version}}/{{Architecture}}" + SlurmSrcDir: "/opt/slurm/{{ ClusterName }}/config/src/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}" - name: Set SlurmOSDir set_fact: - SlurmOSDir: "/opt/slurm/{{ClusterName}}/config/os/{{distribution}}/{{distribution_major_version}}/{{Architecture}}" + SlurmOSDir: "/opt/slurm/{{ ClusterName }}/config/os/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}" - name: Set SlurmBinDir set_fact: - SlurmBinDir: "{{SlurmOSDir}}/bin" + SlurmBinDir: "{{ SlurmOSDir }}/bin" - name: Show variables used by this role debug: msg: | - SlurmVersion: {{SlurmVersion}} - SlurmSrcDir: {{SlurmSrcDir}} - SlurmBinDir: {{SlurmBinDir}} - SlurmConfigDir: {{SlurmConfigDir}} - SlurmOSDir: {{SlurmOSDir}} - SlurmrestdPort: {{SlurmrestdPort}} - SlurmEtcDir: {{SlurmEtcDir}} - ModulefilesBaseDir: {{ModulefilesBaseDir}} + SlurmVersion: {{ SlurmVersion }} + SlurmSrcDir: {{ SlurmSrcDir }} + SlurmBinDir: {{ SlurmBinDir }} + SlurmConfigDir: {{ SlurmConfigDir }} + SlurmOSDir: {{ SlurmOSDir }} + SlurmrestdPort: {{ SlurmrestdPort }} + SlurmEtcDir: {{ SlurmEtcDir }} + ModulefilesBaseDir: {{ ModulefilesBaseDir }} - name: Install epel from amazon-linux-extras when: distribution == 'Amazon' @@ -108,9 +108,10 @@ - mailx - make - man2html - - munge - - munge-devel - - munge-libs + # munge is built from ParallelCluster version + # - munge + # - munge-devel + # - munge-libs - mysql-devel - numactl-devel - openmpi @@ -156,32 +157,34 @@ shell: | set -xe - mkdir -p {{SlurmSrcDir}} - cd {{SlurmSrcDir}} - wget https://download.schedmd.com/slurm/slurm-{{SlurmVersion}}.tar.bz2 - bzip2 -d -f slurm-{{SlurmVersion}}.tar.bz2 - tar -xf slurm-{{SlurmVersion}}.tar - rm slurm-{{SlurmVersion}}.tar + mkdir -p {{ SlurmSrcDir }} + cd {{ SlurmSrcDir }} + wget https://download.schedmd.com/slurm/slurm-{{ SlurmVersion }}.tar.bz2 + bzip2 -d -f slurm-{{ SlurmVersion }}.tar.bz2 + tar -xf slurm-{{ SlurmVersion }}.tar + rm slurm-{{ SlurmVersion }}.tar args: - creates: "{{SlurmSrcDir}}/slurm-{{SlurmVersion}}/INSTALL" + creates: "{{ SlurmSrcDir }}/slurm-{{ SlurmVersion }}/INSTALL" -- name: Create {{SlurmOSDir}} +- name: Create {{ SlurmOSDir }} file: - path: "{{SlurmOSDir}}" + path: "{{ SlurmOSDir }}" state: directory owner: root group: root mode: 0775 -- name: Build and install slurm on {{distribution}} {{distribution_major_version}} on {{Architecture}} +- name: Build and install slurm on {{ distribution }} {{ distribution_major_version }} on {{ Architecture }} args: - creates: "{{SlurmBinDir}}/srun" + creates: "{{ SlurmBinDir }}/srun" shell: | set -xe - cd {{SlurmSrcDir}}/slurm-{{SlurmVersion}} - ./configure --prefix {{SlurmOSDir}} --with-slurmrestd --enable-slurmrestd --with-slurmrestd-port={{SlurmrestdPort}} &> configure.log + set -o pipefail + + cd {{ SlurmSrcDir }}/slurm-{{ SlurmVersion }} + ./configure --prefix {{ SlurmOSDir }} --with-slurmrestd --enable-slurmrestd --with-slurmrestd-port={{ SlurmrestdPort }} &> configure.log CORES=$(grep processor /proc/cpuinfo | wc -l) make -j $CORES &> slurm-make.log make -j $CORES contrib &> slurm-make-contrib.log @@ -189,26 +192,26 @@ make install &> slurm-make-install.log make install-contrib &> slurm-make-install-contrib.log -- name: Create {{SlurmEtcDir}} +- name: Create {{ SlurmEtcDir }} file: - path: "{{SlurmEtcDir}}" + path: "{{ SlurmEtcDir }}" state: directory owner: root group: root mode: 0775 -- name: Create {{SlurmOSDir}}/etc +- name: Create {{ SlurmOSDir }}/etc file: state: link - src: "{{SlurmEtcDir}}" - path: "{{SlurmOSDir}}/etc" + src: "{{ SlurmEtcDir }}" + path: "{{ SlurmOSDir }}/etc" owner: root group: root -- name: Create {{SlurmOSDir}}/etc +- name: Create {{ SlurmOSDir }}/etc file: state: link - src: "{{SlurmOSDir}}/lib/libnss_slurm.so.2" + src: "{{ SlurmOSDir }}/lib/libnss_slurm.so.2" path: "/usr/lib64/libnss_slurm.so.2" owner: root group: root @@ -224,15 +227,15 @@ - name: Fix permissions on config dir so users can access it to get the modulefiles file: - path: "{{SlurmConfigDir}}" + path: "{{ SlurmConfigDir }}" state: directory owner: root group: root mode: '0755' -- name: Create {{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}} +- name: Create {{ ModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }} file: - path: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}" + path: "{{ ModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }}" state: directory owner: root group: root @@ -245,11 +248,11 @@ cmd: | set -ex - chmod -R 0755 {{SlurmConfigDir}} + chmod -R 0755 {{ SlurmConfigDir }} - name: Create slurm modulefile .template template: - dest: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/.template" + dest: "{{ ModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }}/.template" src: opt/slurm/config/modules/modulefiles/slurm/.template owner: root group: root @@ -258,7 +261,7 @@ - name: Create slurm modulefile file: - path: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/{{ParallelClusterVersion}}" + path: "{{ ModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }}/{{ ParallelClusterVersion }}" src: ".template" state: link owner: root @@ -267,7 +270,7 @@ - name: Create slurm modulefile .version template: - dest: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/.version" + dest: "{{ ModulefilesBaseDir }}/{{ distribution }}/{{ distribution_major_version }}/{{ Architecture }}/{{ ClusterName }}/.version" src: opt/slurm/config/modules/modulefiles/slurm/.version owner: root group: root diff --git a/source/resources/playbooks/roles/lustre-client/tasks/main.yml b/source/resources/playbooks/roles/lustre-client/tasks/main.yml index 0281c9b0..9f620bd4 100644 --- a/source/resources/playbooks/roles/lustre-client/tasks/main.yml +++ b/source/resources/playbooks/roles/lustre-client/tasks/main.yml @@ -16,12 +16,12 @@ lustre_client_installed: "{{ lustre_client['stdout_lines'][0] == 'installed' }}" - debug: msg: | - lustre_client_installed: {{lustre_client_installed}} - amazonlinux2: {{amazonlinux2}} - centos7_5_to_6: {{centos7_5_to_6}} - centos7_5_to_9: {{centos7_5_to_9}} - centos7_7_to_9: {{centos7_7_to_9}} - kernel: {{kernel}} + lustre_client_installed: {{ lustre_client_installed }} + amazonlinux2: {{ amazonlinux2 }} + centos7_5_to_6: {{ centos7_5_to_6 }} + centos7_5_to_9: {{ centos7_5_to_9 }} + centos7_7_to_9: {{ centos7_7_to_9 }} + kernel: {{ kernel }} - name: Install lustre client on Amazon Linux 2 when: not lustre_client_installed and amazonlinux2 diff --git a/source/resources/playbooks/roles/mount_extra_fs/tasks/main.yml b/source/resources/playbooks/roles/mount_extra_fs/tasks/main.yml index e4c150b2..1d2c6c97 100644 --- a/source/resources/playbooks/roles/mount_extra_fs/tasks/main.yml +++ b/source/resources/playbooks/roles/mount_extra_fs/tasks/main.yml @@ -4,7 +4,7 @@ - name: Print vars debug: msg: | - ExtraMounts: {{ExtraMounts}} + ExtraMounts: {{ ExtraMounts }} - name: Install nfs-utils yum: @@ -14,7 +14,7 @@ - name: Mount ExtraMounts when: ExtraMounts|length > 0 - loop: "{{ExtraMounts}}" + loop: "{{ ExtraMounts }}" mount: backup: yes fstype: "{{item.type}}" diff --git a/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml b/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml index 249ba183..39d547f7 100644 --- a/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml +++ b/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml @@ -4,13 +4,13 @@ - name: Print vars debug: msg: | - FileSystemDns: {{FileSystemDns}} - FileSystemMountPath: {{FileSystemMountPath}} - FileSystemMountSrc: {{FileSystemMountSrc}} - FileSystemOptions: {{FileSystemOptions}} - FileSystemPort: {{FileSystemPort}} - FileSystemType: {{FileSystemType}} - ExtraMounts: {{ExtraMounts}} + FileSystemDns: {{ FileSystemDns }} + FileSystemMountPath: {{ FileSystemMountPath }} + FileSystemMountSrc: {{ FileSystemMountSrc }} + FileSystemOptions: {{ FileSystemOptions }} + FileSystemPort: {{ FileSystemPort }} + FileSystemType: {{ FileSystemType }} + ExtraMounts: {{ ExtraMounts }} - name: Install nfs-utils yum: @@ -18,9 +18,9 @@ name: - nfs-utils -- name: Create {{FileSystemMountPath}} +- name: Create {{ FileSystemMountPath }} file: - path: "{{FileSystemMountPath}}" + path: "{{ FileSystemMountPath }}" state: directory owner: root group: root @@ -28,15 +28,15 @@ - name: Wait for file system dns to exist wait_for: - host: "{{FileSystemDns}}" - port: "{{FileSystemPort}}" + host: "{{ FileSystemDns }}" + port: "{{ FileSystemPort }}" timeout: 1800 # 30 minutes - name: Mount SLURM file system mount: backup: yes - fstype: "{{FileSystemType}}" - opts: "{{FileSystemOptions}}" - path: "{{FileSystemMountPath}}" - src: "{{FileSystemMountSrc}}" + fstype: "{{ FileSystemType }}" + opts: "{{ FileSystemOptions }}" + path: "{{ FileSystemMountPath }}" + src: "{{ FileSystemMountSrc }}" state: mounted diff --git a/source/resources/playbooks/roles/rm_create_users_groups_json/tasks/main.yml b/source/resources/playbooks/roles/rm_create_users_groups_json/tasks/main.yml index 3edf37a0..233c63a4 100644 --- a/source/resources/playbooks/roles/rm_create_users_groups_json/tasks/main.yml +++ b/source/resources/playbooks/roles/rm_create_users_groups_json/tasks/main.yml @@ -3,5 +3,5 @@ # Remove the crontab - name: Remove /etc/cron.d/slurm_users_groups_json file: - dest: /etc/cron.d/slurm_{{ClusterName}}_create_users_groups_json + dest: /etc/cron.d/slurm_{{ ClusterName }}_create_users_groups_json state: absent diff --git a/source/resources/playbooks/roles/security_updates/tasks/main.yml b/source/resources/playbooks/roles/security_updates/tasks/main.yml index 2aaf8d3b..1b1129d0 100644 --- a/source/resources/playbooks/roles/security_updates/tasks/main.yml +++ b/source/resources/playbooks/roles/security_updates/tasks/main.yml @@ -3,6 +3,5 @@ - name: Install Security updates yum: - state: latest - disablerepo: "{{yum_disablerepo|default(omit)}}" + disablerepo: "{{ yum_disablerepo|default(omit) }}" security: yes diff --git a/source/slurm_installer/find_existing_resources.py b/source/slurm_installer/find_existing_resources.py index 1398e6c5..c03ea7f3 100644 --- a/source/slurm_installer/find_existing_resources.py +++ b/source/slurm_installer/find_existing_resources.py @@ -31,6 +31,11 @@ from prompt import get_input as get_input logger = logging.getLogger(__file__) +logger_formatter = logging.Formatter('%(levelname)s: %(message)s') +logger_streamHandler = logging.StreamHandler() +logger_streamHandler.setFormatter(logger_formatter) +logger.addHandler(logger_streamHandler) +logger.propagate = False logger.setLevel(logging.INFO) pp = pprint.PrettyPrinter(indent=4) @@ -130,6 +135,60 @@ def get_keypair(self, config_key, config_value, args_value, prompt): choice = get_input(f"Select a KeyPair:", None, allowed_choices, int) return options[choice] + def check_res_environment_name(self, config_key, res_environment_name, config) -> bool: + try: + res_stack_name = None + stacks = {} + for stack_dict in self.cloudformation.list_stacks( + StackStatusFilter=[ + 'CREATE_COMPLETE', + 'ROLLBACK_COMPLETE', + 'UPDATE_COMPLETE', + 'UPDATE_ROLLBACK_COMPLETE', + 'IMPORT_COMPLETE', + 'IMPORT_ROLLBACK_COMPLETE' + ] + )["StackSummaries"]: + stack_name = stack_dict['StackName'] + if stack_name == res_environment_name: + res_stack_name = stack_dict['StackName'] + # Don't break here so get all of the stack names + stack_status = stack_dict['StackStatus'] + stacks[stack_name] = stack_status + if not res_stack_name: + message = f"CloudFormation RES stack named {res_environment_name} not found. Existing stacks:" + for stack_name in sorted(stacks): + message += f"\n {stack_name:32}: status={stacks[stack_name]}" + raise ValueError(message) + + # Get VpcId, SubnetId from RES stack + stack_parameters = self.cloudformation.describe_stacks(StackName=res_stack_name)['Stacks'][0]['Parameters'] + vpc_id = None + subnet_ids = [] + for stack_parameter_dict in stack_parameters: + if stack_parameter_dict['ParameterKey'] == 'VpcId': + vpc_id = stack_parameter_dict['ParameterValue'] + elif stack_parameter_dict['ParameterKey'] == 'PrivateSubnets': + subnet_ids = stack_parameter_dict['ParameterValue'].split(',') + if not vpc_id: + raise ValueError(f"VpcId parameter not found in {res_environment_name} RES stack.") + if 'VpcId' in config and config['VpcId'] != vpc_id: + raise ValueError(f"Config file VpcId={config['VpcId']} is not the same as RESEnvironmentName VpcId={vpc_id}.") + logger.info(f"VpcId set to {vpc_id} by RESEnvironmentName.") + config['VpcId'] = vpc_id + if not subnet_ids: + raise ValueError(f"PrivateSubnets parameter not found in {res_environment_name} RES stack.") + if 'SubnetId' in config and config['SubnetId'] not in subnet_ids: + raise ValueError(f"Config file SubnetId={config['SubnetId']} is not a RES private subnet. RES private subnets: {subnet_ids}.") + if 'SubnetId' not in config: + config['SubnetId'] = subnet_ids[0] + logger.info(f"SubnetId set to {config['SubnetId']} by RESEnvironmentName.") + + return True + except: + raise + return False + def check_vpc_id(self, specified_vpc_id): try: vpcs = {} diff --git a/source/slurm_installer/installer.py b/source/slurm_installer/installer.py index 11c1ce8f..f869b530 100755 --- a/source/slurm_installer/installer.py +++ b/source/slurm_installer/installer.py @@ -75,8 +75,9 @@ def main(self): parser.add_argument("--prompt", action='store_true', help="Prompt for configuration values if not in config file or if invalid.") parser.add_argument("--stack-name", type=str, help="CloudFormation stack name.") parser.add_argument("--profile", "-p", type=str, help="AWS CLI profile to use.") - parser.add_argument("--region", "-r", type=str, help="AWS region where you want to deploy your SOCA environment.") + parser.add_argument("--region", "--Region", "-r", type=str, help="AWS region where you want to deploy your SOCA environment.") parser.add_argument("--SshKeyPair", "-ssh", type=str, help="SSH key to use") + parser.add_argument("--RESEnvironmentName", type=str, default=None, help="Research and Engineering Studio (RES) environment to build the cluster in. Will automatically set VpcId, SubnetId, and SubmitterSecurityGroupIds.") parser.add_argument("--VpcId", type=str, help="Id of VPC to use") parser.add_argument("--SubnetId", type=str, help="SubnetId to use") parser.add_argument("--ErrorSnsTopicArn", type=str, default='', help="SNS topic for error notifications.") @@ -192,27 +193,46 @@ def main(self): self.install_parameters[config_key] = self.config[config_key] logger.info(f"{config_key:30}: {self.install_parameters[config_key]}") - config_key = 'VpcId' - if config_key not in self.config and not args.VpcId: - if not args.prompt: - logger.error(f"{fg('red')}Must specify --prompt or --{config_key} on the command line or {config_key} in the config file.{attr('reset')}") + if args.RESEnvironmentName: + config_key = 'RESEnvironmentName' + logger.info(f"Checking {config_key}") + try: + if not resource_finder.check_res_environment_name(config_key, args.RESEnvironmentName, self.config): + exit(1) + except ValueError as e: + logger.error(e) sys.exit(1) - try: - checked_value = resource_finder.get_vpc_id(config_key, self.config.get(config_key, ''), args.VpcId, args.prompt) - except ValueError as e: - logger.error(e) - sys.exit(1) - if args.prompt: - if args.VpcId: - if args.VpcId != checked_value: - for arg_index, arg_name in enumerate(cmdline_args): - if arg_name == f'--{config_key}': - cmdline_args[arg_index + 1] = checked_value - else: - prompt_args += [f'--{config_key}', checked_value] - self.config[config_key] = checked_value - self.install_parameters[config_key] = self.config[config_key] - logger.info(f"{config_key:30}: {self.install_parameters[config_key]}") + self.config[config_key] = args.RESEnvironmentName + self.install_parameters[config_key] = self.config[config_key] + logger.info(f"{config_key:30}: {self.install_parameters[config_key]}") + config_key = 'VpcId' + self.install_parameters[config_key] = self.config[config_key] + logger.info(f"{config_key:30}: {self.install_parameters[config_key]}") + if args.VpcId and args.VpcId != self.config['VpcId']: + logger.error(f"--VpcId {args.VpcId} is different than VPC for --RESEnvironmentName {args.RESEnvironmentName}") + exit(1) + else: + config_key = 'VpcId' + if config_key not in self.config and not args.VpcId: + if not args.prompt: + logger.error(f"{fg('red')}Must specify --prompt or --{config_key} or --RESEnvironmentName on the command line or {config_key} in the config file.{attr('reset')}") + sys.exit(1) + try: + checked_value = resource_finder.get_vpc_id(config_key, self.config.get(config_key, ''), args.VpcId, args.prompt) + except ValueError as e: + logger.error(e) + sys.exit(1) + if args.prompt: + if args.VpcId: + if args.VpcId != checked_value: + for arg_index, arg_name in enumerate(cmdline_args): + if arg_name == f'--{config_key}': + cmdline_args[arg_index + 1] = checked_value + else: + prompt_args += [f'--{config_key}', checked_value] + self.config[config_key] = checked_value + self.install_parameters[config_key] = self.config[config_key] + logger.info(f"{config_key:30}: {self.install_parameters[config_key]}") # Get the CIDR block for the VPC. Used in multi-region deployments config_key = 'CIDR'