From 1c45cc711ccb2ee199185061c52a5b2f6149cbe9 Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Fri, 17 May 2024 17:23:28 +0000 Subject: [PATCH] Do not auto-prune instance types if there are too many I was previously only allowing 1 memory size/core count combination to keep the number of compute resources down and also was combining multiple instance types in one compute resource if possible. This was to try to maximize the number of instance types that were configured. This led to people not being able to configure the exact instance types they wanted. The preference is to notify the user and let them choose which instances types to exclude or to reduce the number of included types. So, I've reverted to my original strategy of 1 instance type per compute resource and 1 CR per queue. The compute resources can be combined into any queues that the user wants using custom slurm settings. I had to exclude instance types in the default configuration in order to keep from exceeding the PC limits. Resolves #220 --- source/cdk/cdk_slurm_stack.py | 398 +++++++----------- source/cdk/config_schema.py | 89 +++- .../DeconfigureRESUsersGroupsJson.py | 2 +- 3 files changed, 242 insertions(+), 247 deletions(-) diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index 6a352066..dd56ac67 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -2443,7 +2443,35 @@ def create_parallel_cluster_config(self): else: compute_node_ami = None - # Create list of instance types by number of cores and amount of memory + MAX_NUMBER_OF_QUEUES = config_schema.MAX_NUMBER_OF_QUEUES(self.PARALLEL_CLUSTER_VERSION) + MAX_NUMBER_OF_COMPUTE_RESOURCES = config_schema.MAX_NUMBER_OF_COMPUTE_RESOURCES(self.PARALLEL_CLUSTER_VERSION) + MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_QUEUE = config_schema.MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_QUEUE(self.PARALLEL_CLUSTER_VERSION) + + # Create queueus and compute resources. + # We are limited to MAX_NUMBER_OF_QUEUES queues and MAX_NUMBER_OF_COMPUTE_RESOURCES compute resources. + # First analyze the selected instance types to make sure that these limits aren't exceeded. + # The fundamental limit is the limit on the number of compute resources. + # Each compute resource maps to a NodeName and I want instance type to be selected using a constraint. + # This means that each compute resource can only contain a single instance type. + # This limits the number of instance type to MAX_NUMBER_OF_COMPUTE_RESOURCES or MAX_NUMBER_OF_COMPUTE_RESOURCES/2 if you configure spot instances. + # + # We could possible support more instance types by putting instance types with the same amount of cores and memory into the same compute resource. + # The problem with doing this is that you can wind up with very different instance types in the same compute node. + # For example, you could wind up with with an m5zn and r7a.medium or x2iedn.2xlarge and x2iezn.2xlarge. + # + # Create 1 compute resource for each instance type and 1 queue for each compute resource. + # + # If the user configures too many instance types, then flag an error and print out the configured instance + # types and suggest instance types to exclude. + + purchase_options = ['ONDEMAND'] + if self.config['slurm']['InstanceConfig']['UseSpot']: + purchase_options.append('SPOT') + MAX_NUMBER_OF_INSTANCE_TYPES = int(MAX_NUMBER_OF_COMPUTE_RESOURCES / 2) + else: + MAX_NUMBER_OF_INSTANCE_TYPES = MAX_NUMBER_OF_COMPUTE_RESOURCES + + # Create list of instance types by number of cores and amount of memory instance_types_by_core_memory = {} # Create list of instance types by amount of memory and number of cores instance_types_by_memory_core = {} @@ -2481,259 +2509,77 @@ def create_parallel_cluster_config(self): for cores in sorted(instance_types_by_memory_core[mem_gb]): logger.info(f" {len(instance_types_by_memory_core[mem_gb][cores])} instance type with {cores:3} core(s): {instance_types_by_memory_core[mem_gb][cores]}") - purchase_options = ['ONDEMAND'] - if self.config['slurm']['InstanceConfig']['UseSpot']: - purchase_options.append('SPOT') + if len(self.instance_types) > MAX_NUMBER_OF_INSTANCE_TYPES: + logger.error(f"Too many instance types configured: {len(self.instance_types)}. Max is {MAX_NUMBER_OF_INSTANCE_TYPES}") + + + logger.error(f"Too many instance types configured: {len(self.instance_types)}. Max is {MAX_NUMBER_OF_INSTANCE_TYPES}. Consider selecting 1 instance type per memory size. Either reduce the number of included instance families and types or exclude instance families and types.") + exit(1) + nodesets = {} number_of_queues = 0 number_of_compute_resources = 0 for purchase_option in purchase_options: nodesets[purchase_option] = [] - if config_schema.PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE(self.PARALLEL_CLUSTER_VERSION) and config_schema.PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE(self.PARALLEL_CLUSTER_VERSION): - # Creating a queue for each memory size - # In each queue, create a CR for each permutation of memory and core count - for mem_gb in sorted(instance_types_by_memory_core.keys()): - for purchase_option in purchase_options: - if purchase_option == 'ONDEMAND': - queue_name_prefix = "od" - allocation_strategy = 'lowest-price' - else: - queue_name_prefix = "sp" - allocation_strategy = 'capacity-optimized' - queue_name = f"{queue_name_prefix}-{mem_gb}-gb" - if number_of_queues >= MAX_NUMBER_OF_QUEUES: - logger.warning(f"Skipping {queue_name} queue because MAX_NUMBER_OF_QUEUES=={MAX_NUMBER_OF_QUEUES}") - continue - if number_of_compute_resources >= MAX_NUMBER_OF_COMPUTE_RESOURCES: - logger.warning(f"Skipping {queue_name} queue because MAX_NUMBER_OF_COMPUTE_RESOURCES=={MAX_NUMBER_OF_COMPUTE_RESOURCES}") - continue - logger.info(f"Configuring {queue_name} queue:") - nodeset = f"{queue_name}_nodes" - nodesets[purchase_option].append(nodeset) - parallel_cluster_queue = { - 'Name': queue_name, - 'AllocationStrategy': allocation_strategy, - 'CapacityType': purchase_option, - 'ComputeResources': [], - 'ComputeSettings': { - 'LocalStorage': { - 'RootVolume': { - 'VolumeType': 'gp3' - } - } - }, - 'CustomActions': { - 'OnNodeStart': { - 'Sequence': [ - { - 'Script': self.custom_action_s3_urls['config/bin/on_compute_node_start.sh'], - 'Args': [] - } - ] - }, - 'OnNodeConfigured': { - 'Sequence': [ - { - 'Script': self.custom_action_s3_urls['config/bin/on_compute_node_configured.sh'], - 'Args': [] - } - ] - }, - }, - 'Iam': { - 'AdditionalIamPolicies': [ - {'Policy': 'arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore'}, - {'Policy': '{{ParallelClusterAssetReadPolicyArn}}'}, - {'Policy': '{{ParallelClusterSnsPublishPolicyArn}}'} - ] - }, - 'Networking': { - 'SubnetIds': [self.config['SubnetId']], - 'AdditionalSecurityGroups': ['{{SlurmNodeSecurityGroupId}}'], - 'PlacementGroup': {} - }, - } - if 'ComputeNodeAmi' in self.config['slurm']['ParallelClusterConfig']: - parallel_cluster_queue['Image'] = { - 'CustomAmi': self.config['slurm']['ParallelClusterConfig']['ComputeNodeAmi'] - } - if 'AdditionalSecurityGroups' in self.config['slurm']['InstanceConfig']: - for security_group_id in self.config['slurm']['InstanceConfig']['AdditionalSecurityGroups']: - parallel_cluster_queue['Networking']['AdditionalSecurityGroups'].append(security_group_id) - if 'AdditionalIamPolicies' in self.config['slurm']['InstanceConfig']: - for iam_policy_arn in self.config['slurm']['InstanceConfig']['AdditionalIamPolicies']: - parallel_cluster_queue['Iam']['AdditionalIamPolicies'].append({'Policy': iam_policy_arn}) - number_of_queues += 1 - - # Give the compute node access to extra mounts - for fs_type in self.extra_mount_security_groups.keys(): - for extra_mount_sg_name, extra_mount_sg in self.extra_mount_security_groups[fs_type].items(): - parallel_cluster_queue['Networking']['AdditionalSecurityGroups'].append(extra_mount_sg.security_group_id) - - for num_cores in sorted(instance_types_by_memory_core[mem_gb].keys()): - compute_resource_name = f"{queue_name_prefix}-{mem_gb}gb-{num_cores}-cores" - instance_types = sorted(instance_types_by_memory_core[mem_gb][num_cores]) - # If we do multiple CRs per queue then we hit the CR limit without being able to create queues for all memory sizes. - # Select the instance types with the lowest core count for higher memory/core ratio and lower cost. - if len(parallel_cluster_queue['ComputeResources']): - logger.info(f" Skipping {compute_resource_name:18} compute resource: {instance_types} to reduce number of CRs.") - continue - if number_of_compute_resources >= MAX_NUMBER_OF_COMPUTE_RESOURCES: - logger.warning(f" Skipping {compute_resource_name:18} compute resource: {instance_types} because MAX_NUMBER_OF_COMPUTE_RESOURCES=={MAX_NUMBER_OF_COMPUTE_RESOURCES}") - continue - logger.info(f" Adding {compute_resource_name:18} compute resource: {instance_types}") - if compute_resource_name in self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts']: - min_count = self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts'][compute_resource_name]['MinCount'] - max_count = self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts'][compute_resource_name]['MaxCount'] - else: - min_count = self.config['slurm']['InstanceConfig']['NodeCounts']['DefaultMinCount'] - max_count = self.config['slurm']['InstanceConfig']['NodeCounts']['DefaultMaxCount'] - compute_resource = { - 'Name': compute_resource_name, - 'MinCount': min_count, - 'MaxCount': max_count, - 'DisableSimultaneousMultithreading': self.config['slurm']['ParallelClusterConfig']['DisableSimultaneousMultithreading'], - 'Instances': [], - 'Efa': {'Enabled': False}, - } - efa_supported = self.config['slurm']['ParallelClusterConfig']['EnableEfa'] - min_price = sys.maxsize - max_price = 0 - total_price = 0 - for instance_type in sorted(instance_types): - efa_supported = efa_supported and self.plugin.get_EfaSupported(self.cluster_region, instance_type) - if purchase_option == 'ONDEMAND': - price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['OnDemand'] - else: - price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['spot']['max'] - min_price = min(min_price, price) - max_price = max(max_price, price) - total_price += price - compute_resource['Instances'].append( - { - 'InstanceType': instance_type - } - ) - average_price = total_price / len(instance_types) - compute_resource['Efa']['Enabled'] = efa_supported - if self.PARALLEL_CLUSTER_VERSION >= parse_version('3.7.0'): - compute_resource['StaticNodePriority'] = int(average_price * 1000) - compute_resource['DynamicNodePriority'] = int(average_price * 10000) - compute_resource['Networking'] = { - 'PlacementGroup': { - 'Enabled': efa_supported - } + + # Create 1 queue and compute resource for each instance type and purchase option. + for purchase_option in purchase_options: + for instance_type in self.instance_types: + efa_supported = self.plugin.get_EfaSupported(self.cluster_region, instance_type) and self.config['slurm']['ParallelClusterConfig']['EnableEfa'] + if purchase_option == 'ONDEMAND': + queue_name_prefix = "od" + allocation_strategy = 'lowest-price' + price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['OnDemand'] + else: + queue_name_prefix = "sp" + allocation_strategy = 'capacity-optimized' + price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['spot']['max'] + queue_name = f"{queue_name_prefix}-{instance_type}" + queue_name = queue_name.replace('.', '-') + logger.info(f"Configuring {queue_name} queue:") + if number_of_queues >= MAX_NUMBER_OF_QUEUES: + logger.error(f"Can't create {queue_name} queue because MAX_NUMBER_OF_QUEUES=={MAX_NUMBER_OF_QUEUES} and have {number_of_queues} queues.") + exit(1) + nodeset = f"{queue_name}_nodes" + nodesets[purchase_option].append(nodeset) + parallel_cluster_queue = self.create_queue_config(queue_name, allocation_strategy, purchase_option) + number_of_queues += 1 + + compute_resource_name = f"{queue_name_prefix}-{instance_type}".replace('.', '-') + if number_of_compute_resources >= MAX_NUMBER_OF_COMPUTE_RESOURCES: + logger.error(f"Can't create {compute_resource_name} compute resource because MAX_NUMBER_OF_COMPUTE_RESOURCES=={MAX_NUMBER_OF_COMPUTE_RESOURCES} and have {number_of_compute_resources} compute resources") + exit(1) + logger.info(f" Adding {compute_resource_name:18} compute resource") + if compute_resource_name in self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts']: + min_count = self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts'][compute_resource_name]['MinCount'] + max_count = self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts'][compute_resource_name]['MaxCount'] + else: + min_count = self.config['slurm']['InstanceConfig']['NodeCounts']['DefaultMinCount'] + max_count = self.config['slurm']['InstanceConfig']['NodeCounts']['DefaultMaxCount'] + compute_resource = { + 'Name': compute_resource_name, + 'MinCount': min_count, + 'MaxCount': max_count, + 'DisableSimultaneousMultithreading': self.config['slurm']['ParallelClusterConfig']['DisableSimultaneousMultithreading'], + 'Instances': [], + 'Efa': {'Enabled': efa_supported}, + 'Networking': { + 'PlacementGroup': { + 'Enabled': efa_supported } - parallel_cluster_queue['ComputeResources'].append(compute_resource) - number_of_compute_resources += 1 - self.parallel_cluster_config['Scheduling']['SlurmQueues'].append(parallel_cluster_queue) - else: - # ParallelCluster has a restriction where a queue can have only 1 instance type with memory based scheduling - # So, for now creating a queue for each instance type and purchase option - for purchase_option in purchase_options: - for instance_type in self.instance_types: - efa_supported = self.plugin.get_EfaSupported(self.cluster_region, instance_type) and self.config['slurm']['ParallelClusterConfig']['EnableEfa'] - if purchase_option == 'ONDEMAND': - queue_name_prefix = "od" - allocation_strategy = 'lowest-price' - price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['OnDemand'] - else: - queue_name_prefix = "sp" - allocation_strategy = 'capacity-optimized' - price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['spot']['max'] - queue_name = f"{queue_name_prefix}-{instance_type}" - queue_name = queue_name.replace('.', '-') - nodeset = f"{queue_name}_nodes" - nodesets[purchase_option].append(nodeset) - parallel_cluster_queue = { - 'Name': queue_name, - 'AllocationStrategy': allocation_strategy, - 'CapacityType': purchase_option, - 'ComputeResources': [], - 'ComputeSettings': { - 'LocalStorage': { - 'RootVolume': { - 'VolumeType': 'gp3' - } - } - }, - 'CustomActions': { - 'OnNodeStart': { - 'Sequence': [ - { - 'Script': self.custom_action_s3_urls['config/bin/on_compute_node_start.sh'], - 'Args': [] - } - ] - }, - 'OnNodeConfigured': { - 'Sequence': [ - { - 'Script': self.custom_action_s3_urls['config/bin/on_compute_node_configured.sh'], - 'Args': [] - } - ] - }, - }, - 'Iam': { - 'AdditionalIamPolicies': [ - {'Policy': 'arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore'}, - {'Policy': '{{ParallelClusterAssetReadPolicyArn}}'}, - {'Policy': '{{ParallelClusterSnsPublishPolicyArn}}'} - ] - }, - 'Networking': { - 'SubnetIds': [self.config['SubnetId']], - 'AdditionalSecurityGroups': ['{{SlurmNodeSecurityGroupId}}'], - }, } - if 'ComputeNodeAmi' in self.config['slurm']['ParallelClusterConfig']: - parallel_cluster_queue['Image'] = { - 'CustomAmi': self.config['slurm']['ParallelClusterConfig']['ComputeNodeAmi'] - } - if 'AdditionalSecurityGroups' in self.config['slurm']['InstanceConfig']: - for security_group_id in self.config['slurm']['InstanceConfig']['AdditionalSecurityGroups']: - parallel_cluster_queue['Networking']['AdditionalSecurityGroups'].append(security_group_id) - if 'AdditionalIamPolicies' in self.config['slurm']['InstanceConfig']: - for iam_policy_arn in self.config['slurm']['InstanceConfig']['AdditionalIamPolicies']: - parallel_cluster_queue['Iam']['AdditionalIamPolicies'].append({'Policy': iam_policy_arn}) - - # Give the compute node access to extra mounts - for fs_type in self.extra_mount_security_groups.keys(): - for extra_mount_sg_name, extra_mount_sg in self.extra_mount_security_groups[fs_type].items(): - parallel_cluster_queue['Networking']['AdditionalSecurityGroups'].append(extra_mount_sg.security_group_id) - - compute_resource_name = f"{queue_name_prefix}-{instance_type}-cr1".replace('.', '-') - if compute_resource_name in self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts']: - min_count = self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts'][compute_resource_name]['MinCount'] - max_count = self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts'][compute_resource_name]['MaxCount'] - else: - min_count = self.config['slurm']['InstanceConfig']['NodeCounts']['DefaultMinCount'] - max_count = self.config['slurm']['InstanceConfig']['NodeCounts']['DefaultMaxCount'] - compute_resource = { - 'Name': compute_resource_name, - 'MinCount': min_count, - 'MaxCount': max_count, - 'DisableSimultaneousMultithreading': self.config['slurm']['ParallelClusterConfig']['DisableSimultaneousMultithreading'], - 'Instances': [], - 'Efa': {'Enabled': efa_supported}, - 'Networking': { - 'PlacementGroup': { - 'Enabled': efa_supported - } - } + } + compute_resource['Instances'].append( + { + 'InstanceType': instance_type } - compute_resource['Instances'].append( - { - 'InstanceType': instance_type - } - ) - if self.PARALLEL_CLUSTER_VERSION >= parse_version('3.7.0'): - compute_resource['StaticNodePriority'] = int(price * 1000) - compute_resource['DynamicNodePriority'] = int(price * 10000) - parallel_cluster_queue['ComputeResources'].append(compute_resource) - self.parallel_cluster_config['Scheduling']['SlurmQueues'].append(parallel_cluster_queue) + ) + if config_schema.PARALLEL_CLUSTER_SUPPORTS_NODE_WEIGHTS(self.PARALLEL_CLUSTER_VERSION): + compute_resource['StaticNodePriority'] = int(price * 1000) + compute_resource['DynamicNodePriority'] = int(price * 10000) + parallel_cluster_queue['ComputeResources'].append(compute_resource) + self.parallel_cluster_config['Scheduling']['SlurmQueues'].append(parallel_cluster_queue) logger.info(f"Created {number_of_queues} queues with {number_of_compute_resources} compute resources") @@ -3017,3 +2863,65 @@ def create_parallel_cluster_config(self): CfnOutput(self, "command11_SubmitterDeconfigure", value = f"sudo /opt/slurm/{cluster_name}/config/bin/submitter_deconfigure.sh && sudo umount /opt/slurm/{cluster_name}" ) + + def create_queue_config(self, queue_name, allocation_strategy, purchase_option): + parallel_cluster_queue = { + 'Name': queue_name, + 'AllocationStrategy': allocation_strategy, + 'CapacityType': purchase_option, + 'ComputeResources': [], + 'ComputeSettings': { + 'LocalStorage': { + 'RootVolume': { + 'VolumeType': 'gp3' + } + } + }, + 'CustomActions': { + 'OnNodeStart': { + 'Sequence': [ + { + 'Script': self.custom_action_s3_urls['config/bin/on_compute_node_start.sh'], + 'Args': [] + } + ] + }, + 'OnNodeConfigured': { + 'Sequence': [ + { + 'Script': self.custom_action_s3_urls['config/bin/on_compute_node_configured.sh'], + 'Args': [] + } + ] + }, + }, + 'Iam': { + 'AdditionalIamPolicies': [ + {'Policy': 'arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore'}, + {'Policy': '{{ParallelClusterAssetReadPolicyArn}}'}, + {'Policy': '{{ParallelClusterSnsPublishPolicyArn}}'} + ] + }, + 'Networking': { + 'SubnetIds': [self.config['SubnetId']], + 'AdditionalSecurityGroups': ['{{SlurmNodeSecurityGroupId}}'], + 'PlacementGroup': {} + }, + } + if 'ComputeNodeAmi' in self.config['slurm']['ParallelClusterConfig']: + parallel_cluster_queue['Image'] = { + 'CustomAmi': self.config['slurm']['ParallelClusterConfig']['ComputeNodeAmi'] + } + if 'AdditionalSecurityGroups' in self.config['slurm']['InstanceConfig']: + for security_group_id in self.config['slurm']['InstanceConfig']['AdditionalSecurityGroups']: + parallel_cluster_queue['Networking']['AdditionalSecurityGroups'].append(security_group_id) + if 'AdditionalIamPolicies' in self.config['slurm']['InstanceConfig']: + for iam_policy_arn in self.config['slurm']['InstanceConfig']['AdditionalIamPolicies']: + parallel_cluster_queue['Iam']['AdditionalIamPolicies'].append({'Policy': iam_policy_arn}) + + # Give the compute node access to extra mounts + for fs_type in self.extra_mount_security_groups.keys(): + for extra_mount_sg_name, extra_mount_sg in self.extra_mount_security_groups[fs_type].items(): + parallel_cluster_queue['Networking']['AdditionalSecurityGroups'].append(extra_mount_sg.security_group_id) + + return parallel_cluster_queue diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index f69bb32e..414fba36 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -181,6 +181,15 @@ def get_slurm_rest_api_version(config): # Feature support +def MAX_NUMBER_OF_QUEUES(parallel_cluster_version): + return 50 + +def MAX_NUMBER_OF_COMPUTE_RESOURCES(parallel_cluster_version): + return 50 + +def MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_QUEUE(parallel_cluster_version): + return 50 + # Version 3.7.0: PARALLEL_CLUSTER_SUPPORTS_LOGIN_NODES_VERSION = parse_version('3.7.0') def PARALLEL_CLUSTER_SUPPORTS_LOGIN_NODES(parallel_cluster_version): @@ -194,6 +203,10 @@ def PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE(parallel_clus def PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE(parallel_cluster_version): return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE_VERSION +PARALLEL_CLUSTER_SUPPORTS_NODE_WEIGHTS_VERSION = parse_version('3.7.0') +def PARALLEL_CLUSTER_SUPPORTS_NODE_WEIGHTS(parallel_cluster_version): + return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_NODE_WEIGHTS_VERSION + # Version 3.8.0 PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8_VERSION = parse_version('3.8.0') @@ -297,6 +310,7 @@ def DEFAULT_OS(config): 'x2iezn', # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB + 'u', #'u-6tb1', # Intel Xeon Scalable (Skylake) 6 TB #'u-9tb1', # Intel Xeon Scalable (Skylake) 9 TB #'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB @@ -371,7 +385,80 @@ def DEFAULT_OS(config): default_excluded_instance_types = [ '.+\.(micro|nano)', # Not enough memory - '.*\.metal.*' + '.*\.metal.*', + + # Reduce the number of selected instance types to 25. + # Exclude larger core counts for each memory size + # 2 GB: + 'c7a.medium', + 'c7g.medium', + # 4 GB: m7a.medium, m7g.medium + 'c7a.large', + 'c7g.large', + # 8 GB: r7a.medium, r7g.medium + 'm5zn.large', + 'm7a.large', + 'm7g.large', + 'c7a.xlarge', + 'c7g.xlarge', + # 16 GB: r7a.large, x2gd.medium, r7g.large + 'r7iz.large', + 'm5zn.xlarge', + 'm7a.xlarge', + 'm7g.xlarge', + 'c7a.2xlarge', + 'c7g.2xlarge', + # 32 GB: r7a.xlarge, x2gd.large, r7g.xlarge + 'r7iz.xlarge', + 'm5zn.2xlarge', + 'm7a.2xlarge', + 'm7g.2xlarge', + 'c7a.4xlarge', + 'c7g.4xlarge', + # 64 GB: r7a.2xlarge, x2gd.xlarge, r7g.2xlarge + 'r7iz.2xlarge', + 'm7a.4xlarge', + 'm7g.4xlarge', + 'c7a.8xlarge', + 'c7g.8xlarge', + # 96 GB: + 'm5zn.6xlarge', + 'c7a.12xlarge', + 'c7g.12xlarge', + # 128 GB: x2iedn.xlarge, r7iz.4xlarge, x2gd.2xlarge, r7g.4xlarge + 'r7a.4xlarge', + 'm7a.8xlarge', + 'm7g.8xlarge', + 'c7a.16xlarge', + 'c7g.8xlarge', + # 192 GB: m5zn.12xlarge, m7a.12xlarge, m7g.12xlarge + 'c7a.24xlarge', + # 256 GB: x2iedn.2xlarge, x2iezn.2xlarge, x2gd.4xlarge, r7g.8xlarge + 'r7iz.8xlarge', + 'r7a.8xlarge', + 'm7a.16xlarge', + 'm7g.16xlarge', + 'c7a.32xlarge', + # 384 GB: 'r7iz.12xlarge', r7g.12xlarge + 'r7a.12xlarge', + 'm7a.24xlarge', + 'c7a.48xlarge', + # 512 GB: x2iedn.4xlarge, x2iezn.4xlarge, x2gd.8xlarge, r7g.16xlarge + 'r7iz.16xlarge', + 'r7a.16xlarge', + 'm7a.32xlarge', + # 768 GB: r7a.24xlarge, x2gd.12xlarge + 'x2iezn.6xlarge', + 'm7a.48xlarge', + # 1024 GB: x2iedn.8xlarge, x2iezn.8xlarge, x2gd.16xlarge + 'r7iz.32xlarge', + 'r7a.32xlarge', + # 1536 GB: x2iezn.12xlarge, x2idn.24xlarge + 'r7a.48xlarge', + # 2048 GB: x2iedn.16xlarge + 'x2idn.32xlarge', + # 3072 GB: 'x2iedn.24xlarge', + # 4096 GB: x2iedn.32xlarge ] architectures = [ diff --git a/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py b/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py index 028e5983..21adfc54 100644 --- a/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py +++ b/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py @@ -137,7 +137,7 @@ def lambda_handler(event, context): sudo rmdir $mount_dest fi -pass +true """ logger.info(f"Submitting SSM command") send_command_response = ssm_client.send_command(