From 0c47a68903cfa118099a03f104f77c183797d4ec Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Wed, 16 Oct 2024 15:21:23 +0000 Subject: [PATCH] Only use default EDA excludes if also using default Includes Default Exclude only works correctly with default includes because it excludes instance types to keep the total instance types down. If user specifies any includes, then the default EDA excludes may exclude instance types that they are trying to include. Only use the default EDA includes and excludes if no includes or excludes are configured. Resolves #262 Restore memory based partitions. Related to #235. Create partitions that include the purchase option (sp|od) and the amoutn of instance memory. This maintains backward compantibility for those using partitions to select the purchase option and amount of total memory. --- docs/config.md | 247 +++++++++++++++++++++------------- source/cdk/cdk_slurm_stack.py | 77 +++++++---- source/cdk/config_schema.py | 75 ++++------- source/requirements.txt | 2 +- 4 files changed, 227 insertions(+), 174 deletions(-) diff --git a/docs/config.md b/docs/config.md index 268a2ff3..6fb2d9c8 100644 --- a/docs/config.md +++ b/docs/config.md @@ -93,6 +93,7 @@ This project creates a ParallelCluster configuration file that is documented in Imds: Secured: bool InstanceConfig: + UseOnDemand: str UseSpot: str Exclude: InstanceFamilies: @@ -614,9 +615,154 @@ List of Amazon Resource Names (ARNs) of IAM policies for Amazon EC2 that will be ### InstanceConfig -Configure the instances used by the cluster. +Configure the instances used by the cluster for compute nodes. -A partition will be created for each combination of Base OS, Architecture, and Spot. +ParallelCluster is limited to a total of 50 compute resources and +we only put 1 instance type in each compute resource. +This limits you to a total of 50 instance types per cluster. +If you need more instance types than that, then you will need to create multiple clusters. +If you configure both on-demand and spot instances, then the limit is effectively 25 instance types because 2 compute resources will be created for each instance type. + +If you configure more than 50 instance types then the installer will fail with an error. +You will then need to modify your configuration to either include fewer instance types or +exclude instance types from the configuration. + +If no Include and Exclude parameters are specified then default EDA instance types +will be configured. +The defaults will include the latest generation instance families in the c, m, r, x, and u families. +Older instance families are excluded. +Metal instance types are also excluded. +Specific instance types are also excluded to keep the total number of instance types under 50. +If multiple instance types have the same amount of memory, then the instance types with the highest core counts are excluded. +This is because EDA workloads are typically memory limited, not core limited. + +If any Include or Exclude parameters are specified, then minimal defaults will be used for the parameters that +aren't specified. +By default, all instance families are included and no specific instance types are included. +By default, all instance types with less than 2 GiB of memory are excluded because they don't have enough memory for a Slurm compute node. + +If no includes or excludes are provided, the defaults are: + +``` +slurm: + InstanceConfig: + Exclude: + InstanceFamilies: + - 'a1' # Graviton 1 + - 'c4' # Replaced by c5 + - 'd2' # SSD optimized + - 'g3' # Replaced by g4 + - 'g3s' # Replaced by g4 + - 'h1' # SSD optimized + - 'i3' # SSD optimized + - 'i3en' # SSD optimized + - 'm4' # Replaced by m5 + - 'p2' # Replaced by p3 + - 'p3' + - 'p3dn' + - 'r4' # Replaced by r5 + - 't2' # Replaced by t3 + - 'x1' + - 'x1e' + InstanceTypes: + - '.*\.metal' + # Reduce the number of selected instance types to 25. + # Exclude larger core counts for each memory size + # 2 GB: + - 'c7a.medium' + - 'c7g.medium' + # 4 GB: m7a.medium, m7g.medium + - 'c7a.large' + - 'c7g.large' + # 8 GB: r7a.medium, r7g.medium + - 'm5zn.large' + - 'm7a.large' + - 'm7g.large' + - 'c7a.xlarge' + - 'c7g.xlarge' + # 16 GB: r7a.large, x2gd.medium, r7g.large + - 'r7iz.large' + - 'm5zn.xlarge' + - 'm7a.xlarge' + - 'm7g.xlarge' + - 'c7a.2xlarge' + - 'c7g.2xlarge' + # 32 GB: r7a.xlarge, x2gd.large, r7g.xlarge + - 'r7iz.xlarge' + - 'm5zn.2xlarge' + - 'm7a.2xlarge' + - 'm7g.2xlarge' + - 'c7a.4xlarge' + - 'c7g.4xlarge' + # 64 GB: r7a.2xlarge, x2gd.xlarge, r7g.2xlarge + - 'r7iz.2xlarge' + - 'm7a.4xlarge' + - 'm7g.4xlarge' + - 'c7a.8xlarge' + - 'c7g.8xlarge' + # 96 GB: + - 'm5zn.6xlarge' + - 'c7a.12xlarge' + - 'c7g.12xlarge' + # 128 GB: x2iedn.xlarge, r7iz.4xlarge, x2gd.2xlarge, r7g.4xlarge + - 'r7a.4xlarge' + - 'm7a.8xlarge' + - 'm7g.8xlarge' + - 'c7a.16xlarge' + - 'c7g.8xlarge' + # 192 GB: m5zn.12xlarge, m7a.12xlarge, m7g.12xlarge + - 'c7a.24xlarge' + # 256 GB: x2iedn.2xlarge, x2iezn.2xlarge, x2gd.4xlarge, r7g.8xlarge + - 'r7iz.8xlarge' + - 'r7a.8xlarge' + - 'm7a.16xlarge' + - 'm7g.16xlarge' + - 'c7a.32xlarge' + # 384 GB: r7iz.12xlarge, r7g.12xlarge + - 'r7a.12xlarge' + - 'm7a.24xlarge' + - 'c7a.48xlarge' + # 512 GB: x2iedn.4xlarge, x2iezn.4xlarge, x2gd.8xlarge, r7g.16xlarge + - 'r7iz.16xlarge' + - 'r7a.16xlarge' + - 'm7a.32xlarge' + # 768 GB: r7a.24xlarge, x2gd.12xlarge + - 'x2iezn.6xlarge' + - 'm7a.48xlarge' + # 1024 GB: x2iedn.8xlarge, x2iezn.8xlarge, x2gd.16xlarge + - 'r7iz.32xlarge' + - 'r7a.32xlarge' + # 1536 GB: x2iezn.12xlarge, x2idn.24xlarge + - 'r7a.48xlarge' + # 2048 GB: x2iedn.16xlarge + - 'x2idn.32xlarge' + # 3072 GB: x2iedn.24xlarge + # 4096 GB: x2iedn.32xlarge + Include: + InstanceFamilies: + - 'c7a' # AMD EPYC 9R14 Processor 3.7 GHz + - 'c7g' # AWS Graviton3 Processor 2.6 GHz + - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz + - 'm7a' # AMD EPYC 9R14 Processor 3.7 GHz + - 'm7g' # AWS Graviton3 Processor 2.6 GHz + - 'r7a' # AMD EPYC 9R14 Processor 3.7 GHz + - 'r7g' # AWS Graviton3 Processor 2.6 GHz + - 'r7iz' # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz + - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB + - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB + - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB + - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB + - 'u.*' + InstanceTypes: [] +``` + +#### UseOnDemand + +Configure on-demand instances. + +type: bool + +default: True #### UseSpot @@ -638,45 +784,13 @@ Instance families and types are regular expressions with implicit '^' and '$' at Regular expressions with implicit '^' and '$' at the begining and end. -An empty list is the same as '.*'. - -Default: - -``` -default_excluded_instance_families = [ - 'a1', # Graviton 1 - 'c4', # Replaced by c5 - 'd2', # SSD optimized - 'g3', # Replaced by g4 - 'g3s', # Replaced by g4 - 'h1', # SSD optimized - 'i3', # SSD optimized - 'i3en', # SSD optimized - 'm4', # Replaced by m5 - 'p2', # Replaced by p3 - 'p3', - 'p3dn', - 'r4', # Replaced by r5 - 't2', # Replaced by t3 - 'x1', - 'x1e', -] -``` +Default: [] ##### Exclude InstanceTypes Regular expressions with implicit '^' and '$' at the begining and end. -An empty list is the same as '.*'. - -Default: - -``` -default_excluded_instance_types = [ - '.+\.(micro|nano)', # Not enough memory - '.*\.metal.*' -] -``` +Default: [] #### Include @@ -698,70 +812,13 @@ If MaxSizeOnly is True then only the largest instance type in a family will be i Regular expressions with implicit '^' and '$' at the begining and end. -An empty list is the same as '.*'. - -Default: - -``` -default_eda_instance_families = [ - 'c7a', # AMD EPYC 9R14 Processor 3.7 GHz - - 'c7g', # AWS Graviton3 Processor 2.6 GHz - # 'c7gd', # AWS Graviton3 Processor 2.6 GHz - # 'c7gn', # AWS Graviton3 Processor 2.6 GHz - - # 'c7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz - - #'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5zn', # Intel Xeon Platinum 8252 4.5 GHz - - 'm7a', # AMD EPYC 9R14 Processor 3.7 GHz - - # 'm7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz - - 'm7g', # AWS Graviton3 Processor 2.6 GHz - # 'm7gd', # AWS Graviton3 Processor 2.6 GHz - - 'r7a', # AMD EPYC 9R14 Processor 3.7 GHz - - 'r7g', # AWS Graviton3 Processor 2.6 GHz - # 'r7gd', # AWS Graviton3 Processor 2.6 GHz - - # 'r7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz - - 'r7iz', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz - - 'x2gd', # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn', # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn', # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn', # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - #'u-6tb1', # Intel Xeon Scalable (Skylake) 6 TB - #'u-9tb1', # Intel Xeon Scalable (Skylake) 9 TB - #'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB -] -``` +Default: ['.*'] ##### Include InstanceTypes Regular expressions with implicit '^' and '$' at the begining and end. -An empty list is the same as '.*'. - -Default: - -``` -default_eda_instance_types = [ - #'c5\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz - #'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz - #'c5d\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz - #'c5d\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz -] -``` +Default: [] #### NodeCounts diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index 08ca3dad..3c2b77a8 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -460,6 +460,22 @@ def check_config(self): logger.error(f"ParallelCluster requires VolumeId for {mount_dir} in slurm/storage/ExtraMounts") config_errors += 1 + # If no instance config has been set then choose EDA defaults + if 'InstanceFamilies' not in self.config['slurm']['InstanceConfig']['Include'] and 'InstanceTypes' not in self.config['slurm']['InstanceConfig']['Include'] and 'InstanceFamilies' not in self.config['slurm']['InstanceConfig']['Exclude'] and 'InstanceTypes' not in self.config['slurm']['InstanceConfig']['Exclude']: + self.config['slurm']['InstanceConfig']['Include']['InstanceFamilies'] = config_schema.default_included_eda_instance_families + self.config['slurm']['InstanceConfig']['Include']['InstanceTypes'] = config_schema.default_included_eda_instance_types + self.config['slurm']['InstanceConfig']['Exclude']['InstanceFamilies'] = config_schema.default_excluded_eda_instance_families + self.config['slurm']['InstanceConfig']['Exclude']['InstanceTypes'] = config_schema.default_excluded_eda_instance_types + # Set non-eda defaults + if 'InstanceFamilies' not in self.config['slurm']['InstanceConfig']['Include']: + self.config['slurm']['InstanceConfig']['Include']['InstanceFamilies'] = config_schema.default_instance_families + if 'InstanceTypes' not in self.config['slurm']['InstanceConfig']['Include']: + self.config['slurm']['InstanceConfig']['Include']['InstanceTypes'] = config_schema.default_included_instance_types + if 'InstanceFamilies' not in self.config['slurm']['InstanceConfig']['Exclude']: + self.config['slurm']['InstanceConfig']['Exclude']['InstanceFamilies'] = config_schema.default_excluded_instance_families + if 'InstanceTypes' not in self.config['slurm']['InstanceConfig']['Exclude']: + self.config['slurm']['InstanceConfig']['Exclude']['InstanceTypes'] = config_schema.default_excluded_instance_types + # Check to make sure controller instance type has at least 4 GB of memmory. slurmctl_instance_type = self.config['slurm']['SlurmCtl']['instance_type'] slurmctl_memory_in_gb = int(self.get_instance_type_info(slurmctl_instance_type)['MemoryInMiB'] / 1024) @@ -1158,6 +1174,7 @@ def check_regions_config(self): self.instance_types = sorted(self.instance_types) # Filter the instance types by architecture due to PC limitation to 1 architecture + # Also require at least 2 GB of memory. cluster_architecture = self.config['slurm']['ParallelClusterConfig']['Architecture'] logger.info(f"ParallelCluster Architecture: {cluster_architecture}") filtered_instance_types = [] @@ -1166,6 +1183,10 @@ def check_regions_config(self): if instance_architecture != cluster_architecture: logger.warning(f"Excluding {instance_type} because architecture ({instance_architecture}) != {cluster_architecture}") continue + mem_gb = int(self.plugin.get_MemoryInMiB(self.cluster_region, instance_type) / 1024) + if mem_gb < 2: + logger.warning(f"Excluding {instance_type} because has less than 2 GiB of memory.") + continue filtered_instance_types.append(instance_type) self.instance_types = filtered_instance_types logger.info(f"ParallelCluster configured to use {len(self.instance_types)} instance types :\n{pp.pformat(self.instance_types)}") @@ -2402,9 +2423,9 @@ def create_parallel_cluster_config(self): # We are limited to MAX_NUMBER_OF_QUEUES queues and MAX_NUMBER_OF_COMPUTE_RESOURCES compute resources. # First analyze the selected instance types to make sure that these limits aren't exceeded. # The fundamental limit is the limit on the number of compute resources. - # Each compute resource maps to a NodeName and I want instance type to be selected using a constraint. + # Each compute resource maps to a NodeName and I want instance type to be able to be selected using a constraint. # This means that each compute resource can only contain a single instance type. - # This limits the number of instance type to MAX_NUMBER_OF_COMPUTE_RESOURCES or MAX_NUMBER_OF_COMPUTE_RESOURCES/2 if you configure spot instances. + # This limits the number of instance types to MAX_NUMBER_OF_COMPUTE_RESOURCES or MAX_NUMBER_OF_COMPUTE_RESOURCES/2 if you configure spot instances. # # We could possible support more instance types by putting instance types with the same amount of cores and memory into the same compute resource. # The problem with doing this is that you can wind up with very different instance types in the same compute node. @@ -2415,14 +2436,17 @@ def create_parallel_cluster_config(self): # If the user configures too many instance types, then flag an error and print out the configured instance # types and suggest instance types to exclude. - purchase_options = ['ONDEMAND'] + purchase_options = [] + if self.config['slurm']['InstanceConfig']['UseOnDemand']: + purchase_options.append('ONDEMAND') if self.config['slurm']['InstanceConfig']['UseSpot']: purchase_options.append('SPOT') - MAX_NUMBER_OF_INSTANCE_TYPES = int(MAX_NUMBER_OF_COMPUTE_RESOURCES / 2) - else: - MAX_NUMBER_OF_INSTANCE_TYPES = MAX_NUMBER_OF_COMPUTE_RESOURCES + if not len(purchase_options): + logger.error(f"Must specify either slurm/InstanceConfig/UseOnDemand or UseSpot.") + exit(1) + MAX_NUMBER_OF_INSTANCE_TYPES = int(MAX_NUMBER_OF_COMPUTE_RESOURCES / len(purchase_options)) - # Create list of instance types by number of cores and amount of memory + # Create list of instance types by number of cores and amount of memory instance_types_by_core_memory = {} # Create list of instance types by amount of memory and number of cores instance_types_by_memory_core = {} @@ -2468,34 +2492,46 @@ def create_parallel_cluster_config(self): exit(1) - nodesets = {} + # partition_nodesets is a dictionary indexed by partition name and containing a list of nodesets. + partition_nodesets = {} number_of_queues = 0 number_of_compute_resources = 0 - for purchase_option in purchase_options: - nodesets[purchase_option] = [] # Create 1 queue and compute resource for each instance type and purchase option. for purchase_option in purchase_options: for instance_type in self.instance_types: + logger.debug(f"Creating queue for {purchase_option} {instance_type}") efa_supported = self.plugin.get_EfaSupported(self.cluster_region, instance_type) and self.config['slurm']['ParallelClusterConfig']['EnableEfa'] + mem_gb = int(self.plugin.get_MemoryInMiB(self.cluster_region, instance_type) / 1024) if purchase_option == 'ONDEMAND': queue_name_prefix = "od" allocation_strategy = 'lowest-price' price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['OnDemand'] + purchase_option_partition = "on-demand" else: queue_name_prefix = "sp" allocation_strategy = 'capacity-optimized' - price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['spot']['max'] + price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['spot'].get('max', None) + purchase_option_partition = "spot" queue_name = f"{queue_name_prefix}-{instance_type}" queue_name = queue_name.replace('.', '-') queue_name = queue_name.replace('large', 'l') queue_name = queue_name.replace('medium', 'm') + if not price: + logger.warning(f"Skipping {queue_name} because {instance_type} doesn't have spot pricing") + continue logger.info(f"Configuring {queue_name} queue:") if number_of_queues >= MAX_NUMBER_OF_QUEUES: logger.error(f"Can't create {queue_name} queue because MAX_NUMBER_OF_QUEUES=={MAX_NUMBER_OF_QUEUES} and have {number_of_queues} queues.") exit(1) nodeset = f"{queue_name}_nodes" - nodesets[purchase_option].append(nodeset) + if purchase_option_partition not in partition_nodesets: + partition_nodesets[purchase_option_partition] = [] + partition_nodesets[purchase_option_partition].append(nodeset) + mem_partition = f"{queue_name_prefix}-{mem_gb}-gb" + if mem_partition not in partition_nodesets: + partition_nodesets[mem_partition] = [] + partition_nodesets[mem_partition].append(nodeset) parallel_cluster_queue = self.create_queue_config(queue_name, allocation_strategy, purchase_option) number_of_queues += 1 @@ -2556,25 +2592,14 @@ def create_parallel_cluster_config(self): self.parallel_cluster_config['Scheduling']['SlurmSettings']['CustomSlurmSettings'].append(slurm_settings_dict) # Create custom partitions based on those created by ParallelCluster - if 'ONDEMAND' in nodesets: + for partition in partition_nodesets: self.parallel_cluster_config['Scheduling']['SlurmSettings']['CustomSlurmSettings'].extend( [ { - 'PartitionName': 'on-demand', + 'PartitionName': partition, 'Default': 'NO', 'PriorityTier': '1', - 'Nodes': ','.join(nodesets['ONDEMAND']), - } - ] - ) - if 'SPOT' in nodesets: - self.parallel_cluster_config['Scheduling']['SlurmSettings']['CustomSlurmSettings'].extend( - [ - { - 'PartitionName': 'spot', - 'Default': 'NO', - 'PriorityTier': '10', - 'Nodes': ','.join(nodesets['SPOT']), + 'Nodes': ','.join(partition_nodesets[partition]), } ] ) diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index e8be3892..ac1eba2e 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -359,7 +359,7 @@ def DEFAULT_OS(config): ] # By default I've chosen to exclude *7i instance types because they have 50% of the cores as *7z instances with the same memory. -default_eda_instance_families = [ +default_included_eda_instance_families = [ 'c7a', # AMD EPYC 9R14 Processor 3.7 GHz 'c7g', # AWS Graviton3 Processor 2.6 GHz @@ -396,61 +396,28 @@ def DEFAULT_OS(config): 'x2iezn', # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - 'u', + 'u.*', #'u-6tb1', # Intel Xeon Scalable (Skylake) 6 TB #'u-9tb1', # Intel Xeon Scalable (Skylake) 9 TB #'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB ] -old_eda_instance_families = [ - 'c5', # Mixed depending on size - 'c5a', # AMD EPYC 7R32 3.3 GHz - 'c5ad', # AMD EPYC 7R32 3.3 GHz - 'c6a', - 'c6ad', - 'c6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'c6id', - 'c6g', # AWS Graviton2 Processor 2.5 GHz - 'c6gd', # AWS Graviton2 Processor 2.5 GHz - 'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - 'm5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - 'm5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - 'm5a', # AMD EPYC 7571 2.5 GHz - 'm5ad', # AMD EPYC 7571 2.5 GHz - 'm5zn', # Intel Xeon Platinum 8252 4.5 GHz - 'm6a', # AMD EPYC 7R13 Processor 3.6 GHz - 'm6ad', - 'm6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'm6id', - 'm6g', # AWS Graviton2 Processor 2.5 GHz - 'm6gd', # AWS Graviton2 Processor 2.5 GHz - 'r5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - 'r5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - 'r5b', # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - 'r5a', # AMD EPYC 7571 2.5 GHz - 'r5ad', # AMD EPYC 7571 2.5 GHz - 'r6a', - 'r6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - 'r6id', - 'r6g', # AWS Graviton2 Processor 2.5 GHz - 'r6gd', # AWS Graviton2 Processor 2.5 GHz - 'x1', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - 'x1e', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - 'x2gd', # AWS Graviton2 Processor 2.5 GHz 1TB - 'x2idn', # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - 'x2iedn', # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - 'x2iezn', # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - 'z1d', # Intel Xeon Platinum 8151 4.0 GHz +default_included_instance_families = [ + '.*' ] -default_eda_instance_types = [ +default_included_eda_instance_types = [ #'c5\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz #'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz #'c5d\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz #'c5d\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz ] -default_excluded_instance_families = [ +default_included_instance_types = [] + +default_excluded_instance_families = [] + +default_excluded_eda_instance_families = [ 'a1', # Graviton 1 'c4', # Replaced by c5 'd2', # SSD optimized @@ -469,8 +436,9 @@ def DEFAULT_OS(config): 'x1e', ] -default_excluded_instance_types = [ - '.+\.(micro|nano)', # Not enough memory +default_excluded_instance_types = [] + +default_excluded_eda_instance_types = [ '.*\.metal.*', # Reduce the number of selected instance types to 25. @@ -727,25 +695,28 @@ def get_config_schema(config): # Configure the instances used by the cluster # A partition will be created for each combination of Base OS, Architecture, and Spot 'InstanceConfig': { + # UseOnDemand: + # Configure on-demand instances + Optional('UseOnDemand', default=True): bool, # UseSpot: # Configure spot instances Optional('UseSpot', default=True): bool, # Include*/Exclude*: # Instance families and types are regular expressions with implicit '^' and '$' at the begining and end. - # Exclude patterns are processed first and take precesdence over any includes. + # Exclude patterns are processed first and take precedence over any includes. # An empty list is the same as '.*'. - Optional('Exclude', default={'InstanceFamilies': default_excluded_instance_families, 'InstanceTypes': default_excluded_instance_types}): { - Optional('InstanceFamilies', default=default_excluded_instance_families): [str], - Optional('InstanceTypes', default=default_excluded_instance_types): [str] + Optional('Exclude', default={}): { + Optional('InstanceFamilies'): [str], + Optional('InstanceTypes'): [str] }, - Optional('Include', default={'MaxSizeOnly': False, 'InstanceFamilies': default_eda_instance_families, 'InstanceTypes': default_eda_instance_types}): { + Optional('Include', default={'MaxSizeOnly': False}): { # MaxSizeOnly: # If MaxSizeOnly is True then only the largest instance type in # a family will be included unless specific instance types are included. # Default: false Optional('MaxSizeOnly', default=False): bool, - Optional('InstanceFamilies', default=default_eda_instance_families): [str], - Optional('InstanceTypes', default=default_eda_instance_types): [str] + Optional('InstanceFamilies'): [str], + Optional('InstanceTypes'): [str] }, 'NodeCounts': { Optional('DefaultMinCount', default=0): And(int, lambda s: s >= 0), diff --git a/source/requirements.txt b/source/requirements.txt index 60b01311..6692894a 100644 --- a/source/requirements.txt +++ b/source/requirements.txt @@ -10,5 +10,5 @@ pytest python-hostlist pip requests -PyYAML>=5.4.1 +PyYAML>5.4.1 schema