From e880fc86f4d9e5bb8294544ca161a2aca9e78575 Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Mon, 23 May 2022 16:53:09 -0500 Subject: [PATCH] Update example configs and add comments to the config schema (#28) Resolves [Feature #25](https://github.com/aws-samples/aws-eda-slurm-cluster/issues/25) --- source/cdk/cdk_slurm_stack.py | 4 +- source/cdk/config_schema.py | 253 +++++++++++++++--- source/resources/config/default_config.yml | 4 +- .../config/slurm_all_instance_types.yml | 7 +- source/resources/config/slurm_all_os.yml | 8 +- source/resources/config/slurm_alma_linux.yml | 8 +- source/resources/config/slurm_eda.yml | 40 +-- source/resources/config/slurm_eda_az1.yml | 43 +-- source/resources/config/slurm_eda_az2.yml | 56 ++-- source/resources/config/slurm_eda_az3.yml | 54 ++-- .../resources/config/slurm_elasticsearch.yml | 58 ++-- source/resources/config/slurm_fpga_dev.yml | 61 ++--- source/resources/config/slurm_lustre.yml | 11 +- source/resources/config/slurm_ontap.yml | 36 +++ source/resources/config/slurm_rocky_linux.yml | 12 +- source/resources/config/slurm_zfs.yml | 37 +++ source/slurm_installer/installer.py | 2 +- 17 files changed, 481 insertions(+), 213 deletions(-) create mode 100644 source/resources/config/slurm_ontap.yml create mode 100644 source/resources/config/slurm_zfs.yml diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index ed95f4d5..3d6c59bb 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -168,7 +168,7 @@ def get_config(self, context_var, default_path): from schema import SchemaError region = self.node.try_get_context('region') try: - config_parameters = check_schema(config_parameters, [region]) + config_parameters = check_schema(config_parameters) except SchemaError: logger.exception(f"Invalid config file: {config_file_path}") exit(1) @@ -378,7 +378,7 @@ def check_config(self): from config_schema import check_schema from schema import SchemaError try: - validated_config = check_schema(self.config, [self.config['Region']]) + validated_config = check_schema(self.config) except SchemaError: logger.exception(f"Invalid config") exit(1) diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index 05c14c3d..963e2dd9 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -16,11 +16,23 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import boto3 +from botocore.client import ClientError +from os import environ import re from schema import Schema, And, Use, Optional, Regex, SchemaError +from sys import exit config = {} -valid_regions = [] + +# Determine all AWS regions available on the account. We do not display opt-out region +default_region = environ.get("AWS_DEFAULT_REGION", "us-east-1") +ec2_client = boto3.client("ec2", region_name=default_region) +try: + valid_regions = [region["RegionName"] for region in ec2_client.describe_regions()["Regions"]] +except ClientError as err: + logger.error(f"{fg('red')}Unable to list all AWS regions. Make sure you have set your IAM credentials. {err} {attr('reset')}") + exit(1) filesystem_lifecycle_policies = [ 'None', @@ -31,64 +43,155 @@ 'AFTER_90_DAYS' ] -default_storage = { - 'provider': 'efs', - 'removal_policy': 'RETAIN', - 'efs': { - 'enable_automatic_backups': False, - 'lifecycle_policy': 'AFTER_30_DAYS', - 'use_efs_helper': False, - 'throughput_mode': 'BURSTING', - 'performance_mode': 'GENERAL_PURPOSE', - 'encrypted': True - }, -} +eda_instance_families = [ + #'c5', # Mixed depending on size + 'c5a', # AMD EPYC 7R32 3.3 GHz + #'c5ad', # AMD EPYC 7R32 3.3 GHz + 'c6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz + 'c6g', # AWS Graviton2 Processor 2.5 GHz + #'c6gd', # AWS Graviton2 Processor 2.5 GHz + #'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz + 'm5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + #'m5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + 'm5a', # AMD EPYC 7571 2.5 GHz + #'m5ad', # AMD EPYC 7571 2.5 GHz + 'm5zn', # Intel Xeon Platinum 8252 4.5 GHz + 'm6a', # AMD EPYC 7R13 Processor 3.6 GHz + 'm6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz + 'm6g', # AWS Graviton2 Processor 2.5 GHz + #'m6gd', # AWS Graviton2 Processor 2.5 GHz + 'r5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + 'r5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + #'r5b', # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz + 'r5a', # AMD EPYC 7571 2.5 GHz + 'r5ad', # AMD EPYC 7571 2.5 GHz + 'r6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB + 'r6g', # AWS Graviton2 Processor 2.5 GHz + #'r6gd', # AWS Graviton2 Processor 2.5 GHz + #'x1', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB + #'x1e', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB + 'x2gd', # AWS Graviton2 Processor 2.5 GHz 1TB + 'x2idn', # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB + 'x2iedn', # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB + 'x2iezn', # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB + 'z1d', # Intel Xeon Platinum 8151 4.0 GHz + #'u-6tb1', # Intel Xeon Scalable (Skylake) 6 TB + #'u-9tb1', # Intel Xeon Scalable (Skylake) 9 TB + #'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB +] + +eda_instance_types = [ + #'c5\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz + 'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz + #'c5d\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz + #'c5d\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz +] +# The config file is used in the installer and the CDK app. +# Some configuration values are required in the CDK app but are optional so that they can be set by the installer. config_schema = Schema( { + # termination_protection: + # Enable Cloudformation Stack termination protection Optional('termination_protection', default=True): bool, - Optional('StackName'): str, + # Optional so can be specified on the command-line + Optional('StackName', default='slurm'): str, + # Optional so can be specified on the command-line Optional('Region'): And(str, lambda s: s in valid_regions), + # Optional so can be specified on the command-line Optional('SshKeyPair'): str, + # Optional so can be specified on the command-line Optional('VpcId'): And(str, lambda s: re.match('vpc-', s)), + # + # SubnetId + # Optional. If not specified then the first private subnet is chosen. Optional('SubnetId'): And(str, lambda s: re.match('subnet-', s)), + # Optional, but highly recommended Optional('ErrorSnsTopicArn'): str, + # + # Domain: + # Domain name for the Route 53 private hosted zone that will be used + # by the slurm cluster for DNS. + # By default will be {StackName}.local + # Alternately, provide HostedZoneId of an existing Route53 hosted zone to use. + # Cannot specify both Domain and HostedZoneId. Optional('Domain'): str, + # + # HostedZoneId: + # ID of an existing hosted zone that will be used by the slurm cluster for DNS. + # Alternately, provide Domain name to use for a new Route53 hosted zone to use. + # Cannot specify both Domain and HostedZoneId. Optional('HostedZoneId'): str, Optional('TimeZone', default='US/Central'): str, 'slurm': { + # SlurmVersion: + # Latest tested version + # Critical security fix released in 21.08.8. Must be later than that. Optional('SlurmVersion', default='21.08.8'): str, + # + # ClusterName: + # Default to the StackName Optional('ClusterName'): str, - Optional('MungeKeySsmParameter', default='/slurm/munge_key'): str, # Will be created if it doesn't exist. + # + # MungeKeySsmParameter: + # SSM String Parameter with a base64 encoded munge key to use for the cluster. + # Required if your submitters need to use more than 1 cluster. + # Will be created if it doesn't exist to save the value in Parameter Store. + Optional('MungeKeySsmParameter', default='/slurm/munge_key'): str, + # + # SlurmCtl: + # Required, but can be an empty dict to accept all of the defaults 'SlurmCtl': { + # NumberOfControllers + # For high availability configure multiple controllers Optional('NumberOfControllers', default=1): And(Use(int), lambda n: 1 <= n <= 3), Optional('BaseHostname', default='slurmctl'): str, - Optional('architecture', default='arm64'): str, + Optional('architecture', default='arm64'): And(str, lambda s: s in ['arm64', 'x86_64']), Optional('instance_type', default='c6g.large'): str, Optional('volume_size', default=200): int, - Optional('SuspendAction', default='stop'): str, + # + # SuspendAction: + # Set to stop or terminate. + # Stopped nodes will restart quicker, but you will continue to be charged for the EBS volumes + # attached to the instance. + Optional('SuspendAction', default='stop'): And(str, lambda s: s in ['stop', 'terminate']), + # + # MaxStoppedDuration: + # In ISO 8601 duration format: https://en.wikipedia.org/wiki/ISO_8601#Durations + # Default: 1 hour = P0Y0M0DT1H0M0S + # Evaluated at least hourly by cron job. Optional('MaxStoppedDuration', default='P0Y0M0DT1H0M0S'): str, Optional('CloudWatchPeriod', default=5): int, Optional('PreemptMode', default='REQUEUE'): And(str, lambda s: s in ['OFF', 'CANCEL', 'GANG', 'REQUEUE', 'SUSPEND']), Optional('PreemptType', default='preempt/partition_prio'): And(str, lambda s: s in ['preempt/none', 'preempt/partition_prio', 'preempt/qos']), Optional('PreemptExemptTime', default='0'): str, }, - Optional('ExistingSlurmDbd'): { - Optional('UseSlurmDbd', default=True): bool, - Optional('StackName'): str, - Optional('SecurityGroup'): {str: And(str, lambda s: re.match('sg-', s))}, - Optional('HostnameFQDN'): str, - }, + # + # The accounting database is required to enable fairshare scheduling + # It is managed by the Slurm Database Daemon (slurmdbd) instance + # The instance can be created as part of the cluster or can use an existing instance in a federation of clusters. + # + # SlurmDbd: + # It is recommended to get the basic cluster configured and working before enabling the accounting database Optional('SlurmDbd'): { Optional('UseSlurmDbd', default=True): bool, Optional('Hostname', default='slurmdbd'): str, - Optional('architecture', default='arm64'): str, + Optional('architecture', default='arm64'): And(str, lambda s: s in ['arm64', 'x86_64']), Optional('instance_type', default='m6g.large'): str, Optional('volume_size', default=200): int, Optional('database', default={'port': 3306}): { 'port': int, } }, + # + # ExistingSlurmDbd: + # Used for federated clusters that must share a common slurmdbd instance. + Optional('ExistingSlurmDbd'): { + Optional('UseSlurmDbd', default=True): bool, + Optional('StackName'): str, + Optional('SecurityGroup'): {str: And(str, lambda s: re.match('sg-', s))}, + Optional('HostnameFQDN'): str, + }, Optional('Federation'): { 'Name': str, Optional('FederatedClusterStackNames'): [str], @@ -104,10 +207,14 @@ 'x86_64': str, 'arm64': str, }, + # + # BaseAmis: + # Customized AMIs with file system mounts, packages, etc. configured. + # If these aren't defined then the generic base AMIs are used. Optional('BaseAmis'): { - str: { # region - str: { # distribution - int: { # distribution_major_version + And(str, lambda s: s in valid_regions): { # region + And(str, lambda s: s in ['AlmaLinux', 'Amazon', 'CentOS', 'RedHat', 'Rocky']): { # Distribution + And(int, lambda n: n in [2, 7, 8]): { # distribution_major_version str: { # architecture 'ImageId': And(str, lambda s: re.match('ami-', s)), Optional('RootDeviceSize'): str, @@ -117,20 +224,44 @@ } }, }, + # + # SubmitterSecurityGroupIds: + # External security groups that should be able to use the cluster Optional('SubmitterSecurityGroupIds'): {str: str}, Optional('SubmitterInstanceTags'): {str: [str]}, + # + # InstanceConfig: + # Configure the instances used by the cluster + # A partition will be created for each combination of Base OS, Architecture, and Spot 'InstanceConfig': { + # UseSpot: + # Configure spot instances Optional('UseSpot', default=True): bool, + # + # DefaultPartition: + # By default this will be the first OS/Architecture listed in BaseOsArchitecture. + # Add '_spot' to the end to make spot the default purchase option. 'DefaultPartition': str, + # + # NodesPerInstanceType: + # The number of nodes that will be defined for each instance type. 'NodesPerInstanceType': int, 'BaseOsArchitecture': { - str: { # distribution - int: [ # distribution_major_version - str # architecture + And(str, lambda s: s in ['AlmaLinux', 'Amazon', 'CentOS', 'RedHat', 'Rocky']): { # Distribution + And(int, lambda n: n in [2, 7, 8]): [ # distribution_major_version + And(str, lambda s: s in ['x86_64', 'arm64']) # architecture ] } }, + # Include*/Exclude*: + # Instance families and types are regular expressions with implicit '^' and '$' at the begining and end. + # Exclude patterns are processed first and take precedence over any includes. + # An empty list is the same as '.*'. 'Include': { + # MaxSizeOnly: + # If MaxSizeOnly is True then only the largest instance type in + # a family will be included unless specific instance types are included. + # Default: false Optional('MaxSizeOnly', default=False): bool, 'InstanceFamilies': [str], 'InstanceTypes': [str] @@ -151,6 +282,10 @@ Optional('Partition', default='onprem'): str, } }, + # + # ElasticSearch: + # Configure the ElasticSearch/OpenSearch domain used by the slurm cluster + # If not specified then won't be created or used by the cluster. Optional('ElasticSearch'): { Optional('ebs_volume_size', default=20): int, Optional('ebs_volume_type', default='GP2'): str, @@ -158,33 +293,75 @@ Optional('number_of_azs', default=2): int, Optional('master_nodes', default=2): int, Optional('master_node_instance_type', default='m5.large.search'): str, + # + # data_nodes: + # Must be a multiple of number_of_azs Optional('data_nodes', default=1): int, Optional('data_node_instance_type', default='m5.large.search'): str, Optional('warm_nodes', default=0): int, Optional('warm_instance_type', default='ultrawarm.medium.search'): str, }, + # + # JobCompType: + # Job completion database type. + # This is independent and separate from the slurmdbd results database and has less information. Optional('JobCompType', default='jobcomp/filetxt'): And(str, lambda s: s in ('jobcomp/none', 'jobcomp/elasticsearch', 'jobcomp/filetxt')), + # + # JobCompLoc: + # Used with jobcomp/elasticsearch + # A complete URL endpoint with format ://_doc + # http://{{EsDomain}}.{{Region}}.es.amazonaws.com/slurm/_doc Optional('JobCompLoc'): str, Optional('SlurmUid', default=900): int, 'storage': { + # + # mount_path: + # Default is /opt/slurm/{{cluster_name}} Optional('mount_path'): str, Optional('provider', default='efs'): And(str, lambda s: s in ('efs', 'lustre', 'ontap', 'zfs')), + # + # removal_policy: + # RETAIN will preserve the EFS even if you delete the stack. + # Any other value will delete EFS if you delete the CFN stack Optional('removal_policy', default='RETAIN'): And(str, lambda s: s in ('DESTROY', 'RETAIN', 'SNAPSHOT')), Optional('kms_key_arn'): str, Optional('efs'): { Optional('enable_automatic_backups', default=False): bool, + # + # lifecycle_policy + # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-efs-filesystem-lifecyclepolicy.html Optional('lifecycle_policy', default='AFTER_30_DAYS'): And(str, lambda s: s in filesystem_lifecycle_policies), Optional('use_efs_helper', default=False): bool, Optional('throughput_mode', default='BURSTING'): And(str, lambda s: s in ('BURSTING', 'PROVISIONED')), + # + # provisioned_throughput_per_second: + # In MiB/s. Minimum value of 1 Optional('provisioned_throughput_per_second'): int, Optional('performance_mode', default='GENERAL_PURPOSE'): And(str, lambda s: s in ('GENERAL_PURPOSE', 'MAX_IO')), + # + # encrypted + # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-efs-filesystem.html#cfn-efs-filesystem-encrypted Optional('encrypted', default=True): bool, }, Optional('lustre'): { + # deployment_type + # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-deploymenttype Optional('deployment_type', default='SCRATCH_2'): And(str, lambda s: s in ('PERSISTENT_1', 'SCRATCH_1', 'SCRATCH_2')), + # + # drive_cache_type + # Required when storage_type is HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-drivecachetype Optional('drive_cache_type', default='NONE'): And(str, lambda s: s in ('NONE', 'READ')), + # + # per_unit_storage_throughput + # Required for the PERSISTENT_1 deployment_type. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-perunitstoragethroughput Optional('per_unit_storage_throughput', default=50): int, + # + # storage_capacity + # For SCRATCH_2 and PERSISTENT_1 types, valid values are 1,200, 2,400, then continuing in increments of 2,400 GiB. For SCRATCH_1 deployment types, valid values are 1,200, 2,400, 3,600, then continuing in increments of 3,600 GiB. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagecapacity Optional('storage_capacity', default=1200): int, + # + # storage_type + # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagety Optional('storage_type'): And(str, lambda s: s in ('HDD', 'SSD')), }, Optional('ontap'): { @@ -201,6 +378,11 @@ Optional('throughput_capacity', default=64): And(int, lambda s: s in [64, 128, 256, 512, 1024, 2048, 3072, 4096]), Optional('data_compression_type', default='ZSTD'): And(str, lambda s: s in ('NONE', 'ZSTD', 'LZ4')), }, + # + # ExtraMounts + # Additional mounts for compute nodes + # This examle shows SOCA EFS file systems. + # This is required so the compute node as the same file structure as the remote desktops. Optional('ExtraMounts', default=[]): [ { 'dest': str, @@ -212,9 +394,10 @@ }, }, Optional('AmiMap', default={}): { - str: { # Region - str: { # Distribution - int: { # distribution_major_version + #str: { # Region + And(str, lambda s: s in valid_regions, error=f"Invalid region. valid_regions={valid_regions}"): { # Region + And(str, lambda s: s in ['AlmaLinux', 'Amazon', 'CentOS', 'RedHat', 'Rocky']): { # Distribution + And(int, lambda n: n in [2, 7, 8]): { # distribution_major_version And(str, lambda s: s in ['x86_64', 'arm64']): { 'ImageId': str, 'RootDeviceName': str @@ -226,11 +409,9 @@ } ) -def check_schema(config_in, regions): +def check_schema(config_in): # Validate config against schema global config - global valid_regions config = config_in - valid_regions = regions validated_config = config_schema.validate(config) return validated_config diff --git a/source/resources/config/default_config.yml b/source/resources/config/default_config.yml index 622ad579..ade26c30 100644 --- a/source/resources/config/default_config.yml +++ b/source/resources/config/default_config.yml @@ -2,12 +2,10 @@ #==================================================================== # Sample configuraton that creates a minimal Slurm cluster # -# Shows all available configuration options -# Note that CentOS 8 has been discontinued and support has been removed. -# Uses arm64 architecture for SlurmCtl and SlurmDbd by default. # No SlurmDbd in this configuration. # # Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. #==================================================================== StackName: slurmminimal diff --git a/source/resources/config/slurm_all_instance_types.yml b/source/resources/config/slurm_all_instance_types.yml index 85667e3b..58c52f35 100644 --- a/source/resources/config/slurm_all_instance_types.yml +++ b/source/resources/config/slurm_all_instance_types.yml @@ -1,5 +1,10 @@ --- +#==================================================================== # Create a minimal cluster with all instance types +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmalltypes @@ -41,4 +46,4 @@ slurm: # Use defaults from schema SlurmCtl: {} - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_all_os.yml b/source/resources/config/slurm_all_os.yml index ae85ceb8..1856d158 100644 --- a/source/resources/config/slurm_all_os.yml +++ b/source/resources/config/slurm_all_os.yml @@ -1,6 +1,12 @@ --- +#==================================================================== # Slurm cluster with all supported OS distributions and versions. +# # Note that CentOS 8 has been discontinued and support has been removed. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmallos @@ -34,4 +40,4 @@ slurm: # Use defaults from schema SlurmCtl: {} - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_alma_linux.yml b/source/resources/config/slurm_alma_linux.yml index cfdce47e..9d6e2210 100644 --- a/source/resources/config/slurm_alma_linux.yml +++ b/source/resources/config/slurm_alma_linux.yml @@ -1,5 +1,11 @@ --- +#==================================================================== # Slurm cluster to test Alma Linux support +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== + StackName: slurmalma @@ -26,4 +32,4 @@ slurm: # Use defaults from schema SlurmCtl: {} - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda.yml b/source/resources/config/slurm_eda.yml index 2c07c782..5340da18 100644 --- a/source/resources/config/slurm_eda.yml +++ b/source/resources/config/slurm_eda.yml @@ -1,26 +1,26 @@ --- +#==================================================================== # Slurm cluster for EDA +# # Redundant controllers and typical instances used by EDA. # Uses CentOS 7 and AlmaLinux 8 and both x86_64 and arm64 architectures. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmeda slurm: - # High level configuration - - SlurmVersion: "21.08.5" - SlurmCtl: # For high availability configure multiple controllers NumberOfControllers: 2 # The accounting database is required to enable fairshare scheduling # It is managed by the Slurm Database Daemon (slurmdbd) instance - # This instance can be created as part of the cluster or can use an existing instance. SlurmDbd: {} - # InstanceConfig: - # Configure the instances used by the cluster + # Configure typical EDA instance types # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true @@ -33,20 +33,21 @@ slurm: Include: MaxSizeOnly: false InstanceFamilies: - - 'c5' # Mixed depending on size + #- 'c5' # Mixed depending on size #- 'c5a' # AMD EPYC 7R32 3.3 GHz #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - #- 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz + - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'c6g' # AWS Graviton2 Processor 2.5 GHz #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + #- 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #- 'm5a' # AMD EPYC 7571 2.5 GHz #- 'm5ad' # AMD EPYC 7571 2.5 GHz - #- 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - #- 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - #- 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz + - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz + - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'm6g' # AWS Graviton2 Processor 2.5 GHz #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz @@ -54,13 +55,16 @@ slurm: #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz #- 'r5a' # AMD EPYC 7571 2.5 GHz #- 'r5ad' # AMD EPYC 7571 2.5 GHz - #- 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - #- 'r6g' # AWS Graviton2 Processor 2.5 GHz + - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB + - 'r6g' # AWS Graviton2 Processor 2.5 GHz #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - #- 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - #- 'z1d' # Intel Xeon Platinum 8151 4.0 GHz + - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB + - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB + - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB + - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB + - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB @@ -75,4 +79,4 @@ slurm: - '.*\.metal' # Use defaults from schema - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az1.yml b/source/resources/config/slurm_eda_az1.yml index 5bf27c53..33b1f4fe 100644 --- a/source/resources/config/slurm_eda_az1.yml +++ b/source/resources/config/slurm_eda_az1.yml @@ -1,33 +1,38 @@ --- +#==================================================================== # Federated Slurm cluster for EDA # # This is the first AZ that other AZs will reference. # Other federated clusters will share this cluster's SlurmDbd instance. # Redundant controllers and typical instances used by EDA. # Uses CentOS 7 and AlmaLinux 8 and both x86_64 and arm64 architectures. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmedaaz1 -#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 +# Add your subnet id +SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 -slurm: - #MungeKeySsmParameter: "/slurm/munge_key" +# This is optional, but highly recommended +#ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} +slurm: SlurmCtl: # For high availability configure multiple controllers NumberOfControllers: 2 # The accounting database is required to enable fairshare scheduling # It is managed by the Slurm Database Daemon (slurmdbd) instance - # This instance can be created as part of the cluster or can use an existing instance. SlurmDbd: {} Federation: Name: slurmeda FederatedClusterStackNames: [] - # InstanceConfig: - # Configure the instances used by the cluster + # Configure typical EDA instance types # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true @@ -40,20 +45,21 @@ slurm: Include: MaxSizeOnly: false InstanceFamilies: - - 'c5' # Mixed depending on size + #- 'c5' # Mixed depending on size #- 'c5a' # AMD EPYC 7R32 3.3 GHz #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - #- 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz + - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'c6g' # AWS Graviton2 Processor 2.5 GHz #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + #- 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #- 'm5a' # AMD EPYC 7571 2.5 GHz #- 'm5ad' # AMD EPYC 7571 2.5 GHz - #- 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - #- 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - #- 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz + - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz + - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'm6g' # AWS Graviton2 Processor 2.5 GHz #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz @@ -61,13 +67,16 @@ slurm: #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz #- 'r5a' # AMD EPYC 7571 2.5 GHz #- 'r5ad' # AMD EPYC 7571 2.5 GHz - #- 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - #- 'r6g' # AWS Graviton2 Processor 2.5 GHz + - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB + - 'r6g' # AWS Graviton2 Processor 2.5 GHz #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - #- 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - #- 'z1d' # Intel Xeon Platinum 8151 4.0 GHz + - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB + - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB + - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB + - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB + - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB @@ -82,4 +91,4 @@ slurm: - '.*\.metal' # Use defaults from schema - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az2.yml b/source/resources/config/slurm_eda_az2.yml index 5fa3d011..4f888085 100644 --- a/source/resources/config/slurm_eda_az2.yml +++ b/source/resources/config/slurm_eda_az2.yml @@ -1,28 +1,30 @@ --- +#==================================================================== # Federated Slurm cluster for EDA # # This is the 2nd AZ that must be created after the 1st cluster. # Shares the SlurmDbd instance from the 1st AZ's cluster. # Redundant controllers and typical instances used by EDA. # Uses CentOS 7 and AlmaLinux 8 and both x86_64 and arm64 architectures. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmedaaz2 -#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 +# Add your subnet id +SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 # This is optional, but highly recommended #ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} slurm: - #MungeKeySsmParameter: "/slurm/munge_key" - SlurmCtl: # For high availability configure multiple controllers NumberOfControllers: 2 - # The accounting database is required to enable fairshare scheduling - # It is managed by the Slurm Database Daemon (slurmdbd) instance - # This instance can be created as part of the cluster or can use an existing instance. + # Re-use the SlurmDbd instance from slurmedaaz1 ExistingSlurmDbd: StackName: slurmedaaz1 @@ -31,8 +33,7 @@ slurm: FederatedClusterStackNames: - slurmedaaz1 - # InstanceConfig: - # Configure the instances used by the cluster + # Configure typical EDA instance types # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true @@ -45,40 +46,43 @@ slurm: Include: MaxSizeOnly: false InstanceFamilies: - - 'c5' # Mixed depending on size - #- 'c5a' # AMD EPYC 7R32 3.3 GHz + #- 'c5' # Mixed depending on size + - 'c5a' # AMD EPYC 7R32 3.3 GHz #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - #- 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'c6g' # AWS Graviton2 Processor 2.5 GHz #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5a' # AMD EPYC 7571 2.5 GHz + - 'm5a' # AMD EPYC 7571 2.5 GHz #- 'm5ad' # AMD EPYC 7571 2.5 GHz - #- 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - #- 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - #- 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz + - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz + - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'm6g' # AWS Graviton2 Processor 2.5 GHz #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + - 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + - 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - #- 'r5a' # AMD EPYC 7571 2.5 GHz - #- 'r5ad' # AMD EPYC 7571 2.5 GHz - #- 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - #- 'r6g' # AWS Graviton2 Processor 2.5 GHz + - 'r5a' # AMD EPYC 7571 2.5 GHz + - 'r5ad' # AMD EPYC 7571 2.5 GHz + - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB + - 'r6g' # AWS Graviton2 Processor 2.5 GHz #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - #- 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - #- 'z1d' # Intel Xeon Platinum 8151 4.0 GHz + - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB + - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB + - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB + - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB + - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: [] + InstanceTypes: #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz + - 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz Exclude: @@ -87,4 +91,4 @@ slurm: - '.*\.metal' # Use defaults from schema - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az3.yml b/source/resources/config/slurm_eda_az3.yml index 18256328..ca0d2a3b 100644 --- a/source/resources/config/slurm_eda_az3.yml +++ b/source/resources/config/slurm_eda_az3.yml @@ -1,28 +1,30 @@ --- +#==================================================================== # Federated Slurm cluster for EDA # # This is the 3rd AZ that must be created after the 1st AZ and 2nd clusters. # Shares the SlurmDbd instance from the 1st AZ's cluster. # Redundant controllers and typical instances used by EDA. # Uses CentOS 7 and AlmaLinux 8 and both x86_64 and arm64 architectures. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmedaaz3 +# Add your subnet id SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 # This is optional, but highly recommended #ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} slurm: - #MungeKeySsmParameter: "/slurm/munge_key" - SlurmCtl: # For high availability configure multiple controllers NumberOfControllers: 2 - # The accounting database is required to enable fairshare scheduling - # It is managed by the Slurm Database Daemon (slurmdbd) instance - # This instance can be created as part of the cluster or can use an existing instance. + # Re-use the SlurmDbd instance from slurmedaaz1 ExistingSlurmDbd: StackName: slurmedaaz1 @@ -32,8 +34,7 @@ slurm: - slurmedaaz1 - slurmedaaz2 - # InstanceConfig: - # Configure the instances used by the cluster + # Configure typical EDA instance types # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true @@ -46,40 +47,43 @@ slurm: Include: MaxSizeOnly: false InstanceFamilies: - - 'c5' # Mixed depending on size - #- 'c5a' # AMD EPYC 7R32 3.3 GHz + #- 'c5' # Mixed depending on size + - 'c5a' # AMD EPYC 7R32 3.3 GHz #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - #- 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'c6g' # AWS Graviton2 Processor 2.5 GHz #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5a' # AMD EPYC 7571 2.5 GHz + - 'm5a' # AMD EPYC 7571 2.5 GHz #- 'm5ad' # AMD EPYC 7571 2.5 GHz - #- 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - #- 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - #- 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz + - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz + - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - 'm6g' # AWS Graviton2 Processor 2.5 GHz #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + - 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz + - 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - #- 'r5a' # AMD EPYC 7571 2.5 GHz - #- 'r5ad' # AMD EPYC 7571 2.5 GHz - #- 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - #- 'r6g' # AWS Graviton2 Processor 2.5 GHz + - 'r5a' # AMD EPYC 7571 2.5 GHz + - 'r5ad' # AMD EPYC 7571 2.5 GHz + - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB + - 'r6g' # AWS Graviton2 Processor 2.5 GHz #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - #- 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - #- 'z1d' # Intel Xeon Platinum 8151 4.0 GHz + - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB + - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB + - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB + - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB + - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: [] + InstanceTypes: #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz + - 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz Exclude: @@ -88,4 +92,4 @@ slurm: - '.*\.metal' # Use defaults from schema - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_elasticsearch.yml b/source/resources/config/slurm_elasticsearch.yml index 06d06f17..2d8a0746 100644 --- a/source/resources/config/slurm_elasticsearch.yml +++ b/source/resources/config/slurm_elasticsearch.yml @@ -1,36 +1,17 @@ --- +#==================================================================== # Minimal Slurm cluster with an ElasticSearch domain +# # Creates a new domain and configures Slurm to write completed job information to the domain. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmes slurm: - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot - InstanceConfig: - UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot - NodesPerInstanceType: 10 - BaseOsArchitecture: - AlmaLinux: {8: [x86_64, arm64]} - CentOS: - 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - - t3 - - t4g - InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' - # ElasticSearch: - # Configure the ElasticSearch/OpenSearch domain used by the slurm cluster - # If not specified then won't be created or used by the cluster. - # master_nodes: Defaults to 0 # data_nodes: Must be a multiple of number_of_azs ElasticSearch: ebs_volume_size: 20 @@ -50,12 +31,27 @@ slurm: # jobcomp/elasticsearch # jobcomp/filetxt JobCompType: jobcomp/elasticsearch - # - # JobCompLoc: - # Used with jobcomp/elasticsearch - # A complete URL endpoint with format ://_doc - #JobCompLoc: http://{{EsDomain}}.{{Region}}.es.amazonaws.com/slurm/_doc + + InstanceConfig: + UseSpot: true + DefaultPartition: CentOS_7_x86_64_spot + NodesPerInstanceType: 10 + BaseOsArchitecture: + AlmaLinux: {8: [x86_64, arm64]} + CentOS: + 7: [x86_64] + Include: + MaxSizeOnly: false + InstanceFamilies: + - t3 + - t4g + InstanceTypes: [] + Exclude: + InstanceFamilies: [] + InstanceTypes: + - '.+\.(micro|nano)' # Not enough memory + - '.*\.metal' # Use defaults from schema SlurmCtl: {} - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_fpga_dev.yml b/source/resources/config/slurm_fpga_dev.yml index e9710d1d..a0407440 100644 --- a/source/resources/config/slurm_fpga_dev.yml +++ b/source/resources/config/slurm_fpga_dev.yml @@ -1,22 +1,23 @@ --- +#==================================================================== # Slurm cluster that uses the AWS FPGA Developer AMI as the base AMI for compute nodes. +# # Based on the EDA configuration. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmedafpga slurm: SlurmNodeAmis: - # Customized AMIs with file system mounts, packages, etc. configured. - # If these aren't defined then the generic base AMIs are used. - # Example in the comment below is the AWS FPGA Developer AMI + # AWS FPGA Developer AMIs BaseAmis: us-east-1: Amazon: {2: {x86_64: {ImageId: ami-0efdec76678df9a64, RootDeviceSize: '+5'}}} CentOS: {7: {x86_64: {ImageId: ami-02155c6289e76719a, RootDeviceSize: '+5'}}} - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true DefaultPartition: CentOS_7_x86_64_spot @@ -29,42 +30,18 @@ slurm: Include: MaxSizeOnly: false InstanceFamilies: - - 'c5' # Mixed depending on size - #- 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - #- 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - #- 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - #- 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - #- 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - #- 'r5a' # AMD EPYC 7571 2.5 GHz - #- 'r5ad' # AMD EPYC 7571 2.5 GHz - #- 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - #- 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - #- 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - #- 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB + - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz + - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz + - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz + - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz + - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz + - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB + - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB + - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB + - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB + - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz InstanceTypes: [] - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz Exclude: InstanceFamilies: [] InstanceTypes: @@ -72,4 +49,4 @@ slurm: # Use defaults from schema SlurmCtl: {} - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_lustre.yml b/source/resources/config/slurm_lustre.yml index c092f8a9..6ea6fd41 100644 --- a/source/resources/config/slurm_lustre.yml +++ b/source/resources/config/slurm_lustre.yml @@ -1,7 +1,13 @@ --- +#==================================================================== # Slurm cluster that uses Lustre for storing the Slurm configuration and tool files +# # EFS should be adequate and more cost effective for most uses. # You might consider Lustre for very large, dynamic clusters if they are putting a strain on EFS metadata. +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmlustre @@ -11,13 +17,10 @@ slurm: provider: lustre lustre': {} - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true DefaultPartition: CentOS_7_x86_64_spot - NodesPerInstanceType: 10 + NodesPerInstanceType: 5 BaseOsArchitecture: CentOS: 7: [x86_64] diff --git a/source/resources/config/slurm_ontap.yml b/source/resources/config/slurm_ontap.yml new file mode 100644 index 00000000..25e4b9ad --- /dev/null +++ b/source/resources/config/slurm_ontap.yml @@ -0,0 +1,36 @@ +--- +#==================================================================== +# Slurm cluster that uses FSx for NetApp ONTAP for storing the Slurm configuration and tool files +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== + +StackName: slurmontap +slurm: + storage: + provider: ontap + ontap: {} # This causes the defaults from the schema to be applied. + + InstanceConfig: + UseSpot: true + DefaultPartition: AlmaLinux_8_arm64_spot + NodesPerInstanceType: 10 + BaseOsArchitecture: + AlmaLinux: {8: [x86_64, arm64]} + CentOS: + 7: [x86_64] + Include: + MaxSizeOnly: false + InstanceFamilies: + - t3 + - t4g + InstanceTypes: [] + Exclude: + InstanceFamilies: [] + InstanceTypes: + - '.+\.(micro|nano)' # Not enough memory + - '.*\.metal' + + # Use defaults from schema + SlurmCtl: {} diff --git a/source/resources/config/slurm_rocky_linux.yml b/source/resources/config/slurm_rocky_linux.yml index 91a0929a..8eac0d08 100644 --- a/source/resources/config/slurm_rocky_linux.yml +++ b/source/resources/config/slurm_rocky_linux.yml @@ -1,16 +1,18 @@ --- +#==================================================================== # Test Rocky linux support +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== StackName: slurmrocky slurm: - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true DefaultPartition: Rocky_8_x86_64_spot - NodesPerInstanceType: 10 + NodesPerInstanceType: 5 BaseOsArchitecture: Rocky: {8: [x86_64, arm64]} Include: @@ -26,4 +28,4 @@ slurm: # Use defaults from schema SlurmCtl: {} - storage: {'efs': {}} + storage: {'zfs': {}} diff --git a/source/resources/config/slurm_zfs.yml b/source/resources/config/slurm_zfs.yml new file mode 100644 index 00000000..2460fe02 --- /dev/null +++ b/source/resources/config/slurm_zfs.yml @@ -0,0 +1,37 @@ +--- +#==================================================================== +# Slurm cluster that uses FSx for OpenZfs for storing the Slurm configuration and tool files +# +# Defaults and valid configuration options are in source/config_schema.py. +# Command line values override values in the config file. +#==================================================================== + +StackName: slurmzfs + +slurm: + storage: + provider: zfs + zfs: {} # This causes the defaults from the schema to be applied. + + InstanceConfig: + UseSpot: true + DefaultPartition: AlmaLinux_8_arm64_spot + NodesPerInstanceType: 10 + BaseOsArchitecture: + AlmaLinux: {8: [x86_64, arm64]} + CentOS: + 7: [x86_64] + Include: + MaxSizeOnly: false + InstanceFamilies: + - t3 + - t4g + InstanceTypes: [] + Exclude: + InstanceFamilies: [] + InstanceTypes: + - '.+\.(micro|nano)' # Not enough memory + - '.*\.metal' + + # Use defaults from schema + SlurmCtl: {} diff --git a/source/slurm_installer/installer.py b/source/slurm_installer/installer.py index 276b44c4..2a0048de 100755 --- a/source/slurm_installer/installer.py +++ b/source/slurm_installer/installer.py @@ -452,7 +452,7 @@ def get_config(self, config_file): from config_schema import check_schema from schema import SchemaError try: - validated_config = check_schema(config_parameters, self.accepted_regions) + validated_config = check_schema(config_parameters) except SchemaError: logger.exception(f"Invalid config file: {config_file_path}") sys.exit(1)