diff --git a/.gitignore b/.gitignore index deb1375f..e474b001 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,4 @@ -.mkdocs_venv/ -site/ -.vscode/ - -# Jekyll -Gemfile.lock -.jekyll-cache .mkdocs_venv/ _site site/ diff --git a/docs/multi-region.md b/docs/multi-region.md index 1f26ad5b..294394d0 100644 --- a/docs/multi-region.md +++ b/docs/multi-region.md @@ -289,3 +289,7 @@ slurm: type: nfs4 options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport ``` + +## Deployment + +After the configuration is complete then deployment is the same as document on the [Deploy the Cluster](deploy.md) page. diff --git a/source/app.py b/source/app.py index 3878400c..ba73f869 100644 --- a/source/app.py +++ b/source/app.py @@ -23,10 +23,6 @@ app = App() -# TODO: Create a stack for each additional region to create resources needed to create instances in those regions. -# * Instance profile -# * Security group - cdk_env = Environment( account = app.node.try_get_context('account_id'), region = app.node.try_get_context('region') diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index d6141594..453feacb 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -382,7 +382,9 @@ def check_config(self): exit(1) if not self.config['slurm']['InstanceConfig']['Regions']: - default_region = { + self.config['slurm']['InstanceConfig']['Regions'] = {} + self.config['slurm']['InstanceConfig']['Regions'][self.config['Region']] = { + 'VpcId': self.config['VpcId'], 'CIDR': self.config['CIDR'], 'SshKeyPair': self.config['SshKeyPair'], 'AZs': [ @@ -392,7 +394,6 @@ def check_config(self): } ] } - self.config['slurm']['InstanceConfig']['Regions'][self.config['Region']] = default_region self.compute_regions = {} self.remote_compute_regions = {} @@ -648,7 +649,7 @@ def create_security_groups(self): Tags.of(self.zfs_sg).add("Name", f"{self.stack_name}-ZfsSG") self.suppress_cfn_nag(self.zfs_sg, 'W29', 'Egress port range used to block all egress') - # Compute nodes may use lustre file systems to create a security group with the required ports. + # Compute nodes may use lustre file systems so create a security group with the required ports. self.lustre_sg = ec2.SecurityGroup(self, "LustreSG", vpc=self.vpc, allow_all_outbound=False, description="Lustre Security Group") Tags.of(self.lustre_sg).add("Name", f"{self.stack_name}-LustreSG") self.suppress_cfn_nag(self.lustre_sg, 'W29', 'Egress port range used to block all egress') @@ -735,6 +736,7 @@ def create_security_groups(self): fs_client_sg.connections.allow_to(self.nfs_sg, ec2.Port.tcp(2049), f"{fs_client_sg_name} to Nfs") if self.onprem_cidr: self.nfs_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp(2049), 'OnPremNodes to Nfs') + # Allow compute nodes in remote regions access to NFS for compute_region, compute_region_cidr in self.remote_compute_regions.items(): self.nfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(2049), f"{compute_region} to Nfs") @@ -759,6 +761,7 @@ def create_security_groups(self): self.zfs_sg.connections.allow_from(self.onprem_cidr, ec2.Port.udp_range(20001, 20003), 'OnPremNodes to Zfs') self.suppress_cfn_nag(self.zfs_sg, 'W27', 'Correct, restricted range for zfs: 20001-20003') self.suppress_cfn_nag(self.zfs_sg, 'W29', 'Correct, restricted range for zfs: 20001-20003') + # Allow compute nodes in remote regions access to ZFS for compute_region, compute_region_cidr in self.remote_compute_regions.items(): self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(111), f"{compute_region} to Zfs") self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.udp(111), f"{compute_region} to Zfs") @@ -785,6 +788,7 @@ def create_security_groups(self): self.lustre_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp_range(1021, 1023), 'OnPremNodes to Lustre') self.lustre_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp(988), f"Lustre to OnPremNodes") self.lustre_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp_range(1021, 1023), f"Lustre to OnPremNodes") + # Allow compute nodes in remote regions access to Lustre for compute_region, compute_region_cidr in self.remote_compute_regions.items(): self.lustre_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(988), f"{compute_region} to Lustre") self.lustre_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(1021, 1023), f"{compute_region} to Lustre") @@ -988,6 +992,8 @@ def create_elasticsearch(self): self.config['slurm']['JobCompLoc'] = f"http://{domain_endpoint}/slurm/_doc" def create_file_system(self): + self.slurmfs_fqdn = f"slurmfs.{self.config['Domain']}" + if 'kms_key_arn' in self.config['slurm']['storage']: kms_key = kms.Key.from_key_arn(self.config['slurm']['storage']['kms_key_arn']) else: @@ -1057,7 +1063,7 @@ def create_file_system(self): self.file_system_mount_name = "" - self.file_system_mount_source = f"{self.file_system_ip_address}:/" + self.file_system_mount_source = f"{self.slurmfs_fqdn}:/" if self.config['slurm']['storage']['efs']['use_efs_helper']: self.file_system_type = 'efs' @@ -1155,7 +1161,7 @@ def create_file_system(self): self.file_system_mount_name = "" - self.file_system_mount_source = f"{self.file_system_ip_address}:/slurm" + self.file_system_mount_source = f"{self.slurmfs_fqdn}:/slurm" self.file_system_options = 'nfsvers=4.1' @@ -1237,7 +1243,7 @@ def create_file_system(self): self.file_system_mount_name = "" - self.file_system_mount_source = f"{self.file_system_ip_address}:/fsx/slurm" + self.file_system_mount_source = f"{self.slurmfs_fqdn}:/fsx/slurm" self.file_system_options = 'nfsvers=4.1' @@ -1255,7 +1261,6 @@ def create_file_system(self): record_name = 'slurmfs', target = route53.RecordTarget.from_ip_addresses(self.file_system_ip_address) ) - CfnOutput(self, "FileSystemProvider", value = self.config['slurm']['storage']['provider'] ) @@ -1725,7 +1730,6 @@ def get_instance_template_vars(self, instance_role): "ERROR_SNS_TOPIC_ARN": self.config['ErrorSnsTopicArn'], "ExtraMounts": self.config['slurm']['storage']['ExtraMounts'], "FileSystemDns": self.file_system_dns, - "FileSystemIpAddress": self.file_system_ip_address, "FileSystemMountPath": self.config['slurm']['storage']['mount_path'], "FileSystemMountSrc": self.file_system_mount_source, "FileSystemOptions": self.file_system_options, @@ -1749,7 +1753,6 @@ def get_instance_template_vars(self, instance_role): else: instance_template_vars["AccountingStorageHost"] = '' instance_template_vars["CloudWatchPeriod"] = self.config['slurm']['SlurmCtl']['CloudWatchPeriod'] - instance_template_vars["CloudWatchPeriod"] = self.config['slurm']['SlurmCtl']['CloudWatchPeriod'] instance_template_vars["DefaultPartition"] = self.default_partition if 'Federation' in self.config['slurm']: instance_template_vars["Federation"] = self.config['slurm']['Federation']['Name'] diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index f92b6ae2..0963ae6d 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -45,17 +45,18 @@ 'AFTER_90_DAYS' ] -eda_instance_families = [ +default_eda_instance_families = [ #'c5', # Mixed depending on size - 'c5a', # AMD EPYC 7R32 3.3 GHz + #'c5a', # AMD EPYC 7R32 3.3 GHz #'c5ad', # AMD EPYC 7R32 3.3 GHz + 'c6a', 'c6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz 'c6g', # AWS Graviton2 Processor 2.5 GHz #'c6gd', # AWS Graviton2 Processor 2.5 GHz #'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz 'm5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #'m5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - 'm5a', # AMD EPYC 7571 2.5 GHz + #'m5a', # AMD EPYC 7571 2.5 GHz #'m5ad', # AMD EPYC 7571 2.5 GHz 'm5zn', # Intel Xeon Platinum 8252 4.5 GHz 'm6a', # AMD EPYC 7R13 Processor 3.6 GHz @@ -82,13 +83,37 @@ #'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB ] -eda_instance_types = [ +default_eda_instance_types = [ #'c5\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz - 'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz + #'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz #'c5d\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz #'c5d\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz ] +default_excluded_instance_families = [ + 'a1', # Graviton 1 + 'c4', # Replaced by c5 + 'd2', # SSD optimized + 'g3', # Replaced by g4 + 'g3s', # Replaced by g4 + 'h1', # SSD optimized + 'i3', # SSD optimized + 'i3en', # SSD optimized + 'm4', # Replaced by m5 + 'p2', # Replaced by p3 + 'p3', + 'p3dn', + 'r4', # Replaced by r5 + 't2', # Replaced by t3 + 'x1', + 'x1e', +] + +default_excluded_instance_types = [ + '.+\.(micro|nano)', # Not enough memory + '.*\.metal' +] + # The config file is used in the installer and the CDK app. # Some configuration values are required in the CDK app but are optional so that they can be set by the installer. config_schema = Schema( @@ -252,22 +277,22 @@ }, # Include*/Exclude*: # Instance families and types are regular expressions with implicit '^' and '$' at the begining and end. - # Exclude patterns are processed first and take precedence over any includes. + # Exclude patterns are processed first and take precesdence over any includes. # An empty list is the same as '.*'. - 'Include': { + Optional('Exclude', default={'InstanceFamilies': default_excluded_instance_families, 'InstanceTypes': default_excluded_instance_types}): { + Optional('InstanceFamilies', default=default_excluded_instance_families): [str], + Optional('InstanceTypes', default=default_excluded_instance_types): [str] + }, + Optional('Include', default={'MaxSizeOnly': False, 'InstanceFamilies': default_eda_instance_families, 'InstanceTypes': default_eda_instance_types}): { # MaxSizeOnly: # If MaxSizeOnly is True then only the largest instance type in # a family will be included unless specific instance types are included. # Default: false Optional('MaxSizeOnly', default=False): bool, - 'InstanceFamilies': [str], - 'InstanceTypes': [str] - }, - Optional('Exclude', default={'InstanceFamilies': [], 'InstanceTypes': []}): { - 'InstanceFamilies': [str], - 'InstanceTypes': [str] + Optional('InstanceFamilies', default=default_eda_instance_families): [str], + Optional('InstanceTypes', default=default_eda_instance_types): [str] }, - Optional('Regions', default=[]): { + Optional('Regions', default={}): { str: { 'VpcId': And(str, lambda s: re.match('vpc-', s)), 'CIDR': str, diff --git a/source/resources/config/default_config.yml b/source/resources/config/default_config.yml index ade26c30..1a2f8018 100644 --- a/source/resources/config/default_config.yml +++ b/source/resources/config/default_config.yml @@ -18,23 +18,16 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_arm64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' storage: provider: zfs diff --git a/source/resources/config/slurm_all_instance_types.yml b/source/resources/config/slurm_all_instance_types.yml index 58c52f35..5e88f29c 100644 --- a/source/resources/config/slurm_all_instance_types.yml +++ b/source/resources/config/slurm_all_instance_types.yml @@ -14,35 +14,12 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: [] InstanceTypes: [] - Exclude: - InstanceFamilies: - - a1 # Graviton 1 - - c4 # Replaced by c5 - - d2 # SSD optimized - - g3 # Replaced by g4 - - g3s # Replaced by g4 - - h1 # SSD optimized - - i3 # SSD optimized - - i3en # SSD optimized - - m4 # Replaced by m5 - - p2 # Replaced by p3 - - p3 - - p3dn - - r4 # Replaced by r5 - - t2 # Replaced by t3 - - u - - x1 - - x1e - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_all_os.yml b/source/resources/config/slurm_all_os.yml index 1856d158..6a08d427 100644 --- a/source/resources/config/slurm_all_os.yml +++ b/source/resources/config/slurm_all_os.yml @@ -16,7 +16,6 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} @@ -28,15 +27,10 @@ slurm: 8: [x86_64, arm64] Rocky: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_alma_linux.yml b/source/resources/config/slurm_alma_linux.yml index 9d6e2210..d2e1ac5f 100644 --- a/source/resources/config/slurm_alma_linux.yml +++ b/source/resources/config/slurm_alma_linux.yml @@ -15,20 +15,14 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_x86_64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_eda.yml b/source/resources/config/slurm_eda.yml index 5340da18..d45f9c52 100644 --- a/source/resources/config/slurm_eda.yml +++ b/source/resources/config/slurm_eda.yml @@ -24,59 +24,11 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - #- 'c5' # Mixed depending on size - #- 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - #- 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - #- 'r5a' # AMD EPYC 7571 2.5 GHz - #- 'r5ad' # AMD EPYC 7571 2.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: [] - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az1.yml b/source/resources/config/slurm_eda_az1.yml index 33b1f4fe..5ec9f19d 100644 --- a/source/resources/config/slurm_eda_az1.yml +++ b/source/resources/config/slurm_eda_az1.yml @@ -36,59 +36,11 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - #- 'c5' # Mixed depending on size - #- 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - #- 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - #- 'r5a' # AMD EPYC 7571 2.5 GHz - #- 'r5ad' # AMD EPYC 7571 2.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: [] - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az2.yml b/source/resources/config/slurm_eda_az2.yml index 4f888085..09fd2fb4 100644 --- a/source/resources/config/slurm_eda_az2.yml +++ b/source/resources/config/slurm_eda_az2.yml @@ -37,58 +37,11 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - #- 'c5' # Mixed depending on size - - 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - - 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - - 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - - 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - - 'r5a' # AMD EPYC 7571 2.5 GHz - - 'r5ad' # AMD EPYC 7571 2.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - - 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az3.yml b/source/resources/config/slurm_eda_az3.yml index ca0d2a3b..b66421d6 100644 --- a/source/resources/config/slurm_eda_az3.yml +++ b/source/resources/config/slurm_eda_az3.yml @@ -38,58 +38,11 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - #- 'c5' # Mixed depending on size - - 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - - 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - - 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - - 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - - 'r5a' # AMD EPYC 7571 2.5 GHz - - 'r5ad' # AMD EPYC 7571 2.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - - 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema storage: {'zfs': {}} diff --git a/source/resources/config/slurm_elasticsearch.yml b/source/resources/config/slurm_elasticsearch.yml index 2d8a0746..95fd7c8e 100644 --- a/source/resources/config/slurm_elasticsearch.yml +++ b/source/resources/config/slurm_elasticsearch.yml @@ -34,23 +34,16 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_fpga_dev.yml b/source/resources/config/slurm_fpga_dev.yml index a0407440..05922465 100644 --- a/source/resources/config/slurm_fpga_dev.yml +++ b/source/resources/config/slurm_fpga_dev.yml @@ -20,7 +20,6 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: Amazon: @@ -28,7 +27,6 @@ slurm: CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz @@ -42,10 +40,6 @@ slurm: - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_lustre.yml b/source/resources/config/slurm_lustre.yml index 6ea6fd41..fa5a0d6e 100644 --- a/source/resources/config/slurm_lustre.yml +++ b/source/resources/config/slurm_lustre.yml @@ -19,20 +19,14 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_multi_az.yml b/source/resources/config/slurm_multi_az.yml index dc42d493..72a3e62d 100644 --- a/source/resources/config/slurm_multi_az.yml +++ b/source/resources/config/slurm_multi_az.yml @@ -1,132 +1,30 @@ --- -# Sample configuraton that creates a minimal Slurm cluster -# Shows all available configuration options -# Note that CentOS 8 has been discontinued and support has been removed. -# Uses arm64 architecture for SlurmCtl and SlurmDbd by default. -# No SlurmDbd in this configuration. +# Multi-region Slurm cluster with Netapp Ontap -termination_protection: True # Enable (recommended) or Disable Cloudformation Stack termination protection +StackName: slurmmultiaz -#==================================================================== -# Parameters that must be in the config file or on the command line. -# Command line values override values in the config file. -#==================================================================== -StackName: slurmminimal #Region: us-east-1 + #SshKeyPair: name of your ec2 keypair + #VpcId: vpc-xxxxxxxxxxxxxxxxx -# SubnetId: -# Optional. If not specified then the first private subnet is chosen. #SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 -#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 -#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 + +#HostedZoneId: XXXXXXXXXXXXXXXXXXX # This is optional, but highly recommended #ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} -#==================================================================== -# Required Parameters -#==================================================================== - -# Domain: Optional -# Domain name for the Route 53 private hosted zone that will be used -# by the slurm cluster for DNS. -# By default will be {StackName}.local -# Alternately, provide HostedZoneId of an existing Route53 hosted zone to use. -# Cannot specify both Domain and HostedZoneId. -# Domain: "{{StackName}}.local" - -# HostedZoneId: Optional -# ID of an existing hosted zone that will be used by the slurm cluster for DNS. -# Alternately, provide Domain name to use for a new Route53 hosted zone to use. -# Cannot specify both Domain and HostedZoneId. -# HostedZoneId: - -TimeZone: 'US/Central' +#TimeZone: 'US/Central' slurm: - # High level configuration - - SlurmVersion: "21.08.5" - - # ClusterName: - # Optional - # Must be unique if multiple clusters deployed in the same VPC. - # Default: StackName - # ClusterName: slurm - - # MungeKeySsmParameter - # SSM String Parameter with a base64 encoded munge key to use for the cluster. - # Use this if your submitters need to use more than 1 cluster. - #MungeKeySsmParameter: "/slurm/munge_key" + MungeKeySsmParameter: "/slurm/munge_key" SlurmCtl: - # For high availability configure multiple controllers - NumberOfControllers: 1 - # The index will be appended to BaseHostname starting with 1. - BaseHostname: slurmctl - - # architecture: x86_64 or arm64 - #architecture: x86_64 - #instance_type: "c5.large" - architecture: arm64 - instance_type: "c6g.large" - volume_size: 200 # Size of the EBS root disk + NumberOfControllers: 2 - # SuspendAction - # Set to stop or terminate. - # Stopped nodes will restart quicker, but you will continue to be charged for the EBS volumes - # attached to the instance. - SuspendAction: stop - # - # MaxStoppedDuration - # In ISO 8601 duration format: https://en.wikipedia.org/wiki/ISO_8601#Durations - # Default: 1 hour = P0Y0M0DT1H0M0S - # Evaluated at least hourly - MaxStoppedDuration: P0Y0M0DT1H0M0S - - CloudWatchPeriod: 5 # Cloudwatch metric collection period in minutes. Default value is 5. Set to 1 for finer resolution. - # Also used in the dashboard widgets. - - # The accounting database is required to enable fairshare scheduling - # It is managed by the Slurm Database Daemon (slurmdbd) instance - # This instance can be created as part of the cluster or can use an existing instance. - # SlurmDbd: - # # It is recommended to get the basic cluster configured and working before enabling the accounting database - # UseSlurmDbd: False - - # # Hostname: - # # Hostname of the slurmdbd instance if CreateSlurmdbd is true. - # Hostname: slurmdbd - - # # architecture: x86_64 or arm64 - # #architecture: x86_64 - # #instance_type: "m5.large" - # architecture: arm64 - # instance_type: "m6g.large" - # volume_size: 200 # Size of the EBS root disk - - # database: - # port: 3306 - - # Federation: - # Name: slurmeda - # SlurmCtlSecurityGroups: - # SecurityGroupName: sg-xxxxxxxxxxxxxxxxx - - SlurmNodeAmis: - instance_type: - x86_64: m5.large - arm64: m6g.large - - # Customized AMIs with file system mounts, packages, etc. configured. - # If these aren't defined then the generic base AMIs are used. - # Example in the comment below is the AWS FPGA Developer AMI - #BaseAmis: - # us-east-1: - # Amazon: {2: {x86_64: {ImageId: ami-0efdec76678df9a64, RootDeviceSize: '+5'}}} - # CentOS: {7: {x86_64: {ImageId: ami-02155c6289e76719a, RootDeviceSize: 90}}} + SlurmDbd: {} # External security groups that should be able to use the cluster # SubmitterSecurityGroupIds: @@ -135,117 +33,59 @@ slurm: # SubmitterInstanceTags: # 'soca:ClusterId': ['soca-xyz'] - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot - # - # UseSpot: - # Create both on-demand and spot nodes - # Default: true - # DefaultPartition: - # By default this will be the first OS/Architecture listed in BaseOsArchitecture. - # Add '_spot' to the end to make spot the default purchase option. - # NodesPerInstanceType: - # The number of nodes that will be defined for each instance type. - # Include*/Exclude*: - # Instance families and types are regular expressions with implicit '^' and '$' at the begining and end. - # Exclude patterns are processed first and take precedence over any includes. - # A empty list is the same as '.*'. - # MaxSizeOnly: If MaxSizeOnly is True then only the largest instance type in - # a family will be included unless specific instance types are included. - # Default: false InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_arm64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} - # Amazon: {2: [x86_64, arm64]} CentOS: 7: [x86_64] - # Amazon: {2: [x86_64, arm64]} - # RedHat: - # 7: [x86_64] - # 8: [x86_64, arm64] - # Rocky: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' - AZs: - - Priority: 1 - #Region: us-east-1 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 - - Priority: 2 - #Region: us-east-1 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 - - Priority: 3 - #Region: us-east-1 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 - - # ElasticSearch: - # Configure the ElasticSearch/OpenSearch domain used by the slurm cluster - # If not specified then won't be created or used by the cluster. - # master_nodes: Defaults to 0 - # data_nodes: Must be a multiple of number_of_azs - # ElasticSearch: - # ebs_volume_size: 20 - # ebs_volume_type: GP2 - # enable_version_upgrade: False - # number_of_azs: 2 - # master_nodes: 3 - # master_node_instance_type: m5.large.search - # data_nodes: 2 - # data_node_instance_type: m5.large.search - # warm_nodes: 0 - # warm_instance_type: ultrawarm.medium.search - - # JobCompType: - # Values: - # jobcomp/none - # jobcomp/elasticsearch - # jobcomp/filetxt - JobCompType: jobcomp/filetxt - # - # JobCompLoc: - # Used with jobcomp/elasticsearch - # A complete URL endpoint with format ://_doc - #JobCompLoc: http://{{EsDomain}}.{{Region}}.es.amazonaws.com/slurm/_doc + Regions: + eu-west-1: + VpcId: vpc-xxxxxxxxxxxxxxxxx + CIDR: 10.1.0.0/16 + SshKeyPair: admin-eu-west-1 + AZs: + - Priority: 10 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 + - Priority: 9 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 + - Priority: 8 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 + us-east-1: + VpcId: vpc-xxxxxxxxxxxxxxxxx + CIDR: 10.2.0.0/16 + SshKeyPair: admin-us-east-1 + AZs: + - Priority: 7 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 + - Priority: 6 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 + - Priority: 5 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 + us-west-2: + VpcId: vpc-xxxxxxxxxxxxxxxxx + CIDR: 10.3.0.0/16 + SshKeyPair: admin-us-west-2 + #SecurityGroupId: sg-0addccc8388e008fd + AZs: + - Priority: 4 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 + - Priority: 3 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 + - Priority: 2 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 - # Configure your Storage options below - # @todo support fsxn, test if efs will gate scaling of the cluster storage: - # mount_path: - # Default is /opt/slurm/{{cluster_name}} - #mount_path: "" - provider: "efs" # efs or lustre - #kms_key_arn: - removal_policy : "DESTROY" # DESTROY, RETAIN, SNAPSHOT. Choices: RETAIN will preserve the EFS even if you delete the stack. Any other value will delete EFS if you delete the CFN stack - efs: - use_efs_helper: false - throughput_mode: "BURSTING" # Choices: BURSTING, PROVISIONED - # provisioned_throughput_per_second: 1 # In MiB/s. Minimum value of 1 - performance_mode: "GENERAL_PURPOSE" # Choices: GENERAL_PURPOSE, MAX_IO - encrypted: True # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-efs-filesystem.html#cfn-efs-filesystem-encrypted - lifecycle_policy: "AFTER_30_DAYS" # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-efs-filesystem-lifecyclepolicy.html - lustre: - deployment_type: "SCRATCH_2" # Allowed values: PERSISTENT_1 | SCRATCH_1 | SCRATCH_2. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-deploymenttype - drive_cache_type: "NONE" # Allowed values: NONE | READ. Required when storage_type is HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-drivecachetype - per_unit_storage_throughput: 50 # Allowed values: 12, 40 for HDD, 50, 100, 200 for SSD. Required for the PERSISTENT_1 deployment_type. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-perunitstoragethroughput - storage_capacity: 1200 # For SCRATCH_2 and PERSISTENT_1 types, valid values are 1,200, 2,400, then continuing in increments of 2,400 GiB. For SCRATCH_1 deployment types, valid values are 1,200, 2,400, 3,600, then continuing in increments of 3,600 GiB. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagecapacity - storage_type: "SSD" # Allowed values: SSD or HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagetype + provider: ontap + removal_policy: DESTROY + ontap: {} - # ExtraMounts - # Additional mounts for compute nodes - # This examle shows SOCA EFS file systems. - # This is required so the compute node as the same file structure as the remote desktops. #ExtraMounts: # - dest: /apps # src: fs-xxxxxxxx.efs.us-east-1.amazonaws.com:/ diff --git a/source/resources/config/slurm_ontap.yml b/source/resources/config/slurm_ontap.yml index 25e4b9ad..ccf02007 100644 --- a/source/resources/config/slurm_ontap.yml +++ b/source/resources/config/slurm_ontap.yml @@ -14,23 +14,16 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_arm64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_rocky_linux.yml b/source/resources/config/slurm_rocky_linux.yml index 8eac0d08..7d4ba1ee 100644 --- a/source/resources/config/slurm_rocky_linux.yml +++ b/source/resources/config/slurm_rocky_linux.yml @@ -11,20 +11,14 @@ StackName: slurmrocky slurm: InstanceConfig: UseSpot: true - DefaultPartition: Rocky_8_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: Rocky: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_zfs.yml b/source/resources/config/slurm_zfs.yml index 2460fe02..7f013467 100644 --- a/source/resources/config/slurm_zfs.yml +++ b/source/resources/config/slurm_zfs.yml @@ -15,23 +15,16 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_arm64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/lambdas/UpdateDns/UpdateDns.py b/source/resources/lambdas/UpdateDns/UpdateDns.py deleted file mode 100644 index cbd48f39..00000000 --- a/source/resources/lambdas/UpdateDns/UpdateDns.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -SPDX-License-Identifier: MIT-0 - -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, copy, modify, -merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" - -''' -Create/delete DNS entry -''' -import cfnresponse -import boto3 -import logging -logging.getLogger().setLevel(logging.INFO) - -def lambda_handler(event, context): - try: - logging.info("event: {}".format(event)) - properties = event['ResourceProperties'] - required_properties = ['Hostname', 'Domain', 'HostedZoneId', 'Type', 'Value'] - error_message = "" - for property in required_properties: - try: - value = properties[property] - except: - error_message += "Missing {} property. ".format(property) - if error_message: - raise KeyError(error_message) - route53_client = boto3.client('route53') - requestType = event['RequestType'] - if requestType in ['Create', 'Update']: - action = 'UPSERT' - elif requestType == 'Delete': - action = 'DELETE' - else: - raise ValueError('Invalid RequestType: {}'.format(event['RequestType'])) - hostname = properties['Hostname'] - domain = properties['Domain'] - type = properties['Type'] - value = properties['Value'] - logging.info("{} {}.{} {} record, value=".format(action, hostname, type, value)) - route53_client.change_resource_record_sets( - HostedZoneId=properties['HostedZoneId'], - ChangeBatch={ - 'Comment': '{} {} DNS record'.format(action, hostname), - 'Changes': [ - { - 'Action': action, - 'ResourceRecordSet': { - 'Name': "{}.{}".format(hostname, domain), - 'Type': type, - 'TTL': 60, - 'ResourceRecords': [{'Value': value}] - } - } - ] - } - ) - except Exception as e: - logging.exception(str(e)) - cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e)) - raise - - cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "{} {}.{} {}".format(properties['Type'], properties['Hostname'], properties['Domain'], properties['Value'])) diff --git a/source/resources/lambdas/UpdateDns/cfnresponse.py b/source/resources/lambdas/UpdateDns/cfnresponse.py deleted file mode 120000 index 09400dfc..00000000 --- a/source/resources/lambdas/UpdateDns/cfnresponse.py +++ /dev/null @@ -1 +0,0 @@ -../cfnresponse.py \ No newline at end of file diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py index e87bd965..7fbb1c73 100755 --- a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py @@ -113,9 +113,9 @@ def get_instance_type_info(self, region): else: instance_type_info[instanceType]['ThreadsPerCore'] = 1 if 'ValidCores' in instanceTypeDict['VCpuInfo']: - instance_type_info[instanceType]['CoreCount'] = max(instanceTypeDict['VCpuInfo']['ValidCores']) + instance_type_info[instanceType]['CoreCount'] = int(max(instanceTypeDict['VCpuInfo']['ValidCores'])) else: - instance_type_info[instanceType]['CoreCount'] = instanceTypeDict['VCpuInfo']['DefaultVCpus']/instance_type_info[instanceType]['ThreadsPerCore'] + instance_type_info[instanceType]['CoreCount'] = int(instanceTypeDict['VCpuInfo']['DefaultVCpus']/instance_type_info[instanceType]['ThreadsPerCore']) instance_type_info[instanceType]['MemoryInMiB'] = instanceTypeDict['MemoryInfo']['SizeInMiB'] instance_type_info[instanceType]['SSDCount'] = instanceTypeDict.get('InstanceStorageInfo', {'Disks': [{'Count': 0}]})['Disks'][0]['Count'] instance_type_info[instanceType]['SSDTotalSizeGB'] = instanceTypeDict.get('InstanceStorageInfo', {'TotalSizeInGB': 0})['TotalSizeInGB'] diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py index 6eb7d150..d2332313 100755 --- a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py @@ -1498,7 +1498,7 @@ def create_node_conf(self): logger.propagate = False self.parser = argparse.ArgumentParser("Create SLURM node config from EC2 instance metadata") - self.parser.add_argument('--config-file', default=False, help="YAML file with instance families and types to include/exclude") + self.parser.add_argument('--config-file', required=True, help="YAML file with instance families and types to include/exclude") self.parser.add_argument('--output-file', '-o', required=True, help="Output file") self.parser.add_argument('--az-info-file', required=True, help="JSON file where AZ info will be saved") self.parser.add_argument('--instance-type-info-json', default=False, help="JSON file with cached instance type info.") @@ -1509,72 +1509,8 @@ def create_node_conf(self): logger.setLevel(logging.DEBUG) logger.debug(f"Debugging level {self.args.debug}") - if self.args.config_file: - logger.info(f"Loading config from {self.args.config_file}") - instance_config = yaml.load(open(self.args.config_file, 'r').read(), Loader=yaml.SafeLoader) - else: - instance_config = { - 'UseSpot': True, - 'NodesPerInstanceType': 10, - 'BaseOsArchitecture': { - 'AlmaLinux': {8: ['x86_64', 'arm64']}, - 'CentOS': { - '7': ['x86_64'], - '8': ['x86_64', 'arm64'] - }, - 'Amazon': {'2': ['x86_64', 'arm64']}, - 'RedHat': { - '7': ['x86_64'], - '8': ['x86_64', 'arm64'] - }, - 'Rocky': {8: ['x86_64', 'arm64']}, - }, - 'Include': { - 'MaxSizeOnly': False, - 'InstanceFamilies': [ - 't3', - 't3a', - 't4g', - ], - 'InstanceTypes': [] - }, - 'Exclude': { - 'InstanceFamilies': [ - 'a1', # Graviton 1 - 'c4', # Replaced by c5 - 'd2', # SSD optimized - 'g3', # Replaced by g4 - 'g3s', # Replaced by g4 - 'h1', # SSD optimized - 'i3', # SSD optimized - 'i3en', # SSD optimized - 'm4', # Replaced by m5 - 'p2', # Replaced by p3 - 'p3', - 'p3dn', - 'r4', # Replaced by r5 - 't2', # Replaced by t3 - 'u', - 'x1', - 'x1e' - ], - 'InstanceTypes': [] - }, - 'Regions': [ - { - 'Region': environ['AWS_DEFAULT_REGION'], - 'AZs': [ - { - 'Priority': 1, - 'Region': environ['AWS_DEFAULT_REGION'], - 'Subnet': environ['GridSubnet1'] - } - ], - }, - ], - 'AlwaysOnNodes': [], - 'AlwaysOnPartitions': [] - } + logger.info(f"Loading config from {self.args.config_file}") + instance_config = yaml.load(open(self.args.config_file, 'r').read(), Loader=yaml.SafeLoader) # Check for required fields if 'BaseOsArchitecture' not in instance_config: @@ -1582,13 +1518,15 @@ def create_node_conf(self): # Set defaults for missing fields if 'UseSpot' not in instance_config: - instance_config['UseSpot'] = True + raise ValueError(f"InstanceConfig missing UseSpot") if 'NodesPerInstanceType' not in instance_config: - instance_config['NodesPerInstanceType'] = 10 + raise ValueError(f"InstanceConfig missing NodesPerInstanceType") + if 'Exclude' not in instance_config: + raise ValueError(f"InstanceConfig missing Exclude") if 'Include' not in instance_config: - instance_config['Include'] = {} + raise ValueError(f"InstanceConfig missing Include") if 'MaxSizeOnly' not in instance_config['Include']: - instance_config['Include']['MaxSizeOnly'] = 10 + raise ValueError(f"InstanceConfig missing Include.MaxSizeOnly") compute_regions = sorted(instance_config['Regions'].keys()) az_info = self.get_az_info_from_instance_config(instance_config) @@ -1666,7 +1604,7 @@ def create_node_conf(self): ondemand_featureList = base_featureList + ',ondemand' price = instance_type_info[instanceType]['pricing']['OnDemand'] weight = int(float(price) * 10000) - node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:89s} Weight={}".format( + node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:103s} Weight={}".format( node, str(coreCount), str(realMemory), ondemand_featureList, weight) node_sets[node_set]['node_names'].append(node_name) @@ -1676,7 +1614,7 @@ def create_node_conf(self): spot_feature_list = f"{base_featureList},spot" spot_price = instance_type_info[instanceType]['pricing']['spot'][az] spot_weight = int(float(spot_price) * 10000) - spot_node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:89s} Weight={}".format( + spot_node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:103s} Weight={}".format( spot_node, str(coreCount), str(realMemory), spot_feature_list, spot_weight) node_sets[spot_node_set]['node_names'].append(spot_node_name) diff --git a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template index 5435c427..a8a005ef 100644 --- a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template +++ b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template @@ -57,7 +57,7 @@ if { [ module-info mode load ] || [ module-info mode display ] } { unsetenv SBATCH_TIMELIMIT unsetenv SBATCH_TIMELIMIT_SET } - if { [ info exists ::env(SBATCH_TIMELIMIT_SET) ] } { + if { [ info exists ::env(SBATCH_PARTITION_SET) ] } { unsetenv SBATCH_PARTITION unsetenv SBATCH_PARTITION_SET } @@ -90,7 +90,7 @@ if { [ module-info mode load ] || [ module-info mode display ] } { unsetenv SLURM_MEM_PER_NODE unsetenv SLURM_MEM_PER_NODE_SET } - if { ! [ info exists ::env(SLURM_PARTITION) ] } { + if { ! [ info exists ::env(SLURM_PARTITION_SET) ] } { unsetenv SLURM_PARTITION unsetenv SLURM_PARTITION_SET } diff --git a/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml b/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml index 03607661..249ba183 100644 --- a/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml +++ b/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml @@ -5,7 +5,6 @@ debug: msg: | FileSystemDns: {{FileSystemDns}} - FileSystemIpAddress: {{FileSystemIpAddress}} FileSystemMountPath: {{FileSystemMountPath}} FileSystemMountSrc: {{FileSystemMountSrc}} FileSystemOptions: {{FileSystemOptions}} diff --git a/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml b/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml index 2e47fb00..75743249 100644 --- a/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml +++ b/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml @@ -5,7 +5,6 @@ debug: msg: | FileSystemDns: {{FileSystemDns}} - FileSystemIpAddress: {{FileSystemIpAddress}} FileSystemMountPath: {{FileSystemMountPath}} FileSystemMountSrc: {{FileSystemMountSrc}} FileSystemOptions: {{FileSystemOptions}} diff --git a/source/resources/user_data/WaitForAmi.py b/source/resources/user_data/WaitForAmi.py index 5c649209..dba9b221 100644 --- a/source/resources/user_data/WaitForAmi.py +++ b/source/resources/user_data/WaitForAmi.py @@ -25,6 +25,7 @@ import logging from logging import handlers from os import environ +from sys import exit from time import sleep logger = logging.getLogger(__file__) @@ -59,7 +60,11 @@ def main(): ec2_client = boto3.client('ec2') logger.info(f"Waiting for {args.ami_id} to be available.") while True: - ami_info = ec2_client.describe_images(ImageIds=[args.ami_id])['Images'][0] + try: + ami_info = ec2_client.describe_images(ImageIds=[args.ami_id])['Images'][0] + except IndexError: + logger.error(f"{args.ami_id} not found") + exit(2) state = ami_info['State'] ami_name = ami_info['Name'] logger.info(f"state={state}") diff --git a/source/resources/user_data/slurm_node_ami_config.sh b/source/resources/user_data/slurm_node_ami_config.sh index d2a1ee55..cd2c1bdd 100644 --- a/source/resources/user_data/slurm_node_ami_config.sh +++ b/source/resources/user_data/slurm_node_ami_config.sh @@ -27,10 +27,14 @@ if [ -e /var/lib/cloud/instance/sem/ami.txt ]; then ami=$(cat /var/lib/cloud/instance/sem/ami.txt) echo "First reboot after ami ($ami) created." chmod +x /root/WaitForAmi.py - /root/WaitForAmi.py --ami-id $ami --base-ssm-parameter $SlurmNodeAmiSsmParameterBaseName --instance-id $instance_id --compute-regions $ComputeRegions - # Delete the semaphore so that if the instance reboots because of template changes then a new AMI will be created - mv /var/lib/cloud/instance/sem/ami.txt /var/lib/cloud/instance/sem/$ami.txt - exit 0 + if ! /root/WaitForAmi.py --ami-id $ami --base-ssm-parameter $SlurmNodeAmiSsmParameterBaseName --instance-id $instance_id --compute-regions $ComputeRegions; then + echo "Could not wait for AMI. Assume it is bad and create a new one." + rm -f /var/lib/cloud/instance/sem/ami.txt + else + # Delete the semaphore so that if the instance reboots because of template changes then a new AMI will be created + mv /var/lib/cloud/instance/sem/ami.txt /var/lib/cloud/instance/sem/$ami.txt + exit 0 + fi fi # Install security updates first.