From be9823cd87c0db7e078744a43710966c3881fd4f Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Tue, 3 Dec 2024 17:37:49 +0000 Subject: [PATCH] Fix xio resume script Miscellaneous Exostellar Infrastructure Optimizer integration fixes. Updated documentation. Add DefaultImageName to example config. Rename some of the XIO config parameters. * Replace ControllerSecurityGroupIds with Controllers/SecurityGroupIds * Replace WorkerSecurityGroupIds with Workers/SecurityGroupIds Fix a bug where referencing unset variable if AdditionalSecurityGroupsStackName not set. Change error to warning if an instance type doesn't exist in the current region. Fix configure_xio.py script to create new resources if they don't already exist. Fix hard-coded SLURM_CONF_PATH in resume_xspot.sh script. Check that XIO profile name is alphanumeric. If XIO pool's MinMemory not set, set it to the same value as MaxMemory. --- .../create_slurm_security_groups_stack.py | 4 + docs/exostellar-infrastructure-optimizer.md | 111 +++-- source/cdk/cdk_slurm_stack.py | 61 ++- source/cdk/config_schema.py | 14 +- .../opt/slurm/etc/exostellar/configure_xio.py | 373 ++++++++++----- .../opt/slurm/etc/exostellar/resume_xspot.sh | 2 +- .../opt/slurm/etc/exostellar/test_createVm.sh | 4 +- xio/xio-ems-2.3.2.yaml | 424 ++++++++++++++++++ 8 files changed, 814 insertions(+), 179 deletions(-) create mode 100644 xio/xio-ems-2.3.2.yaml diff --git a/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py b/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py index 993ccbd9..05b09ee4 100644 --- a/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py +++ b/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py @@ -105,6 +105,10 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: ) security_groups['SlurmdbdSG'] = slurmdbd_sg + # Rules for compute nodes + # Allow mounting of /opt/slurm and from head node + slurm_compute_node_sg.connections.allow_to(slurm_head_node_sg, ec2.Port.tcp(2049), f"SlurmComputeNodeSG to SlurmHeadNodeSG NFS") + # Rules for login nodes slurm_login_node_sg.connections.allow_from(slurm_head_node_sg, ec2.Port.tcp_range(1024, 65535), f"SlurmHeadNodeSG to SlurmLoginNodeSG ephemeral") slurm_login_node_sg.connections.allow_from(slurm_compute_node_sg, ec2.Port.tcp_range(1024, 65535), f"SlurmComputeNodeSG to SlurmLoginNodeSG ephemeral") diff --git a/docs/exostellar-infrastructure-optimizer.md b/docs/exostellar-infrastructure-optimizer.md index 67af6f7f..8cc53fee 100644 --- a/docs/exostellar-infrastructure-optimizer.md +++ b/docs/exostellar-infrastructure-optimizer.md @@ -49,11 +49,16 @@ Refer to [Exostellar's documentation](https://docs.exostellar.io/latest/Latest/H First deploy your cluster without configuring XIO. The cluster deploys ansible playbooks that will be used to create the XIO ParallelCluster AMI. -### Install the Exostellar Management Server (EMS) +### Deploy the Exostellar Management Server (EMS) The next step is to [install the Exostellar management server](https://docs.exostellar.io/latest/Latest/HPC-User/installing-management-server). -Exostellar will provide a link to a CloudFormation template that -will deploy the server in your account and will share 3 AMIs that are used by the template to create the EMS, controllers, and workers. +You must first subscribe to the three Exostellar Infrastructure AMIs in the AWS Marketplace. + +* [Exostellar Management Server](https://aws.amazon.com/marketplace/server/procurement?productId=prod-crdnafbqnbnm2) +* [Exostellar Controller](https://aws.amazon.com/marketplace/server/procurement?productId=prod-d4lifqwlw4kja) +* [Exostellar Worker](https://aws.amazon.com/marketplace/server/procurement?productId=prod-2smeyk5fuxt7q) + +Then follow the [directions to deploy the CloudFormation template](https://docs.exostellar.io/latest/Latest/HPC-User/installing-management-server#v2.4.0.0InstallingwithCloudFormationTemplate(AWS)-Step3:CreateaNewStack). ### Create XIO Configuration @@ -80,12 +85,15 @@ available capacity pools and increase the likelihood of running on spot. **Note**: The Intel instance families contain more configurations and higher memory instances. They also have high frequency instance types such as m5zn, r7iz, and z1d. They also tend to have more capacity. The AMD instance families include HPC instance types, however, they do not support spot pricing and can only be used for on-demand. +**Note**: This is only an example configuration. You should customize it for your requirements. + ``` slurm: Xio: ManagementServerStackName: exostellar-management-server PartitionName: xio AvailabilityZone: us-east-2b + DefaultImageName: Profiles: - ProfileName: amd NodeGroupName: amd @@ -191,38 +199,6 @@ slurm: - xiezn - z1d EnableHyperthreading: false - - ProfileName: intel24core350g - NodeGroupName: intel24core350g - MaxControllers: 10 - InstanceTypes: - - r5.12xlarge:1 - - r5d.12xlarge:2 - - r6i.12xlarge:3 - - r6id.12xlarge:4 - - r7i.12xlarge:5 - - r7iz.12xlarge:6 - SpotFleetTypes: - - r5.12xlarge:1 - - r5d.12xlarge:2 - - r6i.12xlarge:3 - - r6id.12xlarge:4 - - r7i.12xlarge:5 - - r7iz.12xlarge:6 - EnableHyperthreading: false - - ProfileName: amd24core350g - NodeGroupName: amd24core350g - MaxControllers: 10 - InstanceTypes: - - r5a.12xlarge:1 - - r5ad.12xlarge:2 - - r6a.12xlarge:3 - - r7a.12xlarge:5 - SpotFleetTypes: - - r5a.12xlarge:1 - - r5ad.12xlarge:2 - - r6a.12xlarge:3 - - r7a.12xlarge:5 - EnableHyperthreading: false Pools: - PoolName: amd-8-gb-1-cores ProfileName: amd @@ -261,18 +237,12 @@ slurm: MaxMemory: 350000 ``` -### Create XIO Profiles - -In the EMS GUI copy the existing az1 profile to the profiles that you configured. -The name is all that matters. -The deployment will update the profile automatically from your configuration. - +### Verify that the "az1" profile exists -### Create the Application Environment +In the EMS GUI go to Profiles and make sure that the "az1" profile exists. +I use that as a template to create your new profiles. -In the EMS GUI copy the **slurm** Application Environment to a new environment that is the same -name as your ParallelCluster cluster. -The deployment will update the application environment from your configuration. +If it doesn't exist, there was a problem with the EMS deployment and you should contact Exostellar support. ### Create an XIO ParallelCluster AMI @@ -292,13 +262,18 @@ packages. Create an AMI from the instance and wait for it to become available. -### Update the cluster with the XIO Iconfiguration +After the AMI has been successfully created you can either stop or terminated the instance to save costs. +If you may need to do additional customization, then stop it, otherwise terminate it. + +### Update the cluster with the XIO configuration Update the cluster with the XIO configuration. This will update the profiles and environment on the EMS server and configure the cluster for XIO. The only remaining step before you can submit jobs is to create the XIO VM image. +This is done before creating an image because the XIO scripts get deployed by this step. + ### Create an XIO Image from the XIO ParallelCluster AMI Connect to the head node and create the XIO Image from the AMI you created. @@ -315,11 +290,53 @@ The pool, profile, and image_name should be from your configuration. The host name doesn't matter. ``` -/opt/slurm/etc/exostellar/teste_creasteVm.sh --pool --profile -i -h +/opt/slurm/etc/exostellar/test_createVm.sh --pool --profile -i -h ``` +When this is done, the VM, worker, and controller should all terminate on their own. +If they do not, then connect to the EMS and cancel the job that started the controller. + +Use `squeue` to list the controller jobs. Use `scancel` to terminate them. + ### Run a test job using Slurm ``` srun --pty -p xio- ``` + +## Debug + +### UpdateHeadNode resource failed + +If the UpdateHeadNode resource fails then it is usually because as task in the ansible script failed. +Connect to the head node and look for errors in: + +```/var/log/ansible.log``` + +Usually it will be a problem with the `/opt/slurm/etc/exostellar/configure_xio.py` script. + +When this happens the CloudFormation stack will usually be in UPDATE_ROLLBACK_FAILED status. +Before you can update it again you will need to complete the rollback. +Go to Stack Actions, select `Continue update rollback`, expand `Advanced troubleshooting`, check the UpdateHeadNode resource, anc click `Continue update rollback`. + +### XIO Controller not starting + +On EMA, check that a job is running to create the controller. + +`squeue` + +On EMS, check the autoscaling log to see if there are errors starting the instance. + +`less /var/log/slurm/autoscaling.log`` + +EMS Slurm partions are at: + +`/xcompute/slurm/bin/partitions.json` + +They are derived from the partition and pool names. + +### Worker instance not starting + +### VM not starting on worker + +### VM not starting Slurm job diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index be870e23..9037017e 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -892,21 +892,26 @@ def update_config_for_exostellar(self): if not exostellar_security_group: logger.error(f"ExostellarSecurityGroup resource not found in {ems_stack_name} EMS stack") exit(1) - if 'ControllerSecurityGroupIds' not in self.config['slurm']['Xio']: - self.config['slurm']['Xio']['ControllerSecurityGroupIds'] = [] - if 'WorkerSecurityGroupIds' not in self.config['slurm']['Xio']: - self.config['slurm']['Xio']['WorkerSecurityGroupIds'] = [] - if exostellar_security_group not in self.config['slurm']['Xio']['ControllerSecurityGroupIds']: - self.config['slurm']['Xio']['ControllerSecurityGroupIds'].append(exostellar_security_group) - if exostellar_security_group not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']: - self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(exostellar_security_group) - if self.slurm_compute_node_sg_id: - if self.slurm_compute_node_sg_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']: - self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.slurm_compute_node_sg_id) + if 'Controllers' not in self.config['slurm']['Xio']: + self.config['slurm']['Xio']['Controllers'] = {} + if 'SecurityGroupIds' not in self.config['slurm']['Xio']['Controllers']: + self.config['slurm']['Xio']['Controllers']['SecurityGroupIds'] = [] + if 'Workers' not in self.config['slurm']['Xio']: + self.config['slurm']['Xio']['Workers'] = {} + if 'SecurityGroupIds' not in self.config['slurm']['Xio']['Workers']: + self.config['slurm']['Xio']['Workers']['SecurityGroupIds'] = [] + if exostellar_security_group not in self.config['slurm']['Xio']['Controllers']['SecurityGroupIds']: + self.config['slurm']['Xio']['Controllers']['SecurityGroupIds'].append(exostellar_security_group) + if exostellar_security_group not in self.config['slurm']['Xio']['Workers']['SecurityGroupIds']: + self.config['slurm']['Xio']['Workers']['SecurityGroupIds'].append(exostellar_security_group) + if 'AdditionalSecurityGroupsStackName' in self.config: + if self.slurm_compute_node_sg_id: + if self.slurm_compute_node_sg_id not in self.config['slurm']['Xio']['Workers']['SecurityGroupIds']: + self.config['slurm']['Xio']['Workers']['SecurityGroupIds'].append(self.slurm_compute_node_sg_id) if 'RESStackName' in self.config: if self.res_dcv_security_group_id: - if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']: - self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.res_dcv_security_group_id) + if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['Workers']['SecurityGroupIds']: + self.config['slurm']['Xio']['Workers']['SecurityGroupIds'].append(self.res_dcv_security_group_id) # Get values from stack outputs ems_ip_address = None @@ -920,6 +925,7 @@ def update_config_for_exostellar(self): self.config['slurm']['Xio']['ManagementServerIp'] = ems_ip_address # Check that all of the profiles used by the pools are defined + logger.debug(f"Xio config:\n{json.dumps(self.config['slurm']['Xio'], indent=4)}") WEIGHT_PER_CORE = { 'amd': 45, 'intel': 78 @@ -928,12 +934,18 @@ def update_config_for_exostellar(self): 'amd': 3, 'intel': 3 } + number_of_warnings = 0 number_of_errors = 0 xio_profile_configs = {} self.instance_type_info = self.plugin.get_instance_types_info(self.cluster_region) self.instance_family_info = self.plugin.get_instance_families_info(self.cluster_region) for profile_config in self.config['slurm']['Xio']['Profiles']: profile_name = profile_config['ProfileName'] + # Check that profile name is alphanumeric + if not re.compile('^[a-zA-z0-9]+$').fullmatch(profile_name): + logger.error(f"Invalid XIO profile name: {profile_name}. Name must be alphanumeric.") + number_of_errors += 1 + continue if profile_name in xio_profile_configs: logger.error(f"{profile_config['ProfileNmae']} XIO profile already defined") number_of_errors += 1 @@ -941,22 +953,28 @@ def update_config_for_exostellar(self): xio_profile_configs[profile_name] = profile_config # Check that all instance types and families are from the correct CPU vendor profile_cpu_vendor = profile_config['CpuVendor'] + invalid_instance_types = [] for instance_type_or_family_with_weight in profile_config['InstanceTypes']: (instance_type, instance_family) = self.get_instance_type_and_family_from_xio_config(instance_type_or_family_with_weight) if not instance_type or not instance_family: - logger.error(f"XIO InstanceType {instance_type_or_family_with_weight} is not a valid instance type or family in the {self.cluster_region} region") - number_of_errors += 1 + logger.warning(f"XIO InstanceType {instance_type_or_family_with_weight} is not a valid instance type or family in the {self.cluster_region} region") + number_of_warnings += 1 + invalid_instance_types.append(instance_type_or_family_with_weight) continue instance_type_cpu_vendor = self.plugin.get_cpu_vendor(self.cluster_region, instance_type) if instance_type_cpu_vendor != profile_cpu_vendor: logger.error(f"Xio InstanceType {instance_type_or_family_with_weight} is from {instance_type_cpu_vendor} and must be from {profile_cpu_vendor}") number_of_errors += 1 + for invalid_instance_type in invalid_instance_types: + profile_config['InstanceTypes'].remove(invalid_instance_type) + invalid_instance_types = [] for instance_type_or_family_with_weight in profile_config['SpotFleetTypes']: (instance_type, instance_family) = self.get_instance_type_and_family_from_xio_config(instance_type_or_family_with_weight) if not instance_type or not instance_family: - logger.error(f"Xio SpotFleetType {instance_type_or_family_with_weight} is not a valid instance type or family in the {self.cluster_region} region") - number_of_errors += 1 + logger.warning(f"Xio SpotFleetType {instance_type_or_family_with_weight} is not a valid instance type or family in the {self.cluster_region} region") + number_of_warnings += 1 + invalid_instance_types.append(instance_type_or_family_with_weight) continue # Check that spot pricing is available for spot pools. price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['spot'].get('max', None) @@ -967,6 +985,9 @@ def update_config_for_exostellar(self): if instance_type_cpu_vendor != profile_cpu_vendor: logger.error(f"Xio InstanceType {instance_type_or_family_with_weight} is from {instance_type_cpu_vendor} and must be from {profile_cpu_vendor}") number_of_errors += 1 + for invalid_instance_type in invalid_instance_types: + profile_config['SpotFleetTypes'].remove(invalid_instance_type) + xio_pool_names = {} for pool_config in self.config['slurm']['Xio']['Pools']: pool_name = pool_config['PoolName'] @@ -985,6 +1006,8 @@ def update_config_for_exostellar(self): number_of_errors += 1 else: pool_config['ImageName'] = self.config['slurm']['Xio']['DefaultImageName'] + if 'MinMemory' not in pool_config: + pool_config['MinMemory'] = pool_config['MaxMemory'] if 'Weight' not in pool_config: profile_config = xio_profile_configs[profile_name] cpu_vendor = profile_config['CpuVendor'] @@ -2226,9 +2249,9 @@ def get_instance_template_vars(self, instance_role): if 'Xio' in self.config['slurm']: instance_template_vars['xio_mgt_ip'] = self.config['slurm']['Xio']['ManagementServerIp'] instance_template_vars['xio_availability_zone'] = self.config['slurm']['Xio']['AvailabilityZone'] - instance_template_vars['xio_controller_security_group_ids'] = self.config['slurm']['Xio']['ControllerSecurityGroupIds'] + instance_template_vars['xio_controller_security_group_ids'] = self.config['slurm']['Xio']['Controllers']['SecurityGroupIds'] instance_template_vars['subnet_id'] = self.config['SubnetId'] - instance_template_vars['xio_worker_security_group_ids'] = self.config['slurm']['Xio']['WorkerSecurityGroupIds'] + instance_template_vars['xio_worker_security_group_ids'] = self.config['slurm']['Xio']['Workers']['SecurityGroupIds'] instance_template_vars['xio_config'] = self.config['slurm']['Xio'] elif instance_role == 'ParallelClusterExternalLoginNode': instance_template_vars['slurm_version'] = get_SLURM_VERSION(self.config) diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index 5299b4e6..e3e6437e 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -1408,11 +1408,17 @@ def get_config_schema(config): Optional('Weight'): int } ], - Optional('ManagementServerImageId'): str, Optional('AvailabilityZone'): str, - Optional('ControllerSecurityGroupIds'): [ str ], - Optional('ControllerImageId'): str, - Optional('WorkerSecurityGroupIds'): [ str ], + Optional('Controllers'): { + Optional('ImageId'): str, + Optional('SecurityGroupIds'): [str], + Optional('IdentityRole'): str, + }, + Optional('Workers'): { + Optional('ImageId'): str, + Optional('SecurityGroupIds'): [ str ], + Optional('IdentityRole'): str + }, Optional('WorkerImageId'): str, }, Optional('SlurmUid', default=401): int, diff --git a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py index f0f7166a..1d7566c0 100755 --- a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py +++ b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py @@ -48,129 +48,289 @@ def __init__(self): self.configure_environment() + if self.num_errors: + logger.error(f"Failed with {self.num_errors} errors") + exit(1) + def configure_profiles(self): - logger.info(f"Getting profile az1 to use as a template for new profiles.") - response = requests.get(f"{self.ems_url}/v1/profile/az1", verify=False) - if response.status_code != 200: + template_profile_config = self.get_template_profile_config() + if not template_profile_config: self.num_errors += 1 - logger.error(f"Failed to get az1 profile. code={response.status_code} content={response.content.decode('utf8')}") - az1_profile = None - else: - # logger.info(f"response content:\n{response.content.decode('utf8')}") - az1_profile = json.loads(response.content.decode('utf8')) - # logger.info(f"az1 profile:\n{json.dumps(az1_profile, indent=4)}") + logger.error(f"Failed to get template profile.") + exit(1) for profile_config in self.xio_config['Profiles']: - self.configure_profile(profile_config) - - def configure_profile(self, profile_config): - profile_name = profile_config['ProfileName'] - response = requests.get(f"{self.ems_url}/v1/profile/{profile_name}", verify=False) - profile = json.loads(response.content.decode('utf8')) - profile.pop('Arbiter', None) - profile['ProfileName'] = profile_name - profile['NodeGroupName'] = profile_name - name_tag = f"xspot-controller-{profile_name}" - name_tag_found = False - for tag_dict in profile['Controller']['InstanceTags']: - if tag_dict['Key'] == 'Name': - name_tag_found = True - tag_dict['Value'] = name_tag - if not name_tag_found: - profile['Controller']['InstanceTags'].append({ - 'Key': 'Name', - 'Value': name_tag - }) - if not profile['Controller']['ImageId']: - profile['Controller']['ImageId'] = self.xio_config['ControllerImageId'] - profile['MaxControllers'] = profile_config['MaxControllers'] - profile['Controller']['SecurityGroupIds'] = [] - for security_group_id in self.xio_config['ControllerSecurityGroupIds']: - profile['Controller']['SecurityGroupIds'].append(security_group_id) - profile['Worker']['InstanceTypes'] = [] - for instance_type in profile_config['InstanceTypes']: - profile['Worker']['InstanceTypes'].append(instance_type) - profile['Worker']['SpotFleetTypes'] = [] - for spot_fleet_type in profile_config['SpotFleetTypes']: - profile['Worker']['SpotFleetTypes'].append(spot_fleet_type) - name_tag_found = False - name_tag = f"xspot-worker-{profile_name}" - for tag_dict in profile['Worker']['InstanceTags']: - if tag_dict['Key'] == 'Name': - name_tag_found = True - tag_dict['Value'] = name_tag - if not name_tag_found: - profile['Worker']['InstanceTags'].append({ - 'Key': 'Name', - 'Value': name_tag - }) - profile['Worker']['SecurityGroupIds'] = [] - for security_group_id in self.xio_config['WorkerSecurityGroupIds']: - profile['Worker']['SecurityGroupIds'].append(security_group_id) - profile['Xspot']['EnableHyperthreading'] = profile_config['EnableHyperthreading'] - logger.info(f"{profile_name}:\n{json.dumps(profile, indent=4)}") + self.configure_profile(profile_config, template_profile_config) - logger.info(f"Updating profile {profile_name}") + def get_template_profile_config(self): + profile_name = 'az1' + logger.info(f"Checking if profile {profile_name} exists.") + response = requests.get(f"{self.ems_url}/v1/profile/{profile_name}", verify=False) + if response.status_code != 200: + if response.status_code != 404: + self.num_errors += 1 + logger.error(f"Unknown error getting {profile_name} profile. code={response.status_code} content:\n{response.content.decode('utf8')}") + return None + logger.info(f"Profile {profile_name} doesn't exist so creating it.") headers = {'Content-type': 'application/json'} - response = requests.put(f"{self.ems_url}/v1/profile", data=json.dumps(profile), headers=headers) - if response.status_code == 200: - logger.info(f"Succeeded: {response.content.decode('utf8')}") - else: - logger.error(f"{profile_name} update failed with code=={response.status_code}\n{response.content.decode('utf8')}") + template_profile_config = { + "AvailabilityZone": self.xio_config['AvailabilityZone'], + "Controller": { + "IdentityRole": self.xio_config['Controllers']['IdentityRole'], #"arn:aws:iam::415233562408:instance-profile/xio-ems-2-3-2-ExostellarInstanceProfile-KZxDGhXRFKJj" + "InstanceTags": [ + { + "Key": "exostellar.xspot-role", + "Value": "xspot-controller" + } + ], + "InstanceType": "c5.xlarge", + "SecurityGroupIds": self.xio_config['Controllers']['SecurityGroupIds'], + "SubnetID": self.ansible_head_node_vars['subnet_id'], + "VolumeSize": 100, + "ImageId": self.xio_config['Controllers']['ImageId'] + }, + "EnableIO": True, + "LogPath": "/xcompute/logs", + "Manufacturer": "Intel", + "MaxControllers": 10, + "ProfileName": "az1", + "Region": self.ansible_head_node_vars['region'], + "Worker": { + "IdentityRole": self.xio_config['Workers']['IdentityRole'], # "arn:aws:iam::415233562408:instance-profile/xio-ems-2-3-2-ExostellarInstanceProfile-KZxDGhXRFKJj" + "InstanceTags": [ + { + "Key": "exostellar.xspot-role", + "Value": "xspot-worker" + } + ], + "InstanceTypes": [ + "m5:0", + "m6i:1" + ], + "SecurityGroupIds": self.xio_config['Workers']['SecurityGroupIds'], + "SpotFleetTypes": [ + "m5:1", + "m5d:0", + "m6i:2" + ], + "SubnetID": self.ansible_head_node_vars['subnet_id'], + "ImageId": self.xio_config['Workers']['ImageId'] + }, + "Xspot": { + "EnableHyperthreading": False, + "EnableBalloon": True + }, + "XspotVersion": "xspot-3.0.3", + "NodeGroupName": "az1", + } + logger.debug(f"{profile_name} profile config:\n{json.dumps(template_profile_config, indent=4)}") - def configure_environment(self): - logger.info(f"Getting slurm environment to use as a template for new environments.") - response = requests.get(f"{self.ems_url}/v1/env/slurm", verify=False) + response = requests.post(f"{self.ems_url}/v1/profile", data=json.dumps(template_profile_config), headers=headers) + if response.status_code != 200: + logger.error(f"Failed to create {profile_name} profile. code=={response.status_code} content:\n{response.content.decode('utf8')}") + self.num_errors += 1 + return None + logger.info(f"Created {profile_name} profile: {response.content.decode('utf8')}") + + template_profile_config = None + logger.info(f"Getting profile {profile_name} to use as a template for new profiles.") + response = requests.get(f"{self.ems_url}/v1/profile/{profile_name}", verify=False) if response.status_code != 200: + logger.error(f"Failed to get {profile_name} profile. code={response.status_code} content={response.content.decode('utf8')}") + self.num_errors += 1 + return None + + template_profile_config = json.loads(response.content.decode('utf8')) + logger.info(f"{profile_name} profile:\n{json.dumps(template_profile_config, indent=4)}") + + # Remove the Id which is unique to each + template_profile_config.pop('Id', None) + if 'ImageId' in self.xio_config.get('Controllers', {}): + if template_profile_config['Controller']['ImageId'] != self.xio_config['Controllers']['ImageId']: + logger.info(f"Changing default Controller ImageId from {template_profile_config['Controller']['ImageId']} to {self.xio_config['Controllers']['ImageId']}") + template_profile_config['Controller']['ImageId'] = self.xio_config['Controllers']['ImageId'] + if 'ImageId' in self.xio_config.get('Workers', {}): + if template_profile_config['Worker']['ImageId'] != self.xio_config['Workers']['ImageId']: + logger.info(f"Changing default Worker ImageId from {template_profile_config['Worker']['ImageId']} to {self.xio_config['Workers']['ImageId']}") + template_profile_config['Worker']['ImageId'] = self.xio_config['Workers']['ImageId'] + for security_group_id in self.xio_config.get('Controllers', {}).get('SecurityGroupIds', []): + if security_group_id not in template_profile_config['Controller']['SecurityGroupIds']: + logger.info(f"Adding {security_group_id} to default Controller SecurityGroupIds") + template_profile_config['Controller']['SecurityGroupIds'].append(security_group_id) + for security_group_id in self.xio_config.get('Workers', {}).get('SecurityGroupIds', []): + if security_group_id not in template_profile_config['Worker']['SecurityGroupIds']: + logger.info(f"Adding {security_group_id} to default Worker SecurityGroupIds") + template_profile_config['Worker']['SecurityGroupIds'].append(security_group_id) + logger.debug(f"Modified {profile_name} profile:\n{json.dumps(template_profile_config, indent=4)}") + + return template_profile_config + + def configure_profile(self, profile_config, template_profile_config): + profile_name = profile_config['ProfileName'] + logger.info(f"Configuring {profile_name} profile") + profile_exists = False + response = requests.get(f"{self.ems_url}/v1/profile/{profile_name}", verify=False) + logger.debug(f"response:\n{response}") + if response.status_code == 404: + logger.info(f"{profile_name} profile doesn't exist so creating it.") + profile = deepcopy(template_profile_config) + elif response.status_code != 200: self.num_errors += 1 - logger.error(f"Failed to get slurm environment. code={response.status_code} content={response.content.decode('utf8')}") - slurm_env = None + logger.error(f"Failed to get {profile_name} profile. code={response.status_code} content={response.content.decode('utf8')}") + else: + logger.info(f"{profile_name} profile exists so updating it.") + profile_exists = True + try: + profile = json.loads(response.content.decode('utf8')) + except Exception as e: + logger.error(f"Invalid json config returned by server: {response.content.decode('utf8')}") + self.num_errors += 1 + return + profile.pop('Arbiter', None) + profile.pop('MeteringList', None) + profile.pop('Manufacturer', None) + profile.pop('Status', None) + + if profile_exists: + # Check fields against the template + if profile['Controller'].get('ImageId', '') != template_profile_config['Controller']['ImageId']: + logger.warning(f" Changing Controller.Imageid from '{profile['Controller'].get('ImageId', '')} to {template_profile_config['Controller']['ImageId']}") + profile['Controller']['ImageId'] = template_profile_config['Controller']['ImageId'] + if profile['Worker'].get('ImageId', '') != template_profile_config['Worker']['ImageId']: + logger.warning(f" Changing Worker.Imageid from '{profile['Worker'].get('ImageId', '')} to {template_profile_config['Worker']['ImageId']}") + profile['Worker']['ImageId'] = template_profile_config['Worker']['ImageId'] + for security_group_id in template_profile_config['Controller']['SecurityGroupIds']: + if security_group_id not in profile['Controller']['SecurityGroupIds']: + logger.info(f" Adding {security_group_id} to Controller.SecurityGroupIds") + profile['Controller']['SecurityGroupIds'].append(security_group_id) + for security_group_id in template_profile_config['Worker']['SecurityGroupIds']: + if security_group_id not in profile['Worker']['SecurityGroupIds']: + logger.warning(f" Adding {security_group_id} to Worker.SecurityGroupIds") + profile['Worker']['SecurityGroupIds'].append(security_group_id) + + # Set profile specific fields from the config + profile['ProfileName'] = profile_name + profile['NodeGroupName'] = profile_name + name_tag = f"xspot-controller-{profile_name}" + name_tag_found = False + for tag_dict in profile['Controller']['InstanceTags']: + if tag_dict['Key'] == 'Name': + name_tag_found = True + tag_dict['Value'] = name_tag + if not name_tag_found: + profile['Controller']['InstanceTags'].append({ + 'Key': 'Name', + 'Value': name_tag + }) + profile['MaxControllers'] = profile_config['MaxControllers'] + profile['Worker']['InstanceTypes'] = [] + for instance_type in profile_config['InstanceTypes']: + profile['Worker']['InstanceTypes'].append(instance_type) + profile['Worker']['SpotFleetTypes'] = [] + for spot_fleet_type in profile_config['SpotFleetTypes']: + profile['Worker']['SpotFleetTypes'].append(spot_fleet_type) + name_tag_found = False + name_tag = f"xspot-worker-{profile_name}" + for tag_dict in profile['Worker']['InstanceTags']: + if tag_dict['Key'] == 'Name': + name_tag_found = True + tag_dict['Value'] = name_tag + if not name_tag_found: + profile['Worker']['InstanceTags'].append({ + 'Key': 'Name', + 'Value': name_tag + }) + profile['Xspot']['EnableHyperthreading'] = profile_config['EnableHyperthreading'] + logger.info(f"{profile_name} profile config:\n{json.dumps(profile, indent=4)}") + + headers = {'Content-type': 'application/json'} + if profile_exists: + logger.info(f"Updating profile {profile_name}") + response = requests.put(f"{self.ems_url}/v1/profile", data=json.dumps(profile), headers=headers) + else: + logger.info(f"Creating profile {profile_name}") + response = requests.post(f"{self.ems_url}/v1/profile", data=json.dumps(profile), headers=headers) + if response.status_code == 200: + logger.info(f"Succeeded: {response.content.decode('utf8')}") else: - # logger.info(f"response:\n{response}") - # logger.info(f"response content:\n{response.content.decode('utf8')}") - slurm_env = json.loads(response.content.decode('utf8')) - # logger.info(f"slurm env:\n{json.dumps(slurm_env, indent=4)}") + logger.error(f"{profile_name} update failed with code=={response.status_code}\n{response.content.decode('utf8')}") + self.num_errors += 1 + def configure_environment(self): env_name = self.ansible_head_node_vars['cluster_name'] logger.info(f"Getting {env_name} environment.") + env_exists = False response = requests.get(f"{self.ems_url}/v1/env/{env_name}", verify=False) if response.status_code != 200: - self.num_errors += 1 - logger.error(f"Failed to get {env_name} environment. code={response.status_code} content={response.content.decode('utf8')}") - env = None + logger.info(f"{env_name} environment doesn't exist. code={response.status_code} content={response.content.decode('utf8')}") + env = {} else: - env = json.loads(response.content.decode('utf8')) - env['HeadAddress'] = f"head_node.{env_name}.pcluster" - env['Pools'] = [] - for pool_config in self.xio_config['Pools']: - env['Pools'].append({ - 'PoolName': pool_config['PoolName'], - 'PoolSize': pool_config['PoolSize'], - 'ProfileName': pool_config['ProfileName'], - 'VM': { - 'CPUs': pool_config['CPUs'], - 'ImageName': pool_config.get('ImageName', self.xio_config['DefaultImageName']), - 'MinMemory': pool_config['MinMemory'], - 'MaxMemory': pool_config['MaxMemory'], - 'VolumeSize': pool_config['VolumeSize'], - 'PrefixCount': 0, - 'UserData': '' - } - }) - env['Slurm'] = { - 'BinPath': f"/opt/slurm/bin", - 'ConfPath': f"/opt/slurm/etc", - 'PartitionName': self.xio_config['PartitionName'] - } - logger.info(f"{env_name} application environment:\n{json.dumps(env, indent=4)}") + logger.info(f"{env_name} environment exists. code={response.status_code} content={response.content.decode('utf8')}") + env_exists = True + try: + env = json.loads(response.content.decode('utf8')) + except Exception as e: + # Need the id from the existing environment or we can't update it so this is an error. + self.num_errors += 1 + logger.error(f"Invalid environment configuration returned by server:\n{response.content.decode('utf8')}") + return + if not env: + logger.info(f"Getting 'slurm' environment to use as a template for new environment.") + response = requests.get(f"{self.ems_url}/v1/env/slurm", verify=False) + if response.status_code != 200: + self.num_errors += 1 + logger.error(f"Failed to get 'slurm' environment. code={response.status_code} content={response.content.decode('utf8')}") + return + else: + logger.debug(f"response:\n{response}") + logger.debug(f"response content:\n{response.content.decode('utf8')}") + try: + template_env = json.loads(response.content.decode('utf8')) + logger.debug(f"template_env:\n{json.dumps(template_env, indent=4)}") + except Exception as e: + self.num_errors += 1 + logger.error(f"Invalid environment configuration returned by server:\n{response.content.decode('utf8')}") + return + + env['EnvName'] = env_name + env['Type'] = 'slurm' + env['HeadAddress'] = f"head_node.{env_name}.pcluster" + env['Pools'] = [] + for pool_config in self.xio_config['Pools']: + env['Pools'].append({ + 'PoolName': pool_config['PoolName'], + 'PoolSize': pool_config['PoolSize'], + 'ProfileName': pool_config['ProfileName'], + 'VM': { + 'CPUs': pool_config['CPUs'], + 'ImageName': pool_config.get('ImageName', self.xio_config['DefaultImageName']), + 'MinMemory': pool_config['MinMemory'], + 'MaxMemory': pool_config['MaxMemory'], + 'VolumeSize': pool_config['VolumeSize'], + 'PrefixCount': 0, + 'UserData': '' + } + }) + env['Slurm'] = { + 'BinPath': f"/opt/slurm/bin", + 'ConfPath': f"/opt/slurm/etc", + 'PartitionName': self.xio_config['PartitionName'] + } + logger.info(f"{env_name} application environment:\n{json.dumps(env, indent=4)}") + + headers = {'Content-type': 'application/json'} + if env_exists: logger.info(f"Updating environment {env_name}") - headers = {'Content-type': 'application/json'} response = requests.put(f"{self.ems_url}/v1/env", data=json.dumps(env), headers=headers) - if response.status_code == 200: - logger.info(f"Succeeded: {response.content.decode('utf8')}") - else: - logger.error(f"{env} environment update failed with code=={response.status_code}\n{response.content.decode('utf8')}") + else: + logger.info(f"Creating environment {env_name}") + response = requests.post(f"{self.ems_url}/v1/env", data=json.dumps(env), headers=headers) + if response.status_code == 200: + logger.info(f"Succeeded: {response.content.decode('utf8')}") + else: + self.num_errors += 1 + logger.error(f"{env_name} environment update failed with code=={response.status_code}\n{response.content.decode('utf8')}") return @@ -190,11 +350,10 @@ def configure_environment(self): if args.debug: logger.setLevel(logging.DEBUG) - stream_handler = logging.StreamHandler() - stream_handler.setFormatter(logger_formatter) - logger.addHandler(stream_handler) app = ConfigureXio() + except SystemExit as e: + exit(e) except: logging.exception(f"Unhandled exception in {__file__}") raise diff --git a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/resume_xspot.sh b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/resume_xspot.sh index 557dd4ed..6e0f428c 100755 --- a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/resume_xspot.sh +++ b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/resume_xspot.sh @@ -70,7 +70,7 @@ function resume_xspot() echo "MaxMemory=$mem" TMP_USER_DATA_FILE=$(mktemp).sh - cp /opt/slurm/res-demo-pc-3-10-1-rhel8-x86/etc/exostellar/xspot-vm_user_data.sh $TMP_USER_DATA_FILE + cp ${SLURM_CONF_PATH}/exostellar/xspot-vm_user_data.sh $TMP_USER_DATA_FILE sed -i "s/XSPOT_NODENAME/$host/g" $TMP_USER_DATA_FILE cat $TMP_USER_DATA_FILE user_data=$(cat $TMP_USER_DATA_FILE | base64 -w 0) diff --git a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/test_createVm.sh b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/test_createVm.sh index 7f5197d7..e3815294 100755 --- a/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/test_createVm.sh +++ b/source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/test_createVm.sh @@ -4,7 +4,7 @@ ## Email: support@exostellar.io ## ############################################################################### -XCOMPUTE_HEAD_IP=10.3.134.102 +XCOMPUTE_HEAD_IP={{ xio_mgt_ip }} pool="" profile="" @@ -87,6 +87,8 @@ echo -e "** OUT: JobId = $id\n" for i in {0..59}; do echo -ne "Waiting for $host... $((i * 10))s\033[0K\r" http_code=$(curl -s -w "%{http_code}" -X GET http://$XCOMPUTE_HEAD_IP:5000/v1/xcompute/vm/$host?detailedInfo=true -o $OUT_FILE) + echo + jq -r '' $OUT_FILE if [ $http_code -eq 200 ]; then echo "NodeName: `jq -r '.NodeName' $OUT_FILE`" echo "Controller: `jq -r '.Controller.NodeName' $OUT_FILE`" diff --git a/xio/xio-ems-2.3.2.yaml b/xio/xio-ems-2.3.2.yaml new file mode 100644 index 00000000..930b6f4b --- /dev/null +++ b/xio/xio-ems-2.3.2.yaml @@ -0,0 +1,424 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: Installer for the Exostellar Management Server. + +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: "Network Configuration" + Parameters: + - VPCId + - SubnetId + - DomainName + - SubnetIsPublic + - VPCCidr + - SharedSecurityGroupId + - Label: + default: "Instance Configuration" + Parameters: + - InstanceType + - KeyName + - TerminationProtection + - VolumeSize + - Label: + default: "HA Integration" + Parameters: + - NFSDNSName + - NFSSecurityGroupId + ParameterLabels: + VPCId: + default: "VPC ID" + VPCCidr: + default: "VPC CIDR" + SubnetId: + default: "Subnet ID" + SubnetIsPublic: + default: "Is Subnet Public?" + SharedSecurityGroupId: + default: "Shared security group ID (Optional)" + InstanceType: + default: "EC2 Instance Type" + KeyName: + default: "SSH Key Pair" + NFSDNSName: + default: "NFS DNS name (Optional). If not using remote NFS, leave empty." + NFSSecurityGroupId: + default: "NFS security group ID (Required if remote NFS like EFS is used)" + +Parameters: + VPCId: + Type: AWS::EC2::VPC::Id + Description: Select the VPC where the Mgmt Server will be deployed. + + SubnetId: + Type: AWS::EC2::Subnet::Id + Description: Select the subnet where the Mgmt Server will be deployed. + + DomainName: + Type: String + Default: '' + Description: Enter the domain name intended to be used to reach the Exostellar Management Server (leave empty if not using) + + SubnetIsPublic: + Type: String + Description: > + Specify 'true' if you want a public IP and this is a public subnet. + Note: This choice should reflect the actual configuration of the subnet; + it does not auto-detect if the subnet is public. + AllowedValues: + - 'true' + - 'false' + Default: 'true' + + VPCCidr: + Type: String + Description: Enter the CIDR block for the VPC. + Default: "0.0.0.0/0" + + SharedSecurityGroupId: + Type: String + Description: Enter the ID of the shared security group to attach to the Mgmt Server. Leave empty if none. + Default: "" + + InstanceType: + Type: String + Description: Select the EC2 instance type for the Mgmt Server. + AllowedValues: + - c5.xlarge + - c5.2xlarge + - c5.4xlarge + - c5.9xlarge + - c5.12xlarge + - c5.18xlarge + - c5.24xlarge + - c5d.xlarge + - c5d.2xlarge + - c5d.4xlarge + - c5d.9xlarge + - c5d.12xlarge + - c5d.18xlarge + - c5d.24xlarge + - c6i.xlarge + - c6i.2xlarge + - c6i.4xlarge + - c6i.8xlarge + - c6i.12xlarge + - c6i.16xlarge + - c6i.24xlarge + - c6i.32xlarge + - c7i.xlarge + - c7i.2xlarge + - c7i.4xlarge + - c7i.8xlarge + - c7i.12xlarge + - c7i.16xlarge + - c7i.24xlarge + - c7i.48xlarge + - m5.large + - m5.xlarge + - m5.2xlarge + - m5.4xlarge + - m5.12xlarge + - m5.24xlarge + - m5d.large + - m5d.xlarge + - m5d.2xlarge + - m5d.4xlarge + - m5d.12xlarge + - m5d.24xlarge + - m6i.large + - m6i.xlarge + - m6i.2xlarge + - m6i.4xlarge + - m6i.12xlarge + - m6i.24xlarge + - m7i.large + - m7i.xlarge + - m7i.2xlarge + - m7i.4xlarge + - m7i.8xlarge + - m7i.12xlarge + - m7i.16xlarge + - m7i.24xlarge + - m7i.48xlarge + - r6i.large + - r6i.xlarge + - r6i.2xlarge + - r6i.4xlarge + - r6i.12xlarge + - r6i.24xlarge + - r7i.xlarge + - r7i.2xlarge + - r7i.4xlarge + - r7i.8xlarge + - r7i.12xlarge + - r7i.16xlarge + - r7i.24xlarge + - r7i.48xlarge + Default: m5d.xlarge + + VolumeSize: + Type: Number + Description: The size of the management server's EBS volume in GB. + Default: 100 + MinValue: 100 + + KeyName: + Type: AWS::EC2::KeyPair::KeyName + Description: Select the SSH key pair for Mgmt Server access. + + TerminationProtection: + Type: String + Description: Enable termination protection for the instance. + AllowedValues: + - 'yes' + - 'no' + Default: 'yes' + + NFSDNSName: + Type: String + Description: Enter the NFS DNS name. E.g., fs-123456789.efs.us-east-1.amazonaws.com. + Default: "" + + NFSSecurityGroupId: + Type: String + Description: Enter the ID of the security group that enables traffic between the NFS server and the management server. + Default: "" + +Conditions: + TerminationProtectionCondition: !Equals [ !Ref TerminationProtection, 'yes' ] + SubnetIsPublic: !Equals [ !Ref SubnetIsPublic, 'true' ] + HasSharedSecurityGroup: !Not [!Equals [!Ref SharedSecurityGroupId, ""]] + HasSharedNFSSecurityGroup: !Not [!Equals [!Ref NFSSecurityGroupId, ""]] + +Mappings: + RegionAMIMap: + us-east-1: + AMIID: "ami-068ee583ca9f08b7a" + us-east-2: + AMIID: "ami-068e2edacd108e779" + us-west-1: + AMIID: "ami-008543102d9a2bb2b" + us-west-2: + AMIID: "ami-02796527adf4d9b7a" + eu-central-1: + AMIID: "" + eu-west-1: + AMIID: "" + eu-west-2: + AMIID: "" + eu-west-3: + AMIID: "" + ca-central-1: + AMIID: "" + ap-south-1: + AMIID: "" + ap-southeast-1: + AMIID: "" + ap-southeast-2: + AMIID: "" + +Resources: + ExostellarSecurityGroup: + Type: 'AWS::EC2::SecurityGroup' + Properties: + GroupDescription: 'Security Group for the Mgmt Server' + VpcId: !Ref VPCId + SecurityGroupIngress: + - IpProtocol: tcp + FromPort: '22' + ToPort: '22' + CidrIp: 0.0.0.0/0 + - IpProtocol: tcp + FromPort: '443' + ToPort: '443' + CidrIp: 0.0.0.0/0 + - IpProtocol: tcp + FromPort: '5000' + ToPort: '5000' + CidrIp: !Ref VPCCidr + Tags: + - Key: "Name" + Value: !Sub "${AWS::StackName}-SecurityGroup" + + ExostellarSecurityGroupSelfIngress: + Type: 'AWS::EC2::SecurityGroupIngress' + Properties: + GroupId: !Ref ExostellarSecurityGroup + SourceSecurityGroupId: !Ref ExostellarSecurityGroup + IpProtocol: -1 + FromPort: 0 + ToPort: 65535 + + ExostellarRole: + Type: 'AWS::IAM::Role' + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: ec2.amazonaws.com + Action: sts:AssumeRole + Policies: + - PolicyName: ExostellarPolicy + PolicyDocument: | + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:RunInstances", + "ec2:StopInstances", + "ec2:DescribeSpotPriceHistory", + "ec2:DescribeInstances", + "ec2:DescribeInstanceTypes", + "ec2:DescribeTags", + "ec2:CreateTags", + "ec2:CreateFleet", + "ec2:CreateLaunchTemplate", + "ec2:DeleteLaunchTemplate", + "ec2:TerminateInstances", + "ec2:AssignPrivateIpAddresses", + "ec2:UnassignPrivateIpAddresses", + "ec2:AttachNetworkInterface", + "ec2:DetachNetworkInterface", + "ec2:CreateNetworkInterface", + "ec2:DeleteNetworkInterface", + "ec2:ModifyNetworkInterfaceAttribute", + "ec2:DescribeRegions" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "iam:CreateServiceLinkedRole", + "iam:ListRoles", + "iam:ListInstanceProfiles", + "iam:PassRole", + "iam:GetRole" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeImages", + "ec2:DescribeImageAttribute", + "ec2:DescribeKeyPairs", + "ec2:DescribeInstanceTypeOfferings", + "iam:GetInstanceProfile", + "iam:SimulatePrincipalPolicy", + "sns:Publish", + "ssm:GetParameters", + "ssm:GetParametersByPath" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateVolume", + "ec2:DescribeVolumes", + "ec2:AttachVolume", + "ec2:ModifyInstanceAttribute", + "ec2:DetachVolume", + "ec2:DeleteVolume" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateInstanceExportTask", + "ec2:DescribeExportTasks", + "ec2:RebootInstances", + "ec2:CreateSnapshot", + "ec2:DescribeSnapshots", + "ec2:LockSnapshot" + ], + "Resource": "*" + } + ] + } + Tags: + - Key: "Name" + Value: !Sub "${AWS::StackName}-IAMRole" + + ExostellarInstanceProfile: + Type: 'AWS::IAM::InstanceProfile' + Properties: + Roles: + - !Ref ExostellarRole + + ExostellarInstanceLaunchTemplate: + Type: 'AWS::EC2::LaunchTemplate' + Properties: + LaunchTemplateName: !Sub "${AWS::StackName}-ExostellarInstance" + LaunchTemplateData: + InstanceType: !Ref InstanceType + KeyName: !Ref KeyName + ImageId: !FindInMap [RegionAMIMap, !Ref 'AWS::Region', AMIID] + NetworkInterfaces: + - DeviceIndex: '0' + SubnetId: !Ref SubnetId + AssociatePublicIpAddress: !If [SubnetIsPublic, 'true', 'false'] + Groups: + - !Ref ExostellarSecurityGroup + - !If [HasSharedSecurityGroup, !Ref SharedSecurityGroupId, !Ref "AWS::NoValue"] + - !If [HasSharedNFSSecurityGroup, !Ref NFSSecurityGroupId, !Ref "AWS::NoValue"] + IamInstanceProfile: + Name: !Ref ExostellarInstanceProfile + BlockDeviceMappings: + - DeviceName: "/dev/sda1" + Ebs: + VolumeSize: !Ref VolumeSize + DeleteOnTermination: true + VolumeType: "gp2" + MetadataOptions: + HttpEndpoint: enabled + HttpPutResponseHopLimit: 1 + HttpTokens: optional + + ExostellarInstance: + Type: 'AWS::EC2::Instance' + Properties: + LaunchTemplate: + LaunchTemplateId: !Ref ExostellarInstanceLaunchTemplate + Version: !GetAtt ExostellarInstanceLaunchTemplate.LatestVersionNumber + Tags: + - Key: Name + Value: !Sub '${AWS::StackName}-MgmtServer' + DisableApiTermination: !If [TerminationProtectionCondition, 'true', 'false'] + UserData: + Fn::Base64: + !Sub | + #!/bin/bash + cat <> /xcompute/ems-options + DOMAIN_NAME=${DomainName} + NFS_REMOTE_HOST=${NFSDNSName} + EOF + +Outputs: + 1ExostellarMgmtServerURL: + Description: The URL of the Mgmt Server + Value: !If + - SubnetIsPublic + - !Sub 'https://${ExostellarInstance.PublicIp}' + - !Sub 'https://${ExostellarInstance.PrivateIp}' + + 2ExostellarMgmtServerPrivateIP: + Description: The private IP of the Mgmt Server + Value: !GetAtt ExostellarInstance.PrivateIp + + 3ExostellarAdminUsername: + Description: Initial admin username + Value: 'admin@exostellar.io' + + 4ExostellarAdminPassword: + Description: Initial admin password (change on first login) + Value: 'password'