Skip to content

Commit

Permalink
Update config files and fix errors found in testing new configs
Browse files Browse the repository at this point in the history
Add --RESEnvironmentName to the installer

Ease initial integration with Research and Engineering Studio (RES).

Automatically add the correct submitter security groups and configure
the /home directory.

Automatically choose the subnets if not specified based on RES subnets.

Resolves #207

============================

Update template config files

Added more comments to clarify that these are examples that should be copied
and customized by users.

Added comments for typical configuration options.

Deleted obsolete configs that were from v1.

Resolves #203

=============================

Set default head node instance type based on architecture.

Resolves #206

==============================

Clean up ansible-lint errors and warnings.
Arm architecture cluster was failing because of an incorrect condition in the ansible playbook that is flagged by lint.

==============================

Use vdi controller instead of cluster manager for users and groups info

Cluster manager stopped being domain joined for some reason.

==============================

Paginate describe_instances when creating head node a record.

Otherwise, may not find the cluster head node instance.

==============================

Add default MungeKeySecret.

This should be the default or you can't access multiple clusters from the same server.

==============================

Increase timeout for ssm command that configures submitters

Need the time to compile slurm.

==============================

Force slurm to be rebuilt for submitters of all os distributions even if they match the os of the cluster.

Otherwise get errors because can't find PluginDir in the same location as when it was compiled.

==============================

Paginate describe_instances in UpdateHeadNode lambda

==============================

Add check for min memory of 4 GB for slurm controller
  • Loading branch information
cartalla committed Mar 20, 2024
1 parent a8b6555 commit b330437
Show file tree
Hide file tree
Showing 57 changed files with 1,572 additions and 1,215 deletions.
165 changes: 109 additions & 56 deletions source/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,52 +45,93 @@
class EC2InstanceTypeInfo:

def __init__(self, regions, get_savings_plans=True, json_filename=None, debug=False):
self.missing_regions = []

if debug:
logger.setLevel(logging.DEBUG)

if not regions:
# Get a list of all AWS regions
self.ec2_client = boto3.client('ec2', region_name='us-east-1')
try:
regions = sorted([region["RegionName"] for region in self.describe_regions()["Regions"]])
except ClientError as err:
logger.error(f"Unable to list all AWS regions. Make sure you have set your IAM credentials. {err}")
sys.exit(1)
self.regions = regions

self.get_savings_plans = get_savings_plans

logger.info(f"Getting EC2 pricing info for following regions:\n{pp.pformat(self.regions)}")

# Get the list of valid regions to test for valid credentials
# and to validate regions arg.
# Valid credentials shouldn't be required.
# If they don't exist, then require json_filename with cached results.
self.valid_credentials = False
self.valid_regions = []
self.ec2_client = boto3.client('ec2', region_name='us-east-1')
try:
self.valid_regions = sorted([region["RegionName"] for region in self.describe_regions()["Regions"]])
self.valid_credentials = True
except ClientError as e:
logger.debug(f"{e.response['Error']['Message']}({e.response['Error']['Code']})")
logger.info(f"Valid AWS CLI credentials not found. Must specify json_filename or configure or update your AWS CLI credentials.")

# Valid credentials shouldn't be required.
# If they don't exist, then require json_filename with cached results.
if not self.valid_credentials:
if not json_filename:
raise ValueError(f"Valid AWS CLI credentials must be provided or json_filename must be set.")
if not path.exists(json_filename):
raise ValueError(f"{json_filename} doesn't exist and no valid AWS CLI credentials exist to create it. Configure or update your AWS CLI credentials.")

self.instance_type_and_family_info = {}
if json_filename:
if path.exists(json_filename):
logger.info(f"Reading cached info from {json_filename}")
try:
self.instance_type_and_family_info = json.loads(open(json_filename, 'r').read())
except:
logger.exception(f"Error reading {json_filename}. Creating new version with latest format and data.")
if not self.valid_credentials:
logger.exception(f"Error reading {json_filename} and no valid AWS CLI credentials to create a new version. Configure or update your AWS CLI credentials.")
raise
logger.exception(f"Error reading {json_filename}: {e}. Creating new version with latest format and data.")
os.rename(json_filename, json_filename + '.back')
self.instance_type_and_family_info = {}
try:
self.check_instance_type_and_family_info()
except:
if not self.valid_credentials:
logger.exception(f"Incorrect data in {json_filename} and no valid AWS CLI credentials to create a new version. Configure or update your AWS CLI credentials.")
raise
logger.exception(f"Incorrect data in {json_filename}. Creating new version with latest format and data.")
os.rename(json_filename, json_filename + '.back')
self.instance_type_and_family_info = {}
if not self.valid_regions and self.instance_type_and_family_info:
self.valid_regions = sorted(self.instance_type_and_family_info.keys())
else:
logger.info(f"{json_filename} doesn't exist so cannot be read and will be created.")
if not json_filename or not path.exists(json_filename):
self.instance_type_and_family_info = {}

if regions:
# Check that specified regions are valid
invalid_regions = []
for region in regions:
if region not in self.valid_regions:
logger.error(f"Invalid region: {region}")
invalid_regions.append(region)
if invalid_regions:
raise ValueError(f"Invalid regions specified. Valid regions: {json.dumps(self.valid_regions)}")
self.regions = regions
else:
self.regions = self.valid_regions

self.get_savings_plans = get_savings_plans

logger.info(f"Getting EC2 pricing info for following regions:\n{pp.pformat(self.regions)}")

# Endpoints only supported in 2 regions: https://docs.aws.amazon.com/cli/latest/reference/pricing/index.html
self.pricing_client = boto3.client('pricing', region_name='us-east-1')

for region in self.regions:
for region in sorted(self.regions):
if region in self.instance_type_and_family_info and json_filename:
logger.info(f'Using EC2 instance info from {json_filename} for {region}')
continue
region_name = self.get_region_name(region)
if not region_name:
logger.error(f"Could not find region name for {region}. Is this a new region or does it need to be enabled for your account?")
continue
logger.info(f'Getting EC2 instance info for {region} ({region_name})')
assert(self.valid_credentials)
self.ec2_client = boto3.client('ec2', region_name=region)
self.get_instance_type_and_family_info(region)
if not self.get_instance_type_and_family_info(region):
continue

# Save json after each successful region to speed up reruns
if json_filename:
Expand All @@ -99,12 +140,21 @@ def __init__(self, regions, get_savings_plans=True, json_filename=None, debug=Fa
print(json.dumps(self.instance_type_and_family_info, indent=4, sort_keys=True), file=fh)
fh.close()

if self.missing_regions:
logger.error(f"{len(self.missing_regions)} regions without names. May be new or not enabled in the account.\n{json.dumps(self.missing_regions, indent=4)}")
return

def get_instance_type_and_family_info(self, region):
region_name = self.get_region_name(region)
logger.debug(f"region_name={region_name}")
azs = []
try:
# If opt-in region isn't available then this can fail.
# Should have already checked that we have valid AWS CLI credentials in constructor.
self.describe_availability_zones(region)
except ClientError as e:
logger.error(f"Can't list availability zones for {region}. If {region} is a special region, make sure it is enabled for your account. {e}")
return None
for az_info in self.describe_availability_zones(region)['AvailabilityZones']:
if az_info['ZoneType'] != 'availability-zone':
continue
Expand Down Expand Up @@ -308,42 +358,43 @@ def get_instance_type_and_family_info(self, region):
instance_type_info[instanceType]['pricing']['ComputeSavingsPlan_min_discount'] = min_compute_sp_discount
instance_type_info[instanceType]['pricing']['ComputeSavingsPlan_max_discount'] = max_compute_sp_discount
logger.debug(f" instance_type_info:\n{json.dumps(instance_type_info[instanceType], indent=4, sort_keys=True)}")
return self.instance_type_and_family_info[region]

def check_instance_type_and_family_info(self):
'''
Raises KeyError
'''
for region, region_dict in self.instance_type_and_family_info.items():
for instance_type, instance_type_dict in region_dict['instance_types'].items():
try:
assert 'architecture' in instance_type_dict
assert 'SustainedClockSpeedInGhz' in instance_type_dict
assert 'SustainedClockSpeedInGhz' in instance_type_dict
assert 'DefaultVCpus' in instance_type_dict
assert 'DefaultCores' in instance_type_dict
assert 'DefaultThreadsPerCore' in instance_type_dict
assert 'ValidThreadsPerCore' in instance_type_dict
assert 'DefaultThreadsPerCore' in instance_type_dict
assert 'MemoryInMiB' in instance_type_dict
assert 'SSDCount' in instance_type_dict
assert 'SSDTotalSizeGB' in instance_type_dict
assert 'Hypervisor' in instance_type_dict
assert 'NetworkPerformance' in instance_type_dict
if 'pricing' in instance_type_dict:
assert 'ComputeSavingsPlan' in instance_type_dict['pricing']
except:
logger.error(f"{instance_type} instance type missing data:\n{json.dumps(instance_type_dict, indent=4)}")
raise
for instance_family, instance_family_dict in region_dict['instance_families'].items():
try:
assert 'instance_types' in instance_family_dict
assert 'architecture' in instance_family_dict
assert 'MaxCoreCount' in instance_family_dict
assert 'MaxInstanceType' in instance_family_dict
assert 'MaxInstanceSize' in instance_family_dict
except:
logger.error(f"{instance_family} family missing data:\n{json.dumps(instance_family_dict, indent=4)}")
raise
for instance_type, instance_type_dict in region_dict['instance_types'].items():
try:
assert 'architecture' in instance_type_dict
assert 'SustainedClockSpeedInGhz' in instance_type_dict
assert 'SustainedClockSpeedInGhz' in instance_type_dict
assert 'DefaultVCpus' in instance_type_dict
assert 'DefaultCores' in instance_type_dict
assert 'DefaultThreadsPerCore' in instance_type_dict
assert 'ValidThreadsPerCore' in instance_type_dict
assert 'DefaultThreadsPerCore' in instance_type_dict
assert 'MemoryInMiB' in instance_type_dict
assert 'SSDCount' in instance_type_dict
assert 'SSDTotalSizeGB' in instance_type_dict
assert 'Hypervisor' in instance_type_dict
assert 'NetworkPerformance' in instance_type_dict
if 'pricing' in instance_type_dict:
assert 'ComputeSavingsPlan' in instance_type_dict['pricing']
except:
logger.error(f"{instance_type} instance type missing data:\n{json.dumps(instance_type_dict, indent=4)}")
raise
for instance_family, instance_family_dict in region_dict['instance_families'].items():
try:
assert 'instance_types' in instance_family_dict
assert 'architecture' in instance_family_dict
assert 'MaxCoreCount' in instance_family_dict
assert 'MaxInstanceType' in instance_family_dict
assert 'MaxInstanceSize' in instance_family_dict
except:
logger.error(f"{instance_family} family missing data:\n{json.dumps(instance_family_dict, indent=4)}")
raise

def print_csv(self, filename=""):
if filename:
Expand Down Expand Up @@ -473,19 +524,21 @@ def get_instance_size(instanceType):

# Translate region code to region name
def get_region_name(self, region_code):
missing_regions = {
'ap-northeast-3': 'Asia Pacific (Osaka)'
}
endpoint_file = resource_filename('botocore', 'data/endpoints.json')
with open(endpoint_file, 'r') as f:
data = json.load(f)
missing_region_names = {
'ca-west-1': {'description': 'Canada (Calgary)'}
}
for missing_region in missing_region_names:
if missing_region not in data:
data['partitions'][0]['regions'][missing_region] = missing_region_names[missing_region]
try:
region_name = data['partitions'][0]['regions'][region_code]['description']
except KeyError:
if region_code in missing_regions:
return missing_regions[region_code]
logger.exception(f"Couldn't get region name for {region_code}\nendpoint_file: {endpoint_file}\ndata:\n{pp.pformat(data['partitions'][0]['regions'])}")
raise
self.missing_regions.append(region_code)
logger.error(f"Couldn't get region name for {region_code}\nendpoint_file: {endpoint_file}\ndata:\n{pp.pformat(data['partitions'][0]['regions'])}")
return None
region_name = region_name.replace('Europe', 'EU')
return region_name

Expand Down
Loading

0 comments on commit b330437

Please sign in to comment.