Skip to content

Commit

Permalink
Create slurmfs ARecord for use in other regions.
Browse files Browse the repository at this point in the history
This required adding a lambda to do DNS lookups.
  • Loading branch information
cartalla committed Jun 22, 2022
1 parent 6d5231b commit 28fa8cc
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 66 deletions.
28 changes: 15 additions & 13 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,37 +36,39 @@ fi
python_version=$(python3 --version 2>&1 | awk '{print $2}')
python_major_version=$(echo $python_version | cut -d '.' -f 1)
python_minor_version=$(echo $python_version | cut -d '.' -f 2)
if [[ $python_minor_version -lt 6 ]]; then
echo "error: CDK requires python 3.6 or later. You have $python_version. Update your python3 version."
if [[ $python_minor_version -lt 7 ]]; then
echo "error: CDK requires python 3.7 or later. You have $python_version. Update your python3 version."
exit 1
fi

# Check nodejs version
required_nodejs_version=16.15.0
if ! node -v &> /dev/null; then
echo -e "\nnode not found in your path."
echo "Installing nodejs in your home dir. Hit ctrl-c to abort"
pushd $HOME
wget https://nodejs.org/dist/v16.13.1/node-v16.13.1-linux-x64.tar.xz
tar -xf node-v16.13.1-linux-x64.tar.xz
rm node-v16.13.1-linux-x64.tar.xz
wget https://nodejs.org/dist/v${required_nodejs_version}/node-v${required_nodejs_version}-linux-x64.tar.xz
tar -xf node-v${required_nodejs_version}-linux-x64.tar.xz
rm node-v${required_nodejs_version}-linux-x64.tar.xz
cat >> ~/.bashrc << EOF
# Nodejs
export PATH=$HOME/node-v16.13.1-linux-x64/bin:\$PATH
export PATH=$HOME/node-v${required_nodejs_version}-linux-x64/bin:\$PATH
EOF
source ~/.bashrc
popd
fi
# Check node version
node_version=$(node -v 2>&1 | awk '{print $1}')
node_version=${node_version:1}
node_major_version=$(echo $node_version | cut -d '.' -f 1)
node_minor_version=$(echo $node_version | cut -d '.' -f 2)

nodejs_version=$(node -v 2>&1 | awk '{print $1}')
nodejs_version=${nodejs_version:1}
node_major_version=$(echo $nodejs_version | cut -d '.' -f 1)
node_minor_version=$(echo $nodejs_version | cut -d '.' -f 2)
if [[ $node_major_version -lt 14 ]]; then
echo "error: CDK requires node 14.15.0 or later. You have $node_version. Update your node version."
echo "error: CDK requires node 14.15.0 or later. You have $nodejs_version. Update your node version."
exit 1
fi
if [[ $node_major_version -eq 14 ]] && [[ $node_minor_version -lt 6 ]]; then
echo "error: CDK requires node 14.15.0 or later. You have $node_version. Update your node version."
echo "error: CDK requires node 14.15.0 or later. You have $nodejs_version. Update your node version."
exit 1
fi

Expand Down
159 changes: 106 additions & 53 deletions source/cdk/cdk_slurm_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,9 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
# Assets must be created before setting instance_template_vars so the playbooks URL exists
self.create_assets()

self.create_lambdas()
# Create VPC before lambdas so that lambdas can access the VPC.
self.create_vpc()
self.create_lambdas()
self.create_security_groups()
if 'ElasticSearch' not in self.config['slurm']:
self.create_elasticsearch()
Expand Down Expand Up @@ -465,7 +466,68 @@ def create_assets(self):
self.on_prem_compute_nodes_config_file_asset = None
self.onprem_cidr = None

def create_vpc(self):
self.vpc = ec2.Vpc.from_lookup(self, "Vpc", vpc_id = self.config['VpcId'])

self.subnets = self.vpc.private_subnets
valid_subnet_ids = []
if 'SubnetId' in self.config:
self.subnet = None
for subnet in self.subnets:
valid_subnet_ids.append(subnet.subnet_id)
if subnet.subnet_id == self.config['SubnetId']:
self.subnet = subnet
break
if not self.subnet:
# If this is a new VPC then the cdk.context.json will not have the VPC and will be refreshed after the bootstrap phase. Until then the subnet ids will be placeholders so just pick the first subnet. After the bootstrap finishes the vpc lookup will be done and then the info will be correct.
if valid_subnet_ids[0] == 'p-12345':
logger.warning(f"VPC {self.config['VpcId']} not in cdk.context.json and will be refresshed before synth.")
self.subnet = self.vpc.private_subnets[0]
else:
logger.error(f"SubnetId {self.config['SubnetId']} not found in VPC {self.config['VpcId']}\nValid subnet ids:\n{pp.pformat(valid_subnet_ids)}")
exit(1)
else:
self.subnet = self.vpc.private_subnets[0]
self.config['SubnetId'] = self.subnet.subnet_id
logger.info(f"Subnet set to {self.config['SubnetId']}")
logger.info(f"availability zone: {self.subnet.availability_zone}")

# Can't create query logging for private hosted zone.
if 'HostedZoneId' in self.config:
self.hosted_zone = route53.HostedZone.from_hosted_zone_id(self, "PrivateDns", hosted_zone_id=self.config['HostedZoneId'])
self.config['Domain'] = self.hosted_zone.zone_name
else:
self.hosted_zone = route53.HostedZone(self, "PrivateDns",
vpcs = [self.vpc],
zone_name = self.config['Domain']
)
remote_vpcs = {}
for region_dict in self.config['slurm']['InstanceConfig']['Regions']:
if region_dict['Region'] == self.config['Region']:
continue
remote_vpcs[region_dict['Region']] = ec2.Vpc.from_lookup(
self, f"Vpc{region_dict['Region']}",
region = region_dict['Region'],
vpc_id = region_dict['VpcId'])
# BUG: CDK isn't creating the correct region for the vpcs even though cdk_context.json has it right.
#self.hosted_zone.add_vpc(remote_vpcs[region_dict['Region']])

def create_lambdas(self):
dnsLookupLambdaAsset = s3_assets.Asset(self, "DnsLookupLambdaAsset", path="resources/lambdas/DnsLookup")
self.dns_lookup_lambda = aws_lambda.Function(
self, "DnsLookupLambda",
function_name=f"{self.stack_name}-DnsLookup",
description="Lookup up FQDN in DNS",
memory_size=128,
runtime=aws_lambda.Runtime.PYTHON_3_7,
timeout=Duration.minutes(3),
log_retention=logs.RetentionDays.INFINITE,
handler="DnsLookup.lambda_handler",
code=aws_lambda.Code.from_bucket(dnsLookupLambdaAsset.bucket, dnsLookupLambdaAsset.s3_object_key),
vpc = self.vpc,
allow_all_outbound = True
)

createComputeNodeSGLambdaAsset = s3_assets.Asset(self, "CreateComputeNodeSGLambdaAsset", path="resources/lambdas/CreateComputeNodeSG")
self.create_compute_node_sg_lambda = aws_lambda.Function(
self, "CreateComputeNodeSGLambda",
Expand Down Expand Up @@ -555,52 +617,6 @@ def create_lambdas(self):
)
)

def create_vpc(self):
self.vpc = ec2.Vpc.from_lookup(self, "Vpc", vpc_id = self.config['VpcId'])

self.subnets = self.vpc.private_subnets
valid_subnet_ids = []
if 'SubnetId' in self.config:
self.subnet = None
for subnet in self.subnets:
valid_subnet_ids.append(subnet.subnet_id)
if subnet.subnet_id == self.config['SubnetId']:
self.subnet = subnet
break
if not self.subnet:
# If this is a new VPC then the cdk.context.json will not have the VPC and will be refreshed after the bootstrap phase. Until then the subnet ids will be placeholders so just pick the first subnet. After the bootstrap finishes the vpc lookup will be done and then the info will be correct.
if valid_subnet_ids[0] == 'p-12345':
logger.warning(f"VPC {self.config['VpcId']} not in cdk.context.json and will be refresshed before synth.")
self.subnet = self.vpc.private_subnets[0]
else:
logger.error(f"SubnetId {self.config['SubnetId']} not found in VPC {self.config['VpcId']}\nValid subnet ids:\n{pp.pformat(valid_subnet_ids)}")
exit(1)
else:
self.subnet = self.vpc.private_subnets[0]
self.config['SubnetId'] = self.subnet.subnet_id
logger.info(f"Subnet set to {self.config['SubnetId']}")
logger.info(f"availability zone: {self.subnet.availability_zone}")

# Can't create query logging for private hosted zone.
if 'HostedZoneId' in self.config:
self.hosted_zone = route53.HostedZone.from_hosted_zone_id(self, "PrivateDns", hosted_zone_id=self.config['HostedZoneId'])
self.config['Domain'] = self.hosted_zone.zone_name
else:
self.hosted_zone = route53.HostedZone(self, "PrivateDns",
vpcs = [self.vpc],
zone_name = self.config['Domain']
)
remote_vpcs = {}
for region_dict in self.config['slurm']['InstanceConfig']['Regions']:
if region_dict['Region'] == self.config['Region']:
continue
remote_vpcs[region_dict['Region']] = ec2.Vpc.from_lookup(
self, f"Vpc{region_dict['Region']}",
region = region_dict['Region'],
vpc_id = region_dict['VpcId'])
# BUG: CDK isn't creating the correct region for the vpcs even though cdk_context.json has it right.
#self.hosted_zone.add_vpc(remote_vpcs[region_dict['Region']])

def create_security_groups(self):
self.nfs_sg = ec2.SecurityGroup(self, "NfsSG", vpc=self.vpc, allow_all_outbound=False, description="Nfs Security Group")
Tags.of(self.nfs_sg).add("Name", f"{self.stack_name}-NfsSG")
Expand Down Expand Up @@ -1005,14 +1021,21 @@ def create_file_system(self):
self.file_system_dependency = self.file_system

self.file_system_dns = f"{self.file_system.file_system_id}.efs.{self.region}.amazonaws.com"
self.file_system_dns = self.file_system_dns

# Get IpAddress of file system
self.file_system_ip_address = CustomResource(
self, f"ZfsIpAddress",
service_token = self.dns_lookup_lambda.function_arn,
properties={
"FQDN": self.file_system_dns
}
).get_att_string('IpAddress')

self.file_system_port = 2049

self.file_system_mount_name = ""

self.file_system_mount_src = f"{self.file_system_dns}:/"
self.file_system_mount_source = self.file_system_mount_src
self.file_system_mount_source = f"{self.file_system_ip_address}:/"

if self.config['slurm']['storage']['efs']['use_efs_helper']:
self.file_system_type = 'efs'
Expand All @@ -1021,7 +1044,7 @@ def create_file_system(self):
self.file_system_type = 'nfs4'
self.file_system_options = 'nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport'

self.file_system_mount_command = f"sudo mkdir -p {self.config['slurm']['storage']['mount_path']} && sudo yum -y install nfs-utils && sudo mount -t {self.file_system_type} -o {self.file_system_options} {self.file_system_mount_src} {self.config['slurm']['storage']['mount_path']}"
self.file_system_mount_command = f"sudo mkdir -p {self.config['slurm']['storage']['mount_path']} && sudo yum -y install nfs-utils && sudo mount -t {self.file_system_type} -o {self.file_system_options} {self.file_system_mount_source} {self.config['slurm']['storage']['mount_path']}"

elif self.config['slurm']['storage']['provider'] == "ontap":
if 'iops' in self.config['slurm']['storage']['ontap']:
Expand Down Expand Up @@ -1075,6 +1098,15 @@ def create_file_system(self):
}
).get_att_string('DNSName')

# Get IpAddress of SVM
self.file_system_ip_address = CustomResource(
self, f"OntapSvmIpAddress",
service_token = self.dns_lookup_lambda.function_arn,
properties={
"FQDN": self.file_system_dns
}
).get_att_string('IpAddress')

# Add a volume
self.volume = fsx.CfnVolume(
self, 'OntapVolume',
Expand All @@ -1101,7 +1133,7 @@ def create_file_system(self):

self.file_system_mount_name = ""

self.file_system_mount_source = f"{self.file_system_dns}:/slurm"
self.file_system_mount_source = f"{self.file_system_ip_address}:/slurm"

self.file_system_options = 'nfsvers=4.1'

Expand Down Expand Up @@ -1172,9 +1204,18 @@ def create_file_system(self):
self.file_system_type = 'nfs'
self.file_system_dns = self.file_system.attr_dns_name

# Get IpAddress of file system
self.file_system_ip_address = CustomResource(
self, f"ZfsIpAddress",
service_token = self.dns_lookup_lambda.function_arn,
properties={
"FQDN": self.file_system_dns
}
).get_att_string('IpAddress')

self.file_system_mount_name = ""

self.file_system_mount_source = f"{self.file_system_dns}:/fsx/slurm"
self.file_system_mount_source = f"{self.file_system_ip_address}:/fsx/slurm"

self.file_system_options = 'nfsvers=4.1'

Expand All @@ -1185,6 +1226,14 @@ def create_file_system(self):

Tags.of(self.file_system).add("Name", f"{self.stack_name}-Slurm")

# Create DNS entry for file system that can be used in remote VPCs
route53.ARecord(
self, f"SlurmFileSystemDnsRecord",
zone = self.hosted_zone,
record_name = 'slurmfs',
target = route53.RecordTarget.from_ip_addresses(self.file_system_ip_address)
)

CfnOutput(self, "FileSystemProvider",
value = self.config['slurm']['storage']['provider']
)
Expand All @@ -1197,6 +1246,9 @@ def create_file_system(self):
CfnOutput(self, "FileSystemDnsName",
value = self.file_system_dns
)
CfnOutput(self, "FileSystemIpAddress",
value = self.file_system_ip_address
)
CfnOutput(self, "MountCommand",
value = self.file_system_mount_command
)
Expand Down Expand Up @@ -1651,6 +1703,7 @@ def get_instance_template_vars(self, instance_role):
"ERROR_SNS_TOPIC_ARN": self.config['ErrorSnsTopicArn'],
"ExtraMounts": self.config['slurm']['storage']['ExtraMounts'],
"FileSystemDns": self.file_system_dns,
"FileSystemIpAddress": self.file_system_ip_address,
"FileSystemMountPath": self.config['slurm']['storage']['mount_path'],
"FileSystemMountSrc": self.file_system_mount_source,
"FileSystemOptions": self.file_system_options,
Expand Down
56 changes: 56 additions & 0 deletions source/resources/lambdas/DnsLookup/DnsLookup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: MIT-0
Permission is hereby granted, free of charge, to any person obtaining a copy of this
software and associated documentation files (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use, copy, modify,
merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

'''
Do DNS lookup and return the IP address.
'''
import cfnresponse
import json
import logging
from socket import getaddrinfo, SOCK_STREAM

logging.getLogger().setLevel(logging.INFO)

def lambda_handler(event, context):
try:
logging.info(f"event:\n{json.dumps(event, indent=4)}")
properties = event['ResourceProperties']
required_properties = ['FQDN']
error_message = ""
for property in required_properties:
try:
value = properties[property]
except:
error_message += f"Missing {property} property. "
if error_message:
raise KeyError(error_message)

fqdn = properties['FQDN']
ip_address_tuples = getaddrinfo(host=fqdn, port=None, type=SOCK_STREAM)
logging.info(f"Found {len(ip_address_tuples)} ip addresses")
for ip_address_tuple in ip_address_tuples:
logging.info(f"ip_address_tuple: {ip_address_tuple}")
ip_address = ip_address_tuples[0][4][0]
logging.info(f"ip_address: {ip_address}")

except Exception as e:
logging.exception(str(e))
cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e))
raise

cfnresponse.send(event, context, cfnresponse.SUCCESS, {'IpAddress': ip_address}, f"{ip_address}")
1 change: 1 addition & 0 deletions source/resources/lambdas/DnsLookup/cfnresponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
debug:
msg: |
FileSystemDns: {{FileSystemDns}}
FileSystemIpAddress: {{FileSystemIpAddress}}
FileSystemMountPath: {{FileSystemMountPath}}
FileSystemMountSrc: {{FileSystemMountSrc}}
FileSystemOptions: {{FileSystemOptions}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
debug:
msg: |
FileSystemDns: {{FileSystemDns}}
FileSystemIpAddress: {{FileSystemIpAddress}}
FileSystemMountPath: {{FileSystemMountPath}}
FileSystemMountSrc: {{FileSystemMountSrc}}
FileSystemOptions: {{FileSystemOptions}}
Expand Down

0 comments on commit 28fa8cc

Please sign in to comment.