diff --git a/.travis.yml b/.travis.yml index e59bdae8c..0b5080264 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,9 +27,15 @@ before_deploy: - bash _scripts/configure-deploy.sh deploy: - provider: script - script: bash _scripts/deploy.sh - skip_cleanup: true - on: - repo: aws-samples/aws-genomics-workflows - branch: master \ No newline at end of file + - provider: script + script: bash _scripts/deploy.sh production + skip_cleanup: true + on: + repo: aws-samples/aws-genomics-workflows + branch: release + - provider: script + script: bash _scripts/deploy.sh test + skip_cleanup: true + on: + repo: aws-samples/aws-genomics-workflows + branch: master \ No newline at end of file diff --git a/README.md b/README.md index 630f472df..d88105812 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The documentation is built using mkdocs. Install dependencies: ```bash -$ conda env create --file enviroment.yaml +$ conda env create --file environment.yaml ``` This will create a `conda` environment called `mkdocs` diff --git a/_scripts/deploy.sh b/_scripts/deploy.sh index ee7691ba2..9555d7998 100644 --- a/_scripts/deploy.sh +++ b/_scripts/deploy.sh @@ -5,30 +5,75 @@ set -e bash _scripts/make-artifacts.sh mkdocs build +ASSET_BUCKET=s3://aws-genomics-workflows +ASSET_STAGE=${1:-production} -echo "publishing artifacts:" -aws s3 sync \ - --profile asset-publisher \ - --acl public-read \ - --delete \ - ./artifacts \ - s3://aws-genomics-workflows/artifacts +function s3_uri() { + BUCKET=$1 + shift -echo "publishing templates:" -aws s3 sync \ - --profile asset-publisher \ - --acl public-read \ - --delete \ - --metadata commit=$(git rev-parse HEAD) \ - ./src/templates \ - s3://aws-genomics-workflows/templates + IFS="" + PREFIX_PARTS=("$@") + PREFIX_PARTS=(${PREFIX_PARTS[@]}) + PREFIX=$(printf '/%s' "${PREFIX_PARTS[@]%/}") + + echo "${BUCKET%/}/${PREFIX:1}" +} -echo "publishing site" -aws s3 sync \ - --acl public-read \ - --delete \ - ./site \ - s3://docs.opendata.aws/genomics-workflows +function artifacts() { + S3_URI=$(s3_uri $ASSET_BUCKET $ASSET_STAGE_PATH "artifacts") + echo "publishing artifacts: $S3_URI" + aws s3 sync \ + --profile asset-publisher \ + --acl public-read \ + --delete \ + ./artifacts \ + $S3_URI +} + +function templates() { + S3_URI=$(s3_uri $ASSET_BUCKET $ASSET_STAGE_PATH "templates") + + echo "publishing templates: $S3_URI" + aws s3 sync \ + --profile asset-publisher \ + --acl public-read \ + --delete \ + --metadata commit=$(git rev-parse HEAD) \ + ./src/templates \ + $S3_URI +} + +function site() { + echo "publishing site" + aws s3 sync \ + --acl public-read \ + --delete \ + ./site \ + s3://docs.opendata.aws/genomics-workflows +} + +function all() { + artifacts + templates + site +} + +echo "DEPLOYMENT STAGE: $ASSET_STAGE" +case $ASSET_STAGE in + production) + ASSET_STAGE_PATH="" + all + ;; + test) + ASSET_STAGE_PATH="test" + artifacts + templates + ;; + *) + echo "unsupported staging level - $ASSET_STAGE" + exit 1 +esac diff --git a/docs/core-env/create-custom-compute-resources.md b/docs/core-env/create-custom-compute-resources.md index 44bdcbee3..ed9e2bca1 100644 --- a/docs/core-env/create-custom-compute-resources.md +++ b/docs/core-env/create-custom-compute-resources.md @@ -1,20 +1,17 @@ -# Creating Custom Compute Resources +# Custom Compute Resources Genomics is a data-heavy workload and requires some modification to the defaults -used for batch job processing. In particular, instances running the Tasks/Jobs -need scalable storage to meet unpredictable runtime demands. +used by AWS Batch for job processing. To efficiently use resources, AWS Batch places multiple jobs on an worker instance. The data requirements for individual jobs can range from a few MB to 100s of GB. Instances running workflow jobs will not know beforehand how much space is required, and need scalable storage to meet unpredictable runtime demands. -By default, AWS Batch relies upon the [Amazon ECS-Optimized AMI](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html) -to launch container instances for running jobs. This is sufficient in most cases, but specialized needs, such as the large -storage requirements noted above, require customization of the base AMI. - -This section provides two methods for customizing the base ECS-Optimized AMI -that adds an expandable working directory for jobs to write data. -A process will monitor the directory and add more EBS volumes on the fly to expand the free space -based on the capacity threshold, like so: +To handle this use case, we can use a process that monitors a scratch directory on an instance and expands free space as needed based on capacity thresholds. This can be done using logical volume management and attaching EBS volumes as needed to the instance like so: ![Autoscaling EBS storage](images/ebs-autoscale.png) +The above process - "EBS autoscaling" - requires a few small dependencies and a simple daemon installed on the host instance. + +By default, AWS Batch uses the [Amazon ECS-Optimized AMI](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html) +to launch instances for running jobs. This is sufficient in most cases, but specialized needs, such as the large storage requirements noted above, require customization of the base AMI. Because the provisioning requirements for EBS autoscaling are fairly simple and light weight, one can use an EC2 Launch Template to customize instances. + ## EC2 Launch Template The simplest method for customizing an instance is to use an EC2 Launch Template. @@ -43,11 +40,13 @@ packages: - python27-pip - sed - wget +# add more package names here if you need them runcmd: - pip install -U awscli boto3 - cd /opt && wget https://aws-genomics-workflows.s3.amazonaws.com/artifacts/aws-ebs-autoscale.tgz && tar -xzf aws-ebs-autoscale.tgz - sh /opt/ebs-autoscale/bin/init-ebs-autoscale.sh /scratch /dev/sdc 2>&1 > /var/log/init-ebs-autoscale.log +# you can add more commands here if you have additional provisioning steps --==BOUNDARY==-- ``` @@ -58,23 +57,113 @@ If you want this volume to be larger initially, you can specify a bigger one mapped to `/dev/sdc` the Launch Template. !!! note - The mount point is specific to what orchestration method / engine you intend - to use. `/scratch` is considered the default for AWS Step Functions. If you - are using a 3rd party workflow orchestration engine this mount point will need - to be adjusted to fit that engine's expectations. + The mount point is specific to what orchestration method / engine you intend to use. `/scratch` is considered a generic default. If you are using a 3rd party workflow orchestration engine this mount point will need to be adjusted to fit that engine's expectations. + +Also note that the script has MIME multi-part boundaries. This is because AWS Batch will combind this script with others that it uses to provision instances. + +## Creating an EC2 Launch Template + +Instructions on how to create a launch template are below. Once your Launch Template is created, you can reference it when you setup resources in AWS Batch to ensure that jobs run therein have your customizations available +to them. + +### Automated via CloudFormation You can use the following CloudFormation template to create a Launch Template suitable for your needs. | Name | Description | Source | Launch Stack | | -- | -- | :--: | :--: | -{{ cfn_stack_row("EC2 Launch Template", "GenomicsWorkflow-LT", "aws-genomics-launch-template.template.yaml", "Creates an EC2 Launch Template that provisions instances on first boot for processing genomics workflow tasks.") }} +{{ cfn_stack_row("EC2 Launch Template", "GWFCore-LT", "aws-genomics-launch-template.template.yaml", "Creates an EC2 Launch Template that provisions instances on first boot for processing genomics workflow tasks.") }} + +### Manually via the AWS CLI + +In most cases, EC2 Launch Templates can be created using the AWS EC2 Console. +For this case, we need to use the AWS CLI. + +Create a file named `launch-template-data.json` with the following contents: + +```json +{ + "TagSpecifications": [ + { + "ResourceType": "instance", + "Tags": [ + { + "Key": "architecture", + "Value": "genomics-workflow" + }, + { + "Key": "solution", + "Value": "nextflow" + } + ] + } + ], + "BlockDeviceMappings": [ + { + "Ebs": { + "DeleteOnTermination": true, + "VolumeSize": 50, + "VolumeType": "gp2" + }, + "DeviceName": "/dev/xvda" + }, + { + "Ebs": { + "Encrypted": true, + "DeleteOnTermination": true, + "VolumeSize": 75, + "VolumeType": "gp2" + }, + "DeviceName": "/dev/xvdcz" + }, + { + "Ebs": { + "Encrypted": true, + "DeleteOnTermination": true, + "VolumeSize": 20, + "VolumeType": "gp2" + }, + "DeviceName": "/dev/sdc" + } + ], + "UserData": "...base64-encoded-string..." +} +``` -Once your Launch Template is created, you can reference it when you setup resources -in AWS Batch to ensure that jobs run therein have your customizations available -to them. +The above template will create an instance with three attached EBS volumes. + +* `/dev/xvda`: will be used for the root volume +* `/dev/xvdcz`: will be used for the docker metadata volume +* `/dev/sdc`: will be the initial volume use for scratch space (more on this below) -## Custom AMI +The `UserData` value should be the `base64` encoded version of the UserData script used to provision instances. + +Use the command below to create the corresponding launch template: + +```bash +aws ec2 \ + create-launch-template \ + --launch-template-name genomics-workflow-template \ + --launch-template-data file://launch-template-data.json +``` + +You should get something like the following as a response: + +```json +{ + "LaunchTemplate": { + "LatestVersionNumber": 1, + "LaunchTemplateId": "lt-0123456789abcdef0", + "LaunchTemplateName": "genomics-workflow-template", + "DefaultVersionNumber": 1, + "CreatedBy": "arn:aws:iam::123456789012:user/alice", + "CreateTime": "2019-01-01T00:00:00.000Z" + } +} +``` + +## Custom AMIs A slightly more involved method for customizing an instance is to create a new AMI based on the ECS Optimized AMI. This is good if you have @@ -83,14 +172,5 @@ datasets preloaded that will be needed by all your jobs. You can learn more about how to [create your own AMIs in the EC2 userguide](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html). -The CloudFormation template below automates the tasks needed to create an AMI and should take about 10-15min to complete. - -| Name | Description | Source | Launch Stack | -| -- | -- | :--: | :--: | -{{ cfn_stack_row("Custom AMI (Existing VPC)", "GenomicsWorkflow-AMI", "deprecated/aws-genomics-ami.template.yaml", "Creates a custom AMI that EC2 instances can be based on for processing genomics workflow tasks. The creation process will happen in a VPC you specify") }} - -Once your AMI is created, you will need to jot down its unique AMI Id. You will -need this when creating compute resources in AWS Batch. - !!! note This is considered advanced use. All documentation and CloudFormation templates hereon assumes use of EC2 Launch Templates. diff --git a/docs/core-env/create-iam-roles.md b/docs/core-env/create-iam-roles.md index c372e3e4d..ee9356d8c 100644 --- a/docs/core-env/create-iam-roles.md +++ b/docs/core-env/create-iam-roles.md @@ -1,6 +1,75 @@ # Permissions -## Create IAM Roles +IAM is used to control access to your AWS resources. This includes access by users and groups in your account, as well as access by AWS services such as AWS Batch operating on your behalf. + +Services use IAM Roles which provide temporary access to AWS resources when needed. + +!!! danger "IMPORTANT" + You need to have Administrative access to your AWS account to make changes in IAM. + + A recommended way to do this is to create a user and add that user to a group with the `AdministratorAccess` managed policy attached. This makes it easier to revoke these privileges if necessary. + +## Create IAM Resources + +### IAM Policies + +For the EC2 instance role described in the next section, it is recommended to restrict access to just the resources and permissions it needs to use. In this case, it will be: + +* Access to the specific buckets used for input and output data +* The ability to create and add EBS volumes to the instance (more on this later) + +These policies could be used by other roles, so it will be easier to manage them if each are stand alone documents. + +* **Bucket Access Policy (required)**: + +This policy specifies full access to a single S3 bucket named `` which physically resides in ``. + +```json +{ + "PolicyName": "s3bucket-access-", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "s3:*", + "Resource": [ + "arn:aws:s3:::", + "arn:aws:s3:::/*" + ] + } + ] + } +} +``` + +If needed, the policy can be made more granular - i.e. only allowing access to a prefix within the bucket - by modifying the second `Resource` item to include the prefix path before the `*`. + +* **EBS Autoscale Policy (required)**: + +This policy allows job instance to attach EBS volumes to create extra scratch space for genomic data. + +```json +{ + "PolicyName": "ebs-autoscale-", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:*Volume", + "ec2:describeVolumes", + "ec2:modifyInstanceAttribute" + ], + "Resource": "*" + } + ] + } +} +``` + +### IAM Roles IAM roles that your job execution environment in AWS Batch will use include: @@ -13,56 +82,165 @@ IAM roles that your job execution environment in AWS Batch will use include: * **Batch Instance Profile (required)**: Role that defines service permissions for EC2 instances launched by AWS Batch. - For example, this is used to specify policies that allow access to specific S3 buckets and modify storage on the instance (shown below). + This role should also have attached policies (see above) that allow access to specific S3 buckets and the ability to modify storage (e.g. EBS volumes) on the instance. [(Learn More)](https://docs.aws.amazon.com/batch/latest/userguide/instance_IAM_role.html) -```yaml - -# this inline policy specifies access to a single S3 bucket -- PolicyName: GenomicsEnv-S3Bucket-Access-us-east-1 - PolicyDocument: - Version: 2012-10-17 - Statement: - Effect: Allow - Resource: - - "arn:aws:s3:::" - - "arn:aws:s3:::/*" - Action: - - "s3:*" - -# this inline policy allows the job instance to attach EBS volumes to create -# extra scratch space for genomic data -- PolicyName: GenomicsEnv-Autoscale-EBS-us-east-1 - PolicyDocument: - Version: 2012-10-17 - Statement: - Effect: Allow - Action: - - "ec2:createVolume" - - "ec2:attachVolume" - - "ec2:deleteVolume" - - "ec2:modifyInstanceAttribute" - - "ec2:describeVolumes" - Resource: "*" -``` - * **Batch SpotFleet Role (depends)**: This role is needed if you intend to launch spot instances from AWS Batch. - If you create a managed compute environment that uses Amazon EC2 Spot Fleet Instances, you must create a role that grants the Spot Fleet permission to bid on, launch, tag, and terminate instances on your behalf. + If you create a managed compute environment that uses Amazon EC2 Spot Fleet Instances, you must create a role that grants the Spot Fleet permission to set a cost threshold, launch, tag, and terminate instances on your behalf. [(Learn More)](https://docs.aws.amazon.com/batch/latest/userguide/spot_fleet_IAM_role.html) * **Batch Job Role (optional)**: - Role used to provide service permissions to individual jobs. + Role used to provide specific service permissions to individual jobs. Jobs can run without an IAM role. In that case, they inherit the - permissions of the instance they run on. + permissions of the instance they run on. Job roles are useful if you have jobs that utilize additional AWS resources such as buckets with supplementary data or need to interact with other AWS services like databases. + +### Automated via CloudFormation -The CloudFormation template below creates all of the above roles. +The CloudFormation template below creates all of the above roles and policies. | Name | Description | Source | Launch Stack | | -- | -- | :--: | :--: | -{{ cfn_stack_row("Amazon IAM Roles", "GenomicsWorkflow-IAM", "aws-genomics-iam.template.yaml", "Create the necessary IAM Roles. This is useful to hand to someone with the right permissions to create these on your behalf. _You will need to provide a S3 bucket name_.") }} +{{ cfn_stack_row("Amazon IAM Roles", "GWFCore-IAM", "aws-genomics-iam.template.yaml", "Create the necessary IAM Roles. This is useful to hand to someone with the right permissions to create these on your behalf. _You will need to provide a S3 bucket name_.") }} + +!!! danger "Administrative Access Required" + In order run this CloudFormation temlate you you will need privileged access to your account either through an IAM user, STS assumed role, or CloudFormation Stack role. + +### Manually via the AWS Console + +#### Create a bucket access policy + +* Go to the IAM Console +* Click on "Policies" +* Click on "Create Policy" +* Repeat the following for as many buckets as you will use (e.g. if you have one bucket for nextflow logs and another for nextflow workDir, you will need to do this twice) + * Select "S3" as the service + * Select "All Actions" + * Under Resources select "Specific" + * Under Resources > bucket, click "Add ARN" + * Type in the name of the bucket + * Click "Add" + * Under Resources > object, click "Add ARN" + * For "Bucket Name", type in the name of the bucket + * For "Object Name", select "Any" + * Click "Add additional permissions" if you have additional buckets you are using +* Click "Review Policy" +* Name the policy "bucket-access-policy" +* Click "Create Policy" + +#### Create an EBS autoscale policy + +* Go to the IAM Console +* Click on "Policies" +* Click on "Create Policy" +* Switch to the "JSON" tab +* Paste the following into the editor: + +```json +{ + "Version": "2012-10-17", + "Statement": { + "Action": [ + "ec2:*Volume", + "ec2:modifyInstanceAttribute", + "ec2:describeVolumes" + ], + "Resource": "*", + "Effect": "Allow" + } +} +``` + +* Click "Review Policy" +* Name the policy "ebs-autoscale-policy" +* Click "Create Policy" + +#### Create a Batch Service Role + +This is a role used by AWS Batch to launch EC2 instances on your behalf. + +* Go to the IAM Console +* Click on "Roles" +* Click on "Create role" +* Select "AWS service" as the trusted entity +* Choose "Batch" as the service to use the role +* Click "Next: Permissions" + +In Attached permissions policies, the "AWSBatchServiceRole" will already be attached + +* Click "Next: Tags". (adding tags is optional) +* Click "Next: Review" +* Set the Role Name to "AWSBatchServiceRole" +* Click "Create role" + +#### Create an EC2 Instance Role + +This is a role that controls what AWS Resources EC2 instances launched by AWS Batch have access to. +In this case, you will limit S3 access to just the bucket you created earlier. + +* Go to the IAM Console +* Click on "Roles" +* Click on "Create role" +* Select "AWS service" as the trusted entity +* Choose EC2 from the larger services list +* Choose "EC2 - Allows EC2 instances to call AWS services on your behalf" as the use case. +* Click "Next: Permissions" + +* Type "ContainerService" in the search field for policies +* Click the checkbox next to "AmazonEC2ContainerServiceforEC2Role" to attach the policy + +* Type "S3" in the search field for policies +* Click the checkbox next to "AmazonS3ReadOnlyAccess" to attach the policy !!! note - In order to create these roles you will need privileged access to your account. + Enabling Read-Only access to all S3 resources is required if you use publicly available datasets such as the [1000 Genomes dataset](https://registry.opendata.aws/1000-genomes/), and others, available in the [AWS Registry of Open Datasets](https://registry.opendata.aws) + +* Type "bucket-access-policy" in the search field for policies +* Click the checkbox next to "bucket-access-policy" to attach the policy + +* Type "ebs-autoscale-policy" in the search field for policies +* Click the checkbox next to "ebs-autoscale-policy" to attach the policy + +* Click "Next: Tags". (adding tags is optional) +* Click "Next: Review" +* Set the Role Name to "ecsInstanceRole" +* Click "Create role" + +#### Create an EC2 SpotFleet Role + +This is a role that allows creation and launch of Spot fleets - Spot instances with similar compute capabilities (i.e. vCPUs and RAM). This is for using Spot instances when running jobs in AWS Batch. + +* Go to the IAM Console +* Click on "Roles" +* Click on "Create role" +* Select "AWS service" as the trusted entity +* Choose EC2 from the larger services list +* Choose "EC2 - Spot Fleet Tagging" as the use case + +In Attached permissions policies, the "AmazonEC2SpotFleetTaggingRole" will already be attached + +* Click "Next: Tags". (adding tags is optional) +* Click "Next: Review" +* Set the Role Name to "AWSSpotFleetTaggingRole" +* Click "Create role" + +#### Create a Job Role + +This is a role used by individual Batch Jobs to specify permissions to AWS resources in addition to permissions allowed by the Instance Role above. + +* Go to the IAM Console +* Click on "Roles" +* Click on "Create role" +* Select "AWS service" as the trusted entity +* Choose Elastic Container Service from the larger services list +* Choose "Elastic Container Service Task" as the use case. +* Click "Next: Permissions" + +* Attach AWS managed and user defined policies as needed. + +* Click "Next: Tags". (adding tags is optional) +* Click "Next: Review" +* Set the Role Name to "BatchJobRole" +* Click "Create role" diff --git a/docs/core-env/create-s3-bucket.md b/docs/core-env/create-s3-bucket.md index 571667232..609927a0b 100644 --- a/docs/core-env/create-s3-bucket.md +++ b/docs/core-env/create-s3-bucket.md @@ -1,9 +1,6 @@ # Data Storage -You will need a robust location to store your input and output data. As mentioned -previously, genomics data files are fairly large. In addition to input sample -files, genomics data processing typically relies on additional items like -reference sequences or annotation databases that can be equally large. +You will need a robust location to store your input and output data. Genomics data files often equal or exceed 100GB per file. In addition to input sample files, genomics data processing typically relies on additional items like reference sequences or annotation databases that can be equally large. The following are key criteria for storing data for genomics workflows @@ -12,12 +9,31 @@ The following are key criteria for storing data for genomics workflows * durable * capable of handling large files +Amazon S3 buckets meet all of the above conditions. S3 also makes it easy to collaboratively work on such large datasets because buckets and the data stored in them are globally available. + +You can use an S3 bucket to store both your input data and workflow results. + ## Create an S3 Bucket -Amazon S3 buckets meet all of the above conditions. +You can use an existing bucket for your workflows, or you can create a new one using the methods below. -You can use an existing bucket for your workflows, or you can create a new one using the CloudFormation template below. +### Automated via Cloudformation | Name | Description | Source | Launch Stack | | -- | -- | :--: | :--: | -{{ cfn_stack_row("Amazon S3 Bucket", "GenomicsWorkflow-S3", "aws-genomics-s3.template.yaml", "Creates a secure Amazon S3 bucket to read from and write results to.") }} +{{ cfn_stack_row("Amazon S3 Bucket", "GWFCore-S3", "aws-genomics-s3.template.yaml", "Creates a secure Amazon S3 bucket to read from and write results to.") }} + +### Manually via the AWS Console + +* Go to the S3 Console +* Click on the "Create Bucket" button + +In the dialog that opens: + +* Provide a "Bucket Name". This needs to be globally unique. + +* Select the region for the bucket. Buckets are globally accessible, but the data resides on physical hardware within a specific region. It is best to choose a region that is closest to where you are and where you will launch compute resources to reduce network latency and avoid inter-region transfer costs. + +The default options for bucket configuration are sufficient for the marjority of use cases. + +* Click the "Create" button to accept defaults and create the bucket. diff --git a/docs/core-env/setup-aws-batch.md b/docs/core-env/setup-aws-batch.md index e8b4fb519..edbed27bb 100644 --- a/docs/core-env/setup-aws-batch.md +++ b/docs/core-env/setup-aws-batch.md @@ -1,6 +1,4 @@ -# AWS Batch for Genomics Workflows - -## What is AWS Batch? +# AWS Batch [AWS Batch](https://aws.amazon.com/batch/) is a managed service that helps you efficiently run batch computing workloads on the AWS Cloud. Users submit jobs to job queues, specifying the application to be run and the compute resources (CPU and memory) required by the job. AWS Batch is responsible for launching the appropriate quantity and types of instances needed to run your jobs. @@ -14,7 +12,7 @@ A [job definition](http://docs.aws.amazon.com/batch/latest/userguide/job_definit Jobs are submitted to [job queues](http://docs.aws.amazon.com/batch/latest/userguide/job_queues.html) where they reside until they can be scheduled to run on Amazon EC2 instances within a compute environment. An AWS account can have multiple job queues, each with varying priority. This gives you the ability to closely align the consumption of compute resources with your organizational requirements. -[Compute environments](http://docs.aws.amazon.com/batch/latest/userguide/compute_environments.html) provision and manage your EC2 instances and other compute resources that are used to run your AWS Batch jobs. Job queues are mapped to one or more compute environments and a given environment can also be mapped to one or more job queues. This many-to-many relationship is defined by the compute environment order and job queue priority properties. +[Compute environments](http://docs.aws.amazon.com/batch/latest/userguide/compute_environments.html) are effectively autoscaling clusters of EC2 instances that are launched to run your jobs. Unlike traditional HPC clusters, compute environments can be configured to use a variety of instance types and sizes. The AWS Batch job scheduler will do the heavy lifting of placing jobs on the most appropriate instance type based on the jobs resource requirements. Compute environments can also use either On-demand instances, or Spot instances for maximum cost savings. Job queues are mapped to one or more compute environments and a given environment can also be mapped to one or more job queues. This many-to-many relationship is defined by the compute environment order and job queue priority properties. The following diagram shows a general overview of how the AWS Batch resources interact. @@ -22,17 +20,17 @@ The following diagram shows a general overview of how the AWS Batch resources in For more information, watch the [How AWS Batch Works](https://www.youtube.com/watch?v=T4aAWrGHmxQ) video. -## AWS Batch Jobs Requirements +## Job Requirements AWS Batch does not make assumptions on the structure and requirements that Jobs take with respect to inputs and outputs. Batch Jobs may take data streams, files, or only parameters as input, and produce the same variety for output, inclusive of files, metadata changes, updates to databases, etc. Batch assumes that each application handles their own input/output requirements. A common pattern for bioinformatics tooling is that files such as genomic sequence data are both inputs and outputs to/from a process. Many bioinformatics tools have also been developed to run in traditional Linux-based compute clusters with shared filesystems and are not necessarily optimized for cloud computing. -The set of common requirements for genomics on AWS Batch are: +When using AWS Batch for genomics workflows, there are a couple key considerations: * Independent execution: - To make your workflow as flexible as possible, each job should run independently. As a result, you cannot necessarily guarantee that different jobs in the same overall workflow run on the same instance. Using S3 as the location to exchange data between containers enables you to decouple storage of your intermediate files from compute. + To make your workflow as flexible as possible, each job should run independently. As a result, you cannot necessarily guarantee that different jobs in the same overall workflow will run on the same instance. Using S3 as the location to exchange data between containers enables you to decouple storage of your intermediate files from compute. * Multitenancy: @@ -40,9 +38,9 @@ The set of common requirements for genomics on AWS Batch are: * Data cleanup: - As your jobs complete and write the output back to S3, it is a good idea to delete the scratch data generated by that job on your instance. This allows you to optimize for cost by reusing EC2 instances if there are jobs remaining in the queue, rather than terminating the EC2 instances. + As your jobs complete and write the output back to S3, it is a good idea to delete the scratch data generated by that job on the instance. This allows you to optimize for cost by reusing EC2 instances if there are jobs remaining in the queue, rather than terminating the EC2 instances. -## AWS Batch Environment +## Creating an AWS Batch Environment A complete AWS Batch environment consists of the following: @@ -51,8 +49,132 @@ A complete AWS Batch environment consists of the following: 3. A default Job Queue that utilizes the Spot compute environment first, but falls back to the on-demand compute environment if there is spare capacity available. 4. A high-priority Job Queue that leverages the on-demand and Spot CE's (in that order) and has higher priority than the default queue. +### Automated via CloudFormation + The CloudFormation template below will create all of the above. | Name | Description | Source | Launch Stack | | -- | -- | :--: | :--: | -{{ cfn_stack_row("AWS Batch", "GenomicsWorkflow-Batch", "aws-genomics-batch.template.yaml", "Creates AWS Batch Job Queues and Compute Environments. You will need to provide the details for your Launch Template ID, IAM roles and instance profiles, and the IDs for a VPC and subnets.") }} +{{ cfn_stack_row("AWS Batch", "GWFCore-Batch", "aws-genomics-batch.template.yaml", "Creates AWS Batch Job Queues and Compute Environments. _You will need to provide the details for your Launch Template ID, IAM roles and instance profiles, and the IDs for a VPC and subnets._") }} + +### Manually via the AWS Console + +#### Compute Environments + +You can create several compute environments to suit your needs. Below we'll create the following: + +* An "optimal" compute environment using on-demand instances +* An "optimal" compute environment using spot instances + +"Optimal" is a default grouping of EC2 instance types used for compute environments. It includes M4 (general purpose), C4 (compute-optimized), and R4 (memory-optimized) instance families which should be suitable for a wide range of computing cases. + +##### Create an "optimal" on-demand compute environment + +1. Go to the AWS Batch Console +2. Click on "Compute environments" +3. Click on "Create environment" +4. Select "Managed" as the "Compute environment type" +5. For "Compute environment name" type: "ondemand" +6. In the "Service role" drop down, select the `AWSBatchServiceRole` you created previously +7. In the "Instance role" drop down, select the `ecsInstanceRole` you created previously +8. For "Provisioning model" select "On-Demand" +9. "Allowed instance types" will be already populated with "optimal" - which is a mixture of M4, C4, and R4 instances. +10. In the "Launch template" drop down, select the `genomics-workflow-template` you created previously +11. Set Minimum and Desired vCPUs to 0. + +!!! info + **Minimum vCPUs** is the lowest number of active vCPUs (i.e. instances) your compute environment will keep running and available for placing jobs when there are no jobs queued. Setting this to 0 means that AWS Batch will terminate all instances when all queued jobs are complete. + + **Desired vCPUs** is the number of active vCPUs (i.e. instances) that are currently needed in the compute environment to process queued jobs. Setting this to 0 implies that there are currently no queued jobs. AWS Batch will adjust this number based on the number of jobs queued and their resource requirements. + + **Maximum vCPUs** is the highest number of active vCPUs (i.e. instances) your compute environment will launch. This places a limit on the number of jobs the compute environment can process in parallel. + +For networking, the options are populated with your account's default VPC, public subnets, and security group. This should be sufficient for the purposes of this workshop. In a production setting, it is recommended to use a separate VPC, private subnets therein, and associated security groups. + +Optional: (Recommended) Add EC2 tags. These will help identify which EC2 instances were launched by AWS Batch. At minimum: + +* Key: "Name" +* Value: "batch-ondemand-worker" + +Click on "Create" + +##### Create an "optimal" spot compute environment + +1. Go to the AWS Batch Console +2. Click on "Compute environments" +3. Click on "Create environment" +4. Select "Managed" as the "Compute environment type" +5. For "Compute environment name" type: "spot" +6. In the "Service role" drop down, select the `AWSBatchServiceRole` you created previously +7. In the "Instance role" drop down, select the `ecsInstanceRole` you created previously +8. For "Provisioning model" select "Spot" +9. In the "Spot fleet role" drop down, select the `AWSSpotFleetTaggingRole` you created previously +10. "Allowed instance types" will be already populated with "optimal" - which is a mixture of M4, C4, and R4 instances. +11. In the "Launch template" drop down, select the `genomics-workflow-template` you created previously +12. Set Minimum and Desired vCPUs to 0. + +For networking, the options are populated with your account's default VPC, public subnets, and security group. This should be sufficient for the purposes of this workshop. In a production setting, it is recommended to use a separate VPC, private subnets therein, and associated security groups. + +Optional: (Recommended) Add EC2 tags. These will help identify which EC2 instances were launched by AWS Batch. At minimum: + +* Key: "Name" +* Value: "batch-spot-worker" + +Click on "Create" + +#### Job Queues + +AWS Batch job queues, are where you submit and monitor the status of jobs. + +Job queues can be associated with one or more compute environments in a preferred order. Multiple job queues can be associated with the same compute environment. Thus to handle scheduling, job queues also have a priority weight as well. + +Below we'll create two job queues: + + * A "Default" job queue + * A "High Priority" job queue + +Both job queues will use both compute environments you created previously. + +##### Create a "default" job queue + +This queue is intended for jobs that do not require urgent completion, and can handle potential interruption. This queue will schedule jobs to: + +1. The "spot" compute environment +2. The "ondemand" compute environment + +in that order. + +Because it primarily leverages Spot instances, it will also be the most cost effective job queue. + +* Go to the AWS Batch Console +* Click on "Job queues" +* Click on "Create queue" +* For "Queue name" use "default" +* Set "Priority" to 1 +* Under "Connected compute environments for this queue", using the drop down menu: + + 1. Select the "spot" compute environment you created previously, then + 2. Select the "ondemand" compute environment you created previously + +* Click on "Create Job Queue" + +##### Create a "high-priority" job queue + +This queue is intended for jobs that are urgent and **cannot** handle potential interruption. This queue will schedule jobs to: + +1. The "ondemand" compute environment +2. The "spot" compute environment + +in that order. + +* Go to the AWS Batch Console +* Click on "Job queues" +* Click on "Create queue" +* For "Queue name" use "highpriority" +* Set "Priority" to 100 (higher values mean higher priority) +* Under "Connected compute environments for this queue", using the drop down menu: + + 1. Select the "ondemand" compute environment you created previously, then + 2. Select the "spot" compute environment you created previously + +* Click on "Create Job Queue" diff --git a/docs/orchestration/nextflow/nextflow-overview.md b/docs/orchestration/nextflow/nextflow-overview.md index 853e3b8db..b6e4330dc 100644 --- a/docs/orchestration/nextflow/nextflow-overview.md +++ b/docs/orchestration/nextflow/nextflow-overview.md @@ -65,25 +65,28 @@ ENTRYPOINT ["/opt/bin/nextflow.aws.sh"] !!! note If you are trying to keep your container image as small as possible, keep in mind that Nextflow relies on basic linux tools such as `awk`, `bash`, `ps`, `date`, `sed`, `grep`, `egrep`, and `tail` which may need to be installed on extra minimalist base images like `alpine`. -The script used for the entrypoint is shown below. The first parameter is the folder in S3 where you have staged your Nextflow scripts and supporting files (like additional config files). Any additional parameters are passed along to the Nextflow executable. This is important to remember when submiting the head node job. Notice that it automatically configures some Nextflow values based on environment variables set by AWS Batch. +The script used for the entrypoint is shown below. The first parameter should be a Nextflow "project". Nextflow supports pulling projects directly from Git repositories. This script also allows for projects to be specified as an S3 URI - a bucket and folder therein where you have staged your Nextflow scripts and supporting files (like additional config files). Any additional parameters are passed along to the Nextflow executable. Also, the script automatically configures some Nextflow values based on environment variables set by AWS Batch. ```bash -#!/bin/bash -echo $@ -NEXTFLOW_SCRIPT=$1 +echo "=== ENVIRONMENT ===" +echo `env` + +echo "=== RUN COMMAND ===" +echo "$@" + +NEXTFLOW_PROJECT=$1 shift -NEXTFLOW_PARAMS=$@ +NEXTFLOW_PARAMS="$@" # Create the default config using environment variables # passed into the container -mkdir -p /opt/config NF_CONFIG=~/.nextflow/config cat << EOF > $NF_CONFIG workDir = "$NF_WORKDIR" process.executor = "awsbatch" process.queue = "$NF_JOB_QUEUE" -executor.awscli = "/home/ec2-user/miniconda/bin/aws" +aws.batch.cliPath = "/home/ec2-user/miniconda/bin/aws" EOF # AWS Batch places multiple jobs on an instance @@ -91,20 +94,41 @@ EOF # to create a unique path GUID="$AWS_BATCH_JOB_ID/$AWS_BATCH_JOB_ATTEMPT" +if [ "$GUID" = "/" ]; then + GUID=`date | md5sum | cut -d " " -f 1` +fi + mkdir -p /opt/work/$GUID cd /opt/work/$GUID -# stage workflow definition -aws s3 sync --only-show-errors --exclude '.*' $NEXTFLOW_SCRIPT . +# stage in session cache +# .nextflow directory holds all session information for the current and past runs. +# it should be `sync`'d with an s3 uri, so that runs from previous sessions can be +# resumed +aws s3 sync --only-show-errors $NF_LOGSDIR/.nextflow .nextflow -NF_FILE=$(find . -name "*.nf" -maxdepth 1) +# stage workflow definition +if [[ "$NEXTFLOW_PROJECT" =~ "^s3://.*" ]]; then + aws s3 sync --only-show-errors --exclude 'runs/*' --exclude '.*' $NEXTFLOW_PROJECT ./project + NEXTFLOW_PROJECT=./project +fi echo "== Running Workflow ==" -echo "nextflow run $NF_FILE $NEXTFLOW_PARAMS" -nextflow run $NF_FILE $NEXTFLOW_PARAMS +echo "nextflow run $NEXTFLOW_PROJECT $NEXTFLOW_PARAMS" +nextflow run $NEXTFLOW_PROJECT $NEXTFLOW_PARAMS + +# stage out session cache +aws s3 sync --only-show-errors .nextflow $NF_LOGSDIR/.nextflow + +# .nextflow.log file has more detailed logging from the workflow run and is +# nominally unique per run. +# +# when run locally, .nextflow.logs are automatically rotated +# when syncing to S3 uniquely identify logs by the batch GUID +aws s3 cp --only-show-errors .nextflow.log $NF_LOGSDIR/.nextflow.log.${GUID/\//.} ``` -The `AWS_BATCH_JOB_ID` and `AWS_BATCH_JOB_ATTEMPT` are [environment variables that are automatically provided](https://docs.aws.amazon.com/batch/latest/userguide/job_env_vars.html) to all AWS Batch jobs. The `NF_WORKDIR` and `NF_JOB_QUEUE` variables are ones set by the Batch Job Definition ([see below](#batch-job-definition)). +The `AWS_BATCH_JOB_ID` and `AWS_BATCH_JOB_ATTEMPT` are [environment variables that are automatically provided](https://docs.aws.amazon.com/batch/latest/userguide/job_env_vars.html) to all AWS Batch jobs. The `NF_WORKDIR`, `NF_LOGSDIR`, and `NF_JOB_QUEUE` variables are ones set by the Batch Job Definition ([see below](#batch-job-definition)). ### Job instance AWS CLI @@ -146,51 +170,41 @@ An AWS Batch Job Definition for the containerized Nextflow described above is sh { "jobDefinitionName": "nextflow", "jobDefinitionArn": "arn:aws:batch:::job-definition/nextflow:1", - "revision": 1, - "status": "ACTIVE", "type": "container", - "parameters": { - "NextflowScript": "s3:///nextflow/workflow.nf" - }, + "parameters": {}, "containerProperties": { - "image": "/nextflow:latest", + "image": ".dkr.ecr..amazonaws.com/nextflow:latest", "vcpus": 2, "memory": 1024, - "command": [ - "Ref::NextflowScript" - ], - "volumes": [ - { - "host": { - "sourcePath": "/scratch" - }, - "name": "scratch" - } - ], + "command": [], + "jobRoleArn": "", + "volumes": [], "environment": [ + { + "name": "NF_LOGSDIR", + "value": "s3:///_nextflow/logs" + }, { "name": "NF_JOB_QUEUE", "value": "" }, { "name": "NF_WORKDIR", - "value": "s3:///runs" + "value": "s3:///_nextflow/runs" } ], - "mountPoints": [ - { - "containerPath": "/opt/work", - "sourceVolume": "scratch" - } - ], - "ulimits": [] + "mountPoints": [], + "ulimits": [], + "resourceRequirements": [] } } ``` +The `` is described below. + ### Nextflow IAM Role -Nextflow needs to be able to create and submit Batch Job Defintions and Batch Jobs, and read workflow script files in an S3 bucket. These permissions are provided via a Job Role associated with the Job Definition. Policies for this role would look like the following: +Nextflow needs to be able to create and submit Batch Job Defintions and Batch Jobs, and read workflow logs and session information from an S3 bucket. These permissions are provided via a Job Role associated with the Job Definition. Policies for this role would look like the following: #### Nextflow-Batch-Access @@ -213,7 +227,7 @@ This policy gives **full** access to AWS Batch. #### Nextflow-S3Bucket-Access -This policy gives **full** access to the buckets used to store data and workflow scripts. +This policy gives **full** access to the buckets used to store workflow data and Nextflow session metadata. ```json { @@ -224,8 +238,8 @@ This policy gives **full** access to the buckets used to store data and workflow "s3:*" ], "Resource": [ - "arn:aws:s3:::", - "arn:aws:s3:::/*", + "arn:aws:s3:::", + "arn:aws:s3:::/*", "arn:aws:s3:::", "arn:aws:s3:::/*" ], @@ -237,7 +251,9 @@ This policy gives **full** access to the buckets used to store data and workflow ## A Nextflow S3 Bucket -The containerized version of `nextflow` above reads a `*.nf` script from an S3 bucket and writes workflow logs and outputs back to it. This bucket can either be the same one that your workflow inputs and outputs are stored (e.g. in a separate folder therein) or it can be another bucket entirely. +Because running as a container will be an ephemeral process, the containerized version of `nextflow` stores workflow session information in S3 using paths described by `NF_WORKDIR` and `NF_LOGSDIR` environment variables. These allow you to use Nextflow's `-resume` flag to restart a workflow that was previously interrupted at the step it left off at. + +This bucket can be independent of the S3 bucket used to store workflow input and output data if necessary. ## Running a workflow @@ -333,86 +349,11 @@ For each process in your workflow, Nextflow will create a corresponding Batch Jo You can customize these job definitions to incorporate additional environment variables or volumes/mount points as needed. -!!! important - In order to take advantage of automatically [expandable scratch space](../../../core-env/create-custom-compute-resources/) in the host instance, you will need to modify Nextflow created job definitions to map a container volume from `/scratch` on the host to `/tmp` in the container. - -For example, a customized job definition for the process above that maps `/scratch` on the host to `/scratch` in the container and still work with Nextflow would be: - -```json -{ - "jobDefinitionName": "nf-ubuntu-latest", - "jobDefinitionArn": "arn:aws:batch:::job-definition/nf-ubuntu-latest:2", - "revision": 2, - "status": "ACTIVE", - "type": "container", - "parameters": { - "nf-token": "43869867b5fbae16fa7cfeb5ea2c3522" - }, - "containerProperties": { - "image": "ubuntu:latest", - "vcpus": 1, - "memory": 1024, - "command": [ - "true" - ], - "volumes": [ - { - "host": { - "sourcePath": "/home/ec2-user/miniconda" - }, - "name": "aws-cli" - }, - { - "host": { - "sourcePath": "/scratch" - }, - "name": "scratch" - } - ], - "environment": [], - "mountPoints": [ - { - "containerPath": "/home/ec2-user/miniconda", - "readOnly": true, - "sourceVolume": "aws-cli" - }, - { - "containerPath": "/scratch", - "sourceVolume": "scratch" - } - ], - "ulimits": [] - } -} -``` - -Nextflow will use the most recent revision of a Job Definition. - -You can also predefine Job Definitions that leverage extra volume mappings and refer to them in the process definition. Assuming you had an existing Job Definition named `say-hello`, a process definition that utilized it would look like: - -```groovy -texts = Channel.from("AWS", "Nextflow") - -process hello { - // directives - // substitute the container image reference with a job-definition reference - container "job-definition://say-hello" - - // compute resources for the Batch Job - cpus 1 - memory '512 MB' - - input: - val text from texts - - output: - file 'hello.txt' +!!! note + As of Nextflow 19.07 you can use the `aws.batch.volumes` config option to define additional volumes and mount points. - """ - echo "Hello $text" > hello.txt - """ -} -``` +!!! important + Instances provisioned using the Nextflow specific EC2 Launch Template configure `/var/lib/docker` in the host instance to use automatically [expandable scratch space](../../../core-env/create-custom-compute-resources/), allowing containerized jobs to stage as much data as needed without running into disk space limits. ### Running the workflow @@ -421,25 +362,30 @@ To run a workflow you submit a `nextflow` Batch job to the appropriate Batch Job * the AWS Batch Console * or the command line with the AWS CLI -This is what starting a workflow via the AWS CLI would look like: +This is what starting a workflow via the AWS CLI would look like using Nextflow's built-in "hello-world" workflow: ```bash +aws batch submit-job \ + --job-name nf-hello \ + --job-queue \ + --job-definition nextflow \ + --container-overrides command=hello +``` -git clone https://github.com/nf-core/rnaseq.git -aws s3 sync rnaseq s3://path/to/workflow/folder +After submitting a workflow, you can monitor the progress of tasks via the AWS Batch console. +For the "Hello World" workflow above you will see five jobs run in Batch - one for the head node, and one for each `Channel` text as it goes through the `hello` process. + +For a more complex example, you can try the following, which will run the [RNASeq workflow](https://nf-co.re/rnaseq) developed by the [NF-Core project](https://nf-co.re/) against data in the [1000 Genomes AWS Public Dataset](https://registry.opendata.aws/1000-genomes/): +```bash aws batch submit-job \ - --job-name run-workflow-nf \ + --job-name nf-core-rnaseq \ --job-queue \ --job-definition nextflow \ - --container-overrides command=s3://path/to/workflow/folder,\ + --container-overrides command=nf-core/rnaseq,\ "--reads","'s3://1000genomes/phase3/data/HG00243/sequence_read/SRR*_{1,2}.filt.fastq.gz'",\ "--genome","GRCh37",\ "--skip_qc" ``` -After submitting a workflow, you can monitor the progress of tasks via the AWS Batch console. - -For the "Hello World" workflow above you will see three jobs run in Batch - one for the head node, and one for each `Channel` text as it goes through the `hello` process. - For the nf-core example "rnaseq" workflow you will see 11 jobs run in Batch over the course of a couple hours - the head node will last the whole duration of the pipeline while the others will stop once their step is complete. You can look at the CloudWatch logs for the head node job to monitor workflow progress. Note the additional single quotes wrapping the 1000genomes path. diff --git a/docs/quick-start.md b/docs/quick-start.md index 140d31a0c..d1630e9d2 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -23,15 +23,15 @@ The above template uses the AWS Quickstart reference for a [Modular and Scalable For architectural details, best practices, step-by-step instructions, and customization options, see the [deployment guide](https://fwd.aws/9VdxN). -## Step 1: Compute Environment +## Step 1: Core Environment ### Option A: Full stack -The "Full Stack" CloudFormation template below will create all of the AWS resources required - S3 Bucket, EC2 Launch Templates, IAM Roles, Batch Compute Environments, Batch Job Queues - for your genomics workflow environment into an existing VPC. +The "Full Stack" CloudFormation template below will create all of the AWS resources required - S3 Bucket, EC2 Launch Templates, IAM Roles, Batch Compute Environments, Batch Job Queues - you will need for a genomics workflow environment into an existing VPC. | Name | Description | Source | Launch Stack | | -- | -- | :--: | :--: | -{{ cfn_stack_row("Full Stack (Existing VPC)", "GenomicsEnv-Full", "aws-genomics-root-novpc.template.yaml", "Create EC2 Launch Templates, AWS Batch Job Queues and Compute Environments, a secure Amazon S3 bucket, and IAM policies and roles within an **existing** VPC. _NOTE: You must provide VPC ID, and subnet IDs_.") }} +{{ cfn_stack_row("Full Stack (Existing VPC)", "GWFCore-Full", "aws-genomics-root-novpc.template.yaml", "Create EC2 Launch Templates, AWS Batch Job Queues and Compute Environments, a secure Amazon S3 bucket, and IAM policies and roles within an **existing** VPC. _NOTE: You must provide VPC ID, and subnet IDs_.") }} Prior to the final create button, be sure to acknowledge "IAM CAPABILITIES". @@ -51,10 +51,10 @@ Below are the stand-alone CloudFormation templates for each of the sub-stacks. T | Name | Description | Source | Launch Stack | | -- | -- | :--: | :--: | -{{ cfn_stack_row("Amazon IAM Roles", "GenomicsWorkflow-IAM", "aws-genomics-iam.template.yaml", "Create the necessary IAM Roles. This is useful to hand to someone with the right permissions to create these on your behalf. _You will need to provide an Amazon S3 bucket name_.") }} -{{ cfn_stack_row("EC2 Launch Template", "GenomicsWorkflow-LT", "aws-genomics-launch-template.template.yaml", "Creates an EC2 Launch Template that provisions instances on first boot for processing genomics workflow tasks.") }} -{{ cfn_stack_row("Amazon S3 Bucket", "GenomicsWorkflow-S3", "aws-genomics-s3.template.yaml", "Creates a secure Amazon S3 bucket to read inputs and write results.") }} -{{ cfn_stack_row("AWS Batch", "GenomicsWorkflow-Batch", "aws-genomics-batch.template.yaml", "Creates AWS Batch Job Queues and Compute Environments. You will need to provide the details on IAM roles and instance profiles, and the IDs for a VPC and subnets.") }} +{{ cfn_stack_row("Amazon S3 Bucket", "GWFCore-S3", "aws-genomics-s3.template.yaml", "Creates a secure Amazon S3 bucket to read inputs and write results.") }} +{{ cfn_stack_row("Amazon IAM Roles", "GWFCore-IAM", "aws-genomics-iam.template.yaml", "Create the necessary IAM Roles. This is useful to hand to someone with the right permissions to create these on your behalf. _You will need to provide an Amazon S3 bucket name_.") }} +{{ cfn_stack_row("EC2 Launch Template", "GWFCore-LT", "aws-genomics-launch-template.template.yaml", "Creates an EC2 Launch Template that provisions instances on first boot for processing genomics workflow tasks.") }} +{{ cfn_stack_row("AWS Batch", "GWFCore-Batch", "aws-genomics-batch.template.yaml", "Creates AWS Batch Job Queues and Compute Environments. _You will need to provide the details for your Launch Template ID, IAM roles and instance profiles, and the IDs for a VPC and subnets._") }} ## Step 2: Worklow Orchestrators diff --git a/mkdocs.yml b/mkdocs.yml index 692f48d6f..c62c188ac 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,12 +3,12 @@ nav: - Overview: index.md - Disclaimer: disclaimer.md - Quick Start: quick-start.md - - Getting Started: + - Core Environment: - Introduction: core-env/introduction.md - Data Storage: core-env/create-s3-bucket.md - - Compute Resources: core-env/create-custom-compute-resources.md - Permissions: core-env/create-iam-roles.md - - Batch: core-env/setup-aws-batch.md + - Compute Resources: core-env/create-custom-compute-resources.md + - AWS Batch: core-env/setup-aws-batch.md # - Containerized Tooling: # - Introduction: containers/container-introduction.md # - Examples: containers/container-examples.md diff --git a/src/containers/bcftools/aws.dockerfile b/src/containers/bcftools/aws.dockerfile index 4a708db1b..93431cdab 100644 --- a/src/containers/bcftools/aws.dockerfile +++ b/src/containers/bcftools/aws.dockerfile @@ -1,5 +1,6 @@ FROM bcftools:latest +RUN apt-get update RUN apt-get install -y awscli RUN apt-get clean diff --git a/src/containers/bcftools/bcftools.aws.sh b/src/containers/bcftools/bcftools.aws.sh index bfcf141a8..d18df631e 100644 --- a/src/containers/bcftools/bcftools.aws.sh +++ b/src/containers/bcftools/bcftools.aws.sh @@ -29,7 +29,8 @@ function call() { bcftools call \ -m \ - --threads 8 \ + --threads 16 \ + -t chr21 \ -o $OUTPUT_PATH/${SAMPLE_ID}.vcf \ $INPUT_PATH/${SAMPLE_ID}.mpileup.vcf.gz @@ -48,10 +49,14 @@ function mpileup() { aws s3 cp \ --no-progress \ - ${INPUT_PREFIX}/${SAMPLE_ID}.bam $INPUT_PATH + --recursive \ + --exclude "*" \ + --include "${SAMPLE_ID}.bam*"\ + ${INPUT_PREFIX}/ $INPUT_PATH bcftools mpileup \ - --threads 8 \ + --threads 16 \ + -r chr21 \ -Oz \ -f $REFERENCE_PATH/${REFERENCE_NAME}.fasta \ $INPUT_PATH/${SAMPLE_ID}.bam \ diff --git a/src/containers/bwa/aws.dockerfile b/src/containers/bwa/aws.dockerfile index 61c840c54..8d4dc910d 100644 --- a/src/containers/bwa/aws.dockerfile +++ b/src/containers/bwa/aws.dockerfile @@ -1,5 +1,6 @@ FROM bwa:latest +RUN apt-get update RUN apt-get install -y awscli RUN apt-get clean diff --git a/src/containers/bwa/bwa.aws.sh b/src/containers/bwa/bwa.aws.sh index 861da3516..0872ed4df 100644 --- a/src/containers/bwa/bwa.aws.sh +++ b/src/containers/bwa/bwa.aws.sh @@ -39,9 +39,9 @@ function mem() { ${INPUT_PREFIX} $INPUT_PATH # command - bwa mem -p \ + bwa mem -t 16 -p \ $REFERENCE_PATH/${REFERENCE_NAME}.fasta \ - $INPUT_PATH/${SAMPLE_ID}_1.fastq.gz \ + $INPUT_PATH/${SAMPLE_ID}_*1*.fastq.gz \ > $OUTPUT_PATH/${SAMPLE_ID}.sam # data staging diff --git a/src/containers/nextflow/nextflow.aws.sh b/src/containers/nextflow/nextflow.aws.sh index cd8ad96a3..8e10c7496 100644 --- a/src/containers/nextflow/nextflow.aws.sh +++ b/src/containers/nextflow/nextflow.aws.sh @@ -1,8 +1,19 @@ #!/bin/bash -# $1 S3 URI to Nextflow project files. If not using S3 set to "". +# $1 Nextflow project. Can be an S3 URI, or git repo name. # $2.. Additional parameters passed on to the nextflow cli +# using nextflow needs the following locations/directories provided as +# environment variables to the container +# * NF_LOGSDIR: where caching and logging data are stored +# * NF_WORKDIR: where intermmediate results are stored + + +echo "=== ENVIRONMENT ===" +echo `env` + +echo "=== RUN COMMAND ===" echo "$@" + NEXTFLOW_PROJECT=$1 shift NEXTFLOW_PARAMS="$@" @@ -15,9 +26,12 @@ cat << EOF > $NF_CONFIG workDir = "$NF_WORKDIR" process.executor = "awsbatch" process.queue = "$NF_JOB_QUEUE" -executor.awscli = "/home/ec2-user/miniconda/bin/aws" +aws.batch.cliPath = "/home/ec2-user/miniconda/bin/aws" EOF +echo "=== CONFIGURATION ===" +cat ~/.nextflow/config + # AWS Batch places multiple jobs on an instance # To avoid file path clobbering use the JobID and JobAttempt # to create a unique path @@ -30,13 +44,28 @@ fi mkdir -p /opt/work/$GUID cd /opt/work/$GUID +# stage in session cache +# .nextflow directory holds all session information for the current and past runs. +# it should be `sync`'d with an s3 uri, so that runs from previous sessions can be +# resumed +aws s3 sync --only-show-errors $NF_LOGSDIR/.nextflow .nextflow + # stage workflow definition -NF_FILE="" -if [ ! -z "$NEXTFLOW_PROJECT" ]; then - aws s3 sync --only-show-errors --exclude 'runs/*' --exclude '.*' $NEXTFLOW_PROJECT . - NF_FILE=$(find . -maxdepth 1 -name "*.nf") +if [[ "$NEXTFLOW_PROJECT" =~ "^s3://.*" ]]; then + aws s3 sync --only-show-errors --exclude 'runs/*' --exclude '.*' $NEXTFLOW_PROJECT ./project + NEXTFLOW_PROJECT=./project fi echo "== Running Workflow ==" -echo "nextflow run $NF_FILE $NEXTFLOW_PARAMS" -nextflow run $NF_FILE $NEXTFLOW_PARAMS \ No newline at end of file +echo "nextflow run $NEXTFLOW_PROJECT $NEXTFLOW_PARAMS" +nextflow run $NEXTFLOW_PROJECT $NEXTFLOW_PARAMS + +# stage out session cache +aws s3 sync --only-show-errors .nextflow $NF_LOGSDIR/.nextflow + +# .nextflow.log file has more detailed logging from the workflow run and is +# nominally unique per run. +# +# when run locally, .nextflow.logs are automatically rotated +# when syncing to S3 uniquely identify logs by the batch GUID +aws s3 cp --only-show-errors .nextflow.log $NF_LOGSDIR/.nextflow.log.${GUID/\//.} \ No newline at end of file diff --git a/src/containers/samtools/aws.dockerfile b/src/containers/samtools/aws.dockerfile index 0cda32e45..ea5668637 100644 --- a/src/containers/samtools/aws.dockerfile +++ b/src/containers/samtools/aws.dockerfile @@ -1,5 +1,6 @@ FROM samtools:latest +RUN apt-get update RUN apt-get install -y awscli RUN apt-get clean diff --git a/src/containers/samtools/samtools.aws.sh b/src/containers/samtools/samtools.aws.sh index 403e8617f..d9f006c07 100644 --- a/src/containers/samtools/samtools.aws.sh +++ b/src/containers/samtools/samtools.aws.sh @@ -41,7 +41,7 @@ function sort() { ${INPUT_PREFIX}/${SAMPLE_ID}.sam $INPUT_PATH samtools sort \ - -@ 8 \ + -@ 16 \ -o $OUTPUT_PATH/${SAMPLE_ID}.bam \ $INPUT_PATH/${SAMPLE_ID}.sam diff --git a/src/templates/README.md b/src/templates/README.md index 5d5e14f7f..e4091f66a 100644 --- a/src/templates/README.md +++ b/src/templates/README.md @@ -1,54 +1,36 @@ -# Genomics on AWS CloudFormation templates - -This directory contains example CloudFormation templates for setting up the resources for working with genomics and other large-scale biomedical research data. - - -root = to do -* inputs: - - stack name root - - az - - tags - - key pair name - - s3 bucket name -* outputs: - - job queue names - - s3 bucket name - - -vpc = https://raw.githubusercontent.com/aws-quickstart/quickstart-aws-vpc/master/templates/aws-vpc.template -* inputs: - * stack name - * Availability Zones - * tag for public & private subnets - * key pair name -* outputs: - - az - - sg - - -s3 = to do -* input: - - stack name - - s3 bucket name - -iam = to do -* inputs: - - stack name - - s3 bucket name -* outputs - - iam instance profile - - iam ecs service role - - iam ecs task roles - - iam batch service role - -batch = -* inputs: - - stack name - - azs - - key pair name - - iam instance profile - - iam ecs role - - iam ecs task roles - - iam batch service role - - iam batch spot fleet role -- outputs: - - job Queue names +# Genomics Workflows on AWS CloudFormation templates + +Contained herein are CloudFormation templates for creating AWS resources for working with large-scale biomedical data - e.g. genomics. + +## Core Stack + +Templates at the root level represent the "core" stack. The root template is: + +| File | Description | +| :--- | :---------- | +| `aws-genomics-root-novpc.template.yaml` | Root stack that invokes nested stacks (see below) | + +Nested stacks are as follows and listed in order of creation: + +| File | Description | +| :--- | :---------- | +| `aws-genomics-s3.template.yaml` | Creates an S3 bucket for storing workflow input and output data | +| `aws-genomics-launch-template.template.yaml` | Creates an EC2 Launch Template used in AWS Batch Compute Environments | +| `aws-genomics-iam.template.yaml` | Creates IAM roles for AWS Batch resources | +| `aws-genomics-batch.template.yaml` | Creates AWS Batch Job Queues and Compute Environments for job execution | + +## All-in-One ("AIO") Stacks + +All-in-One stacks are provided for solutions that utilize: + +* AWS Step-Functions +* Cromwell +* Nextflow + +and build atop the Core Stackk above. They also include additional stacks specific to the solution: + +| File | Description | +| :--- | :---------- | +| `step-functions/sfn-example.template.yaml` | Creates an example AWS Step Functions state-machine and containers for an example genomics workflow using BWA, samtools, and bcftools. | +| `cromwell/cromwell-server.template.yaml` | Creates an EC2 instance with Cromwell pre-installed and launched in "server" mode | +| `nextflow/nextflow-resources.template.yaml` | Creates a nextflow container and AWS Batch Job Definition for running nextflow | diff --git a/src/templates/aws-genomics-batch.template.yaml b/src/templates/aws-genomics-batch.template.yaml index b8f9a7a75..3817a79c4 100644 --- a/src/templates/aws-genomics-batch.template.yaml +++ b/src/templates/aws-genomics-batch.template.yaml @@ -11,7 +11,6 @@ Metadata: default: "AWS Batch Environment Config" Parameters: - LaunchTemplateId - - Ec2KeyPairName - VpcId - SubnetIds - SpotBidPercentage @@ -19,15 +18,12 @@ Metadata: - DefaultCEMaxvCpus - HighPriorityCEMinvCpus - HighPriorityCEMaxvCpus - - DefaultRetryNumber - BatchServiceRoleArn - Ec2InstanceProfileArn - SpotFleetRoleArn ParameterLabels: LaunchTemplateId: default: Launch Template ID - Ec2KeyPairName: - default: EC2 Key Pair Name VpcId: default: VPC ID SubnetIds: @@ -66,9 +62,6 @@ Parameters: LaunchTemplateId: Type: String Description: Launch Template you want your AWS Batch Compute Environments to use - Ec2KeyPairName: - Type: AWS::EC2::KeyPair::KeyName - Description: Name of the EC2 Key Pair for connecting to EC2 instances launched in your compute environment HighPriorityCEMinvCpus: Type: Number Description: Minimum number of CPUs in the high-priority compute environment. Default 0. @@ -109,7 +102,6 @@ Resources: Ref: VpcId SGSSHIngress: Type: AWS::EC2::SecurityGroupIngress - DependsOn: GenomicsBatchSecurityGroup Properties: GroupId: !Ref GenomicsBatchSecurityGroup IpProtocol: tcp @@ -118,7 +110,6 @@ Resources: CidrIp: 0.0.0.0/0 SGAllTcpEgress: Type: AWS::EC2::SecurityGroupEgress - DependsOn: GenomicsBatchSecurityGroup Properties: GroupId: !Ref GenomicsBatchSecurityGroup IpProtocol: tcp @@ -127,7 +118,6 @@ Resources: CidrIp: 0.0.0.0/0 SGAllTcpSelfIngress: Type: AWS::EC2::SecurityGroupIngress - DependsOn: GenomicsBatchSecurityGroup Properties: GroupId: !Ref GenomicsBatchSecurityGroup IpProtocol: tcp @@ -137,7 +127,6 @@ Resources: GenomicsDefaultComputeEnv: Type: AWS::Batch::ComputeEnvironment - DependsOn: GenomicsBatchSecurityGroup Properties: ComputeEnvironmentName: !Sub - spot-${StackGuid} @@ -147,7 +136,7 @@ Resources: State: ENABLED ComputeResources: BidPercentage: !Ref SpotBidPercentage - Ec2KeyPair: !Ref Ec2KeyPairName + # Ec2KeyPair: !Ref Ec2KeyPairName LaunchTemplate: LaunchTemplateId: !Ref LaunchTemplateId InstanceRole: !Ref Ec2InstanceProfileArn @@ -167,7 +156,6 @@ Resources: GenomicsHighPriorityComputeEnv: Type: AWS::Batch::ComputeEnvironment - DependsOn: GenomicsBatchSecurityGroup Properties: ComputeEnvironmentName: !Sub - ondemand-${StackGuid} @@ -176,7 +164,6 @@ Resources: Type: MANAGED State: ENABLED ComputeResources: - Ec2KeyPair: !Ref Ec2KeyPairName LaunchTemplate: LaunchTemplateId: !Ref LaunchTemplateId InstanceRole: !Ref Ec2InstanceProfileArn diff --git a/src/templates/aws-genomics-launch-template.template.yaml b/src/templates/aws-genomics-launch-template.template.yaml index 0e1df9cee..8a0eed757 100644 --- a/src/templates/aws-genomics-launch-template.template.yaml +++ b/src/templates/aws-genomics-launch-template.template.yaml @@ -6,7 +6,7 @@ Description: >- Mappings: ScratchMountPointMap: step-functions: - mountpoint: "/scratch" + mountpoint: "/var/lib/docker" cromwell: mountpoint: "/cromwell_root" nextflow: @@ -23,25 +23,33 @@ Mappings: ECSAdditionsMap: step-functions: additions: |- + - service docker stop + - cp -au /var/lib/docker /var/lib/docker.bk - cd /opt && wget $artifactRootUrl/aws-ebs-autoscale.tgz && tar -xzf aws-ebs-autoscale.tgz - sh /opt/ebs-autoscale/bin/init-ebs-autoscale.sh $scratchPath /dev/sdc 2>&1 > /var/log/init-ebs-autoscale.log + - sed -i 's+OPTIONS=.*+OPTIONS="--storage-driver btrfs"+g' /etc/sysconfig/docker-storage + - service docker start + - start ecs + cromwell: additions: |- - cd /opt && wget $artifactRootUrl/aws-ebs-autoscale.tgz && tar -xzf aws-ebs-autoscale.tgz - sh /opt/ebs-autoscale/bin/init-ebs-autoscale.sh $scratchPath /dev/sdc 2>&1 > /var/log/init-ebs-autoscale.log - cd /opt && wget $artifactRootUrl/aws-ecs-additions.tgz && tar -xzf aws-ecs-additions.tgz - sh /opt/ecs-additions/ecs-additions-cromwell.sh + nextflow: additions: |- - service docker stop - cp -au /var/lib/docker /var/lib/docker.bk - cd /opt && wget $artifactRootUrl/aws-ebs-autoscale.tgz && tar -xzf aws-ebs-autoscale.tgz - sh /opt/ebs-autoscale/bin/init-ebs-autoscale.sh $scratchPath /dev/sdc 2>&1 > /var/log/init-ebs-autoscale.log - - cd /opt && wget $artifactRootUrl/aws-ecs-additions.tgz && tar -xzf aws-ecs-additions.tgz - - sh /opt/ecs-additions/ecs-additions-nextflow.sh - sed -i 's+OPTIONS=.*+OPTIONS="--storage-driver btrfs"+g' /etc/sysconfig/docker-storage - service docker start - start ecs + - cd /opt && wget $artifactRootUrl/aws-ecs-additions.tgz && tar -xzf aws-ecs-additions.tgz + - sh /opt/ecs-additions/ecs-additions-nextflow.sh + Parameters: LaunchTemplateNamePrefix: Type: String diff --git a/src/templates/aws-genomics-root-novpc.template.yaml b/src/templates/aws-genomics-root-novpc.template.yaml index 3412aeb72..473f99f90 100644 --- a/src/templates/aws-genomics-root-novpc.template.yaml +++ b/src/templates/aws-genomics-root-novpc.template.yaml @@ -15,7 +15,6 @@ Metadata: - WorkflowOrchestrator - VpcId - SubnetIds - - KeyPairName - Label: default: Optional Parameters: @@ -37,8 +36,6 @@ Metadata: default: VPC ID SubnetIds: default: VPC Subnet IDs - KeyPairName: - default: EC2 Key Pair Name SpotBidPercentage: default: Spot Bid % DefaultCEMinvCpus: @@ -85,9 +82,6 @@ Parameters: - cromwell - nextflow Default: step-functions - KeyPairName: - Description: Key Pair name - Type: AWS::EC2::KeyPair::KeyName SpotBidPercentage: Description: The percent of on-demand pricing for max bid for Spot intances Type: Number @@ -160,7 +154,6 @@ Resources: TimeoutInMinutes: 10 Parameters: LaunchTemplateId: !Sub ${LaunchTplStack.Outputs.LaunchTemplateId} - Ec2KeyPairName: !Ref KeyPairName VpcId: !Ref VpcId SubnetIds: !Join [",", !Ref SubnetIds ] SpotBidPercentage: !Ref SpotBidPercentage diff --git a/src/templates/aws-genomics-s3.template.yaml b/src/templates/aws-genomics-s3.template.yaml index 370586efb..924c28dfa 100644 --- a/src/templates/aws-genomics-s3.template.yaml +++ b/src/templates/aws-genomics-s3.template.yaml @@ -60,8 +60,6 @@ Resources: - ServerSideEncryptionByDefault: SSEAlgorithm: AES256 Tags: - - Key: Name - Value: genomics-base-ami - Key: architecture Value: !FindInMap ["TagMap", "default", "architecture"] diff --git a/src/templates/cromwell/cromwell-aio.template.yaml b/src/templates/cromwell/cromwell-aio.template.yaml index 1bcde7b68..4fb58b219 100644 --- a/src/templates/cromwell/cromwell-aio.template.yaml +++ b/src/templates/cromwell/cromwell-aio.template.yaml @@ -1,3 +1,4 @@ +--- AWSTemplateFormatVersion: "2010-09-09" Description: >- (WWPS-GLS-WF-CROMWELL-AIO) Creates the complete set of resources needed to run @@ -20,11 +21,13 @@ Metadata: ParameterGroups: - Label: default: "Required" + Parameters: + - KeyPairName + - Label: + default: "Data Storage" Parameters: - S3BucketName - ExistingBucket - - KeyPairName - - AvailabilityZones - Label: default: "AWS Batch" Parameters: @@ -65,9 +68,6 @@ Metadata: # Parameters Parameters: - AvailabilityZones: - Description: "Choose the two Availability Zones to deploy instances for AWS Batch." - Type: List S3BucketName: Description: >- A S3 bucket name for storing analysis results. @@ -86,7 +86,7 @@ Parameters: - No Default: No KeyPairName: - Description: Key Pair name + Description: Key Pair name used for SSH access to the Cromwell Server. Type: AWS::EC2::KeyPair::KeyName SpotBidPercentage: Description: The percent of on-demand pricing for max bid for Spot intances @@ -159,7 +159,7 @@ Parameters: ArtifactRootUrl: Type: String - Default: https://aws-genomics-workflows.s3.amazonaws.com/artifacts + Default: https://s3.amazonaws.com/aws-genomics-workflows/artifacts Description: >- Root URL for where artifacts / additions scripts are stored @@ -185,9 +185,12 @@ Resources: TemplateURL: https://aws-quickstart.s3.amazonaws.com/quickstart-aws-vpc/templates/aws-vpc.template TimeoutInMinutes: 15 Parameters: - AvailabilityZones: !Join ["," , !Ref AvailabilityZones] + AvailabilityZones: + Fn::Join: + - "," + - - !Sub "${AWS::Region}a" + - !Sub "${AWS::Region}b" NumberOfAZs: "2" - KeyPairName: !Ref KeyPairName Tags: !FindInMap ["TagMap", "default", "tags"] GenomicsWorkflowStack: @@ -199,7 +202,6 @@ Resources: SubnetIds: !Sub "${VpcStack.Outputs.PrivateSubnet1AID}, ${VpcStack.Outputs.PrivateSubnet2AID}" S3BucketName: !Ref S3BucketName ExistingBucket: !Ref ExistingBucket - KeyPairName: !Ref 'KeyPairName' WorkflowOrchestrator: cromwell SpotBidPercentage: !Ref 'SpotBidPercentage' DefaultCEMinvCpus: !Ref 'DefaultCEMinvCpus' @@ -233,6 +235,8 @@ Resources: Outputs: CromwellServerHostName: Value: !GetAtt 'CromwellServerStack.Outputs.HostName' + Export: + Name: !Sub "${AWS::StackName}-CromwellServerHostName" Description: >- Cromwell server public DNS name. Use this URL in a web browser or via curl to access Cromwell and submit workflows. @@ -251,14 +255,21 @@ Outputs: Value: !GetAtt 'VpcStack.Outputs.VPCID' S3Bucket: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvS3Bucket' + Export: + Name: !Sub "${AWS::StackName}-S3Bucket" Description: >- S3 bucket for storing genomics workflow input and output data BatchDefaultQueue: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvDefaultJobQueueArn' + Export: + Name: !Sub "${AWS::StackName}-DefaultJobQueue" Description: >- The default AWS Batch job queue for workflow jobs, based on EC2 SPOT instances BatchHighPriorityQueue: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvHighPriorityJobQueueArn' + Export: + Name: !Sub "${AWS::StackName}-HighPriorityJobQueue" Description: >- AWS Batch job queue for high priority workflow jobs, based on EC2 On-Demand instances +... \ No newline at end of file diff --git a/src/templates/nextflow/nextflow-aio.template.yaml b/src/templates/nextflow/nextflow-aio.template.yaml index e1b144e39..ddb757895 100644 --- a/src/templates/nextflow/nextflow-aio.template.yaml +++ b/src/templates/nextflow/nextflow-aio.template.yaml @@ -21,12 +21,10 @@ Metadata: AWS::CloudFormation::Interface: ParameterGroups: - Label: - default: "Required" + default: "Data Storage" Parameters: - S3DataBucketName - ExistingDataBucket - - KeyPairName - - AvailabilityZones - Label: default: "AWS Batch" Parameters: @@ -41,8 +39,6 @@ Metadata: - NextflowContainerImage - S3NextflowBucketName - ExistingNextflowBucket - - S3NextflowScriptPrefix - - S3NextflowWorkDirPrefix ParameterLabels: S3DataBucketName: @@ -53,8 +49,6 @@ Metadata: default: S3 Nextflow Bucket Name ExistingNextflowBucket: default: Existing Nextflow Bucket? - KeyPairName: - default: EC2 Key Pair Name SpotBidPercentage: default: Spot Bid % DefaultCEMinvCpus: @@ -70,9 +64,6 @@ Metadata: Parameters: - AvailabilityZones: - Description: "Choose the two Availability Zones to deploy instances for AWS Batch." - Type: List S3DataBucketName: Description: >- A S3 bucket name for storing analysis results @@ -81,8 +72,10 @@ Parameters: If left blank a unique bucket name will be generated. Type: String + Default: "" AllowedPattern: "((?=^.{3,63}$)(?!^(\\d+\\.)+\\d+$)(^(([a-z0-9]|[a-z0-9][a-z0-9\\-]*[a-z0-9])\\.)*([a-z0-9]|[a-z0-9][a-z0-9\\-]*[a-z0-9])$)|(^.{0}$))" ConstraintDescription: "Must respect S3 bucket naming conventions" + ExistingDataBucket: Description: Does this bucket already exist? Type: String @@ -90,9 +83,6 @@ Parameters: - Yes - No Default: No - KeyPairName: - Description: Key Pair name - Type: AWS::EC2::KeyPair::KeyName SpotBidPercentage: Description: The percent of on-demand pricing for max bid for Spot intances Type: Number @@ -116,22 +106,13 @@ Parameters: S3NextflowBucketName: Type: String + Default: "" Description: >- - (Optional) S3 Bucket used to store *.nf scripts. + (Optional) S3 Bucket used to store Nextflow metadata (session cache, logs, and intermediate results). Defaults to the S3 Bucket used for data. AllowedPattern: "(^$|(?=^.{3,63}$)(?!^(\\d+\\.)+\\d+$)(^(([a-z0-9]|[a-z0-9][a-z0-9\\-]*[a-z0-9])\\.)*([a-z0-9]|[a-z0-9][a-z0-9\\-]*[a-z0-9])$))" ConstraintDescription: "Must respect S3 bucket naming conventions" - - S3NextflowWorkDirPrefix: - Type: String - Description: >- - (Optional) Parent folder in the S3 bucket that contains workflow execution logs - - S3NextflowScriptPrefix: - Type: String - Description: >- - (Optional) Parent folder in the S3 bucket that contains *.nf workflow scripts - + ExistingNextflowBucket: Type: String Description: >- @@ -143,6 +124,7 @@ Parameters: NextflowContainerImage: Type: String + Default: "" Description: >- (Optional) Container image for nextflow with custom entrypoint for config and workflow script staging. (Example, "/nextflow:latest"). @@ -151,7 +133,7 @@ Parameters: ArtifactRootUrl: Type: String - Default: https://aws-genomics-workflows.s3.amazonaws.com/artifacts + Default: https://s3.amazonaws.com/aws-genomics-workflows/artifacts Description: >- Root URL for where artifacts / additions scripts are stored @@ -179,9 +161,12 @@ Resources: TemplateURL: https://aws-quickstart.s3.amazonaws.com/quickstart-aws-vpc/templates/aws-vpc.template TimeoutInMinutes: 15 Parameters: - AvailabilityZones: !Join ["," , !Ref AvailabilityZones] + AvailabilityZones: + Fn::Join: + - "," + - - !Sub "${AWS::Region}a" + - !Sub "${AWS::Region}b" NumberOfAZs: "2" - KeyPairName: !Ref KeyPairName Tags: !FindInMap ["TagMap", "default", "tags"] GenomicsWorkflowStack: @@ -193,7 +178,6 @@ Resources: SubnetIds: !Sub "${VpcStack.Outputs.PrivateSubnet1AID}, ${VpcStack.Outputs.PrivateSubnet2AID}" S3BucketName: !Ref S3DataBucketName ExistingBucket: !Ref ExistingDataBucket - KeyPairName: !Ref 'KeyPairName' WorkflowOrchestrator: nextflow SpotBidPercentage: !Ref 'SpotBidPercentage' DefaultCEMinvCpus: !Ref 'DefaultCEMinvCpus' @@ -222,8 +206,6 @@ Resources: - UseOneBucket - True # by the time this stack is created, the data bucket should exist - !Ref ExistingNextflowBucket - S3WorkDirPrefix: !Ref S3NextflowWorkDirPrefix - S3ScriptPrefix: !Ref S3NextflowScriptPrefix NextflowContainerImage: !Ref NextflowContainerImage BatchDefaultJobQueue: !GetAtt GenomicsWorkflowStack.Outputs.GenomicsEnvDefaultJobQueueArn Tags: !FindInMap ["TagMap", "default", "tags"] @@ -232,21 +214,34 @@ Resources: Outputs: NextflowContainerImage: Value: !GetAtt NextflowStack.Outputs.NextflowContainerImage + Export: + Name: !Sub "${AWS::StackName}-NextflowJobDefinition" NextflowJobDefinition: Value: !GetAtt NextflowStack.Outputs.NextflowJobDefinition + Export: + Name: !Sub "${AWS::StackName}-NextflowJobDefinition" Description: >- Batch Job Definition that creates a nextflow head node for running workflows S3NextFlowBucket: - Value: !GetAtt NextflowStack.Outputs.BucketName + Value: !GetAtt NextflowStack.Outputs.NextflowBucket + Export: + Name: !Sub "${AWS::StackName}-NextflowBucket" + Description: >- + S3 Bucket used to store Nextflow metadata (session cache, logs, and intermediate results) + S3NextflowLogsDir: + Value: !GetAtt NextflowStack.Outputs.LogsDir + Export: + Name: !Sub "${AWS::StackName}-NextflowLogsDir" Description: >- - S3 Bucket used to store *.nf scripts - S3NextflowScriptPrefix: - Value: !GetAtt NextflowStack.Outputs.ScriptPrefix + S3 URI where nextflow session cache and logs are stored. + S3NextflowWorkDir: + Value: !GetAtt NextflowStack.Outputs.WorkDir + Export: + Name: !Sub "${AWS::StackName}-NextflowWorkDir" Description: >- - Path in the S3 bucket where *.nf script files are located. If blank, - then they are located at the root level of the "nextflow" bucket. + S3 URI where workflow intermediate results are stored. VpcId: Description: >- @@ -254,14 +249,21 @@ Outputs: Value: !GetAtt 'VpcStack.Outputs.VPCID' S3DataBucket: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvS3Bucket' + Export: + Name: !Sub "${AWS::StackName}-DataBucket" Description: >- S3 bucket for storing genomics workflow input and output data BatchDefaultQueue: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvDefaultJobQueueArn' + Export: + Name: !Sub "${AWS::StackName}-DefaultJobQueue" Description: >- The default AWS Batch job queue for workflow jobs, based on EC2 SPOT instances BatchHighPriorityQueue: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvHighPriorityJobQueueArn' + Export: + Name: !Sub "${AWS::StackName}-HighPriorityJobQueue" Description: >- AWS Batch job queue for high priority workflow jobs, based on EC2 On-Demand - instances \ No newline at end of file + instances +... \ No newline at end of file diff --git a/src/templates/nextflow/nextflow-resources.template.yaml b/src/templates/nextflow/nextflow-resources.template.yaml index d6987c49d..24d8ed16c 100644 --- a/src/templates/nextflow/nextflow-resources.template.yaml +++ b/src/templates/nextflow/nextflow-resources.template.yaml @@ -30,7 +30,8 @@ Metadata: default: "Optional" Parameters: - NextflowContainerImage - - S3ScriptPrefix + - S3NextflowPrefix + - S3LogsDirPrefix - S3WorkDirPrefix @@ -44,32 +45,43 @@ Parameters: S3NextflowBucketName: Type: String Description: >- - S3 Bucket used to store *.nf scripts. + S3 Bucket used to store Nextflow metadata (session cache, logs, and intermediate results) - S3ScriptPrefix: + ExistingBucket: Type: String Description: >- - (Optional) Parent folder in the S3 bucket that contains *.nf workflow scripts + Does the S3 Bucket for Nextflow metadata already exist? If not, it will be created. + AllowedValues: + - Yes + - No + Default: No - S3WorkDirPrefix: + S3NextflowPrefix: + Type: String + Description: >- + (Optional) Parent folder in the Nextflow metadata bucket for metadata folders. + Used only if the Nextflow metadata bucket is the same as the Data bucket. + Default: _nextflow + + S3LogsDirPrefix: Type: String Description: >- - (Optional) Parent folder in the S3 bucket that contains workflow execution logs + (Optional) Folder in the Nextflow metadata bucket (under the {Nextflow Prefix} if needed) + for session cache and logs. + Default: logs - ExistingBucket: + S3WorkDirPrefix: Type: String Description: >- - Does the S3 Bucket for *.nf scripts already exist? If not, it will be created. - AllowedValues: - - Yes - - No - Default: No + (Optional) Folder in the Nextflow metadata bucket (under the {Nextflow Prefix} if needed) + that contains workflow intermediate results + Default: runs NextflowContainerImage: Type: String Description: >- (Optional) Container image for nextflow with custom entrypoint for config and workflow - script staging. (Example, "/nextflow:latest"). + script staging. (Example, "/nextflow:latest"). Provide this if you have a specific version of nextflow you want to use, otherwise a container will be built using the latest version. @@ -93,16 +105,6 @@ Conditions: Fn::Equals: - !Ref ExistingBucket - No - - NoS3ScriptPrefix: - Fn::Equals: - - !Ref S3ScriptPrefix - - "" - - NoS3WorkDirPrefix: - Fn::Equals: - - !Ref S3WorkDirPrefix - - "" Resources: @@ -317,15 +319,6 @@ Resources: Type: AWS::Batch::JobDefinition Properties: Type: container - Parameters: - NextflowScript: - Fn::Join: - - "/" - - - Fn::If: - - NoS3ScriptPrefix - - !Sub "s3://${S3NextflowBucketName}" - - !Join ["/", [!Sub "s3://${S3NextflowBucketName}", !Ref S3ScriptPrefix]] - - workflow.nf ContainerProperties: MountPoints: - ContainerPath: /opt/work @@ -347,41 +340,61 @@ Resources: Environment: - Name: "NF_JOB_QUEUE" Value: !Ref BatchDefaultJobQueue + - Name: "NF_LOGSDIR" + Value: + Fn::Join: + - "/" + - - Fn::If: + - DataBucketIsNextflowBucket + - !Join ["/", [!Sub "s3://${S3NextflowBucketName}", !Ref S3NextflowPrefix]] + - !Sub "s3://${S3NextflowBucketName}" + - !Ref S3LogsDirPrefix - Name: "NF_WORKDIR" Value: Fn::Join: - - "/" - - - Fn::If: - - NoS3WorkDirPrefix - - !Sub "s3://${S3NextflowBucketName}" - - !Join ["/", [!Sub "s3://${S3NextflowBucketName}", !Ref S3WorkDirPrefix]] - - runs - + - "/" + - - Fn::If: + - DataBucketIsNextflowBucket + - !Join ["/", [!Sub "s3://${S3NextflowBucketName}", !Ref S3NextflowPrefix]] + - !Sub "s3://${S3NextflowBucketName}" + - !Ref S3WorkDirPrefix + JobDefinitionName: nextflow Outputs: - BucketName: + NextflowBucket: Description: >- - S3 Bucket used to store *.nf scripts + S3 Bucket used to store Nextflow metadata (session cache, logs, and intermediate results) Value: Fn::If: - NextflowBucketDoesNotExist - !Ref S3NextflowBucket - !Ref S3NextflowBucketName - - ScriptPrefix: + + LogsDir: Description: >- - Path in the S3 bucket where *.nf script files are located. If blank, - then they are located at the root level of the bucket. - Value: !Ref S3ScriptPrefix + S3 URI where nextflow session cache and logs are stored. + Value: + Fn::Join: + - "/" + - - Fn::If: + - DataBucketIsNextflowBucket + - !Join ["/", [!Sub "s3://${S3NextflowBucketName}", !Ref S3NextflowPrefix]] + - !Sub "s3://${S3NextflowBucketName}" + - !Ref S3LogsDirPrefix - WorkDirPrefix: + WorkDir: Description: >- - Path in the S3 bucket where "runs" folder with workflow logs and final - outputs is located. If blank, then they are located at the root level of - the bucket. - Value: !Ref S3WorkDirPrefix + S3 URI where workflow intermediate results are stored. + Value: + Fn::Join: + - "/" + - - Fn::If: + - DataBucketIsNextflowBucket + - !Join ["/", [!Sub "s3://${S3NextflowBucketName}", !Ref S3NextflowPrefix]] + - !Sub "s3://${S3NextflowBucketName}" + - !Ref S3WorkDirPrefix NextflowContainerImage: Description: >- @@ -400,4 +413,5 @@ Outputs: NextflowJobRole: Description: >- IAM Role that allows the nextflow head node job access to S3 and Batch - Value: !GetAtt IAMNextflowJobRole.Arn \ No newline at end of file + Value: !GetAtt IAMNextflowJobRole.Arn +... \ No newline at end of file diff --git a/src/templates/step-functions/sfn-aio.template.yaml b/src/templates/step-functions/sfn-aio.template.yaml index c36bee930..966c6af64 100644 --- a/src/templates/step-functions/sfn-aio.template.yaml +++ b/src/templates/step-functions/sfn-aio.template.yaml @@ -19,12 +19,10 @@ Metadata: AWS::CloudFormation::Interface: ParameterGroups: - Label: - default: "Required" + default: "Data Storage" Parameters: - S3BucketName - ExistingBucket - - KeyPairName - - AvailabilityZones - Label: default: "AWS Batch" Parameters: @@ -43,8 +41,6 @@ Metadata: default: S3 Bucket Name ExistingBucket: default: Existing Bucket? - KeyPairName: - default: EC2 Key Pair Name SpotBidPercentage: default: Spot Bid % DefaultCEMinvCpus: @@ -60,9 +56,6 @@ Metadata: # Parameters Parameters: - AvailabilityZones: - Description: "Choose the two Availability Zones to deploy instances for AWS Batch." - Type: List S3BucketName: Description: >- A S3 bucket name for storing analysis results. @@ -79,9 +72,6 @@ Parameters: - Yes - No Default: No - KeyPairName: - Description: Key Pair name - Type: AWS::EC2::KeyPair::KeyName SpotBidPercentage: Description: The percent of on-demand pricing for max bid for Spot intances Type: Number @@ -113,7 +103,7 @@ Parameters: ArtifactRootUrl: Type: String - Default: https://aws-genomics-workflows.s3.amazonaws.com/artifacts + Default: https://s3.amazonaws.com/aws-genomics-workflows/artifacts Description: >- Root URL for where artifacts / additions scripts are stored @@ -134,9 +124,12 @@ Resources: TemplateURL: https://aws-quickstart.s3.amazonaws.com/quickstart-aws-vpc/templates/aws-vpc.template TimeoutInMinutes: 15 Parameters: - AvailabilityZones: !Join ["," , !Ref AvailabilityZones] + AvailabilityZones: + Fn::Join: + - "," + - - !Sub "${AWS::Region}a" + - !Sub "${AWS::Region}b" NumberOfAZs: "2" - KeyPairName: !Ref KeyPairName Tags: !FindInMap ["TagMap", "default", "tags"] GenomicsWorkflowStack: @@ -148,7 +141,6 @@ Resources: SubnetIds: !Sub "${VpcStack.Outputs.PrivateSubnet1AID}, ${VpcStack.Outputs.PrivateSubnet2AID}" S3BucketName: !Ref S3BucketName ExistingBucket: !Ref ExistingBucket - KeyPairName: !Ref 'KeyPairName' WorkflowOrchestrator: step-functions SpotBidPercentage: !Ref 'SpotBidPercentage' DefaultCEMinvCpus: !Ref 'DefaultCEMinvCpus' @@ -173,22 +165,34 @@ Resources: Outputs: StateMachine: Value: !GetAtt 'SfnStack.Outputs.StateMachine' + Export: + Name: !Sub ${AWS::StackName}-StateMachine Description: >- Example AWS Step Functions state machine StateMachineInput: Value: !GetAtt 'SfnStack.Outputs.StateMachineInput' Description: >- Example input for the state machine. Use this when executing your workflow. + VpcId: + Description: >- + The VPC created for your Nextflow stack. + Value: !GetAtt 'VpcStack.Outputs.VPCID' S3Bucket: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvS3Bucket' + Export: + Name: !Sub ${AWS::StackName}-S3Bucket Description: >- S3 bucket for storing genomics workflow input and output data BatchDefaultQueue: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvDefaultJobQueueArn' + Export: + Name: !Sub ${AWS::StackName}-DefaultJobQueue Description: >- The default AWS Batch job queue for workflow jobs, based on EC2 SPOT instances BatchHighPriorityQueue: Value: !GetAtt 'GenomicsWorkflowStack.Outputs.GenomicsEnvHighPriorityJobQueueArn' + Export: + Name: !Sub ${AWS::StackName}-HighPrioirityJobQueue Description: >- AWS Batch job queue for high priority workflow jobs, based on EC2 On-Demand instances diff --git a/src/templates/step-functions/sfn-example.template.yaml b/src/templates/step-functions/sfn-example.template.yaml index 59c80727a..b1f586b7e 100644 --- a/src/templates/step-functions/sfn-example.template.yaml +++ b/src/templates/step-functions/sfn-example.template.yaml @@ -130,7 +130,7 @@ Resources: Environment: Type: LINUX_CONTAINER Image: aws/codebuild/standard:1.0 - ComputeType: BUILD_GENERAL1_SMALL + ComputeType: BUILD_GENERAL1_LARGE PrivilegedMode: True ServiceRole: !GetAtt IAMCodeBuildRole.Arn @@ -155,9 +155,12 @@ Resources: commands: - echo "Building containers" - ROOT="$(pwd)/src/containers" - - cd $ROOT/bwa && ./build.sh - - cd $ROOT/samtools && ./build.sh - - cd $ROOT/bcftools && ./build.sh + # run container builds in parallel to avoid hitting Lambda time limit + # output is captured in logs and `cat`d below to ease debugging + - (cd $ROOT/bwa && ./build.sh > ./build.log) & (cd $ROOT/samtools && ./build.sh > ./build.log) & (cd $ROOT/bcftools && ./build.sh > ./build.log) & wait + - cat $ROOT/bwa/build.log + - cat $ROOT/samtools/build.log + - cat $ROOT/bcftools/build.log post_build: commands: - echo "Tagging container images" @@ -165,9 +168,8 @@ Resources: - docker tag samtools:aws ${REGISTRY}/samtools:aws - docker tag bcftools:aws ${REGISTRY}/bcftools:aws - echo "Pushing container images to ECR" - - docker push ${REGISTRY}/bwa:aws - - docker push ${REGISTRY}/samtools:aws - - docker push ${REGISTRY}/bcftools:aws + # push containers in parallel to avoid hitting Lambda time limit + - docker push ${REGISTRY}/bwa:aws & docker push ${REGISTRY}/samtools:aws & docker push ${REGISTRY}/bcftools:aws & wait - REGISTRY: !Sub ${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com Tags: !FindInMap ["TagMap", "default", "tags"] @@ -184,7 +186,7 @@ Resources: Handler: index.handler Role: !GetAtt IAMLambdaExecutionRole.Arn Runtime: python3.7 - Timeout: 600 + Timeout: 900 Code: ZipFile: | from time import sleep @@ -221,7 +223,7 @@ Resources: command: mem reference_prefix: "s3://broad-references/hg38/v0" reference_name: "Homo_sapiens_assembly38" - sample_id: "SRR1919605" + sample_id: "NIST7035" input_prefix: "s3://aws-batch-genomics-resources/fastq" output_prefix: !Sub "s3://${S3BucketName}/genomics-workflow" @@ -253,7 +255,7 @@ Resources: command: sort reference_prefix: "s3://broad-references/hg38/v0" reference_name: "Homo_sapiens_assembly38" - sample_id: "SRR1919605" + sample_id: "NIST7035" input_prefix: !Sub s3://${S3BucketName}/genomics-workflow" output_prefix: !Sub "s3://${S3BucketName}/genomics-workflow" @@ -285,7 +287,7 @@ Resources: command: mpileup reference_prefix: "s3://broad-references/hg38/v0" reference_name: "Homo_sapiens_assembly38" - sample_id: "SRR1919605" + sample_id: "NIST7035" input_prefix: !Sub s3://${S3BucketName}/genomics-workflow" output_prefix: !Sub "s3://${S3BucketName}/genomics-workflow" @@ -419,86 +421,89 @@ Outputs: StateMachineInput: Description: Example input for the state machine - Value: !Sub | - { - "defaults": { - "queue": "${BatchJobQueue}" - }, - "bwa": { - "mem": { - "parameters": { - "reference_prefix": "s3://broad-references/hg38/v0", - "reference_name": "Homo_sapiens_assembly38", - "sample_id": "SRR1919605", - "input_prefix": "s3://aws-batch-genomics-resources/fastq", - "output_prefix": "s3://${S3BucketName}/genomics-workflow", - "command": "mem" - }, - "resources": { - "Vcpus": 8, - "Memory": 64000 - } - } - }, - "samtools": { - "sort": { - "parameters": { - "reference_prefix": "s3://broad-references/hg38/v0", - "reference_name": "Homo_sapiens_assembly38", - "sample_id": "SRR1919605", - "input_prefix": "s3://${S3BucketName}/genomics-workflow", - "output_prefix": "s3://${S3BucketName}/genomics-workflow", - "command": "sort" - }, - "resources": { - "Vcpus": 8, - "Memory": 32000 - } + Value: + Fn::Sub: + - | + { + "defaults": { + "queue": "${BatchJobQueue}" }, - "index": { - "parameters": { - "reference_prefix": "s3://broad-references/hg38/v0", - "reference_name": "Homo_sapiens_assembly38", - "sample_id": "SRR1919605", - "input_prefix": "s3://${S3BucketName}/genomics-workflow", - "output_prefix": "s3://${S3BucketName}/genomics-workflow", - "command": "index" - }, - "resources": { - "Vcpus": 8, - "Memory": 32000 + "bwa": { + "mem": { + "parameters": { + "reference_prefix": "s3://broad-references/hg38/v0", + "reference_name": "Homo_sapiens_assembly38", + "sample_id": "${SampleID}", + "input_prefix": "s3://aws-batch-genomics-shared/secondary-analysis/example-files/fastq/", + "output_prefix": "s3://${S3BucketName}/genomics-workflow", + "command": "mem" + }, + "resources": { + "Vcpus": 8, + "Memory": 64000 + } } - } - }, - "bcftools": { - "mpileup": { - "parameters": { - "reference_prefix": "s3://broad-references/hg38/v0", - "reference_name": "Homo_sapiens_assembly38", - "sample_id": "SRR1919605", - "input_prefix": "s3://${S3BucketName}/genomics-workflow", - "output_prefix": "s3://${S3BucketName}/genomics-workflow", - "command": "mpileup" + }, + "samtools": { + "sort": { + "parameters": { + "reference_prefix": "s3://broad-references/hg38/v0", + "reference_name": "Homo_sapiens_assembly38", + "sample_id": "${SampleID}", + "input_prefix": "s3://${S3BucketName}/genomics-workflow", + "output_prefix": "s3://${S3BucketName}/genomics-workflow", + "command": "sort" + }, + "resources": { + "Vcpus": 8, + "Memory": 32000 + } }, - "resources": { - "Vcpus": 8, - "Memory": 32000 + "index": { + "parameters": { + "reference_prefix": "s3://broad-references/hg38/v0", + "reference_name": "Homo_sapiens_assembly38", + "sample_id": "${SampleID}", + "input_prefix": "s3://${S3BucketName}/genomics-workflow", + "output_prefix": "s3://${S3BucketName}/genomics-workflow", + "command": "index" + }, + "resources": { + "Vcpus": 8, + "Memory": 32000 + } } }, - "call": { - "parameters": { - "reference_prefix": "s3://broad-references/hg38/v0", - "reference_name": "Homo_sapiens_assembly38", - "sample_id": "SRR1919605", - "input_prefix": "s3://${S3BucketName}/genomics-workflow", - "output_prefix": "s3://${S3BucketName}/genomics-workflow", - "command": "call" + "bcftools": { + "mpileup": { + "parameters": { + "reference_prefix": "s3://broad-references/hg38/v0", + "reference_name": "Homo_sapiens_assembly38", + "sample_id": "${SampleID}", + "input_prefix": "s3://${S3BucketName}/genomics-workflow", + "output_prefix": "s3://${S3BucketName}/genomics-workflow", + "command": "mpileup" + }, + "resources": { + "Vcpus": 8, + "Memory": 32000 + } }, - "resources": { - "Vcpus": 8, - "Memory": 32000 + "call": { + "parameters": { + "reference_prefix": "s3://broad-references/hg38/v0", + "reference_name": "Homo_sapiens_assembly38", + "sample_id": "${SampleID}", + "input_prefix": "s3://${S3BucketName}/genomics-workflow", + "output_prefix": "s3://${S3BucketName}/genomics-workflow", + "command": "call" + }, + "resources": { + "Vcpus": 8, + "Memory": 32000 + } } } } - } + - SampleID: "NIST7035" ... \ No newline at end of file