From 8395859bad1c44565f79f9494ad7d6c2301dc890 Mon Sep 17 00:00:00 2001 From: Henrique Ribeiro Date: Mon, 18 Sep 2023 13:41:03 +0100 Subject: [PATCH] Develop AWS (#35) * 85 release (#28) * Update cromwell version from 83 to 84 * BW-1255 Implement POST /runs endpoint (#6779) * Adding route * Fixing HTTP method error * All formFields made optional * A compliling state * Saving * Saving * All three endpoints functioning as expected; updated RESTAPI.md * Updated response for submission from 200 to 201 to pass tests * Test submission response * Moved updated submission response to askSubmit * test * updating RESTAPI.md * saving * Adding utility file for submitRequest * cleanup * Update awssdkv from 2.17.152 to 2.17.194 (#6814) * BW-1305 Swagger Update (#6818) * Properly documenting metadataArchiveStatus in WorkflowQueryResult model * Update docs * BT-710 Add configs for BlobPathBuilderFactory (#6817) BT-710 Add configs for BlobPathBuilderFactory * BW-1305 Make "name" optional in workflow query response (#6821) * BT-724 Fix BlobPathBuilder failing on retrieving existing filesystem (#6816) Modify blobPathBuilder to fallback to creating a filesystem if one is not found * Logging updates: (#6813) * [BT-698] first pass on BlobTokenGenerator with E2E test (#6824) * first pass on BlobTokenGenerator with E2E test * update BlobPathBuilder constructor args in test * account -> container level client * [BT-687] specify correct types (#6829) * specify correct types * fix test with new type * remove type declarations in function call * remove unnecessary sas-token config * BW-1206 - Combine all Wes Endpoints & add Tests (#6833) * Add tests, getting frid of WesRunRoutes.scala * wesWorkflowId fix, ec implicits errors gone * Refactoring path for GET /runs * Indentation fix * Commit to rollback * Revert "Indentation fix" This reverts commit 63fc4842c9d4eff68ec9cb7c3ef19e110696598b. * PR trigger * Optimize imports * Missed import * BW-1354 - Porting CBAS preliminary step (#6837) * Getting rid of shared utility file; Adding/Updating WES version of submit. * Edit spec file * Adding Wes-like error * BW-1378 Addl CromIAM user enablement checks (#6826) * Update cromwell version from 84 to 85 * BW-1393 Release doc updates (#6839) * BT-732 Checksum validation for blobs read by engine (#6838) * Draft support for optional FileHash * Draft getMd5 for BlobPath * Resolve non-parallel IO to fix tests * Checksum validation for BlobPath * Nicer error message * Test for missing Blob hash * Break attr acquisition into separate method * Cleanup, comments * In-progress tests of blob hash command * Remove test * Remove unused import * BT-711 Refresh SAS token for filesystem on expiry (#6831) * BT-711 Refresh SAS token for filesystem on expiry * Rough cut of token refresh using exceptions * Ignore tests, and minor cleanup * Remove stray line * Draft of manager class for handling expiring file systems * Style fixes * Refactor of blobfilesystemManager and tests covering its functionality * Refined tests to validate close filesystem as separate unit * Ignore connected tests * Clean up of some things * Refactor BlobFileSystemManager to separate file, and some other cleanup * Some additional scala-ifying * Small cleanup * Correcting imports * trigger tests * trigger tests * Batch 1 of scala steward updates (#6903) * Batch 1 of scala steward updates * Rollback snakeYAML * Attempt 3, with only the passing dependancies * Revert google API and Big Query udpates * Winding back other google deps * rollback remaining google updates * trigger tests * trigger tests * [BW-1398] Migrate PKs to BIGINT (#6907) * BT-745 Batch 2 of scala steward updates (#6906) * Update SBT to 2.0.0 * Fix sbt-git import * Update mouse to 1.0.11 * Update rhino 1.7.14 * SUP-692 Retry with more memory after RC 137 (#6912) * Reorder execution result checks so 137 can retry with more memory * Test for memory retry after 137 RC * Fix test expectations * Make memory retry checks consistent * Revert changes to existing test * Rename retryWithMoreMemory to outOfMemoryDetected * Scala steward updates batch 3 (#6913) * Scala steward updates batch 3 * WX-745 Batch 4 scala steward updates (#6916) * WX-746 Localize all DRS inputs in a single Action (#6914) Co-authored-by: Janet Gainer-Dewar * WX-755 Build all images instead of just Cromwell (#6919) * WX-755 Add `isRelease` option for Docker builds (#6923) * WX-755 Cromwell/CromIAM automatically board train (#6924) * WX-755 Fix environment variable syntax (#6926) * WX-743 Enable TES task creation with BlobPaths (#6921) * Give blob SAS tokens write permission * Case class wrapper for subscription id * Resolve duplicate container name in absolute BlobPath * Ignored test demonstrating correct absolute path generation * Update filesystems/blob/src/test/scala/cromwell/filesystems/blob/BlobPathBuilderSpec.scala Co-authored-by: Brian Reilly * PR feedback Co-authored-by: Brian Reilly * [WX-765] Update snakeyaml to 1.33 (#6927) * update snakeyaml to 1.33 * Don't use deprecated no-arg Constructor constructor Co-authored-by: Janet Gainer-Dewar * WM-1414 Refactoring WesRunLog to omit Cromwell's "workflowLog" object (#6925) * Upgrade Postgres to 42.4.1 (#6932) * WX-735 Fix incorrect and/or nondeterministic filesystem ordering (#6930) * WX-772 Update Scala to 2.13.9 (#6928) * Update Scala to 2.13.9 * Try updating sbt-scoverage * Does this version exist anywhere we can see? * This version actually exists * Update library version to remove conflict * Codegen version * Fix fun new 2.13.9 compiler errors * Resolve warnings * Newest Scala? * I guess not * Does this please Travis? * force ci * Back out changes to generated code Co-authored-by: Adam Nichols * WX-781 Bump jackson-databind in /CromwellRefdiskManifestCreator (#6935) Bumps [jackson-databind](https://github.com/FasterXML/jackson) from 2.13.2.2 to 2.13.4.1. - [Release notes](https://github.com/FasterXML/jackson/releases) - [Commits](https://github.com/FasterXML/jackson/commits) --- updated-dependencies: - dependency-name: com.fasterxml.jackson.core:jackson-databind dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * WX-808 Host allowlist for HTTP imports (#6938) * `hostAllowlist` that allows everything * Refactor * Stick allow list in HttpResolver * Better default config * Allow list tests * Make it build Co-authored-by: Janet Gainer-Dewar * Update commons text to 1.10.0 (#6937) * WX-751 Token refresh signal for monitoring (#6939) * Log messages * `DEBUG` -> `INFO` * WX-744 Optionally rewrite blob paths to appear as local paths (#6941) * Modify blob paths for TES * Make blob transformation configurable * Update supportedBackends/tes/src/main/scala/cromwell/backend/impl/tes/TesTask.scala Co-authored-by: Adam Nichols * Apply PR feedback in second place Co-authored-by: Adam Nichols * Update changelog for wdl http allow list (#6944) * WM-1491 Fixing Cromwell-client (#6943) * More updated client for use in cbas * Removing excess code * Fix client build script (#6945) * WX-837: Remove CWL references from documentation (#6949) * wx-837 removed cwl references in markdown doc files * wx-837 removed cwlParsingOverview.md, updated mkdocs.yml * wx-837 updated cromwell.yaml, generated new RESTAPI file * WX-728 Add configurable WSM client to Cromwell (#6948) * Dependencies * Compiles but no tests * Formatting * Moar exclusions * Update to latest WSM * Add additional dependency * We need some UUID here to make the request * Formatting * Clarify what is fake * Formatting * Use our own version of Jersey and Jackson stuff * Port-in Khalid's changes (thank you!) Co-authored-by: Khalid Shakir * Test longevity Don't break the test if someone decides to add a cert to `ws.org` * Cleanup * Cleanup * Cleanup * Adjust TES config file for CI Co-authored-by: Janet Gainer-Dewar Co-authored-by: Khalid Shakir * CROM-6554: Removed PAPIv1 references from doc (#6950) * crom-6554 removed references to PAPI v1 from doc * crom-6554 pr feedback, reworded doc to use example conf as a starting point * WX-833 Real Azure DRS Credentials (#6952) * Remove B2C reference from name * Get token for current user rather than getting from KeyVault * Remove KeyVault config for engine * Remove KeyVault config for DRSLocalizer * Remove KeyVault dependency * Remove KeyVault support from localizer repo template * Cleaned up and working Azure token acquisition for engine * Collapse localizer's AccessTokenStrategy into DrsCredentials * Cleanup * WX-853 Remove most CWL (#6955) * WX-696 Enable getting SAS token from WSM (#6954) * WX-696 Enable getting SAS token from WSM * Wire container resource id from config * Move resource-container-id config path * First pass at config for WSM * Remove unused singleton config * Tests for new config * Fix config parsing * Modified b2c token to be provided each time * Remove singletonConfig arg from factory * Restore types to factory configs * Clean up comments and empty token default * Default to config b2c before searching environment * Fix token default on api client * Fix test * Refactor error handling for when there is no token * Remove token constructor arg for clientProvider * Move configs to global singleton config * Update filesystems/blob/src/main/scala/cromwell/filesystems/blob/BlobFileSystemManager.scala * default -> override * Add override token to test * Update filesystems/blob/src/main/scala/cromwell/filesystems/blob/BlobFileSystemManager.scala Co-authored-by: Adam Nichols * Parentheses * Reduce token timeout * Move AzureCredentials to separate file * Make AzureCredentials an object * WSM token cleanup * Config refactor (#6960) Co-authored-by: Janet Gainer-Dewar * Initial blob token documentation * Refine language in BlobSasTokenGenerator * Update comment and formatting Co-authored-by: Janet Gainer-Dewar Co-authored-by: Adam Nichols * WX-853 Remove CWL language factory, Centaur runner (#6961) * WX-842 Add Pact Dependency for Cromwell (#6962) * WX-842 Add Pact Dependency for Cromwell * Remove incomplete test spec * Initial Pact Test * Fix pact so it compiles * Add breadcrumb comment and clean up * ID-125 Add support for drshub, rename all the things (#6959) * Add support for drshub, rename all the things * fallback to martha if resolver is not in config * WX-867 Translate crc32c hashes to b64 for getm (#6970) * Translate crc32c hashes to b64 for getm * Update tests * Remove obsolete b64 handling for md5, centralize hex validation * Restore old test, fix other test * WX-843 Workflow failure reason should accurately indicate issues opening blob filesystem (#6965) * WX-859 Accept workflow execution identity in config (#6967) * WX-892 Trim down `ValueStore` logging to prevent OOMs (#6981) * Add Nirvana 3.18.1 reference image test, minor cleanup [VS-705] (#6975) * WX-863 Turn off Azure NIO logging (#6982) * Turn off Azure NIO logging * Poke Travis * WM-1616: Allow repeating attempts at initialization (take 2) (#6985) * WX-878 Single shared BlobFileSystemManager (#6986) * Make BlobFileSystemManager shared across all BlobPathBuilders * Update TES conf file to reflect new singleton config * Shell escape reference image files [VS-796] [WX-910] (#6989) * WX-769 `disks` compatibility for TES backend (#6991) * Update FiveMinuteIntro.md (#6994) * WX-906 Sbt Unit Tests as Github Actions (#6992) * WX-926 Support falling back to OCI Manifest Format (#7003) * WX-926 Support falling back to OCI Manifest Forma * Only mount reference disks if requested [WX-925] (#7001) * [WM-1646] Add missing fields for `WorkflowDescription` for WomTool /describe endpoint to Swagger (#7004) * WX-876 Surface TES System Logs to Cromwell when TES backend returns task error status (#6980) * WX-876 Surface TES System Logs to Cromwell when TES backend returns task error status * Address feedback * Address feedback (#6997) * Address additional feedback (#7000) * Fix copy/paste error (#7005) * Address additional feedback * Fix copy/paste error * Trigger CI --------- Co-authored-by: Blair Murri Co-authored-by: Janet Gainer-Dewar * Centaur reference image test should validate symlinks [VS-796] (#6996) * WX-903 Pre-GHA test suite disablement * WX-877 Update CHANGELOG for release 85 (#7011) --------- Signed-off-by: dependabot[bot] Co-authored-by: Janet Gainer-Dewar Co-authored-by: Katrina P <68349264+kpierre13@users.noreply.github.com> Co-authored-by: Chris Llanwarne Co-authored-by: Christian Freitas Co-authored-by: Saloni Shah Co-authored-by: kshakir Co-authored-by: mspector Co-authored-by: Adam Nichols Co-authored-by: Brian Reilly Co-authored-by: Adam Nichols Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Justin Variath Thomas Co-authored-by: Christian Freitas Co-authored-by: Trevyn Langsford Co-authored-by: Miguel Covarrubias Co-authored-by: ekiernan <55763654+ekiernan@users.noreply.github.com> Co-authored-by: Tom Wiseman Co-authored-by: Blair Murri * Develop aws (#29) * stuck on globbing * efs works, no callcaching * update readme * extended EFS support * fix for globbing in nested scatters * updated config for globbing, to prevent issues with empty folders * efs fixes : support paths with over 127 characters, fix delocalization of efs-based globs (#32) * Develop aws (#34) * efs fixes : support paths with over 127 characters, fix delocalization of efs-based globs * add deployment manual, fix issue with empty disks * update documentation * update documentation * update documentation --------- Signed-off-by: dependabot[bot] Co-authored-by: Janet Gainer-Dewar Co-authored-by: Katrina P <68349264+kpierre13@users.noreply.github.com> Co-authored-by: Chris Llanwarne Co-authored-by: Christian Freitas Co-authored-by: Saloni Shah Co-authored-by: kshakir Co-authored-by: mspector Co-authored-by: Adam Nichols Co-authored-by: Brian Reilly Co-authored-by: Adam Nichols Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Justin Variath Thomas Co-authored-by: Christian Freitas Co-authored-by: Trevyn Langsford Co-authored-by: Miguel Covarrubias Co-authored-by: ekiernan <55763654+ekiernan@users.noreply.github.com> Co-authored-by: Tom Wiseman Co-authored-by: Blair Murri Co-authored-by: geertvandeweyer --- README.md | 10 +- .../StandardCacheHitCopyingActor.scala | 7 +- ...wsBatchAsyncBackendJobExecutionActor.scala | 7 +- .../backend/impl/aws/AwsBatchJob.scala | 17 +- .../impl/aws/AwsBatchRuntimeAttributes.scala | 7 +- .../scala/cromwell/backend/impl/aws/DEPLOY.md | 243 ++++++++++++++++++ .../scala/cromwell/backend/impl/aws/README.md | 37 ++- .../AwsBatchBackendFileHashingActor.scala | 11 +- .../impl/aws/io/AwsBatchGlobFunctions.scala | 5 +- .../backend/impl/aws/io/AwsBatchVolume.scala | 4 +- 10 files changed, 319 insertions(+), 29 deletions(-) create mode 100644 supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/DEPLOY.md diff --git a/README.md b/README.md index bbb38795420..d0d78091356 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,15 @@ ## Welcome to the "AWS-friendly" Cromwell -More information regarding AWS features can be found [here](https://github.com/henriqueribeiro/cromwell/tree/master/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws) +The AWS-friendly Cromwell is an optimized fork of the main cromwell release. We try to keep it up-to-date with new releases, while keeping our additions functional. -Contact: henrique [at] loka [dot] com +* Information regarding AWS features can be found [here](supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/README.md) +* Information regarding deployment can be found [here](supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/DEPLOY.md) + +Contact: +* henrique [at] loka [dot] com +* geert [dot] vandeweyer [at] uza [dot] be +* Join the #AWS channel at the [Cromwell Slack workspace](https://join.slack.com/t/cromwellhq/shared_invite/zt-dxmmrtye-JHxwKE53rfKE_ZWdOHIB4g). Cromwell is an open-source Workflow Management System for bioinformatics. Licensing is [BSD 3-Clause](LICENSE.txt). diff --git a/backend/src/main/scala/cromwell/backend/standard/callcaching/StandardCacheHitCopyingActor.scala b/backend/src/main/scala/cromwell/backend/standard/callcaching/StandardCacheHitCopyingActor.scala index e23ecd277af..4b9eecc68a7 100644 --- a/backend/src/main/scala/cromwell/backend/standard/callcaching/StandardCacheHitCopyingActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/callcaching/StandardCacheHitCopyingActor.scala @@ -230,6 +230,8 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit logCacheHitCopyCommand(command) case Some(command: IoTouchCommand) => logCacheHitTouchCommand(command) + case Some(command: IoWriteCommand) => + logCacheHitWriteCommand(command) case huh => log.warning(s"BT-322 {} unexpected commandsToWaitFor: {}", jobTag, huh) } @@ -310,7 +312,10 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit } private def logCacheHitTouchCommand(command: IoTouchCommand): Unit = - log.info(s"BT-322 {} cache hit for file : {}", jobTag, command.toString) + log.info(s"BT-322 {} cache touch hit for file : {}", jobTag, command.toString) + + private def logCacheHitWriteCommand(command: IoWriteCommand): Unit = + log.info(s"BT-322 {} cache write hit for file : {}", jobTag, command.toString) def succeedAndStop(returnCode: Option[Int], copiedJobOutputs: CallOutputs, detritusMap: DetritusMap): State = { import cromwell.services.metadata.MetadataService.implicits.MetadataAutoPutter diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchAsyncBackendJobExecutionActor.scala b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchAsyncBackendJobExecutionActor.scala index 87ef19e80cb..498bd2ec4fe 100755 --- a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchAsyncBackendJobExecutionActor.scala @@ -417,8 +417,10 @@ class AwsBatchAsyncBackendJobExecutionActor(override val standardParams: Standar val output = if (configuration.efsMntPoint.isDefined && configuration.efsMntPoint.getOrElse("").equals(disk.toString.split(" ")(1)) && ! runtimeAttributes.efsDelocalize) { - AwsBatchFileOutput(makeSafeAwsBatchReferenceName(womFile.value), makeSafeAwsBatchReferenceName(womFile.value), relpath, disk) + // name: String, s3key: String, local: Path, mount: AwsBatchVolume + AwsBatchFileOutput(makeSafeAwsBatchReferenceName(womFile.value), womFile.value, relpath, disk) } else { + // if efs is not enabled, OR efs delocalization IS enabled, keep the s3 path as destination. AwsBatchFileOutput(makeSafeAwsBatchReferenceName(womFile.value), destination, relpath, disk) } List(output) @@ -448,7 +450,8 @@ class AwsBatchAsyncBackendJobExecutionActor(override val standardParams: Standar ! runtimeAttributes.efsDelocalize) { (globDirectory, globListFile) } else { - (callRootPath.resolve(globDirectory).pathAsString, callRootPath.resolve(globListFile).pathAsString) + // cannot resolve absolute paths : strip the leading '/' + (callRootPath.resolve(globDirectory.toString.stripPrefix("/")).pathAsString, callRootPath.resolve(globListFile.toString.stripPrefix("/")).pathAsString) } // return results return ( diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchJob.scala b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchJob.scala index 48f286dc436..97302357175 100755 --- a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchJob.scala +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchJob.scala @@ -259,7 +259,8 @@ final case class AwsBatchJob(jobDescriptor: BackendJobDescriptor, // WDL/CWL | b=$$(( 5 * 1024 * 1024 )) | chunk_size=$$(( a > b ? a : b )) | echo $$chunk_size - } + |} + | |function _check_data_integrity() { | local local_path=$$1 | local s3_path=$$2 @@ -276,7 +277,7 @@ final case class AwsBatchJob(jobDescriptor: BackendJobDescriptor, // WDL/CWL | s3_content_length=$$($awsCmd s3api head-object --bucket "$$bucket" --key "$$key" --query 'ContentLength') || | { echo "Attempt to get head of object failed for $$s3_path." && return 1 ; } | # local - | local_content_length=$$(LC_ALL=C ls -dn -- "$$local_path" | awk '{print $$5; exit}' ) || + | local_content_length=$$(LC_ALL=C ls -dnL -- "$$local_path" | awk '{print $$5; exit}' ) || | { echo "Attempt to get local content length failed for $$_local_path." && return 1; } | # compare | if [[ "$$s3_content_length" -eq "$$local_content_length" ]]; then @@ -303,10 +304,12 @@ final case class AwsBatchJob(jobDescriptor: BackendJobDescriptor, // WDL/CWL //generate a series of s3 commands to delocalize artifacts from the container to storage at the end of the task val outputCopyCommand = outputs.map { + // local is relative path, no mountpoint disk in front. case output: AwsBatchFileOutput if output.local.pathAsString.contains("*") => "" //filter out globs - case output: AwsBatchFileOutput if output.name.endsWith(".list") && output.name.contains("glob-") => - Log.debug("Globbing : check for EFS settings.") + case output: AwsBatchFileOutput if output.s3key.endsWith(".list") && output.s3key.contains("glob-") => + Log.debug("Globbing : check for EFS settings.") val s3GlobOutDirectory = output.s3key.replace(".list", "") + // glob paths are not generated with 127 char limit, using generateGlobPaths(). name can be used safely val globDirectory = output.name.replace(".list", "") /* * Need to process this list and de-localize each file if the list file actually exists @@ -317,9 +320,10 @@ final case class AwsBatchJob(jobDescriptor: BackendJobDescriptor, // WDL/CWL Log.debug("EFS glob output file detected: "+ output.s3key + s" / ${output.mount.mountPoint.pathAsString}/${output.local.pathAsString}") val test_cmd = if (efsDelocalize.isDefined && efsDelocalize.getOrElse(false)) { Log.debug("delocalization on EFS is enabled") + Log.debug(s"Delocalizing $globDirectory to $s3GlobOutDirectory\n") s""" - |touch ${output.name} - |_s3_delocalize_with_retry ${output.name} ${output.s3key} + |touch ${output.mount.mountPoint.pathAsString}/${output.local.pathAsString} + |_s3_delocalize_with_retry ${output.mount.mountPoint.pathAsString}/${output.local.pathAsString} ${output.s3key} |if [ -e $globDirectory ]; then _s3_delocalize_with_retry $globDirectory $s3GlobOutDirectory ; fi |""".stripMargin } else { @@ -357,6 +361,7 @@ final case class AwsBatchJob(jobDescriptor: BackendJobDescriptor, // WDL/CWL | """.stripMargin } else { // default delocalization command. + Log.debug(s"Delocalize from ${output.name} to ${output.s3key}\n") s""" |touch ${output.name} |_s3_delocalize_with_retry ${output.name} ${output.s3key} diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchRuntimeAttributes.scala b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchRuntimeAttributes.scala index ceca53a7c85..235927ab07c 100755 --- a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchRuntimeAttributes.scala +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchRuntimeAttributes.scala @@ -188,10 +188,11 @@ object AwsBatchRuntimeAttributes { rtc.getAnyRef(AwsBatchRuntimeAttributes.DisksKey).asInstanceOf[String] // just to prevent complaints about var/val } catch { case _: ConfigException.Missing => - "" + "local-disk" } - // combine - val disks = s"${efs_disks},${rtc_disks}".split(",").toSet.mkString(",") + // combine and remove empty values + val disks = s"${efs_disks},${rtc_disks}".split(",").toSet.filterNot(_.isEmpty).mkString(",") + Log.debug(s"Disks: ${disks}") val runtimeConfig = Some(rtc.withValue(AwsBatchRuntimeAttributes.DisksKey, ConfigValueFactory.fromAnyRef(disks))) return runtimeConfig } diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/DEPLOY.md b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/DEPLOY.md new file mode 100644 index 00000000000..856dd46af99 --- /dev/null +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/DEPLOY.md @@ -0,0 +1,243 @@ +AWS/CROMWELL DEPLOYMENT +======================= + +Overview +-------- + +AWS Cloudformation is used to deploy Cromwell-AWS. As resources are created, costs might be generated along the way. To make sure all generated data is cleaned, remove the stacks again using the CloudFormation Console. + + +Deployment +---------- + +Deployment of the cromwell/AWS environment can be performed using the three cloudformation stacks: + +1. VPC : setup of the networks +2. Resources : setup of the compute environment, job queues and storage solutions +3. Cromwell : setup of an EC2 instance and RDS, hosting the cromwell server and submission tools. + +Along the way, all necessary IAM rols are generated. + +*Note:* This tutorial uses the eu-west-2 (London) region. Change where appropriate for you setup + +*Note:* Select "Preserve successfully provisioned resources" as failure option to review problems in depth. + +### STEP 1 : SETUP VPC + +The default VPC settings create both private and public subnets. Private subnets are used for the compute environment and database setup. Public subnets are used to host the cromwell server instance. + + +Used Template : + +``` +https://cromwell-aws-cloudformation-templates.s3.eu-west-1.amazonaws.com/root-templates/aws-vpc.template.yaml +``` + +Steps: + +1. Go to the CloudFormation console in your region : https://eu-west-2.console.aws.amazon.com/cloudformation/home?region=eu-west-2#/ + +2. Select "Stacks" in the left menu, then "Create Stack" on the top right. Choose "With new resources" :[link](https://eu-west-2.console.aws.amazon.com/cloudformation/home?region=eu-west-2#/stacks/create) + +3. Select "Template is ready", and use "Amazon S3 URL" as template source. Provide the yaml file URL provided above. + +4. Enter a stack name. + +5. Select availability zones, more is better. This gives Batch more zones to find suitable instances. + +6. Match "Number of availability zones" to your selection. + +7. Other settings can be left as default. + +8. Review, confirm and submit on final page. + + + +### STEP 2 : SETUP COMPUTE RESOURCES + +The default setup configures a spot and on-demand queue, a cromwell-specific bucket and the necessary IAM roles to run BATCH jobs. See below for further configuration to extend this setup. + +Used Template : + +``` +https://cromwell-aws-cloudformation-templates.s3.eu-west-1.amazonaws.com/root-templates/gwfcore-root.template.yaml +``` + +Steps: + +1. Go to the CloudFormation console in your region : https://eu-west-2.console.aws.amazon.com/cloudformation/home?region=eu-west-2#/ + +2. Select "Stacks" in the left menu, then "Create Stack" on the top right. Choose "With new resources" :[link](https://eu-west-2.console.aws.amazon.com/cloudformation/home?region=eu-west-2#/stacks/create) + +3. Select "Template is ready", and use "Amazon S3 URL" as template source. Provide the yaml file URL provided above. + +4. Enter a stack name. + * *Note:* This name is referred to as "GWFCORE NameSpace" in STEP 3 + * *Note:* The name gets reflected in queue names etc. Keep it short. + +5. Provide an s3 bucket name to host cromwell temp data, results and runtime requirements. + * *Note:* If the bucket exists, it must be located in the same region. + * *Note:* If the bucket exists, specify this in the next field + +6. Select the VPC ID generated in STEP 1 + +7. Provide the PRIVATE subnets (all of them) from STEP 1, for the compute environment. Match the number of subnets to your selection + +8. Set the Max vCPU count for default (spot) and High Priority (on-demand) compute environment. + * *Note:* Check your quota per region [here](https://eu-west-2.console.aws.amazon.com/servicequotas/home/services/ec2/quotas), look for "Standard" + +9. Set the maximal spot bidding price. + +10. Select the list of instance types to be used. Default "optimal" value is a safe bet. + * *Note:* To edit this, clone the compute environment in the [Batch dashboard](https://eu-west-2.console.aws.amazon.com/batch/home?region=eu-west-2#compute-environments), and play around with the options. + +11. *Optional:* Create an EFS filesystem: + * *Note:* EFS is a distributed filesystem. Keeping large intermediate results on EFS can improve perforance by reducing S3/EC2 transfer times + * *Note:* By default, EFS performance is limited. Change the setup of the volume using the [console](https://eu-west-2.console.aws.amazon.com/efs/home?region=eu-west-2#/file-systems) to "Enhanced/Elastic" Performance. Consider the costs this implies! + * *Note:* It's recommended to create a new EFS, to make sure that mounting and network settings are correctly setup. + + +12. *Optional:* Create an FSx filesystem: + * *Note:* see documentation in [README](README.md) + +13. Other settings can be left as default. + +14. Review, confirm and submit on final page. + + +### STEP 3 : SETUP CROMWELL SERVER + +The default setup deploys a small t3.medium EC2 instance with 25Gb of storage and a RDS aurora-mysql database, to host cromwell, and some interaction tools. Although enough for testing, you'll probably have to scale up the instance for production runs. Alternatively, you can deploy cromwell on local infrastructure, preventing the RDS/EC2 costs. See below for details. + + +Used Template : + +``` +https://cromwell-aws-cloudformation-templates.s3.eu-west-1.amazonaws.com/root-templates/cromwell-resources.template.yaml +``` + +Steps: + +1. Go to the CloudFormation console in your region : https://eu-west-2.console.aws.amazon.com/cloudformation/home?region=eu-west-2#/ + +2. Select "Stacks" in the left menu, then "Create Stack" on the top right. Choose "With new resources" :[link](https://eu-west-2.console.aws.amazon.com/cloudformation/home?region=eu-west-2#/stacks/create) + +3. Select "Template is ready", and use "Amazon S3 URL" as template source. Provide the yaml file URL provided above. + +4. Provide a stack name. + +5. Provide a name space. + * *Note:* This name is reflected in the key name. Keep it short. + +6. Provide the GWFCORE Namespace from STEP 2 + +7. Specify the VPC ID, from STEP 1 + +8. Select a PUBLIC subnet for the cromwell server + +9. Select 2+ PRIVATE subnets for the RDS database + +10. Set the instance root volume size to 25+ Gb + +11. Set a password for the cromwell database + +12. Provide Filesystem details for EFS if specified in STEP 2: + * *Note:* Get the values from [the dashboard](https://eu-west-2.console.aws.amazon.com/efs/home?region=eu-west-2#/file-systems) + * *Note:* Accesspoint is listed under filesystem details, tab "Access Points" + * *Note:* Security Group is listed under filesystem details, tab "Network" + +13. Other settings can be left as default. + +14. Review, confirm and submit on final page. + + +When the stack is ready, use the following commands to retrieve your SSH key. The exact name of the key can be retrieved from the EC2 instance "Connect" page. + +``` +KEY_NAME=key- +REGION= + +# GET KEY ID : +KEY_ID=$(aws ec2 describe-key-pairs --filters Name=key-name,Values=${KEY_NAME} --query KeyPairs[*].KeyPairId --output text --region ${REGION}) + +# GET KEY CONTENTS +mkdir -p ~/.ssh/.keys +aws ssm get-parameter --region ${REGION} --name /ec2/keypair/${KEY_ID} --with-decryption --query Parameter.Value --output text > ~/.ssh/.keys/${KEY_NAME}.pem +chmod 600 ~/.ssh/.keys/${KEY_NAME}.pem + +``` + + + +Post Install Optimizations +-------------------------- + +### EFS + +EFS data is not cleaned up automatically. Consider adding a "cleanup" step in your production WFs, to keep the storage footprint/costs low. + +Edit settings [here](https://eu-west-2.console.aws.amazon.com/efs/home?region=eu-west-2#/file-systems). Select the FS and "edit" + +* Performance : "Enhanced":"Elastic" Throughput mode is recommended for large workflows. consider extra costs +* Lifecycle management : Consider moving data to infrequent access, to reduce costs of "forgotten" data. + +### S3 + +The cromwell temporary data and results are located in the bucket specified in STEP 2. To reduce your storage footprint/costs, consider setting up lifecycle management. + +* *cromwell-execution/* : Contains analysis results and logs for each executed workflow. +* *scripts/* : contains the runtime scripts for each executed task + +To set up a lifecycle: + +* go to the [s3 console](https://s3.console.aws.amazon.com/s3/buckets?region=eu-west-2), and select the bucket from STEP 1 +* Select the "Management" tab and "Create Lifecycle rule" +* analysis results: + * Set name : e.g. "Prune cromwell-execution" + * Set prefix "cromwell-execution" + * Choose actions : "Expire current versions of objects" and "Delete expired object delete markers or incomplete multipart uploads" + * Set time limits for after which objects are deleted (expired). + * Enable "delete incomplete multipart upload" checkbox and set time. +* scripts: + * repeat steps above, with prefix "scripts" + + +### LOCAL CROMWELL INSTANCE + +Running cromwell locally has the benefit of not running an EC2 instance 24/7. However, consider the following points: + +* Run cromwell as a user with a default AWS IAM profile set to the same region, and with sufficient permissions to interact with ECR, Batch and S3. + +* Setup a local database, or setup ingress rules on the RDS security group. + +* Consider load balancing by running multiple instances : see [here](https://cromwell.readthedocs.io/en/stable/Scaling/) + +* for EFS, there are specific concerns: + * expose EFS to your local network : + * Run a cheap (e.g t4g.nano) instance mounting the EFS share, assign a PEM key to access it + * Setup sshfs to mount the EFS share from that instance to your local machine at "/mnt/efs" + * REDUCE network traffic by enabling the "check-sibling-md5" and "efsMakeMD5" settings + + +### COMPUTE ENVIRONMENT + +* You might play around with the instance types to get more options: + * clone the compute environment + * double check all network settings ! + * replace the "optimal" type by "*.family" types. There is maximal number of entries, so using whole families allows more types + +* Extra Queues : You might consider a dedicated queue for high disk/network jobs, by altering the launch template: + * Go to Batch console, select the spot compute environment and open the JSON tab. Look for "launchTemplateName" (eg lt-06fa9fee031254098) + * Go to EC2 console, select "Launch Templates" in the menu on the left, and search for your launch template + * On details, select "Modify Template (create new version)" in the top right under "Actions" + * On the resulting page, add a description (eg "HighNetwork"), and then open advanced settings at the bottom of the page + * At the bottom, look in "User Data". About halfway, set EBS_* settings. EBS_IOPS can go as high as 16,000 and EBS_THROUGHPUT can go as high as 1,000Mbs/s + * Now clone the compute environment (double check all network settings and roles) + * specify "exact" launch template version to the version you created. + * set instance type to network/disk optimized machines (eg m5zn.6xlarge) + * have a blazing fast I/O machine (tests reached constant simultaneous 1Gb/s upload and 1gb/s download, while transferring novaseq data from basespace to AWS/S3) + + + + + diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/README.md b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/README.md index a1cdce0ba1b..613607da42a 100644 --- a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/README.md +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/README.md @@ -29,6 +29,21 @@ defined. This infrastructure and all the associated configuration still exists; however, it is moved out of the Cromwell configuration. +Deployment +---------- + +Deployment of the cromwell/AWS environment can be performed using the three cloudformation stacks: + +1. VPC : setup of the networks +2. Resources : setup of the compute environment, job queues and storage solutions +3. Cromwell : setup of an EC2 instance and RDS, hosting the cromwell server and submission tools. + +Along the way, all necessary IAM rols are generated. + +The full documentation is available [here](DEPLOY.md) + + + Features --------------------- ### Docker Hub Authentication @@ -42,7 +57,7 @@ Docker Hub authentication for AWS Backend enable users to access and use private dockerhub { token = "" } ``` -Stack must be deployed through https://github.com/aws-samples/aws-genomics-workflows. + ### `awsBatchRetryAttempts` @@ -154,11 +169,9 @@ backend { } // set the keys for Out-Of-Memory killing. -// system.io.memory-retry-error-keys -system{ - io{ - memory-retry-error-keys = ["OutOfMemory","Killed"] - } +// system.memory-retry-error-keys +system { + memory-retry-error-keys = ["OutOfMemory","Killed"] } ``` @@ -169,6 +182,14 @@ Workflow specific runtime options : `workflow_options.json`: } ``` +Or specify it in the cromwell config as : + +``` +workflow-options { + memory-retry-multiplier = 1.5 +} +``` + When providing the options.json file during workflow submission, jobs that were terminated due to insufficient memory will be retried 6 times, with increasing memory allocation. For example 4Gb => 6Gb => 9Gb => 13.5Gb => ... Note: Retries of jobs using the `awsBatchRetryAttempts` counter do *not* increase memory allocation. @@ -340,8 +361,8 @@ The following workflow highlights the following features: version 1.0 workflow TestEFS { input { - # input file for WF is located on S3 - File s3_file = 's3://aws-quickstart/quickstart-aws-vpc/templates/aws-vpc.template.yaml' + # input file for WF is located on a public S3 + File s3_file = 's3://cromwell-aws-cloudformation-templates/root-templates/aws-vpc.template.yaml' # set an input parameter holding the working dir on EFS String efs_wd = "/mnt/efs/MyTestProject" } diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/callcaching/AwsBatchBackendFileHashingActor.scala b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/callcaching/AwsBatchBackendFileHashingActor.scala index 41cb1dae97f..66287a781af 100755 --- a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/callcaching/AwsBatchBackendFileHashingActor.scala +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/callcaching/AwsBatchBackendFileHashingActor.scala @@ -54,14 +54,19 @@ class AwsBatchBackendFileHashingActor(standardParams: StandardFileHashingActorPa override def customHashStrategy(fileRequest: SingleFileHashRequest): Option[Try[String]] = { val file = DefaultPathBuilder.get(fileRequest.file.valueString) if (aws_config.efsMntPoint.isDefined && file.toString.startsWith(aws_config.efsMntPoint.getOrElse("--")) && aws_config.checkSiblingMd5.getOrElse(false)) { - // check existence of the sibling file val md5 = file.sibling(s"${file.toString}.md5") - if (md5.exists) { + // check existance of the file : + if (!file.exists) { + // if missing, cache hit is invalid; return invalid md5 + Some("File Missing").map(str => Try(str)) + } + // check existence of the sibling file + else if (md5.exists) { // read the file. val md5_value: Option[String] = Some(md5.contentAsString.split("\\s+")(0)) md5_value.map(str => Try(str)) } else { - // No sibling found, fall back to default. + // File present, but no sibling found, fall back to default. None } diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/io/AwsBatchGlobFunctions.scala b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/io/AwsBatchGlobFunctions.scala index 18505f6d866..1706f55370f 100644 --- a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/io/AwsBatchGlobFunctions.scala +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/io/AwsBatchGlobFunctions.scala @@ -71,6 +71,7 @@ trait AwsBatchGlobFunctions extends GlobFunctions { // - according to those values : write the pattern as s3:// or as local path. // - get the wf id from the config settings. + // this function reads in the globfile and locates globbed files : "local" or NIO access is needed to the files. // for now : hard coded as local at mount point /mnt/efs. val wfid_regex = ".{8}-.{4}-.{4}-.{4}-.{12}".r val wfid = callContext.root.toString.split("/").toList.filter(element => wfid_regex.pattern.matcher(element).matches()).lastOption.getOrElse("") @@ -79,7 +80,7 @@ trait AwsBatchGlobFunctions extends GlobFunctions { val listFilePath = if (pattern.startsWith("/mnt/efs/")) { DefaultPathBuilder.get(globbedDir + "/." + globPatternName + ".list") } else { - callContext.root.resolve(s"${globPatternName}.list") + callContext.root.resolve(s"${globbedDir}/.${globPatternName}.list".stripPrefix("/")) } asyncIo.readLinesAsync(listFilePath.toRealPath()) map { lines => lines.toList map { fileName => @@ -87,7 +88,7 @@ trait AwsBatchGlobFunctions extends GlobFunctions { if (pattern.startsWith("/mnt/efs/")) { s"${globbedDir}/.${globPatternName}/${fileName}" } else { - s"${callContext.root}/${globPatternName}/${fileName}" + callContext.root.resolve(s"${globbedDir}/.${globPatternName}/${fileName}".stripPrefix("/")).pathAsString } } } diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/io/AwsBatchVolume.scala b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/io/AwsBatchVolume.scala index 769c2c2f5d8..563834e8d06 100755 --- a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/io/AwsBatchVolume.scala +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/io/AwsBatchVolume.scala @@ -56,7 +56,6 @@ object AwsBatchVolume { val LocalDiskPattern: Regex = raw"""^\s*local-disk\s*$$""".r def parse(s: String): Try[AwsBatchVolume] = { - val validation: ErrorOr[AwsBatchVolume] = s match { case LocalDiskPattern() => Valid(AwsBatchWorkingDisk()) @@ -69,8 +68,9 @@ object AwsBatchVolume { Valid(AwsBatchEmptyMountedDisk(DefaultPathBuilder.get(mountPoint),fsType)) case _ => s"Disk strings should be of the format 'local-disk' or '/mount/point' but got: '$s'".invalidNel - } + } + Try(validation match { case Valid(localDisk) => localDisk case Invalid(nels) =>