Added additional workflow and README updates for Quickstart [VS-183] (#…

…7463)
broadinstitute · Sep 20, 2021 · 57e2cdb · 57e2cdb
1 parent 67dec79
commit 57e2cdb
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 14 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -56,6 +56,15 @@ workflows:
      filters:
        branches:
          - master
+   - name: GvsAssignIds
+     subclass: WDL
+     primaryDescriptorPath: /scripts/variantstore/wdl/GvsAssignIds.wdl
+     testParameterFiles:
+       - /scripts/variantstore/wdl/GvsAssignIds.example.inputs.json
+     filters:
+       branches:
+         - master
+         - ah_var_store
    - name: GvsCreateFilterSet
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -68,6 +77,8 @@ workflows:
    - name: GvsCreateAltAllele
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateAltAllele.wdl
+     testParameterFiles:
+       - /scripts/variantstore/wdl/GvsCreateAltAllele.example.inputs.json
      filters:
        branches:
          - master
@@ -106,6 +117,7 @@ workflows:
        - /scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json
      filters:
        branches:
+         - master
          - ah_var_store
    - name: GvsValidateVat
      subclass: WDL
@@ -114,6 +126,7 @@ workflows:
        - /scripts/variantstore/variant_annotations_table/GvsValidateVat.example.inputs.json
      filters:
        branches:
+         - master
          - ah_var_store
    - name: MitochondriaPipeline
      subclass: WDL

diff --git a/scripts/variantstore/TERRA_QUICKSTART.md b/scripts/variantstore/TERRA_QUICKSTART.md
@@ -7,7 +7,7 @@ Through this QuickStart you will learn how to use the Broad Genomic Variant Stor
 
 The sequencing data in this quickstart came from the [AnVIL 1000G High Coverage workspace](https://app.terra.bio/#workspaces/anvil-datastorage/1000G-high-coverage-2019)
 
-**Note:** VQSR dies with the default/recommended configuration, so we set SNP max-gaussians to 4 here...
+**Note:** VQSR dies with the default/recommended configuration, so we set SNP max-gaussians to 4 here.
 
 ## Prerequisites
 
@@ -19,38 +19,34 @@ This quickstart assumes that you are familiar with Terra workspaces, the data mo
     - BigQuery data editor
     - BigQuery job user
     - BigQuery Read Session User
-4. These tools expect reblocked gvcf files as input. 
+4. These tools expect re-blocked gVCF files as input.
 
 ## 1. Import Data
 In order to load data into BigQuery without hitting daily load limits, we recommend you group your input files into samples sets and follow these steps for each sample set.
 
-## 1.1 Assign Gvs Ids 
-To optimize the internal queries, each sample must have a unique and consecutive integer id assigned. Run the `GvsAssignIds` workflow. This will create an appropiate id for each sample in the sample set and update the BigQuery data set with the sample name to id mapping info.
+A sample set for the quickstart has already been created with 10 samples and paths to re-blocked gVCFs for each sample.  Run the two import workflows against this sample set by selecting "sample_set" as the root entity type ("Step 1") and `gvs_demo-10` for the data ("Step 2").  If you are creating your own sample set, note that the sample table should have a column for the re-blocked gVCFs (`hg38_reblocked_gvcf` or `reblocked_gvcf_path`) and their index files need to be in the same location.
+
+## 1.1 Assign Gvs IDs
+To optimize the internal queries, each sample must have a unique and consecutive integer ID assigned. Run the `GvsAssignIds` workflow, which will create an appropriate ID for each sample in the sample set and update the BigQuery dataset with the sample name to ID mapping info.
 
 These are the required parameters which must be supplied to the workflow:
 
 | Parameter      | Description |
 | ----------------- | ----------- |
 | project_id | The name of the google project containing the dataset |
 | dataset_name      | The name of the dataset you created above       |
-| external\_sample_names | Use `this.samples.sample_id` |
 
 
 ## 1.2 Load data
 
-Next, your reblocked gVCF files should be imported into GVS by running the `GvsImportGenomes` workflow.  
-
-The workflow should be run against a sample set indicating the samples to load.  The sample table should have a column for the reblocked gVCFs (`hg38_reblocked_gvcf` or `reblocked_gvcf_path`) and their index files need to be in the same location.
-
-A sample set for the quickstart (`gvs_demo-10`) has already been created with 10 samples.
+Next, your re-blocked gVCF files should be imported into GVS by running the `GvsImportGenomes` workflow.
 
 These are the required parameters which must be supplied to the workflow:
 
 | Parameter      | Description |
 | ----------------- | ----------- |
 | project_id | The name of the google project containing the dataset |
 | dataset_name      | The name of the dataset you created above       |
-| external\_sample_names | Use `this.samples.sample_id` |
 | output_directory | A unique GCS path to be used for loading, can be in the workspace bucket.  E.g. `gs://fc-124-12-132-123-31/gvs/demo1`)
 
 

diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl
@@ -39,7 +39,7 @@ workflow GvsAssignIds {
       service_account_json = service_account_json,
       docker = docker_final,
   }
-  
+
   output {
     Boolean gvs_ids_created = true
   }
@@ -69,6 +69,12 @@ task AssignIds {
   command <<<
       set -e
 
+      # make sure that sample names were actually passed, fail if empty
+      if [ ~{length(sample_names)} -eq 0 ]; then
+        echo "No sample names passed. Exiting"
+        exit 1
+      fi
+
       export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
 
       if [ ~{has_service_account_file} = 'true' ]; then
@@ -107,7 +113,7 @@ task AssignIds {
       # perform actual id assignment
       bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
         "UPDATE ~{dataset_name}.~{sample_info_table} m SET m.sample_id = id_assign.id FROM (SELECT sample_name, $offset + ROW_NUMBER() OVER() as id FROM ~{dataset_name}.~{sample_info_table} WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;"
-      
+
       # remove the lock table
       bq --project_id=~{project_id} rm -f -t ~{dataset_name}.sample_id_assignment_lock
 
@@ -151,7 +157,7 @@ task CreateTables {
     echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
     TABLE="~{dataset_name}.~{datatype}"
-      
+
     # Check that the table has not been created yet
     set +e
     bq show --project_id ~{project_id} $TABLE > /dev/null

diff --git a/scripts/variantstore/wdl/GvsCreateAltAllele.example.inputs.json b/scripts/variantstore/wdl/GvsCreateAltAllele.example.inputs.json
@@ -0,0 +1,4 @@
+{
+  "GvsCreateAltAllele.data_project": "PROJECT_ID",
+  "GvsCreateAltAllele.default_dataset": "DATASET",
+}
diff --git a/scripts/variantstore/wdl/GvsCreateAltAllele.wdl b/scripts/variantstore/wdl/GvsCreateAltAllele.wdl
@@ -88,6 +88,10 @@ task GetVetTableNames {
 }
 
 task CreateAltAlleleTable {
+  meta {
+    volatile: true
+  }
+
   input {
     String query_project_id
     String dataset_project_id
@@ -152,6 +156,10 @@ task CreateAltAlleleTable {
 }
 
 task PopulateAltAlleleTable {
+  meta {
+    volatile: true
+  }
+
   input {
     String create_table_done
     String vet_table_name