update for genomes (#6918)

* update for genomes * add schema files
broadinstitute · Jan 29, 2021 · 73d6eba · 73d6eba
1 parent de84e16
commit 73d6eba
Show file tree

Hide file tree

Showing 15 changed files with 495 additions and 89 deletions.
diff --git a/scripts/variantstore_wdl/ImportGenomes.wdl b/scripts/variantstore_wdl/ImportGenomes.wdl
@@ -0,0 +1,232 @@
+version 1.0
+
+workflow ImportGenomes {
+
+  input {
+    Array[File] input_vcfs
+    Array[File]? input_metrics
+    File interval_list
+    String output_directory
+    File sample_map
+    String project_id
+    String dataset_name
+    File pet_schema
+    File vet_schema
+    File metadata_schema
+    #TODO: determine table_id from input sample_map (including looping over multiple table_ids)
+    Int table_id
+
+    Int? preemptible_tries
+    File? gatk_override
+    String? docker
+  }
+
+  String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"])
+
+  scatter (i in range(length(input_vcfs))) {
+    if (defined(input_metrics)) {
+      File input_metric = select_first([input_metrics])[i]
+    }
+
+    call CreateImportTsvs {
+      input:
+        input_vcf = input_vcfs[i],
+        interval_list = interval_list,
+        input_metrics = input_metric,
+        sample_map = sample_map,
+        output_directory = output_directory,
+        gatk_override = gatk_override,
+        docker = docker_final,
+        preemptible_tries = preemptible_tries
+    }
+  }
+
+  call LoadData as LoadMetadataTsvs{
+    input:
+      done = CreateImportTsvs.done[0],
+      project_id = project_id,
+      dataset_name = dataset_name,
+      storage_location = output_directory,
+      datatype = "metadata",
+      numbered = "false",
+      partitioned = "false",
+      table_id = table_id,
+      schema = metadata_schema,
+      preemptible_tries = preemptible_tries,
+      docker = docker_final
+  }
+
+    call LoadData as LoadPetTsvs{
+    input:
+      done = CreateImportTsvs.done[0],
+      project_id = project_id,
+      dataset_name = dataset_name,
+      storage_location = output_directory,
+      datatype = "pet",
+      table_id = table_id,
+      schema = pet_schema,
+      preemptible_tries = preemptible_tries,
+      docker = docker_final
+  }
+
+  call LoadData as LoadVetTsvs{
+    input:
+      done = CreateImportTsvs.done[0],
+      project_id = project_id,
+      dataset_name = dataset_name,
+      storage_location = output_directory,
+      datatype = "vet",
+      table_id = table_id,
+      schema = vet_schema,
+      preemptible_tries = preemptible_tries,
+      docker = docker_final
+  }
+
+}
+
+
+task CreateImportTsvs {
+  input {
+    File input_vcf
+    File? input_metrics
+    File interval_list
+    String output_directory
+    File sample_map
+
+    # runtime
+    Int? preemptible_tries
+    File? gatk_override
+    String docker
+
+    String? for_testing_only
+  }
+
+  Int disk_size = ceil(size(input_vcf, "GB") * 2.5) + 20
+
+  meta {
+    description: "Creates a tsv file for imort into BigQuery"
+  }
+  parameter_meta {
+    input_vcf: {
+      localization_optional: true
+    }
+  }
+  command <<<
+      set -e
+
+      #workaround for https://github.com/broadinstitute/cromwell/issues/3647
+      export TMPDIR=/tmp
+
+      export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+      ~{for_testing_only}
+
+      gatk --java-options "-Xmx2500m" CreateVariantIngestFiles \
+        -V ~{input_vcf} \
+        -L ~{interval_list} \
+        ~{"-QCF " + input_metrics} \
+        --mode GENOMES \
+        -SNM ~{sample_map} \
+        --ref-version 38
+
+      gsutil cp metadata_*.tsv ~{output_directory}/metadata_tsvs/
+      gsutil cp pet_*.tsv ~{output_directory}/pet_tsvs/
+      gsutil cp vet_*.tsv ~{output_directory}/vet_tsvs/
+  >>>
+  runtime {
+      docker: docker
+      memory: "4 GB"
+      disks: "local-disk " + disk_size + " HDD"
+      preemptible: select_first([preemptible_tries, 5])
+      cpu: 2
+  }
+  output {
+      File metadata_tsv = glob("metadata_*.tsv")[0]
+      File pet_tsv = glob("pet_*.tsv")[0] 
+      File vet_tsv = glob("vet_*.tsv")[0]
+      String done = "true"
+  }
+}
+
+
+task LoadData {
+  input {
+    String project_id
+    String dataset_name
+    String storage_location
+    String datatype
+    Int table_id
+    File schema
+    String numbered = "true"
+    String partitioned = "true"
+    String load = "true"
+    String uuid = ""
+
+    #input from previous task needed to delay task from running until the other is complete
+    String done
+
+    # runtime
+    Int? preemptible_tries
+    String docker
+
+    String? for_testing_only
+  }
+
+  command <<<
+    set -x
+    set +e
+    # make sure dataset exists
+    bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null
+
+    set -e
+    ~{for_testing_only}
+
+    DIR="~{storage_location}/~{datatype}_tsvs/"
+    PARTITION_STRING=""
+
+    if [ ~{partitioned} == "true" ]; then
+      let "PARTITION_START=(~{table_id}-1)*4000+1"
+      let "PARTITION_END=$PARTITION_START+3999"
+      let "PARTITION_STEP=1"
+      PARTITION_FIELD="sample_id"
+      PARTITION_STRING="--range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP"
+    fi
+
+    printf -v PADDED_TABLE_ID "_%03d" ~{table_id}
+    FILES="~{datatype}${PADDED_TABLE_ID}_*"
+
+    if [ ~{numbered} != "true" ]; then
+      PADDED_TABLE_ID=""  #override table id to empty string, but it is needed to get the files
+    fi
+
+    NUM_FILES=$(gsutil ls $DIR$FILES | wc -l)
+
+    # create the table and load
+    TABLE="~{dataset_name}.~{uuid + "_"}~{datatype}${PADDED_TABLE_ID}"
+    if [ $NUM_FILES -gt 0 ]; then
+      set +e
+      bq show --project_id ~{project_id} $TABLE > /dev/null
+      set -e
+      if [ $? -ne 0 ]; then
+        echo "making table $TABLE"
+        bq --location=US mk ${PARTITION_STRING} --project_id=~{project_id} $TABLE ~{schema}
+        #TODO: add a Google Storage Transfer for the table when we make it.
+      fi
+      #load should be false if using Google Storage Transfer so that the tables will be created by this script, but no data will be uploaded.
+      if [ ~{load} = true ]; then
+        bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $DIR$FILES ~{schema}
+        echo "ingested ${FILES} file from $DIR into table $TABLE"
+      else
+        echo "${FILES} will be ingested from $DIR by Google Storage Transfer"
+      fi
+    else
+      echo "no ${FILES} files to process"
+    fi
+  >>>
+  runtime {
+    docker: docker
+    memory: "4 GB"
+    disks: "local-disk 10 HDD"
+    preemptible: select_first([preemptible_tries, 5])
+    cpu: 2
+  }
+}
diff --git a/scripts/variantstore_wdl/import_genomes_inputs.json b/scripts/variantstore_wdl/import_genomes_inputs.json
@@ -0,0 +1,13 @@
+{
+  "ImportGenomes.output_directory":"gs://broad-dsp-spec-ops/scratch/andrea/genomes/import",
+  "ImportGenomes.input_vcfs":["gs://broad-dsp-spec-ops/scratch/andrea/genomes/BI_NA03949_1_99000142037_SM-GXZUN_1.reblocked.chr20.g.vcf.gz"],
+  "ImportGenomes.interval_list":"gs://broad-dsp-spec-ops/scratch/andrea/genomes/wgs_calling_regions.hg38.chr20.interval_list",
+  "ImportGenomes.sample_map":"gs://broad-dsp-spec-ops/scratch/andrea/genomes/sampleMap.csv",
+  "ImportGenomes.metadata_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/sample_list_schema.json",
+  "ImportGenomes.pet_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/pet_schema.json",
+  "ImportGenomes.vet_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/vet_schema.json",
+  "ImportGenomes.table_id": 1,
+  "ImportGenomes.project_id": "broad-dsp-spec-ops",
+  "ImportGenomes.dataset_name": "temp_tables",
+  "ImportGenomes.gatk_override": "gs://broad-dsp-spec-ops/scratch/andrea/gatk-varstore.jar"
+}
diff --git a/scripts/variantstore_wdl/schemas/pet_schema.json b/scripts/variantstore_wdl/schemas/pet_schema.json
@@ -0,0 +1,20 @@
+[
+ {
+   "description": "[DESCRIPTION]",
+   "name": "location",
+   "type": "Integer",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "sample_id",
+   "type": "Integer",
+   "mode": "Required"
+ },
+  {
+   "description": "[DESCRIPTION]",
+   "name": "state",
+   "type": "String",
+   "mode": "Required"
+ }
+]
diff --git a/scripts/variantstore_wdl/schemas/sample_list_schema.json b/scripts/variantstore_wdl/schemas/sample_list_schema.json
@@ -0,0 +1,27 @@
+[
+  {
+    "description": "[DESCRIPTION]",
+    "name": "sample_name",
+    "type": "String",
+    "mode": "Required"
+  },
+  {
+    "description": "[DESCRIPTION]",
+    "name": "sample_id",
+    "type": "Integer",
+    "mode": "Required"
+  },
+  {
+    "description": "[DESCRIPTION]",
+    "name": "interval_list_blob",
+    "type": "String",
+    "mode": "Nullable"
+  },
+  {
+    "description": "[DESCRIPTION]",
+    "name": "inferred_state",
+    "type": "String",
+    "mode": "Required"
+  }
+]
+
diff --git a/scripts/variantstore_wdl/schemas/vet_schema.json b/scripts/variantstore_wdl/schemas/vet_schema.json
@@ -0,0 +1,98 @@
+[
+ {
+   "description": "[DESCRIPTION]",
+   "name": "location",
+   "type": "Integer",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "sample_id",
+   "type": "Integer",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "ref",
+   "type": "String",
+   "mode": "Required"
+ },
+  {
+   "description": "[DESCRIPTION]",
+   "name": "alt",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "AS_RAW_MQ",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "AS_RAW_MQRankSum",
+   "type": "String",
+   "mode": "Nullable"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "AS_QUALapprox",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "AS_RAW_ReadPosRankSum",
+   "type": "String",
+   "mode": "Nullable"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "AS_SB_TABLE",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "AS_VarDP",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "call_GT",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "call_AD",
+   "type": "String",
+   "mode": "Nullable"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "call_GQ",
+   "type": "Integer",
+   "mode": "Required"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "call_PGT",
+   "type": "String",
+   "mode": "Nullable"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "call_PID",
+   "type": "String",
+   "mode": "Nullable"
+ },
+ {
+   "description": "[DESCRIPTION]",
+   "name": "call_PL",
+   "type": "String",
+   "mode": "Required"
+ }
+]