broadinstitute · meganshand · Oct 20, 2020 · Oct 6, 2020 · Oct 6, 2020 · Oct 9, 2020
diff --git a/scripts/variantstore_cromwell_tests/import_array_manifest_test.json b/scripts/variantstore_cromwell_tests/import_array_manifest_test.json
@@ -3,6 +3,6 @@
   "ImportArrayManifest.manifest_schema_json":"/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/manifest_schema.json",
   "ImportArrayManifest.project_id":"broad-dsde-dev",
   "ImportArrayManifest.dataset_name":"temp_tables",
-  "ImportArrayManifest.table_name": "__TABLE_NAME__",
+  "ImportArrayManifest.table_name": "__TABLE_NAME___probe_id",
   "ImportArrayManifest.LoadManifest.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS"
 }
diff --git a/scripts/variantstore_cromwell_tests/import_arrays_test.json b/scripts/variantstore_cromwell_tests/import_arrays_test.json
@@ -0,0 +1,16 @@
+{
+  "ImportArrays.output_directory":"gs://variantstore-test/__UUID__",
+  "ImportArrays.input_vcfs":["/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/array.vcf"],
+  "ImportArrays.probe_info_file":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/expected_probe_info.csv",
+  "ImportArrays.sample_map":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/sampleMap.csv",
+  "ImportArrays.sample_list_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json",
+  "ImportArrays.raw_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/raw_array_schema.json",
+  "ImportArrays.table_id": 1,
+  "ImportArrays.project_id": "broad-dsde-dev",
+  "ImportArrays.dataset_name": "temp_tables",
+  "ImportArrays.docker": "__GATK_DOCKER__",
+  "ImportArrays.CreateImportTsvs.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS",
+  "ImportArrays.LoadArrays.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS",
+  "ImportArrays.LoadArrays.load": "true",
+  "ImportArrays.LoadArrays.uuid": "__UUID__"
+}
diff --git a/scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh b/scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh
@@ -25,15 +25,23 @@ else
 fi
 echo "Docker build done =========="
 echo "Putting the newly built docker image into the json parameters"
-cd $WORKING_DIR/gatk/scripts/
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" variantstore_cromwell_tests/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json
+CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/variantstore_cromwell_tests"
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/import_arrays_test.json >$WORKING_DIR/import_arrays_test_tmp.json
 sed -r "s/__TABLE_NAME__/$UUID/g" $WORKING_DIR/import_array_manifest_test_tmp.json > $WORKING_DIR/import_array_manifest_test_mod.json
-echo "JSON FILE (modified) ======="
+sed -r "s/__UUID__/$UUID/g" $WORKING_DIR/import_arrays_test_tmp.json > $WORKING_DIR/import_arrays_test_mod.json
+echo "MANIFEST JSON FILE (modified) ======="
 cat $WORKING_DIR/import_array_manifest_test_mod.json
+echo "INGEST JSON FILE (modified) ======="
+cat $WORKING_DIR/import_arrays_test_mod.json
 
-sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" variantstore_cromwell_tests/local-with-gcs.conf >$WORKING_DIR/set_up.conf
+sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" $CROMWELL_TEST_DIR/local-with-gcs.conf >$WORKING_DIR/set_up.conf
 echo "Updated local_backend.conf with service account"
 
 echo "Running ImportArrayManifest WDL through cromwell"
 ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl
 sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl -i $WORKING_DIR/import_array_manifest_test_mod.json -m $WORKING_DIR/test_import_manifest_wdl.metadata
+
+echo "Running ImportArrays WDL through cromwell"
+ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrays.wdl
+sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrays.wdl -i $WORKING_DIR/import_arrays_test_mod.json
diff --git a/scripts/variantstore_wdl/ImportArrays.wdl b/scripts/variantstore_wdl/ImportArrays.wdl
@@ -0,0 +1,222 @@
+version 1.0
+
+workflow ImportArrays {
+
+  input {
+    Array[File] input_vcfs
+    Array[File]? input_metrics
+    String? probe_info_table
+    File? probe_info_file
+    String output_directory
+    File sample_map
+    String project_id
+    String dataset_name
+    File raw_schema
+    File sample_list_schema
+    #TODO: determine table_id from input sample_map (including looping over multiple table_ids)
+    Int table_id
+
+    Int? preemptible_tries
+    File? gatk_override
+    String? docker
+  }
+
+  String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"])
+
+  scatter (i in range(length(input_vcfs))) {
+    if (defined(input_metrics)) {
+      File input_metric = select_first([input_metrics])[i]
+    }
+
+    call CreateImportTsvs {
+      input:
+        input_vcf = input_vcfs[i],
+        input_metrics = input_metric,
+        probe_info_table = probe_info_table,
+        probe_info_file = probe_info_file,
+        sample_map = sample_map,
+        output_directory = output_directory,
+        gatk_override = gatk_override,
+        docker = docker_final,
+        preemptible_tries = preemptible_tries
+    }
+  }
+
+  call LoadArrays {
+    input:
+      metadata_tsvs = CreateImportTsvs.metadata_tsv,
+      project_id = project_id,
+      dataset_name = dataset_name,
+      storage_location = output_directory,
+      table_id = table_id,
+      raw_schema = raw_schema,
+      sample_list_schema = sample_list_schema,
+      preemptible_tries = preemptible_tries,
+      docker = docker_final
+  }
+}
+
+
+task CreateImportTsvs {
+  input {
+    File input_vcf
+    File? input_metrics
+    String? probe_info_table
+    File? probe_info_file
+    String output_directory
+    File sample_map
+
+    # runtime
+    Int? preemptible_tries
+    File? gatk_override
+    String docker
+
+    String? for_testing_only
+  }
+
+  Int disk_size = ceil(size(input_vcf, "GB") * 2.5) + 20
+
+  meta {
+    description: "Creates a tsv file for imort into BigQuery"
+  }
+  parameter_meta {
+    input_vcf: {
+      localization_optional: true
+    }
+  }
+  command <<<
+      set -e
+
+      #workaround for https://github.com/broadinstitute/cromwell/issues/3647
+      export TMPDIR=/tmp
+      export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+      ~{for_testing_only}
+
+      gatk --java-options "-Xmx2500m" CreateArrayIngestFiles \
+        -V ~{input_vcf} \
+        ~{"-QCF " + input_metrics} \
+        ~{"--probe-info-file " + probe_info_file} \
+        ~{"--probe-info-table " + probe_info_table} \
+        -SNM ~{sample_map} \
+        --ref-version 37
+
+      gsutil cp sample_*.tsv ~{output_directory}/sample_tsvs/
+      gsutil cp raw_*.tsv ~{output_directory}/raw_tsvs/
+  >>>
+  runtime {
+      docker: docker
+      memory: "4 GB"
+      disks: "local-disk " + disk_size + " HDD"
+      preemptible: select_first([preemptible_tries, 5])
+      cpu: 2
+  }
+  output {
+      File metadata_tsv = glob("sample_*.tsv")[0]
+      File arraydata_tsv = glob("raw_*.tsv")[0] 
+  }
+}
+
+task LoadArrays {
+  input {
+    String project_id
+    String dataset_name
+    String storage_location
+    Int table_id
+    File raw_schema
+    File sample_list_schema
+    String load = "false"
+    String uuid = ""
+
+    #input from previous task needed to delay task from running until the other is complete
+    Array[String] metadata_tsvs
+
+    # runtime
+    Int? preemptible_tries
+    String docker
+
+    String? for_testing_only
+  }
+
+  command <<<
+    set -e
+    ~{for_testing_only}
+
+    SAMPLE_DIR=~{storage_location}/sample_tsvs/
+    RAW_DIR=~{storage_location}/raw_tsvs/
+
+    let "PARTITION_START=(~{table_id}-1)*4000+1"
+    let "PARTITION_END=$PARTITION_START+3999"
+    let "PARTITION_STEP=1"
+    PARTITION_FIELD="sample_id"
+    printf -v PADDED_TABLE_ID "%03d" ~{table_id}
+
+    RAW_FILES="raw_${PADDED_TABLE_ID}_*"
+    METADATA_FILES="sample_${PADDED_TABLE_ID}_*"
+
+    NUM_RAW_FILES=$(gsutil ls $RAW_DIR${RAW_FILES} | wc -l)
+    NUM_METADATA_FILES=$(gsutil ls $SAMPLE_DIR${METADATA_FILES} | wc -l)
+
+    if [ $NUM_RAW_FILES -eq 0 -a $NUM_METADATA_FILES -eq 0 ]; then
+      "no files for table ${PADDED_TABLE_ID} to process in ~{storage_location}; exiting"
+      exit
+    fi
+
+    # create a metadata table and load
+    SAMPLE_LIST_TABLE="~{dataset_name}.~{uuid + "_"}sample_list"
+    if [ $NUM_METADATA_FILES -gt 0 ]; then
+      set +e
+      bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null
+      set -e
+      if [ $? -ne 0 ]; then
+        echo "making dataset ~{dataset_name}"
+        bq mk --project_id=~{project_id} ~{dataset_name}
+      fi
+      set +e
+      bq show --project_id ~{project_id} $SAMPLE_LIST_TABLE > /dev/null
+      set -e
+      if [ $? -ne 0 ]; then
+        echo "making table $SAMPLE_LIST_TABLE"
+        bq --location=US mk --project_id=~{project_id} $SAMPLE_LIST_TABLE ~{sample_list_schema}
+        #TODO: add a Google Storage Transfer for the table when we make it.
+      fi
+      if [ ~{load} = true ]; then
+        bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $SAMPLE_LIST_TABLE $SAMPLE_DIR$METADATA_FILES ~{sample_list_schema}
+        echo "ingested ${METADATA_FILES} file from $SAMPLE_DIR into table $SAMPLE_LIST_TABLE"
+      else
+        echo "${METADATA_FILES} will be ingested from $SAMPLE_DIR by Google Storage Transfer"
+      fi
+    else
+      echo "no metadata files to process"
+    fi
+
+    # create array table
+    TABLE="~{dataset_name}.~{uuid + "_"}arrays_${PADDED_TABLE_ID}"
+    if [ $NUM_RAW_FILES -gt 0 ]; then
+      set +e
+      bq show --project_id ~{project_id} $TABLE > /dev/null
+      set -e
+      if [ $? -ne 0 ]; then
+        echo "making table $TABLE"
+        bq --location=US mk --range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP \
+          --project_id=~{project_id} $TABLE ~{raw_schema}
+        #TODO: add a Google Storage Transfer for the table when we make it.
+      fi
+      if [ ~{load} = true ]; then
+        bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $RAW_DIR$RAW_FILES ~{raw_schema}
+        echo "ingested ${RAW_FILES} files from $RAW_DIR into table $TABLE"
+      else
+        echo "${RAW_FILES} will be ingested from $RAW_DIR
+         by Google Storage Transfer"
+      fi
+    else
+      echo "no raw data files to process"
+    fi
+  >>>
+  runtime {
+    docker: docker
+    memory: "4 GB"
+    disks: "local-disk 10 HDD"
+    preemptible: select_first([preemptible_tries, 5])
+    cpu: 2
+  }
+}
diff --git a/scripts/variantstore_wdl/bq_ingest_arrays.sh b/scripts/variantstore_wdl/bq_ingest_arrays.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+set -e
+
+if [ $# -lt 5 ]; then
+  echo "usage: $0 <project-id> <dataset-name> <storage-location> <table-id> <load> <uuid>"
+  exit 1
+fi
+
+PROJECT_ID=$1
+DATASET_NAME=$2
+STORAGE_LOCATION=$3
+TABLE_ID=$4
+if [ $5 == "true" ]; then
+  LOAD=true
+else
+  LOAD=false
+fi
+if [ $# -eq 6 ]; then
+  UUID_FOR_TABLE="${6}_"
+else
+  UUID_FOR_TABLE=""
+fi
+SAMPLE_DIR=$STORAGE_LOCATION/sample_tsvs/
+RAW_DIR=$STORAGE_LOCATION/raw_tsvs/
+
+let "PARTITION_START=($TABLE_ID-1)*4000+1"
+let "PARTITION_END=$PARTITION_START+3999"
+let "PARTITION_STEP=1"
+PARTITION_FIELD="sample_id"
+printf -v PADDED_TABLE_ID "%03d" $TABLE_ID
+
+RAW_FILES="raw_${PADDED_TABLE_ID}_*"
+METADATA_FILES="sample_${PADDED_TABLE_ID}_*"
+
+NUM_RAW_FILES=$(gsutil ls $RAW_DIR${RAW_FILES} | wc -l)
+NUM_METADATA_FILES=$(gsutil ls $SAMPLE_DIR${METADATA_FILES} | wc -l)
+
+if [ $NUM_RAW_FILES -eq 0 -a $NUM_METADATA_FILES -eq 0 ]; then
+  "no files for table ${PADDED_TABLE_ID} to process in $STORAGE_LOCATION; exiting"
+  exit
+fi
+
+# schema and TSV header need to be the same order
+RAW_SCHEMA="schemas/raw_array_schema.json"
+SAMPLE_LIST_SCHEMA="schemas/arrays_sample_list_schema.json"
+
+# create a metadata table and load
+SAMPLE_LIST_TABLE="${DATASET_NAME}.${UUID_FOR_TABLE}sample_list"
+if [ $NUM_METADATA_FILES -gt 0 ]; then
+  set +e
+  bq ls --project_id $PROJECT_ID $DATASET_NAME > /dev/null
+  set -e
+  if [ $? -ne 0 ]; then
+    echo "making dataset $DATASET_NAME"
+    bq mk --project_id=$PROJECT_ID $DATASET_NAME
+  fi
+  set +e
+  bq show --project_id $PROJECT_ID $SAMPLE_LIST_TABLE > /dev/null
+  set -e
+  if [ $? -ne 0 ]; then
+    echo "making table $SAMPLE_LIST_TABLE"
+    bq --location=US mk --project_id=$PROJECT_ID $SAMPLE_LIST_TABLE $SAMPLE_LIST_SCHEMA
+    #TODO: add a Google Storage Transfer for the table when we make it.
+  fi
+  if [ "$LOAD" = true ]; then
+    bq load --location=US --project_id=$PROJECT_ID --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $SAMPLE_LIST_TABLE $SAMPLE_DIR$METADATA_FILES $SAMPLE_LIST_SCHEMA
+    echo "ingested ${METADATA_FILES} file from $SAMPLE_DIR into table $SAMPLE_LIST_TABLE"
+  else
+    echo "${METADATA_FILES} will be ingested from $SAMPLE_DIR by Google Storage Transfer"
+  fi
+else
+  echo "no metadata files to process"
+fi
+
+# create array table
+TABLE="${DATASET_NAME}.${UUID_FOR_TABLE}arrays_${PADDED_TABLE_ID}"
+if [ $NUM_RAW_FILES -gt 0 ]; then
+  set +e
+  bq show --project_id $PROJECT_ID $TABLE > /dev/null
+  set -e
+  if [ $? -ne 0 ]; then
+    echo "making table $TABLE"
+    bq --location=US mk --range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP \
+      --project_id=$PROJECT_ID $TABLE $RAW_SCHEMA
+    #TODO: add a Google Storage Transfer for the table when we make it.
+  fi
+  if [ "$LOAD" = true ]; then
+    bq load --location=US --project_id=$PROJECT_ID --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $RAW_DIR$RAW_FILES $RAW_SCHEMA
+    echo "ingested ${RAW_FILES} files from $RAW_DIR into table $TABLE"
+  else
+    echo "${RAW_FILES} will be ingested from $RAW_DIR
+     by Google Storage Transfer"
+  fi
+else
+  echo "no raw data files to process"
+fi
+