diff --git a/scripts/variantstore_cromwell_tests/import_array_manifest_test.json b/scripts/variantstore_cromwell_tests/import_array_manifest_test.json index 2301367f64a..d71b6a6b1fe 100644 --- a/scripts/variantstore_cromwell_tests/import_array_manifest_test.json +++ b/scripts/variantstore_cromwell_tests/import_array_manifest_test.json @@ -3,6 +3,6 @@ "ImportArrayManifest.manifest_schema_json":"/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/manifest_schema.json", "ImportArrayManifest.project_id":"broad-dsde-dev", "ImportArrayManifest.dataset_name":"temp_tables", - "ImportArrayManifest.table_name": "__TABLE_NAME__", + "ImportArrayManifest.table_name": "__TABLE_NAME___probe_id", "ImportArrayManifest.LoadManifest.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS" } diff --git a/scripts/variantstore_cromwell_tests/import_arrays_test.json b/scripts/variantstore_cromwell_tests/import_arrays_test.json new file mode 100644 index 00000000000..d2d4c8f6cb6 --- /dev/null +++ b/scripts/variantstore_cromwell_tests/import_arrays_test.json @@ -0,0 +1,16 @@ +{ + "ImportArrays.output_directory":"gs://variantstore-test/__UUID__", + "ImportArrays.input_vcfs":["/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/array.vcf"], + "ImportArrays.probe_info_file":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/expected_probe_info.csv", + "ImportArrays.sample_map":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/sampleMap.csv", + "ImportArrays.sample_list_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json", + "ImportArrays.raw_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/raw_array_schema.json", + "ImportArrays.table_id": 1, + "ImportArrays.project_id": "broad-dsde-dev", + "ImportArrays.dataset_name": "temp_tables", + "ImportArrays.docker": "__GATK_DOCKER__", + "ImportArrays.CreateImportTsvs.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS", + "ImportArrays.LoadArrays.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS", + "ImportArrays.LoadArrays.load": "true", + "ImportArrays.LoadArrays.uuid": "__UUID__" +} diff --git a/scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh b/scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh index fc2a682ff87..e9642fb2048 100644 --- a/scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh +++ b/scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh @@ -25,15 +25,23 @@ else fi echo "Docker build done ==========" echo "Putting the newly built docker image into the json parameters" -cd $WORKING_DIR/gatk/scripts/ -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" variantstore_cromwell_tests/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json +CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/variantstore_cromwell_tests" +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/import_arrays_test.json >$WORKING_DIR/import_arrays_test_tmp.json sed -r "s/__TABLE_NAME__/$UUID/g" $WORKING_DIR/import_array_manifest_test_tmp.json > $WORKING_DIR/import_array_manifest_test_mod.json -echo "JSON FILE (modified) =======" +sed -r "s/__UUID__/$UUID/g" $WORKING_DIR/import_arrays_test_tmp.json > $WORKING_DIR/import_arrays_test_mod.json +echo "MANIFEST JSON FILE (modified) =======" cat $WORKING_DIR/import_array_manifest_test_mod.json +echo "INGEST JSON FILE (modified) =======" +cat $WORKING_DIR/import_arrays_test_mod.json -sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" variantstore_cromwell_tests/local-with-gcs.conf >$WORKING_DIR/set_up.conf +sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" $CROMWELL_TEST_DIR/local-with-gcs.conf >$WORKING_DIR/set_up.conf echo "Updated local_backend.conf with service account" echo "Running ImportArrayManifest WDL through cromwell" ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl -i $WORKING_DIR/import_array_manifest_test_mod.json -m $WORKING_DIR/test_import_manifest_wdl.metadata + +echo "Running ImportArrays WDL through cromwell" +ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrays.wdl +sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrays.wdl -i $WORKING_DIR/import_arrays_test_mod.json diff --git a/scripts/variantstore_wdl/ImportArrays.wdl b/scripts/variantstore_wdl/ImportArrays.wdl new file mode 100644 index 00000000000..ab1b35d9957 --- /dev/null +++ b/scripts/variantstore_wdl/ImportArrays.wdl @@ -0,0 +1,224 @@ +version 1.0 + +workflow ImportArrays { + + input { + Array[File] input_vcfs + Array[File]? input_metrics + String? probe_info_table + File? probe_info_file + String output_directory + File sample_map + String project_id + String dataset_name + File raw_schema + File sample_list_schema + #TODO: determine table_id from input sample_map (including looping over multiple table_ids) + Int table_id + + Int? preemptible_tries + File? gatk_override + String? docker + } + + String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"]) + + scatter (i in range(length(input_vcfs))) { + if (defined(input_metrics)) { + File input_metric = select_first([input_metrics])[i] + } + + call CreateImportTsvs { + input: + input_vcf = input_vcfs[i], + input_metrics = input_metric, + probe_info_table = probe_info_table, + probe_info_file = probe_info_file, + sample_map = sample_map, + output_directory = output_directory, + gatk_override = gatk_override, + docker = docker_final, + preemptible_tries = preemptible_tries + } + } + + call LoadArrays { + input: + sample_tsvs = CreateImportTsvs.sample_tsv, + project_id = project_id, + dataset_name = dataset_name, + storage_location = output_directory, + table_id = table_id, + raw_schema = raw_schema, + sample_list_schema = sample_list_schema, + preemptible_tries = preemptible_tries, + docker = docker_final + } +} + + +task CreateImportTsvs { + input { + File input_vcf + File? input_metrics + String? probe_info_table + File? probe_info_file + String output_directory + File sample_map + + # runtime + Int? preemptible_tries + File? gatk_override + String docker + + String? for_testing_only + } + + Int disk_size = ceil(size(input_vcf, "GB") * 2.5) + 20 + + meta { + description: "Creates a tsv file for imort into BigQuery" + } + parameter_meta { + input_vcf: { + localization_optional: true + } + } + command <<< + set -e + + #workaround for https://github.com/broadinstitute/cromwell/issues/3647 + export TMPDIR=/tmp + + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + ~{for_testing_only} + + gatk --java-options "-Xmx2500m" CreateArrayIngestFiles \ + -V ~{input_vcf} \ + ~{"-QCF " + input_metrics} \ + ~{"--probe-info-file " + probe_info_file} \ + ~{"--probe-info-table " + probe_info_table} \ + -SNM ~{sample_map} \ + --ref-version 37 + + gsutil cp sample_*.tsv ~{output_directory}/sample_tsvs/ + gsutil cp raw_*.tsv ~{output_directory}/raw_tsvs/ + >>> + runtime { + docker: docker + memory: "4 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: select_first([preemptible_tries, 5]) + cpu: 2 + } + output { + File sample_tsv = glob("sample_*.tsv")[0] + File arraydata_tsv = glob("raw_*.tsv")[0] + } +} + +task LoadArrays { + input { + String project_id + String dataset_name + String storage_location + Int table_id + File raw_schema + File sample_list_schema + String load = "true" + String uuid = "" + + #input from previous task needed to delay task from running until the other is complete + Array[String] sample_tsvs + + # runtime + Int? preemptible_tries + String docker + + String? for_testing_only + } + + command <<< + set -e + ~{for_testing_only} + + SAMPLE_DIR=~{storage_location}/sample_tsvs/ + RAW_DIR=~{storage_location}/raw_tsvs/ + + let "PARTITION_START=(~{table_id}-1)*4000+1" + let "PARTITION_END=$PARTITION_START+3999" + let "PARTITION_STEP=1" + PARTITION_FIELD="sample_id" + printf -v PADDED_TABLE_ID "%03d" ~{table_id} + + RAW_FILES="raw_${PADDED_TABLE_ID}_*" + METADATA_FILES="sample_${PADDED_TABLE_ID}_*" + + NUM_RAW_FILES=$(gsutil ls $RAW_DIR${RAW_FILES} | wc -l) + NUM_METADATA_FILES=$(gsutil ls $SAMPLE_DIR${METADATA_FILES} | wc -l) + + if [ $NUM_RAW_FILES -eq 0 -a $NUM_METADATA_FILES -eq 0 ]; then + "no files for table ${PADDED_TABLE_ID} to process in ~{storage_location}; exiting" + exit + fi + + # create a metadata table and load + SAMPLE_LIST_TABLE="~{dataset_name}.~{uuid + "_"}sample_list" + if [ $NUM_METADATA_FILES -gt 0 ]; then + set +e + bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null + set -e + if [ $? -ne 0 ]; then + echo "making dataset ~{dataset_name}" + bq mk --project_id=~{project_id} ~{dataset_name} + fi + set +e + bq show --project_id ~{project_id} $SAMPLE_LIST_TABLE > /dev/null + set -e + if [ $? -ne 0 ]; then + echo "making table $SAMPLE_LIST_TABLE" + bq --location=US mk --project_id=~{project_id} $SAMPLE_LIST_TABLE ~{sample_list_schema} + #TODO: add a Google Storage Transfer for the table when we make it. + fi + #load should be false if using Google Storage Transfer so that the tables will be created by this script, but no data will be uploaded. + if [ ~{load} = true ]; then + bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $SAMPLE_LIST_TABLE $SAMPLE_DIR$METADATA_FILES ~{sample_list_schema} + echo "ingested ${METADATA_FILES} file from $SAMPLE_DIR into table $SAMPLE_LIST_TABLE" + else + echo "${METADATA_FILES} will be ingested from $SAMPLE_DIR by Google Storage Transfer" + fi + else + echo "no metadata files to process" + fi + + # create array table + TABLE="~{dataset_name}.~{uuid + "_"}arrays_${PADDED_TABLE_ID}" + if [ $NUM_RAW_FILES -gt 0 ]; then + set +e + bq show --project_id ~{project_id} $TABLE > /dev/null + set -e + if [ $? -ne 0 ]; then + echo "making table $TABLE" + bq --location=US mk --range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP \ + --project_id=~{project_id} $TABLE ~{raw_schema} + #TODO: add a Google Storage Transfer for the table when we make it. + fi + if [ ~{load} = true ]; then + bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $RAW_DIR$RAW_FILES ~{raw_schema} + echo "ingested ${RAW_FILES} files from $RAW_DIR into table $TABLE" + else + echo "${RAW_FILES} will be ingested from $RAW_DIR + by Google Storage Transfer" + fi + else + echo "no raw data files to process" + fi + >>> + runtime { + docker: docker + memory: "4 GB" + disks: "local-disk 10 HDD" + preemptible: select_first([preemptible_tries, 5]) + cpu: 2 + } +} diff --git a/scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json b/scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json new file mode 100644 index 00000000000..02ebb37b7fb --- /dev/null +++ b/scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json @@ -0,0 +1,105 @@ +[ + { + "description": "[DESCRIPTION]", + "name": "sample_id", + "type": "Integer", + "mode": "Required" + }, + { + "description": "[DESCRIPTION]", + "name": "sample_name", + "type": "String", + "mode": "Required" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_ASSAYS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_NON_FILTERED_ASSAYS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_FILTERED_ASSAYS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_ZEROED_OUT_ASSAYS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_SNPS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_INDELS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_CALLS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_AUTOCALL_CALLS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_NO_CALLS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_IN_DB_SNP", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NOVEL_SNPS", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "PCT_DBSNP", + "type": "Float", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "CALL_RATE", + "type": "Float", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "AUTOCALL_CALL_RATE", + "type": "Float", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NUM_SINGLETONS", + "type": "Integer", + "mode": "Nullable" + } +] + diff --git a/scripts/variantstore_wdl/schemas/raw_array_schema.json b/scripts/variantstore_wdl/schemas/raw_array_schema.json new file mode 100644 index 00000000000..30fd5338589 --- /dev/null +++ b/scripts/variantstore_wdl/schemas/raw_array_schema.json @@ -0,0 +1,44 @@ +[ + { + "description": "[DESCRIPTION]", + "name": "sample_id", + "type": "Integer", + "mode": "Required" + }, + { + "description": "[DESCRIPTION]", + "name": "probe_id", + "type": "Integer", + "mode": "Required" + }, + { + "description": "[DESCRIPTION]", + "name": "GT_encoded", + "type": "String", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "NORMX", + "type": "Float", + "mode": "Required" + }, + { + "description": "[DESCRIPTION]", + "name": "NORMY", + "type": "Float", + "mode": "Required" + }, + { + "description": "[DESCRIPTION]", + "name": "BAF", + "type": "Float", + "mode": "Nullable" + }, + { + "description": "[DESCRIPTION]", + "name": "LRR", + "type": "Float", + "mode": "Nullable" + } +] diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/expected_probe_info.csv b/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/expected_probe_info.csv new file mode 100644 index 00000000000..0ccae4ab5ae --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/expected_probe_info.csv @@ -0,0 +1,3 @@ +ProbeId,Name,GenomeBuild,Chr,Position,Ref,AlleleA,AlleleB,build37Flag +0,1:5700115-A-T,37,1,5700115,A,A,T, +1,1:5700116-C-G,37,1,5700116,C,C,G,