-
Notifications
You must be signed in to change notification settings - Fork 596
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* update for genomes * add schema files
- Loading branch information
Showing
15 changed files
with
495 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
version 1.0 | ||
|
||
workflow ImportGenomes { | ||
|
||
input { | ||
Array[File] input_vcfs | ||
Array[File]? input_metrics | ||
File interval_list | ||
String output_directory | ||
File sample_map | ||
String project_id | ||
String dataset_name | ||
File pet_schema | ||
File vet_schema | ||
File metadata_schema | ||
#TODO: determine table_id from input sample_map (including looping over multiple table_ids) | ||
Int table_id | ||
Int? preemptible_tries | ||
File? gatk_override | ||
String? docker | ||
} | ||
String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"]) | ||
scatter (i in range(length(input_vcfs))) { | ||
if (defined(input_metrics)) { | ||
File input_metric = select_first([input_metrics])[i] | ||
} | ||
|
||
call CreateImportTsvs { | ||
input: | ||
input_vcf = input_vcfs[i], | ||
interval_list = interval_list, | ||
input_metrics = input_metric, | ||
sample_map = sample_map, | ||
output_directory = output_directory, | ||
gatk_override = gatk_override, | ||
docker = docker_final, | ||
preemptible_tries = preemptible_tries | ||
} | ||
} | ||
|
||
call LoadData as LoadMetadataTsvs{ | ||
input: | ||
done = CreateImportTsvs.done[0], | ||
project_id = project_id, | ||
dataset_name = dataset_name, | ||
storage_location = output_directory, | ||
datatype = "metadata", | ||
numbered = "false", | ||
partitioned = "false", | ||
table_id = table_id, | ||
schema = metadata_schema, | ||
preemptible_tries = preemptible_tries, | ||
docker = docker_final | ||
} | ||
call LoadData as LoadPetTsvs{ | ||
input: | ||
done = CreateImportTsvs.done[0], | ||
project_id = project_id, | ||
dataset_name = dataset_name, | ||
storage_location = output_directory, | ||
datatype = "pet", | ||
table_id = table_id, | ||
schema = pet_schema, | ||
preemptible_tries = preemptible_tries, | ||
docker = docker_final | ||
} | ||
|
||
call LoadData as LoadVetTsvs{ | ||
input: | ||
done = CreateImportTsvs.done[0], | ||
project_id = project_id, | ||
dataset_name = dataset_name, | ||
storage_location = output_directory, | ||
datatype = "vet", | ||
table_id = table_id, | ||
schema = vet_schema, | ||
preemptible_tries = preemptible_tries, | ||
docker = docker_final | ||
} | ||
|
||
} | ||
|
||
|
||
task CreateImportTsvs { | ||
input { | ||
File input_vcf | ||
File? input_metrics | ||
File interval_list | ||
String output_directory | ||
File sample_map | ||
|
||
# runtime | ||
Int? preemptible_tries | ||
File? gatk_override | ||
String docker | ||
|
||
String? for_testing_only | ||
} | ||
|
||
Int disk_size = ceil(size(input_vcf, "GB") * 2.5) + 20 | ||
|
||
meta { | ||
description: "Creates a tsv file for imort into BigQuery" | ||
} | ||
parameter_meta { | ||
input_vcf: { | ||
localization_optional: true | ||
} | ||
} | ||
command <<< | ||
set -e | ||
|
||
#workaround for https://github.com/broadinstitute/cromwell/issues/3647 | ||
export TMPDIR=/tmp | ||
|
||
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} | ||
~{for_testing_only} | ||
|
||
gatk --java-options "-Xmx2500m" CreateVariantIngestFiles \ | ||
-V ~{input_vcf} \ | ||
-L ~{interval_list} \ | ||
~{"-QCF " + input_metrics} \ | ||
--mode GENOMES \ | ||
-SNM ~{sample_map} \ | ||
--ref-version 38 | ||
|
||
gsutil cp metadata_*.tsv ~{output_directory}/metadata_tsvs/ | ||
gsutil cp pet_*.tsv ~{output_directory}/pet_tsvs/ | ||
gsutil cp vet_*.tsv ~{output_directory}/vet_tsvs/ | ||
>>> | ||
runtime { | ||
docker: docker | ||
memory: "4 GB" | ||
disks: "local-disk " + disk_size + " HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
output { | ||
File metadata_tsv = glob("metadata_*.tsv")[0] | ||
File pet_tsv = glob("pet_*.tsv")[0] | ||
File vet_tsv = glob("vet_*.tsv")[0] | ||
String done = "true" | ||
} | ||
} | ||
task LoadData { | ||
input { | ||
String project_id | ||
String dataset_name | ||
String storage_location | ||
String datatype | ||
Int table_id | ||
File schema | ||
String numbered = "true" | ||
String partitioned = "true" | ||
String load = "true" | ||
String uuid = "" | ||
|
||
#input from previous task needed to delay task from running until the other is complete | ||
String done | ||
|
||
# runtime | ||
Int? preemptible_tries | ||
String docker | ||
|
||
String? for_testing_only | ||
} | ||
|
||
command <<< | ||
set -x | ||
set +e | ||
# make sure dataset exists | ||
bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null | ||
|
||
set -e | ||
~{for_testing_only} | ||
|
||
DIR="~{storage_location}/~{datatype}_tsvs/" | ||
PARTITION_STRING="" | ||
|
||
if [ ~{partitioned} == "true" ]; then | ||
let "PARTITION_START=(~{table_id}-1)*4000+1" | ||
let "PARTITION_END=$PARTITION_START+3999" | ||
let "PARTITION_STEP=1" | ||
PARTITION_FIELD="sample_id" | ||
PARTITION_STRING="--range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP" | ||
fi | ||
|
||
printf -v PADDED_TABLE_ID "_%03d" ~{table_id} | ||
FILES="~{datatype}${PADDED_TABLE_ID}_*" | ||
|
||
if [ ~{numbered} != "true" ]; then | ||
PADDED_TABLE_ID="" #override table id to empty string, but it is needed to get the files | ||
fi | ||
|
||
NUM_FILES=$(gsutil ls $DIR$FILES | wc -l) | ||
|
||
# create the table and load | ||
TABLE="~{dataset_name}.~{uuid + "_"}~{datatype}${PADDED_TABLE_ID}" | ||
if [ $NUM_FILES -gt 0 ]; then | ||
set +e | ||
bq show --project_id ~{project_id} $TABLE > /dev/null | ||
set -e | ||
if [ $? -ne 0 ]; then | ||
echo "making table $TABLE" | ||
bq --location=US mk ${PARTITION_STRING} --project_id=~{project_id} $TABLE ~{schema} | ||
#TODO: add a Google Storage Transfer for the table when we make it. | ||
fi | ||
#load should be false if using Google Storage Transfer so that the tables will be created by this script, but no data will be uploaded. | ||
if [ ~{load} = true ]; then | ||
bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $DIR$FILES ~{schema} | ||
echo "ingested ${FILES} file from $DIR into table $TABLE" | ||
else | ||
echo "${FILES} will be ingested from $DIR by Google Storage Transfer" | ||
fi | ||
else | ||
echo "no ${FILES} files to process" | ||
fi | ||
>>> | ||
runtime { | ||
docker: docker | ||
memory: "4 GB" | ||
disks: "local-disk 10 HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"ImportGenomes.output_directory":"gs://broad-dsp-spec-ops/scratch/andrea/genomes/import", | ||
"ImportGenomes.input_vcfs":["gs://broad-dsp-spec-ops/scratch/andrea/genomes/BI_NA03949_1_99000142037_SM-GXZUN_1.reblocked.chr20.g.vcf.gz"], | ||
"ImportGenomes.interval_list":"gs://broad-dsp-spec-ops/scratch/andrea/genomes/wgs_calling_regions.hg38.chr20.interval_list", | ||
"ImportGenomes.sample_map":"gs://broad-dsp-spec-ops/scratch/andrea/genomes/sampleMap.csv", | ||
"ImportGenomes.metadata_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/sample_list_schema.json", | ||
"ImportGenomes.pet_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/pet_schema.json", | ||
"ImportGenomes.vet_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/vet_schema.json", | ||
"ImportGenomes.table_id": 1, | ||
"ImportGenomes.project_id": "broad-dsp-spec-ops", | ||
"ImportGenomes.dataset_name": "temp_tables", | ||
"ImportGenomes.gatk_override": "gs://broad-dsp-spec-ops/scratch/andrea/gatk-varstore.jar" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
[ | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "location", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "sample_id", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "state", | ||
"type": "String", | ||
"mode": "Required" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
[ | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "sample_name", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "sample_id", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "interval_list_blob", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "inferred_state", | ||
"type": "String", | ||
"mode": "Required" | ||
} | ||
] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
[ | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "location", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "sample_id", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "ref", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "alt", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AS_RAW_MQ", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AS_RAW_MQRankSum", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AS_QUALapprox", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AS_RAW_ReadPosRankSum", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AS_SB_TABLE", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AS_VarDP", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "call_GT", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "call_AD", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "call_GQ", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "call_PGT", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "call_PID", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "call_PL", | ||
"type": "String", | ||
"mode": "Required" | ||
} | ||
] |
Oops, something went wrong.