-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Moving and testing ingest scripts from variantstore #6881
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"ImportArrays.output_directory":"gs://variantstore-test/__UUID__", | ||
"ImportArrays.input_vcfs":["/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/array.vcf"], | ||
"ImportArrays.probe_info_file":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/expected_probe_info.csv", | ||
"ImportArrays.sample_map":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/sampleMap.csv", | ||
"ImportArrays.sample_list_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json", | ||
"ImportArrays.raw_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/raw_array_schema.json", | ||
"ImportArrays.table_id": 1, | ||
"ImportArrays.project_id": "broad-dsde-dev", | ||
"ImportArrays.dataset_name": "temp_tables", | ||
"ImportArrays.docker": "__GATK_DOCKER__", | ||
"ImportArrays.CreateImportTsvs.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS", | ||
"ImportArrays.LoadArrays.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS", | ||
"ImportArrays.LoadArrays.load": "true", | ||
"ImportArrays.LoadArrays.uuid": "__UUID__" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,222 @@ | ||
version 1.0 | ||
|
||
workflow ImportArrays { | ||
|
||
input { | ||
Array[File] input_vcfs | ||
Array[File]? input_metrics | ||
String? probe_info_table | ||
File? probe_info_file | ||
String output_directory | ||
File sample_map | ||
String project_id | ||
String dataset_name | ||
File raw_schema | ||
File sample_list_schema | ||
#TODO: determine table_id from input sample_map (including looping over multiple table_ids) | ||
Int table_id | ||
|
||
Int? preemptible_tries | ||
File? gatk_override | ||
String? docker | ||
} | ||
|
||
String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"]) | ||
|
||
scatter (i in range(length(input_vcfs))) { | ||
if (defined(input_metrics)) { | ||
File input_metric = select_first([input_metrics])[i] | ||
} | ||
|
||
call CreateImportTsvs { | ||
input: | ||
input_vcf = input_vcfs[i], | ||
input_metrics = input_metric, | ||
probe_info_table = probe_info_table, | ||
probe_info_file = probe_info_file, | ||
sample_map = sample_map, | ||
output_directory = output_directory, | ||
gatk_override = gatk_override, | ||
docker = docker_final, | ||
preemptible_tries = preemptible_tries | ||
} | ||
} | ||
|
||
call LoadArrays { | ||
input: | ||
metadata_tsvs = CreateImportTsvs.metadata_tsv, | ||
project_id = project_id, | ||
dataset_name = dataset_name, | ||
storage_location = output_directory, | ||
table_id = table_id, | ||
raw_schema = raw_schema, | ||
sample_list_schema = sample_list_schema, | ||
preemptible_tries = preemptible_tries, | ||
docker = docker_final | ||
} | ||
} | ||
|
||
|
||
task CreateImportTsvs { | ||
input { | ||
File input_vcf | ||
File? input_metrics | ||
String? probe_info_table | ||
File? probe_info_file | ||
String output_directory | ||
File sample_map | ||
|
||
# runtime | ||
Int? preemptible_tries | ||
File? gatk_override | ||
String docker | ||
|
||
String? for_testing_only | ||
} | ||
|
||
Int disk_size = ceil(size(input_vcf, "GB") * 2.5) + 20 | ||
|
||
meta { | ||
description: "Creates a tsv file for imort into BigQuery" | ||
} | ||
parameter_meta { | ||
input_vcf: { | ||
localization_optional: true | ||
} | ||
} | ||
command <<< | ||
set -e | ||
|
||
#workaround for https://github.com/broadinstitute/cromwell/issues/3647 | ||
export TMPDIR=/tmp | ||
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} | ||
~{for_testing_only} | ||
|
||
gatk --java-options "-Xmx2500m" CreateArrayIngestFiles \ | ||
-V ~{input_vcf} \ | ||
~{"-QCF " + input_metrics} \ | ||
~{"--probe-info-file " + probe_info_file} \ | ||
~{"--probe-info-table " + probe_info_table} \ | ||
-SNM ~{sample_map} \ | ||
--ref-version 37 | ||
|
||
gsutil cp sample_*.tsv ~{output_directory}/sample_tsvs/ | ||
gsutil cp raw_*.tsv ~{output_directory}/raw_tsvs/ | ||
>>> | ||
runtime { | ||
docker: docker | ||
memory: "4 GB" | ||
disks: "local-disk " + disk_size + " HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
output { | ||
File metadata_tsv = glob("sample_*.tsv")[0] | ||
File arraydata_tsv = glob("raw_*.tsv")[0] | ||
} | ||
} | ||
|
||
task LoadArrays { | ||
input { | ||
String project_id | ||
String dataset_name | ||
String storage_location | ||
Int table_id | ||
File raw_schema | ||
File sample_list_schema | ||
String load = "false" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is false the right default? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll change it to true. |
||
String uuid = "" | ||
|
||
#input from previous task needed to delay task from running until the other is complete | ||
Array[String] metadata_tsvs | ||
|
||
# runtime | ||
Int? preemptible_tries | ||
String docker | ||
|
||
String? for_testing_only | ||
} | ||
|
||
command <<< | ||
set -e | ||
~{for_testing_only} | ||
|
||
SAMPLE_DIR=~{storage_location}/sample_tsvs/ | ||
RAW_DIR=~{storage_location}/raw_tsvs/ | ||
|
||
let "PARTITION_START=(~{table_id}-1)*4000+1" | ||
let "PARTITION_END=$PARTITION_START+3999" | ||
let "PARTITION_STEP=1" | ||
PARTITION_FIELD="sample_id" | ||
printf -v PADDED_TABLE_ID "%03d" ~{table_id} | ||
|
||
RAW_FILES="raw_${PADDED_TABLE_ID}_*" | ||
METADATA_FILES="sample_${PADDED_TABLE_ID}_*" | ||
|
||
NUM_RAW_FILES=$(gsutil ls $RAW_DIR${RAW_FILES} | wc -l) | ||
NUM_METADATA_FILES=$(gsutil ls $SAMPLE_DIR${METADATA_FILES} | wc -l) | ||
|
||
if [ $NUM_RAW_FILES -eq 0 -a $NUM_METADATA_FILES -eq 0 ]; then | ||
"no files for table ${PADDED_TABLE_ID} to process in ~{storage_location}; exiting" | ||
exit | ||
fi | ||
|
||
# create a metadata table and load | ||
SAMPLE_LIST_TABLE="~{dataset_name}.~{uuid + "_"}sample_list" | ||
if [ $NUM_METADATA_FILES -gt 0 ]; then | ||
set +e | ||
bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null | ||
set -e | ||
if [ $? -ne 0 ]; then | ||
echo "making dataset ~{dataset_name}" | ||
bq mk --project_id=~{project_id} ~{dataset_name} | ||
fi | ||
set +e | ||
bq show --project_id ~{project_id} $SAMPLE_LIST_TABLE > /dev/null | ||
set -e | ||
if [ $? -ne 0 ]; then | ||
echo "making table $SAMPLE_LIST_TABLE" | ||
bq --location=US mk --project_id=~{project_id} $SAMPLE_LIST_TABLE ~{sample_list_schema} | ||
#TODO: add a Google Storage Transfer for the table when we make it. | ||
fi | ||
if [ ~{load} = true ]; then | ||
bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $SAMPLE_LIST_TABLE $SAMPLE_DIR$METADATA_FILES ~{sample_list_schema} | ||
echo "ingested ${METADATA_FILES} file from $SAMPLE_DIR into table $SAMPLE_LIST_TABLE" | ||
else | ||
echo "${METADATA_FILES} will be ingested from $SAMPLE_DIR by Google Storage Transfer" | ||
fi | ||
else | ||
echo "no metadata files to process" | ||
fi | ||
|
||
# create array table | ||
TABLE="~{dataset_name}.~{uuid + "_"}arrays_${PADDED_TABLE_ID}" | ||
if [ $NUM_RAW_FILES -gt 0 ]; then | ||
set +e | ||
bq show --project_id ~{project_id} $TABLE > /dev/null | ||
set -e | ||
if [ $? -ne 0 ]; then | ||
echo "making table $TABLE" | ||
bq --location=US mk --range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP \ | ||
--project_id=~{project_id} $TABLE ~{raw_schema} | ||
#TODO: add a Google Storage Transfer for the table when we make it. | ||
fi | ||
if [ ~{load} = true ]; then | ||
bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $RAW_DIR$RAW_FILES ~{raw_schema} | ||
echo "ingested ${RAW_FILES} files from $RAW_DIR into table $TABLE" | ||
else | ||
echo "${RAW_FILES} will be ingested from $RAW_DIR | ||
by Google Storage Transfer" | ||
fi | ||
else | ||
echo "no raw data files to process" | ||
fi | ||
>>> | ||
runtime { | ||
docker: docker | ||
memory: "4 GB" | ||
disks: "local-disk 10 HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/env bash | ||
set -e | ||
|
||
if [ $# -lt 5 ]; then | ||
echo "usage: $0 <project-id> <dataset-name> <storage-location> <table-id> <load> <uuid>" | ||
exit 1 | ||
fi | ||
|
||
PROJECT_ID=$1 | ||
DATASET_NAME=$2 | ||
STORAGE_LOCATION=$3 | ||
TABLE_ID=$4 | ||
if [ $5 == "true" ]; then | ||
LOAD=true | ||
else | ||
LOAD=false | ||
fi | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when do you not want to load? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My thought was that if we end up using Google Data Transfer we need a script that will create the tables but not actually load the data. Ideally we'd add the generation of the Transfers to this script too, but I didn't get around to doing that. Now that I look at this though, I think the code for this script is all contained within the WDL and I shouldn't have committed this extra file. I'll add a comment about the Google Data Transfer to the WDL and delete this bash script. |
||
if [ $# -eq 6 ]; then | ||
UUID_FOR_TABLE="${6}_" | ||
else | ||
UUID_FOR_TABLE="" | ||
fi | ||
SAMPLE_DIR=$STORAGE_LOCATION/sample_tsvs/ | ||
RAW_DIR=$STORAGE_LOCATION/raw_tsvs/ | ||
|
||
let "PARTITION_START=($TABLE_ID-1)*4000+1" | ||
let "PARTITION_END=$PARTITION_START+3999" | ||
let "PARTITION_STEP=1" | ||
PARTITION_FIELD="sample_id" | ||
printf -v PADDED_TABLE_ID "%03d" $TABLE_ID | ||
|
||
RAW_FILES="raw_${PADDED_TABLE_ID}_*" | ||
METADATA_FILES="sample_${PADDED_TABLE_ID}_*" | ||
|
||
NUM_RAW_FILES=$(gsutil ls $RAW_DIR${RAW_FILES} | wc -l) | ||
NUM_METADATA_FILES=$(gsutil ls $SAMPLE_DIR${METADATA_FILES} | wc -l) | ||
|
||
if [ $NUM_RAW_FILES -eq 0 -a $NUM_METADATA_FILES -eq 0 ]; then | ||
"no files for table ${PADDED_TABLE_ID} to process in $STORAGE_LOCATION; exiting" | ||
exit | ||
fi | ||
|
||
# schema and TSV header need to be the same order | ||
RAW_SCHEMA="schemas/raw_array_schema.json" | ||
SAMPLE_LIST_SCHEMA="schemas/arrays_sample_list_schema.json" | ||
|
||
# create a metadata table and load | ||
SAMPLE_LIST_TABLE="${DATASET_NAME}.${UUID_FOR_TABLE}sample_list" | ||
if [ $NUM_METADATA_FILES -gt 0 ]; then | ||
set +e | ||
bq ls --project_id $PROJECT_ID $DATASET_NAME > /dev/null | ||
set -e | ||
if [ $? -ne 0 ]; then | ||
echo "making dataset $DATASET_NAME" | ||
bq mk --project_id=$PROJECT_ID $DATASET_NAME | ||
fi | ||
set +e | ||
bq show --project_id $PROJECT_ID $SAMPLE_LIST_TABLE > /dev/null | ||
set -e | ||
if [ $? -ne 0 ]; then | ||
echo "making table $SAMPLE_LIST_TABLE" | ||
bq --location=US mk --project_id=$PROJECT_ID $SAMPLE_LIST_TABLE $SAMPLE_LIST_SCHEMA | ||
#TODO: add a Google Storage Transfer for the table when we make it. | ||
fi | ||
if [ "$LOAD" = true ]; then | ||
bq load --location=US --project_id=$PROJECT_ID --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $SAMPLE_LIST_TABLE $SAMPLE_DIR$METADATA_FILES $SAMPLE_LIST_SCHEMA | ||
echo "ingested ${METADATA_FILES} file from $SAMPLE_DIR into table $SAMPLE_LIST_TABLE" | ||
else | ||
echo "${METADATA_FILES} will be ingested from $SAMPLE_DIR by Google Storage Transfer" | ||
fi | ||
else | ||
echo "no metadata files to process" | ||
fi | ||
|
||
# create array table | ||
TABLE="${DATASET_NAME}.${UUID_FOR_TABLE}arrays_${PADDED_TABLE_ID}" | ||
if [ $NUM_RAW_FILES -gt 0 ]; then | ||
set +e | ||
bq show --project_id $PROJECT_ID $TABLE > /dev/null | ||
set -e | ||
if [ $? -ne 0 ]; then | ||
echo "making table $TABLE" | ||
bq --location=US mk --range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP \ | ||
--project_id=$PROJECT_ID $TABLE $RAW_SCHEMA | ||
#TODO: add a Google Storage Transfer for the table when we make it. | ||
fi | ||
if [ "$LOAD" = true ]; then | ||
bq load --location=US --project_id=$PROJECT_ID --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $RAW_DIR$RAW_FILES $RAW_SCHEMA | ||
echo "ingested ${RAW_FILES} files from $RAW_DIR into table $TABLE" | ||
else | ||
echo "${RAW_FILES} will be ingested from $RAW_DIR | ||
by Google Storage Transfer" | ||
fi | ||
else | ||
echo "no raw data files to process" | ||
fi | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i changed the name of the file from metadata_.tsv to sample_.tsv. it might make it more clear to update this output param. (if you do, also change it in the inputs to LoadArrays)