Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moving and testing ingest scripts from variantstore #6881

Merged
merged 5 commits into from
Oct 20, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
"ImportArrayManifest.manifest_schema_json":"/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/manifest_schema.json",
"ImportArrayManifest.project_id":"broad-dsde-dev",
"ImportArrayManifest.dataset_name":"temp_tables",
"ImportArrayManifest.table_name": "__TABLE_NAME__",
"ImportArrayManifest.table_name": "__TABLE_NAME___probe_id",
"ImportArrayManifest.LoadManifest.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS"
}
16 changes: 16 additions & 0 deletions scripts/variantstore_cromwell_tests/import_arrays_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"ImportArrays.output_directory":"gs://variantstore-test/__UUID__",
"ImportArrays.input_vcfs":["/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/array.vcf"],
"ImportArrays.probe_info_file":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/expected_probe_info.csv",
"ImportArrays.sample_map":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/sampleMap.csv",
"ImportArrays.sample_list_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json",
"ImportArrays.raw_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/raw_array_schema.json",
"ImportArrays.table_id": 1,
"ImportArrays.project_id": "broad-dsde-dev",
"ImportArrays.dataset_name": "temp_tables",
"ImportArrays.docker": "__GATK_DOCKER__",
"ImportArrays.CreateImportTsvs.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS",
"ImportArrays.LoadArrays.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS",
"ImportArrays.LoadArrays.load": "true",
"ImportArrays.LoadArrays.uuid": "__UUID__"
}
16 changes: 12 additions & 4 deletions scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,23 @@ else
fi
echo "Docker build done =========="
echo "Putting the newly built docker image into the json parameters"
cd $WORKING_DIR/gatk/scripts/
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" variantstore_cromwell_tests/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json
CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/variantstore_cromwell_tests"
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/import_arrays_test.json >$WORKING_DIR/import_arrays_test_tmp.json
sed -r "s/__TABLE_NAME__/$UUID/g" $WORKING_DIR/import_array_manifest_test_tmp.json > $WORKING_DIR/import_array_manifest_test_mod.json
echo "JSON FILE (modified) ======="
sed -r "s/__UUID__/$UUID/g" $WORKING_DIR/import_arrays_test_tmp.json > $WORKING_DIR/import_arrays_test_mod.json
echo "MANIFEST JSON FILE (modified) ======="
cat $WORKING_DIR/import_array_manifest_test_mod.json
echo "INGEST JSON FILE (modified) ======="
cat $WORKING_DIR/import_arrays_test_mod.json

sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" variantstore_cromwell_tests/local-with-gcs.conf >$WORKING_DIR/set_up.conf
sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" $CROMWELL_TEST_DIR/local-with-gcs.conf >$WORKING_DIR/set_up.conf
echo "Updated local_backend.conf with service account"

echo "Running ImportArrayManifest WDL through cromwell"
ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl
sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl -i $WORKING_DIR/import_array_manifest_test_mod.json -m $WORKING_DIR/test_import_manifest_wdl.metadata

echo "Running ImportArrays WDL through cromwell"
ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrays.wdl
sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrays.wdl -i $WORKING_DIR/import_arrays_test_mod.json
222 changes: 222 additions & 0 deletions scripts/variantstore_wdl/ImportArrays.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
version 1.0

workflow ImportArrays {

input {
Array[File] input_vcfs
Array[File]? input_metrics
String? probe_info_table
File? probe_info_file
String output_directory
File sample_map
String project_id
String dataset_name
File raw_schema
File sample_list_schema
#TODO: determine table_id from input sample_map (including looping over multiple table_ids)
Int table_id

Int? preemptible_tries
File? gatk_override
String? docker
}

String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"])

scatter (i in range(length(input_vcfs))) {
if (defined(input_metrics)) {
File input_metric = select_first([input_metrics])[i]
}

call CreateImportTsvs {
input:
input_vcf = input_vcfs[i],
input_metrics = input_metric,
probe_info_table = probe_info_table,
probe_info_file = probe_info_file,
sample_map = sample_map,
output_directory = output_directory,
gatk_override = gatk_override,
docker = docker_final,
preemptible_tries = preemptible_tries
}
}

call LoadArrays {
input:
metadata_tsvs = CreateImportTsvs.metadata_tsv,
project_id = project_id,
dataset_name = dataset_name,
storage_location = output_directory,
table_id = table_id,
raw_schema = raw_schema,
sample_list_schema = sample_list_schema,
preemptible_tries = preemptible_tries,
docker = docker_final
}
}


task CreateImportTsvs {
input {
File input_vcf
File? input_metrics
String? probe_info_table
File? probe_info_file
String output_directory
File sample_map

# runtime
Int? preemptible_tries
File? gatk_override
String docker

String? for_testing_only
}

Int disk_size = ceil(size(input_vcf, "GB") * 2.5) + 20

meta {
description: "Creates a tsv file for imort into BigQuery"
}
parameter_meta {
input_vcf: {
localization_optional: true
}
}
command <<<
set -e

#workaround for https://github.com/broadinstitute/cromwell/issues/3647
export TMPDIR=/tmp
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
~{for_testing_only}

gatk --java-options "-Xmx2500m" CreateArrayIngestFiles \
-V ~{input_vcf} \
~{"-QCF " + input_metrics} \
~{"--probe-info-file " + probe_info_file} \
~{"--probe-info-table " + probe_info_table} \
-SNM ~{sample_map} \
--ref-version 37

gsutil cp sample_*.tsv ~{output_directory}/sample_tsvs/
gsutil cp raw_*.tsv ~{output_directory}/raw_tsvs/
>>>
runtime {
docker: docker
memory: "4 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 2
}
output {
File metadata_tsv = glob("sample_*.tsv")[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i changed the name of the file from metadata_.tsv to sample_.tsv. it might make it more clear to update this output param. (if you do, also change it in the inputs to LoadArrays)

File arraydata_tsv = glob("raw_*.tsv")[0]
}
}

task LoadArrays {
input {
String project_id
String dataset_name
String storage_location
Int table_id
File raw_schema
File sample_list_schema
String load = "false"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is false the right default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll change it to true.

String uuid = ""

#input from previous task needed to delay task from running until the other is complete
Array[String] metadata_tsvs

# runtime
Int? preemptible_tries
String docker

String? for_testing_only
}

command <<<
set -e
~{for_testing_only}

SAMPLE_DIR=~{storage_location}/sample_tsvs/
RAW_DIR=~{storage_location}/raw_tsvs/

let "PARTITION_START=(~{table_id}-1)*4000+1"
let "PARTITION_END=$PARTITION_START+3999"
let "PARTITION_STEP=1"
PARTITION_FIELD="sample_id"
printf -v PADDED_TABLE_ID "%03d" ~{table_id}

RAW_FILES="raw_${PADDED_TABLE_ID}_*"
METADATA_FILES="sample_${PADDED_TABLE_ID}_*"

NUM_RAW_FILES=$(gsutil ls $RAW_DIR${RAW_FILES} | wc -l)
NUM_METADATA_FILES=$(gsutil ls $SAMPLE_DIR${METADATA_FILES} | wc -l)

if [ $NUM_RAW_FILES -eq 0 -a $NUM_METADATA_FILES -eq 0 ]; then
"no files for table ${PADDED_TABLE_ID} to process in ~{storage_location}; exiting"
exit
fi

# create a metadata table and load
SAMPLE_LIST_TABLE="~{dataset_name}.~{uuid + "_"}sample_list"
if [ $NUM_METADATA_FILES -gt 0 ]; then
set +e
bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making dataset ~{dataset_name}"
bq mk --project_id=~{project_id} ~{dataset_name}
fi
set +e
bq show --project_id ~{project_id} $SAMPLE_LIST_TABLE > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making table $SAMPLE_LIST_TABLE"
bq --location=US mk --project_id=~{project_id} $SAMPLE_LIST_TABLE ~{sample_list_schema}
#TODO: add a Google Storage Transfer for the table when we make it.
fi
if [ ~{load} = true ]; then
bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $SAMPLE_LIST_TABLE $SAMPLE_DIR$METADATA_FILES ~{sample_list_schema}
echo "ingested ${METADATA_FILES} file from $SAMPLE_DIR into table $SAMPLE_LIST_TABLE"
else
echo "${METADATA_FILES} will be ingested from $SAMPLE_DIR by Google Storage Transfer"
fi
else
echo "no metadata files to process"
fi

# create array table
TABLE="~{dataset_name}.~{uuid + "_"}arrays_${PADDED_TABLE_ID}"
if [ $NUM_RAW_FILES -gt 0 ]; then
set +e
bq show --project_id ~{project_id} $TABLE > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making table $TABLE"
bq --location=US mk --range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP \
--project_id=~{project_id} $TABLE ~{raw_schema}
#TODO: add a Google Storage Transfer for the table when we make it.
fi
if [ ~{load} = true ]; then
bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $RAW_DIR$RAW_FILES ~{raw_schema}
echo "ingested ${RAW_FILES} files from $RAW_DIR into table $TABLE"
else
echo "${RAW_FILES} will be ingested from $RAW_DIR
by Google Storage Transfer"
fi
else
echo "no raw data files to process"
fi
>>>
runtime {
docker: docker
memory: "4 GB"
disks: "local-disk 10 HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 2
}
}
97 changes: 97 additions & 0 deletions scripts/variantstore_wdl/bq_ingest_arrays.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env bash
set -e

if [ $# -lt 5 ]; then
echo "usage: $0 <project-id> <dataset-name> <storage-location> <table-id> <load> <uuid>"
exit 1
fi

PROJECT_ID=$1
DATASET_NAME=$2
STORAGE_LOCATION=$3
TABLE_ID=$4
if [ $5 == "true" ]; then
LOAD=true
else
LOAD=false
fi
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when do you not want to load?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thought was that if we end up using Google Data Transfer we need a script that will create the tables but not actually load the data. Ideally we'd add the generation of the Transfers to this script too, but I didn't get around to doing that.

Now that I look at this though, I think the code for this script is all contained within the WDL and I shouldn't have committed this extra file. I'll add a comment about the Google Data Transfer to the WDL and delete this bash script.

if [ $# -eq 6 ]; then
UUID_FOR_TABLE="${6}_"
else
UUID_FOR_TABLE=""
fi
SAMPLE_DIR=$STORAGE_LOCATION/sample_tsvs/
RAW_DIR=$STORAGE_LOCATION/raw_tsvs/

let "PARTITION_START=($TABLE_ID-1)*4000+1"
let "PARTITION_END=$PARTITION_START+3999"
let "PARTITION_STEP=1"
PARTITION_FIELD="sample_id"
printf -v PADDED_TABLE_ID "%03d" $TABLE_ID

RAW_FILES="raw_${PADDED_TABLE_ID}_*"
METADATA_FILES="sample_${PADDED_TABLE_ID}_*"

NUM_RAW_FILES=$(gsutil ls $RAW_DIR${RAW_FILES} | wc -l)
NUM_METADATA_FILES=$(gsutil ls $SAMPLE_DIR${METADATA_FILES} | wc -l)

if [ $NUM_RAW_FILES -eq 0 -a $NUM_METADATA_FILES -eq 0 ]; then
"no files for table ${PADDED_TABLE_ID} to process in $STORAGE_LOCATION; exiting"
exit
fi

# schema and TSV header need to be the same order
RAW_SCHEMA="schemas/raw_array_schema.json"
SAMPLE_LIST_SCHEMA="schemas/arrays_sample_list_schema.json"

# create a metadata table and load
SAMPLE_LIST_TABLE="${DATASET_NAME}.${UUID_FOR_TABLE}sample_list"
if [ $NUM_METADATA_FILES -gt 0 ]; then
set +e
bq ls --project_id $PROJECT_ID $DATASET_NAME > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making dataset $DATASET_NAME"
bq mk --project_id=$PROJECT_ID $DATASET_NAME
fi
set +e
bq show --project_id $PROJECT_ID $SAMPLE_LIST_TABLE > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making table $SAMPLE_LIST_TABLE"
bq --location=US mk --project_id=$PROJECT_ID $SAMPLE_LIST_TABLE $SAMPLE_LIST_SCHEMA
#TODO: add a Google Storage Transfer for the table when we make it.
fi
if [ "$LOAD" = true ]; then
bq load --location=US --project_id=$PROJECT_ID --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $SAMPLE_LIST_TABLE $SAMPLE_DIR$METADATA_FILES $SAMPLE_LIST_SCHEMA
echo "ingested ${METADATA_FILES} file from $SAMPLE_DIR into table $SAMPLE_LIST_TABLE"
else
echo "${METADATA_FILES} will be ingested from $SAMPLE_DIR by Google Storage Transfer"
fi
else
echo "no metadata files to process"
fi

# create array table
TABLE="${DATASET_NAME}.${UUID_FOR_TABLE}arrays_${PADDED_TABLE_ID}"
if [ $NUM_RAW_FILES -gt 0 ]; then
set +e
bq show --project_id $PROJECT_ID $TABLE > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making table $TABLE"
bq --location=US mk --range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP \
--project_id=$PROJECT_ID $TABLE $RAW_SCHEMA
#TODO: add a Google Storage Transfer for the table when we make it.
fi
if [ "$LOAD" = true ]; then
bq load --location=US --project_id=$PROJECT_ID --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $RAW_DIR$RAW_FILES $RAW_SCHEMA
echo "ingested ${RAW_FILES} files from $RAW_DIR into table $TABLE"
else
echo "${RAW_FILES} will be ingested from $RAW_DIR
by Google Storage Transfer"
fi
else
echo "no raw data files to process"
fi

Loading