Skip to content

Commit

Permalink
Support for TDR DRS URIs in Import (#7528)
Browse files Browse the repository at this point in the history
* removed streaming

* removed streaming

* support for indexes as input

* move to same directory

* move to same directory

* move to same directory part two

* comments

* comments

* comments
  • Loading branch information
kcibul authored Nov 2, 2021
1 parent fb67e8e commit 08446bb
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 9 deletions.
1 change: 1 addition & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_no_streaming
- name: GvsPrepareCallset
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareCallset.wdl
Expand Down
26 changes: 17 additions & 9 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ workflow GvsImportGenomes {

input {
Array[File] input_vcfs
Array[File] input_vcf_indexes
Array[String] external_sample_names
File interval_list
String output_directory
Expand Down Expand Up @@ -102,15 +103,17 @@ workflow GvsImportGenomes {
call CreateFOFNs {
input:
input_vcf_list = write_lines(input_vcfs),
input_vcf_index_list = write_lines(input_vcf_indexes),
sample_name_list = write_lines(external_sample_names),
batch_size = batch_size,
run_uuid = SetLock.run_uuid
}

scatter (i in range(length(CreateFOFNs.vcf_batch_fofns))) {
scatter (i in range(length(CreateFOFNs.vcf_batch_vcf_fofns))) {
call CreateImportTsvs {
input:
input_vcfs = read_lines(CreateFOFNs.vcf_batch_fofns[i]),
input_vcfs = read_lines(CreateFOFNs.vcf_batch_vcf_fofns[i]),
input_vcf_indexes = read_lines(CreateFOFNs.vcf_batch_vcf_index_fofns[i]),
sample_names = read_lines(CreateFOFNs.vcf_sample_name_fofns[i]),
interval_list = interval_list,
service_account_json_path = service_account_json_path,
Expand Down Expand Up @@ -411,6 +414,7 @@ task CheckForDuplicateData {
task CreateFOFNs {
input {
File input_vcf_list
File input_vcf_index_list
File sample_name_list
Int batch_size
String run_uuid
Expand All @@ -420,6 +424,7 @@ task CreateFOFNs {
set -e

split -d -a 5 -l ~{batch_size} ~{input_vcf_list} batched_vcfs.
split -d -a 5 -l ~{batch_size} ~{input_vcf_index_list} batched_vcf_indexes.
split -d -a 5 -l ~{batch_size} ~{sample_name_list} batched_sample_names.
}

Expand All @@ -433,14 +438,16 @@ task CreateFOFNs {
}

output {
Array[File] vcf_batch_fofns = glob("batched_vcfs.*")
Array[File] vcf_batch_vcf_fofns = glob("batched_vcfs.*")
Array[File] vcf_batch_vcf_index_fofns = glob("batched_vcf_indexes.*")
Array[File] vcf_sample_name_fofns = glob("batched_sample_names.*")
}
}

task CreateImportTsvs {
input {
Array[File] input_vcfs
Array[File] input_vcf_indexes
Array[String] sample_names
File interval_list
String output_directory
Expand Down Expand Up @@ -475,6 +482,7 @@ task CreateImportTsvs {
localization_optional: true
}
}

command <<<
set -e

Expand Down Expand Up @@ -503,21 +511,21 @@ task CreateImportTsvs {

# translate WDL arrays into BASH arrays
VCFS_ARRAY=(~{sep=" " input_vcfs})
VCF_INDEXES_ARRAY=(~{sep=" " input_vcf_indexes})
SAMPLE_NAMES_ARRAY=(~{sep=" " sample_names})

# loop over the BASH arrays (See https://stackoverflow.com/questions/6723426/looping-over-arrays-printing-both-index-and-value)
for i in "${!VCFS_ARRAY[@]}"; do
input_vcf="${VCFS_ARRAY[$i]}"
input_vcf_basename=$(basename $input_vcf)
updated_input_vcf=$input_vcf
input_vcf_index="${VCFS_ARRAY[$i]}.tbi"
input_vcf_index="${VCF_INDEXES_ARRAY[$i]}"
sample_name="${SAMPLE_NAMES_ARRAY[$i]}"

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp $input_vcf .
gsutil cp $input_vcf_index .
updated_input_vcf=$input_vcf_basename
fi
# we always do our own localization
gsutil cp $input_vcf .
gsutil cp $input_vcf_index .
updated_input_vcf=$input_vcf_basename

# check whether these files have already been generated
DO_TSV_GENERATION='true'
Expand Down

0 comments on commit 08446bb

Please sign in to comment.