Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GVS / Hail VDS integration test [VS-639] #8086

Merged
merged 49 commits into from
Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
7bbf4d3
wip
mcovarr Oct 24, 2022
7fc099b
integration
mcovarr Oct 24, 2022
757fa22
hopefully fail in more interesting ways
mcovarr Oct 25, 2022
059e7b1
spike-worthy hackery
mcovarr Oct 25, 2022
c422cf8
increase hackery
mcovarr Oct 25, 2022
7a1ea90
huh
mcovarr Oct 25, 2022
f41226e
maybe
mcovarr Oct 25, 2022
c62a283
wip
mcovarr Oct 25, 2022
d835a2e
wip
mcovarr Oct 25, 2022
bc3d001
doh
mcovarr Oct 25, 2022
67a8453
fix
mcovarr Oct 25, 2022
27f2690
cleanup
mcovarr Oct 25, 2022
b4a0e96
oops
mcovarr Oct 25, 2022
1a3c10c
fix
mcovarr Oct 25, 2022
23333c7
more spikey wip
mcovarr Oct 27, 2022
1834e5f
wip
mcovarr Oct 27, 2022
4daa85e
oops
mcovarr Oct 27, 2022
f3a4084
gah
mcovarr Oct 27, 2022
b7ca91d
revert many of the differences with non-hail integration test
mcovarr Oct 27, 2022
d1d71b0
fixes
mcovarr Oct 27, 2022
c85d452
so much DRYing
mcovarr Oct 27, 2022
d207592
restore my beauteous whitespace
mcovarr Oct 27, 2022
c751af3
oops need drop_state NONE
mcovarr Oct 27, 2022
99e867c
hackery for short cycle times
mcovarr Oct 27, 2022
1210bb2
Revert "hackery for short cycle times"
mcovarr Oct 28, 2022
3b0ec84
fixees
mcovarr Oct 28, 2022
54c06c8
wip
mcovarr Nov 2, 2022
2926093
wip
mcovarr Nov 2, 2022
397268f
fix from Tim
mcovarr Nov 2, 2022
0b0b84a
Revert "wip"
mcovarr Nov 2, 2022
e3fb81f
hacked up to resume
mcovarr Nov 3, 2022
b1d5dad
whoops
mcovarr Nov 3, 2022
721cc2a
Revert "hacked up to resume"
mcovarr Nov 3, 2022
2978931
uber integration wdl
mcovarr Nov 3, 2022
08ebbcc
separate test run prefixes
mcovarr Nov 3, 2022
86c6e9b
cleanup
mcovarr Nov 4, 2022
b0b1160
Revert "separate test run prefixes"
mcovarr Nov 4, 2022
a228651
rework
mcovarr Nov 4, 2022
80eb983
fixes / improvements
mcovarr Nov 4, 2022
901a3ed
gah
mcovarr Nov 4, 2022
c49641a
oops
mcovarr Nov 7, 2022
98dcede
checkpoint
mcovarr Nov 8, 2022
0b029e3
cleanup
mcovarr Nov 8, 2022
ba03a9f
cleanup, update references
mcovarr Nov 8, 2022
f8a4bb5
oops
mcovarr Nov 9, 2022
ed64917
omg
mcovarr Nov 9, 2022
308c703
comment my trickery
mcovarr Nov 9, 2022
b42d7b1
delete erroneous comment
mcovarr Nov 10, 2022
bd65357
remove junk
mcovarr Nov 10, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -201,14 +201,29 @@ workflows:
branches:
- master
- ah_var_store
- name: GvsQuickstartVcfIntegration
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl
filters:
branches:
- master
- ah_var_store
- name: GvsQuickstartHailIntegration
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
filters:
branches:
- master
- ah_var_store
- vs_639_hail_testing_spike
- name: GvsQuickstartIntegration
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
filters:
branches:
- master
- ah_var_store
- vs_707_azure_setup
- vs_639_hail_testing_spike
- name: GvsIngestTieout
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl
Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ task AssignIds {
bq --project_id=~{project_id} rm -f -t ~{dataset_name}.sample_id_assignment_lock
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "3.75 GB"
disks: "local-disk " + 10 + " HDD"
cpu: 1
Expand Down Expand Up @@ -196,7 +196,7 @@ task CreateCostObservabilityTable {
fi
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
}
output {
Boolean done = true
Expand Down
10 changes: 5 additions & 5 deletions scripts/variantstore/wdl/GvsCallsetStatistics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ task CreateTables {
fi
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
disks: "local-disk 500 HDD"
}
output {
Expand Down Expand Up @@ -400,7 +400,7 @@ task CollectMetricsForChromosome {
Boolean done = true
}
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
disks: "local-disk 500 HDD"
}
}
Expand Down Expand Up @@ -471,7 +471,7 @@ task AggregateMetricsAcrossChromosomes {
Boolean done = true
}
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
disks: "local-disk 500 HDD"
}
}
Expand Down Expand Up @@ -543,7 +543,7 @@ task CollectStatistics {
Boolean done = true
}
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
disks: "local-disk 500 HDD"
}
}
Expand Down Expand Up @@ -572,7 +572,7 @@ task ExportToCSV {
File callset_statistics = "~{statistics_table}.csv"
}
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
disks: "local-disk 500 HDD"
}
}
8 changes: 5 additions & 3 deletions scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import "GvsUtils.wdl" as Utils

workflow GvsExtractAvroFilesForHail {
input {
Boolean go = true
String project_id
String dataset
String filter_set_name
Expand Down Expand Up @@ -61,6 +62,7 @@ workflow GvsExtractAvroFilesForHail {
String vds_output_path = GenerateHailScripts.vds_output_path
String sites_only_vcf_output_path = GenerateHailScripts.sites_only_vcf_output_path
String vat_inputs_output_path = GenerateHailScripts.vat_inputs_output_path
String avro_prefix = ExtractFromNonSuperpartitionedTables.output_prefix
}
}

Expand All @@ -80,7 +82,7 @@ task OutputPath {
File out = stdout()
}
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
disks: "local-disk 500 HDD"
}
}
Expand Down Expand Up @@ -151,7 +153,7 @@ task ExtractFromNonSuperpartitionedTables {
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
disks: "local-disk 500 HDD"
}
}
Expand Down Expand Up @@ -218,7 +220,7 @@ task ExtractFromSuperpartitionedTables {
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
disks: "local-disk 500 HDD"
}
}
Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ task SumBytes {
print(total_mb);"
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "3 GB"
disks: "local-disk 500 HDD"
preemptible: 3
Expand Down Expand Up @@ -394,7 +394,7 @@ task CreateManifest {
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "3 GB"
disks: "local-disk 500 HDD"
preemptible: 3
Expand Down Expand Up @@ -436,7 +436,7 @@ task GenerateSampleListFile {
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "3 GB"
disks: "local-disk 500 HDD"
preemptible: 3
Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ task CreateFOFNs {
>>>
# Can't use alpine version here because their version of split does not recognize the -d option
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
bootDiskSizeGb: 15
memory: "3 GB"
disks: "local-disk 10 HDD"
Expand Down Expand Up @@ -293,7 +293,7 @@ task SetIsLoadedColumn {
AND sls2.status = "FINISHED")'
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "1 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down Expand Up @@ -382,7 +382,7 @@ task GetUningestedSampleIds {
bq --project_id=~{project_id} rm -f=true ~{temp_table}
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "1 GB"
disks: "local-disk 10 HDD"
preemptible: 5
Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ task GetMaxSampleId {
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "3 GB"
disks: "local-disk 10 HDD"
preemptible: 3
Expand Down Expand Up @@ -138,7 +138,7 @@ task GetVetTableNames {
split -l $num_tables_per_file vet_tables.csv vet_tables_
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "3 GB"
disks: "local-disk 10 HDD"
preemptible: 3
Expand Down Expand Up @@ -199,7 +199,7 @@ task CreateAltAlleleTable {

>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine"
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-alpine"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down
145 changes: 145 additions & 0 deletions scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
version 1.0

import "GvsUtils.wdl" as Utils
import "GvsExtractAvroFilesForHail.wdl" as ExtractAvroFilesForHail
import "GvsQuickstartVcfIntegration.wdl" as QuickstartVcfIntegration

workflow GvsQuickstartHailIntegration {
input {
String branch_name
String hail_wheel = "gs://gvs-internal-scratch/hail-wheels/2022-10-18/0.2.102-964bee061eb0/hail-0.2.102-py3-none-any.whl"
}

String project_id = "gvs-internal"

call QuickstartVcfIntegration.GvsQuickstartVcfIntegration {
input:
branch_name = branch_name,
drop_state = "NONE",
extract_do_not_filter_override = false,
dataset_suffix = "hail",
}

call ExtractAvroFilesForHail.GvsExtractAvroFilesForHail {
input:
go = GvsQuickstartVcfIntegration.done,
project_id = project_id,
dataset = GvsQuickstartVcfIntegration.dataset_name,
filter_set_name = GvsQuickstartVcfIntegration.filter_set_name,
scatter_width = 10,
}

call CreateAndTieOutVds {
input:
branch_name = branch_name,
hail_wheel = hail_wheel,
avro_prefix = GvsExtractAvroFilesForHail.avro_prefix,
vds_destination_path = GvsExtractAvroFilesForHail.vds_output_path,
tieout_vcfs = GvsQuickstartVcfIntegration.output_vcfs,
tieout_vcf_indexes = GvsQuickstartVcfIntegration.output_vcf_indexes,
}

output {
Array[File] output_vcfs = GvsQuickstartVcfIntegration.output_vcfs
Array[File] output_vcf_indexes = GvsQuickstartVcfIntegration.output_vcf_indexes
Float total_vcfs_size_mb = GvsQuickstartVcfIntegration.total_vcfs_size_mb
File manifest = GvsQuickstartVcfIntegration.manifest
String vds_output_path = GvsExtractAvroFilesForHail.vds_output_path
Boolean done = true
}
}


task CreateAndTieOutVds {
input {
File hail_wheel
String branch_name
String avro_prefix
String vds_destination_path
Array[File] tieout_vcfs
Array[File] tieout_vcf_indexes
}
parameter_meta {
tieout_vcfs: {
localization_optional: true
}
tieout_vcf_indexes: {
localization_optional: true
}
}
command <<<
# Prepend date, time and pwd to xtrace log entries.
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace

script_url_prefix="https://raw.githubusercontent.com/broadinstitute/gatk/~{branch_name}/scripts/variantstore/wdl/extract"

for script in hail_gvs_import.py hail_join_vds_vcfs.py gvs_vds_tie_out.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is clever, but it might be worth adding a comment somewhere about why we are doing this?

do
curl --silent --location --remote-name "${script_url_prefix}/${script}"
done

# Create a manifest of VCFs and indexes to bulk download with `gcloud storage cp`.
touch vcf_manifest.txt
# This is extremely noisy and not interesting, turn off xtrace.
set +o xtrace
for file in ~{sep=' ' tieout_vcfs} ~{sep=' ' tieout_vcf_indexes}
do
echo $file >> vcf_manifest.txt
done
# xtrace back on
set -o xtrace

# Copy VCFs and indexes to the current directory.
cat vcf_manifest.txt | gcloud storage cp -I .

# `avro_prefix` includes a trailing `avro` so don't add another `avro` here.
gcloud storage cp --recursive ~{avro_prefix} $PWD

export REFERENCES_PATH=$PWD/references
mkdir -p ${REFERENCES_PATH}

gcloud storage cp 'gs://hail-common/references/Homo_sapiens_assembly38.fasta*' ${REFERENCES_PATH}

# Temurin Java 8
apt-get -qq install wget apt-transport-https gnupg
wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add -
echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list
apt-get -qq update
apt -qq install -y temurin-8-jdk

pip install ~{hail_wheel}
export PYSPARK_SUBMIT_ARGS='--driver-memory 16g --executor-memory 16g pyspark-shell'

export WORK=$PWD/work
mkdir ${WORK}

export TEMP_PATH=$WORK/temp
mkdir ${TEMP_PATH}

export VDS_PATH=$WORK/gvs_import.vds
export AVRO_PATH=$PWD/avro

python3 ./hail_gvs_import.py --avro-path ${AVRO_PATH} --vds-path ${VDS_PATH} --temp-path ${TEMP_PATH} --references-path ${REFERENCES_PATH}

export JOINED_MATRIX_TABLE_PATH=${WORK}/joined.mt

python3 ./hail_join_vds_vcfs.py --vds-path ${VDS_PATH} --joined-matrix-table-path ${JOINED_MATRIX_TABLE_PATH} *.vcf.gz

# Copy up the VDS
gcloud storage cp --recursive ${VDS_PATH} ~{vds_destination_path}

pip install pytest
ln -s ${WORK}/joined.mt .
pytest ./gvs_vds_tie_out.py
>>>
runtime {
# `slim` here to be able to use Java
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:409.0.0-slim"
disks: "local-disk 2000 HDD"
memory: "30 GiB"
}
output {
Boolean done = true
}
}
Loading