Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subsampler and augur workflow #483

Merged
merged 52 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
a74b0bf
test subsampler
schaluva Aug 22, 2023
b861b55
set outputs
schaluva Aug 22, 2023
4299b4c
point to right dir
schaluva Aug 22, 2023
388be77
add in all rule inputs
schaluva Aug 22, 2023
53839db
reset push to dockstore
schaluva Aug 22, 2023
8b94ebe
hold output from opt/subsampler dir
schaluva Aug 22, 2023
c4b8671
output /opt..
schaluva Aug 22, 2023
3bd21bc
test
schaluva Aug 22, 2023
eb5124d
check directory structues
schaluva Aug 22, 2023
744e8af
original dir pwd
schaluva Aug 22, 2023
f180d6f
figure out how to get to root
schaluva Aug 22, 2023
93ab0ae
get back to cromwell root
schaluva Aug 22, 2023
a13e4f9
make copies of outputs in cromwell_root
schaluva Aug 22, 2023
3563559
expand to more rules
schaluva Aug 23, 2023
6c7d1c6
add in subsampler
schaluva Aug 23, 2023
a67b21f
remvoe default values for dates
schaluva Aug 23, 2023
e31adcf
make start and end optional
schaluva Aug 23, 2023
5bffdd4
fix rule name
schaluva Aug 23, 2023
e7f7347
clean up and add subsample outputs to wf outputs
schaluva Aug 24, 2023
22feb4a
initial setup of subsampler->augur pipeline
dpark01 Sep 8, 2023
6289875
wdl syntax fixes
dpark01 Sep 8, 2023
f550d81
fix subsample_by_casecounts output refs, add some extra outputs to au…
dpark01 Sep 8, 2023
7933f21
remove genbank test from cromwell unit testing
dpark01 Sep 8, 2023
6ba944c
Merge branch 'master' into subsampler-dp
dpark01 Sep 8, 2023
8728b92
Merge branch 'master' into subsampler-dp
dpark01 Sep 8, 2023
e6dbd51
Merge branch 'master' into subsampler-dp
dpark01 Sep 9, 2023
8d65690
Merge branch 'master' into subsampler-dp
dpark01 Sep 10, 2023
dd3258a
bump nextstrain docker from 20211012T20 to 20230905T19
dpark01 Sep 11, 2023
4d1ea6b
switch from xz to zst
dpark01 Sep 11, 2023
4cae41b
refresh dockstore
schaluva Sep 14, 2023
024f6e8
refresh dockstore with PR not as draft
schaluva Sep 14, 2023
95a961a
bump AUGUR_RECURSION_LIMIT to 100k
dpark01 Sep 14, 2023
dd67453
more recursion limit on augur translate
dpark01 Sep 18, 2023
1c6112e
merge in nextstrain updates
dpark01 Sep 19, 2023
21d1adc
fix delocalization
dpark01 Sep 19, 2023
59f16e3
fix delocalization
dpark01 Sep 20, 2023
cc1b737
make more subsampler inputs optional and add more typing to inputs
dpark01 Sep 21, 2023
fc5c2c7
formatting
dpark01 Sep 21, 2023
a8926ff
add counts to output and blank some default inputs
dpark01 Sep 21, 2023
bcbd13e
depart from snakemake and just run dag steps manually
dpark01 Sep 22, 2023
27eb2b3
crank memory, add zstd support, fail fast
dpark01 Sep 22, 2023
6d0520a
update default compression of subsample output and add more file patt…
dpark01 Sep 22, 2023
cb01dc0
bump hardware requests and add hardware utilization telemetry
dpark01 Sep 22, 2023
54ec1ab
oops, bump to wdl 1 command bock
dpark01 Sep 22, 2023
194082c
add progress statements
dpark01 Sep 22, 2023
59ad5d5
work in execution platforms CWD instead of in /opt/subsampler which i…
dpark01 Sep 22, 2023
3f8c374
expose ram as knob
dpark01 Sep 22, 2023
ee01ab2
absolute path to scripts
dpark01 Sep 22, 2023
ef36163
fix the broken --keep function
dpark01 Sep 22, 2023
3095833
reshape vm defaults and subsampler default inputs
dpark01 Sep 25, 2023
3345925
ram telemetry not working, revert to 30GB
dpark01 Sep 25, 2023
0fba1bc
lower default baseline again to 1e-4, fix spelling of start-date and …
dpark01 Sep 25, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ workflows:
primaryDescriptorPath: /pipes/WDL/workflows/augur_from_msa.wdl
testParameterFiles:
- empty.json
- name: augur_from_msa_with_subsampler
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/augur_from_msa_with_subsampler.wdl
testParameterFiles:
- empty.json
- name: bams_multiqc
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/bams_multiqc.wdl
Expand Down Expand Up @@ -324,6 +329,11 @@ workflows:
primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine.wdl
testParameterFiles:
- empty.json
- name: subsample_by_casecounts
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_casecounts.wdl
testParameterFiles:
- empty.json
- name: subsample_by_metadata
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_metadata.wdl
Expand Down Expand Up @@ -358,4 +368,4 @@ workflows:
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/bam_to_qiime.wdl
testParameterFiles:
- empty.json
- empty.json
151 changes: 151 additions & 0 deletions pipes/WDL/tasks/tasks_interhost.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,156 @@
version 1.0

task subsample_by_cases {
meta {
description: "Run subsampler to get downsampled dataset and metadata proportional to epidemiological case counts."
}
input {
File metadata
File case_data

String id_column
String geo_column
String date_column = "date"
String unit = "week"

File? keep_file
File? remove_file
File? filter_file
Float baseline = 0.0001
Int? seed_num
String? start_date
String? end_date

String docker = "quay.io/broadinstitute/subsampler"
Int machine_mem_gb = 30
}
command <<<
set -e -o pipefail
mkdir -p data outputs

# decompress if compressed
echo "staging and decompressing input data files"
if [[ ~{metadata} == *.gz ]]; then
cat "~{metadata}" | pigz -d > data/metadata.tsv
elif [[ ~{metadata} == *.zst ]]; then
cat "~{metadata}" | zstd -d > data/metadata.tsv
else
ln -s "~{metadata}" data/metadata.tsv
fi
if [[ ~{case_data} == *.gz ]]; then
cat "~{case_data}" | pigz -d > data/case_data.tsv
elif [[ ~{case_data} == *.zst ]]; then
cat "~{case_data}" | zstd -d > data/case_data.tsv
else
ln -s "~{case_data}" data/case_data.tsv
fi

## replicate snakemake DAG manually
# rule genome_matrix
# Generate matrix of genome counts per day, for each element in column ~{geo_column}
echo "getting genome matrix"
python3 /opt/subsampler/scripts/get_genome_matrix.py \
--metadata data/metadata.tsv \
--index-column ~{geo_column} \
--date-column ~{date_column} \
~{"--start-date " + start_date} \
~{"--end-date " + end_date} \
--output outputs/genome_matrix_days.tsv
date;uptime;free

# rule unit_conversion
# Generate matrix of genome and case counts per epiweek
echo "converting matricies to epiweeks"
python3 /opt/subsampler/scripts/aggregator.py \
--input outputs/genome_matrix_days.tsv \
--unit ~{unit} \
--format integer \
--output outputs/matrix_genomes_unit.tsv
python3 /opt/subsampler/scripts/aggregator.py \
--input data/case_data.tsv \
--unit ~{unit} \
--format integer \
~{"--start-date " + start_date} \
~{"--end-date " + end_date} \
--output outputs/matrix_cases_unit.tsv
date;uptime;free

# rule correct_bias
# Correct under- and oversampling genome counts based on epidemiological data
echo "create bias-correction matrix"
python3 /opt/subsampler/scripts/correct_bias.py \
--genome-matrix outputs/matrix_genomes_unit.tsv \
--case-matrix outputs/matrix_cases_unit.tsv \
--index-column code \
~{"--baseline " + baseline} \
--output1 outputs/weekly_sampling_proportions.tsv \
--output2 outputs/weekly_sampling_bias.tsv \
--output3 outputs/matrix_genomes_unit_corrected.tsv
date;uptime;free

# rule subsample
# Sample genomes and metadata according to the corrected genome matrix
echo "subsample data according to bias-correction"
# subsampler_timeseries says --keep is optional but actually fails if you don't specify one
cp /dev/null data/keep.txt
~{"cp " + keep_file + " data/keep.txt"}
python3 /opt/subsampler/scripts/subsampler_timeseries.py \
--metadata data/metadata.tsv \
--genome-matrix outputs/matrix_genomes_unit_corrected.tsv \
--index-column ~{id_column} \
--geo-column ~{geo_column} \
--date-column ~{date_column} \
--time-unit ~{unit} \
--keep data/keep.txt \
~{"--remove " + remove_file} \
~{"--filter-file " + filter_file} \
~{"--seed " + seed_num} \
~{"--start-date " + start_date} \
~{"--end-date " + end_date} \
--weekasdate no \
--sampled-sequences outputs/selected_sequences.txt \
--sampled-metadata outputs/selected_metadata.tsv \
--report outputs/sampling_stats.txt
echo '# Sampling proportion: ~{baseline}' | cat - outputs/sampling_stats.txt > temp && mv temp outputs/sampling_stats.txt
date;uptime;free

# copy outputs from container's temp dir to host-accessible working dir for delocalization
echo "wrap up"
mv outputs/* .
# get counts
cat selected_sequences.txt | wc -l | tee NUM_OUT
# get hardware utilization
set +o pipefail
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
{ cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES

>>>
runtime {
docker: docker
memory: machine_mem_gb + " GB"
cpu: 2
disks: "local-disk 200 HDD"
disk: "200 GB"
dx_instance_type: "mem3_ssd1_v2_x4"
}
output {
File genome_matrix_days = "genome_matrix_days.tsv"
File matrix_genomes_unit = "matrix_genomes_unit.tsv"
File matrix_cases_unit = "matrix_cases_unit.tsv"
File weekly_sampling_proportions = "weekly_sampling_proportions.tsv"
File weekly_sampling_bias = "weekly_sampling_bias.tsv"
File matrix_genomes_unit_corrected = "matrix_genomes_unit_corrected.tsv"
File selected_sequences = "selected_sequences.txt"
File selected_metadata = "selected_metadata.tsv"
File sampling_stats = "sampling_stats.txt"
Int num_selected = read_int("NUM_OUT")
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
}
}

task multi_align_mafft_ref {
input {
File reference_fasta
Expand Down
Loading