diff --git a/tests/functional/filter/cram/subsample-output-group-by-sizes-error.t b/tests/functional/filter/cram/subsample-output-group-by-sizes-error.t new file mode 100644 index 000000000..d129af77c --- /dev/null +++ b/tests/functional/filter/cram/subsample-output-group-by-sizes-error.t @@ -0,0 +1,14 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +--output-group-by-sizes does not work without --group-by-weights. + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --group-by year month \ + > --subsample-max-sequences 100 \ + > --output-group-by-sizes target_group_sizes.tsv \ + > --output-strains strains.txt + ERROR: --output-group-by-sizes is only available for --group-by-weights. It may be added to other sampling methods in the future - see + [2] diff --git a/tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t b/tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t new file mode 100644 index 000000000..850f45fca --- /dev/null +++ b/tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t @@ -0,0 +1,109 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Generate metadata file with 250 rows. + + $ echo "strain date location" > metadata.tsv + $ for i in $(seq 1 50); do + > echo "2000A_$i 2000 A" >> metadata.tsv + > echo "2000B_$i 2000 B" >> metadata.tsv + > echo "2001A_$i 2001 A" >> metadata.tsv + > echo "2001B_$i 2001 B" >> metadata.tsv + > echo "2002B_$i 2002 B" >> metadata.tsv + > done + +Weight locations A:B as 2:1. This is reflected in target_group_sizes.tsv below. + + $ cat >weights.tsv <<~~ + > location weight + > A 2 + > B 1 + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 100 \ + > --subsample-seed 0 \ + > --output-group-by-sizes target_group_sizes.tsv \ + > --output-strains strains.txt 2>/dev/null + + $ cat target_group_sizes.tsv + location weight _augur_filter_target_size_int + A 2 67 + B 1 33 + +Using 1:1 weights is similarly straightforward, with 50 sequences from each location. + + $ cat >weights.tsv <<~~ + > location weight + > A 1 + > B 1 + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 100 \ + > --subsample-seed 0 \ + > --output-group-by-sizes target_group_sizes.tsv \ + > --output-strains strains.txt 2>/dev/null + + $ cat target_group_sizes.tsv + location weight _augur_filter_target_size_int + A 1 50 + B 1 50 + +Keep the 1:1 location weighting, but add uniform sampling on year. +The uniform sampling happens "within" each weighted column value, so the 1:1 +location weighting is reflected even though there is an imbalance in years +available per location. + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by year location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 100 \ + > --subsample-seed 0 \ + > --output-group-by-sizes target_group_sizes.tsv \ + > --output-strains strains.txt 2>/dev/null + + $ cat target_group_sizes.tsv + year location weight _augur_filter_target_size_int + 2000 A 0.5 25 + 2000 B 0.3333333333333333 16 + 2001 A 0.5 25 + 2001 B 0.3333333333333333 16 + 2002 B 0.3333333333333333 17 + +If a single sequence is added for group (2002,A), the weighting now appears +"equal" among all years and locations. + +However, there is only 1 sequence available in (2002,A), much lower than the +requested 17, so the total number of sequences outputted is lower than requested. + + $ echo "2002A_1 2002 A" >> metadata.tsv + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by year location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 100 \ + > --subsample-seed 0 \ + > --output-group-by-sizes target_group_sizes.tsv \ + > --output-strains strains.txt 2>/dev/null + + $ cat target_group_sizes.tsv + year location weight _augur_filter_target_size_int + 2000 A 0.3333333333333333 17 + 2000 B 0.3333333333333333 16 + 2001 A 0.3333333333333333 16 + 2001 B 0.3333333333333333 16 + 2002 A 0.3333333333333333 17 + 2002 B 0.3333333333333333 17 + + $ wc -l strains.txt + \s*83 .* (re) diff --git a/tests/functional/filter/cram/subsample-weighted-invalid-file.t b/tests/functional/filter/cram/subsample-weighted-invalid-file.t new file mode 100644 index 000000000..13f24173a --- /dev/null +++ b/tests/functional/filter/cram/subsample-weighted-invalid-file.t @@ -0,0 +1,61 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Set up files. + + $ cat >metadata.tsv <<~~ + > strain date location + > SEQ1 2000-01-01 A + > SEQ2 2000-01-02 A + > SEQ3 2000-01-03 B + > SEQ4 2000-01-04 B + > SEQ5 2000-02-01 A + > SEQ6 2000-02-02 A + > SEQ7 2000-03-01 B + > SEQ8 2000-03-02 B + > ~~ + +Weights must be non-negative. + + $ cat >weights.tsv <<~~ + > location weight + > A 2 + > B 1 + > C -1 + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-strains strains.txt + Sampling with weights defined by weights.tsv. + ERROR: Bad weights file 'weights.tsv'. + Found negative weights on the following lines: [4] + 'weight' column must be non-negative. + [2] + +Weights must be numeric. + + $ cat >weights.tsv <<~~ + > location weight + > A yes + > B 1 + > C no + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-strains strains.txt + Sampling with weights defined by weights.tsv. + ERROR: Bad weights file 'weights.tsv'. + Found non-numeric weights on the following lines: [2, 4] + 'weight' column must be numeric. + [2] diff --git a/tests/functional/filter/cram/subsample-weighted.t b/tests/functional/filter/cram/subsample-weighted.t new file mode 100644 index 000000000..5b8972142 --- /dev/null +++ b/tests/functional/filter/cram/subsample-weighted.t @@ -0,0 +1,142 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Set up files. + + $ cat >metadata.tsv <<~~ + > strain date location + > SEQ1 2000-01-01 A + > SEQ2 2000-01-02 A + > SEQ3 2000-01-03 B + > SEQ4 2000-01-04 B + > SEQ5 2000-02-01 A + > SEQ6 2000-02-02 A + > SEQ7 2000-03-01 B + > SEQ8 2000-03-02 B + > ~~ + +Sampling with location weights only. + + $ cat >weights.tsv <<~~ + > location weight + > A 2 + > B 1 + > C 3 + > ~~ + +This should take 4 from location A and 2 from location B. The weight for +location C is ignored because there are no corresponding rows in the metadata. + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-strains strains.txt + Sampling with weights defined by weights.tsv. + NOTE: Skipping 1 group due to lack of entries in metadata. + 2 strains were dropped during filtering + 2 were dropped because of subsampling criteria + 6 strains passed all filters + + $ cat strains.txt + SEQ1 + SEQ2 + SEQ5 + SEQ6 + SEQ7 + SEQ8 + +Sampling with weights on location and uniform sampling on date (--group-by +month) should work. + + $ cat >weights.tsv <<~~ + > location weight + > A 2 + > B 1 + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by location month \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-strains strains.txt + Sampling with weights defined by weights.tsv. + NOTE: Skipping 2 groups due to lack of entries in metadata. + NOTE: Weights were not provided for the column 'month'. Using equal weights across values in that column. + 2 strains were dropped during filtering + 2 were dropped because of subsampling criteria + 6 strains passed all filters + +Sampling with incomplete weights should show an error. + + $ cat >weights.tsv <<~~ + > location weight + > A 2 + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-strains strains.txt + Sampling with weights defined by weights.tsv. + ERROR: The input metadata contains 1 group that is missing from the weights file. Re-run with --output-group-by-missing-weights to continue. + [2] + +Re-running with --output-group-by-missing-weights shows a warning and a file to use for fixing. + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by month location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-group-by-missing-weights missing-weights.tsv \ + > --output-strains strains.txt + Sampling with weights defined by weights.tsv. + NOTE: Skipping 1 group due to lack of entries in metadata. + NOTE: Weights were not provided for the column 'month'. Using equal weights across values in that column. + WARNING: The input metadata contains 2 groups that are missing from the weights file. Sequences from these groups will be dropped. + All missing groups added to 'missing-weights.tsv'. + 4 strains were dropped during filtering + 4 were dropped because of subsampling criteria + 4 strains passed all filters + + $ cat missing-weights.tsv + location weight + B + +When --group-by-weights is specified, all columns must be provided in +--group-by. + + $ cat >weights.tsv <<~~ + > location weight + > A 2 + > B 1 + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-strains strains.txt + ERROR: Columns in --group-by-weights must be a subset of columns provided in --group-by. + [2] + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by month \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-strains strains.txt + ERROR: Columns in --group-by-weights must be a subset of columns provided in --group-by. + [2]