🚧 Add initial tests for weighted sampling

nextstrain · Jul 17, 2024 · 7d041b4 · 7d041b4
1 parent 6d47edb
commit 7d041b4
Show file tree

Hide file tree

Showing 3 changed files with 285 additions and 0 deletions.
diff --git a/tests/functional/filter/cram/subsample-output-group-by-sizes-error.t b/tests/functional/filter/cram/subsample-output-group-by-sizes-error.t
@@ -0,0 +1,14 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+--output-group-by-sizes does not work without --group-by-weights.
+
+  $ ${AUGUR} filter \
+  >   --metadata "$TESTDIR/../data/metadata.tsv" \
+  >   --group-by year month \
+  >   --subsample-max-sequences 100 \
+  >   --output-group-by-sizes target_group_sizes.tsv \
+  >   --output-strains strains.txt
+  ERROR: --output-group-by-sizes is only available for --group-by-weights. It may be added to other sampling methods in the future - see <https://github.com/nextstrain/augur/issues/new>
+  [2]
diff --git a/tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t b/tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t
@@ -0,0 +1,109 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Generate metadata file with 250 rows.
+
+  $ echo "strain	date	location" > metadata.tsv
+  $ for i in $(seq 1 50); do
+  >     echo "2000A_$i	2000	A" >> metadata.tsv
+  >     echo "2000B_$i	2000	B" >> metadata.tsv
+  >     echo "2001A_$i	2001	A" >> metadata.tsv
+  >     echo "2001B_$i	2001	B" >> metadata.tsv
+  >     echo "2002B_$i	2002	B" >> metadata.tsv
+  > done
+
+Weight locations A:B as 2:1. This is reflected in target_group_sizes.tsv below.
+
+  $ cat >weights.tsv <<~~
+  > location	weight
+  > A	2
+  > B	1
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 100 \
+  >   --subsample-seed 0 \
+  >   --output-group-by-sizes target_group_sizes.tsv \
+  >   --output-strains strains.txt 2>/dev/null
+
+  $ cat target_group_sizes.tsv
+  location	weight	_augur_filter_target_size_int
+  A	2	67
+  B	1	33
+
+Using 1:1 weights is similarly straightforward, with 50 sequences from each location.
+
+  $ cat >weights.tsv <<~~
+  > location	weight
+  > A	1
+  > B	1
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 100 \
+  >   --subsample-seed 0 \
+  >   --output-group-by-sizes target_group_sizes.tsv \
+  >   --output-strains strains.txt 2>/dev/null
+
+  $ cat target_group_sizes.tsv
+  location	weight	_augur_filter_target_size_int
+  A	1	50
+  B	1	50
+
+Keep the 1:1 location weighting, but add uniform sampling on year.
+The uniform sampling happens "within" each weighted column value, so the 1:1
+location weighting is reflected even though there is an imbalance in years
+available per location.
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by year location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 100 \
+  >   --subsample-seed 0 \
+  >   --output-group-by-sizes target_group_sizes.tsv \
+  >   --output-strains strains.txt 2>/dev/null
+
+  $ cat target_group_sizes.tsv
+  year	location	weight	_augur_filter_target_size_int
+  2000	A	0.5	25
+  2000	B	0.3333333333333333	16
+  2001	A	0.5	25
+  2001	B	0.3333333333333333	16
+  2002	B	0.3333333333333333	17
+
+If a single sequence is added for group (2002,A), the weighting now appears
+"equal" among all years and locations.
+
+However, there is only 1 sequence available in (2002,A), much lower than the
+requested 17, so the total number of sequences outputted is lower than requested.
+
+  $ echo "2002A_1	2002	A" >> metadata.tsv
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by year location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 100 \
+  >   --subsample-seed 0 \
+  >   --output-group-by-sizes target_group_sizes.tsv \
+  >   --output-strains strains.txt 2>/dev/null
+
+  $ cat target_group_sizes.tsv
+  year	location	weight	_augur_filter_target_size_int
+  2000	A	0.3333333333333333	17
+  2000	B	0.3333333333333333	16
+  2001	A	0.3333333333333333	16
+  2001	B	0.3333333333333333	16
+  2002	A	0.3333333333333333	17
+  2002	B	0.3333333333333333	17
+
+  $ wc -l strains.txt
+  \s*83 .* (re)
diff --git a/tests/functional/filter/cram/subsample-weighted.t b/tests/functional/filter/cram/subsample-weighted.t
@@ -0,0 +1,162 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Set up files.
+
+  $ cat >metadata.tsv <<~~
+  > strain	date	location
+  > SEQ1	2000-01-01	A
+  > SEQ2	2000-01-02	A
+  > SEQ3	2000-01-03	B
+  > SEQ4	2000-01-04	B
+  > SEQ5	2000-02-01	A
+  > SEQ6	2000-02-02	A
+  > SEQ7	2000-03-01	B
+  > SEQ8	2000-03-02	B
+  > ~~
+
+Sampling with location weights only.
+
+  $ cat >weights.tsv <<~~
+  > location	weight
+  > A	2
+  > B	1
+  > C	3
+  > ~~
+
+This should take 4 from location A and 2 from location B. The weight for
+location C is ignored because there are no corresponding rows in the metadata.
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 6 \
+  >   --subsample-seed 0 \
+  >   --output-strains strains.txt
+  Sampling with weights defined by weights.tsv.
+  NOTE: Skipping 1 group due to lack of entries in metadata.
+  2 strains were dropped during filtering
+  	2 were dropped because of subsampling criteria
+  6 strains passed all filters
+
+  $ cat strains.txt
+  SEQ1
+  SEQ2
+  SEQ5
+  SEQ6
+  SEQ7
+  SEQ8
+
+Sampling with weights on location and uniform sampling on date (--group-by
+month) should work.
+
+  $ cat >weights.tsv <<~~
+  > location	weight
+  > A	2
+  > B	1
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by location month \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 6 \
+  >   --subsample-seed 0 \
+  >   --output-strains strains.txt
+  Sampling with weights defined by weights.tsv.
+  NOTE: Skipping 2 groups due to lack of entries in metadata.
+  NOTE: Weights were not provided for the column 'month'. Using equal weights across values in that column.
+  2 strains were dropped during filtering
+  	2 were dropped because of subsampling criteria
+  6 strains passed all filters
+
+Sampling with incomplete weights should show an error.
+
+  $ cat >weights.tsv <<~~
+  > location	weight
+  > A	2
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 6 \
+  >   --subsample-seed 0 \
+  >   --output-strains strains.txt
+  Sampling with weights defined by weights.tsv.
+  ERROR: 1 groups appear in the metadata but are missing from the weights file. Re-run with --output-group-by-missing-weights to continue.
+  [2]
+
+Re-running with --output-group-by-missing-weights shows a warning and a file to use for fixing.
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 6 \
+  >   --subsample-seed 0 \
+  >   --output-group-by-missing-weights missing-weights.tsv \
+  >   --output-strains strains.txt
+  Sampling with weights defined by weights.tsv.
+  WARNING: 1 groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.
+  All missing groups added to 'missing-weights.tsv'.
+  4 strains were dropped during filtering
+  	4 were dropped because of subsampling criteria
+  4 strains passed all filters
+
+  $ cat missing-weights.tsv
+  location	weight
+  B	
+
+When --group-by-weights is specified, all columns must be provided in
+--group-by.
+
+  $ cat >weights.tsv <<~~
+  > location	weight
+  > A	2
+  > B	1
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 6 \
+  >   --subsample-seed 0 \
+  >   --output-strains strains.txt
+  ERROR: Columns in --group-by-weights must be a subset of columns provided in --group-by.
+  [2]
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by month \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 6 \
+  >   --subsample-seed 0 \
+  >   --output-strains strains.txt
+  ERROR: Columns in --group-by-weights must be a subset of columns provided in --group-by.
+  [2]
+
+Negative weights are not allowed.
+
+  $ cat >weights.tsv <<~~
+  > location	weight
+  > A	2
+  > B	1
+  > C	-1
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 6 \
+  >   --subsample-seed 0 \
+  >   --output-strains strains.txt
+  Sampling with weights defined by weights.tsv.
+  ERROR: Bad weights file 'weights.tsv'.
+  Found negative weights on the following lines: [3]
+  Weights must be non-negative.
+  [2]