Skip to content

Commit

Permalink
Merge pull request #1459: filter: Print all log messages to stderr
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin authored May 6, 2024
2 parents 03ed408 + 37df8ff commit 556e999
Show file tree
Hide file tree
Showing 34 changed files with 112 additions and 53 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@

* validation: we no longer exit with a non-zero exit code when the requested validation mode is "warn" [#1440][] (@jameshadfield)
* validation: we no longer perform any validation when the requested validation mode is "skip" [#1440][] (@jameshadfield)
* filter: Send all log messages to `stderr`. This allows output to be written to `stdout` (e.g. `--output-strains /dev/stdout`). [#1459][] (@victorlin)

[#1440]: https://github.com/nextstrain/augur/pull/1440
[#1445]: https://github.com/nextstrain/augur/pull/1445
[#1459]: https://github.com/nextstrain/augur/pull/1459

## 24.3.0 (18 March 2024)

Expand Down
14 changes: 7 additions & 7 deletions augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,9 @@ def run(args):
raise AugurError(error)

if (probabilistic_used):
print(f"Sampling probabilistically at {sequences_per_group:0.4f} sequences per group, meaning it is possible to have more than the requested maximum of {args.subsample_max_sequences} sequences after filtering.")
print_err(f"Sampling probabilistically at {sequences_per_group:0.4f} sequences per group, meaning it is possible to have more than the requested maximum of {args.subsample_max_sequences} sequences after filtering.")
else:
print(f"Sampling at {sequences_per_group} per group.")
print_err(f"Sampling at {sequences_per_group} per group.")

if queues_by_group is None:
# We know all of the possible groups now from the first pass through
Expand Down Expand Up @@ -414,10 +414,10 @@ def run(args):
total_strains_passed = len(valid_strains)
total_strains_filtered = len(metadata_strains) + num_excluded_by_lack_of_metadata - total_strains_passed

print(f"{total_strains_filtered} {'strain was' if total_strains_filtered == 1 else 'strains were'} dropped during filtering")
print_err(f"{total_strains_filtered} {'strain was' if total_strains_filtered == 1 else 'strains were'} dropped during filtering")

if num_excluded_by_lack_of_metadata:
print(f"\t{num_excluded_by_lack_of_metadata} had no metadata")
print_err(f"\t{num_excluded_by_lack_of_metadata} had no metadata")

report_template_by_filter_name = {
include_exclude_rules.filter_by_sequence_index.__name__: "{count} had no sequence data",
Expand Down Expand Up @@ -446,11 +446,11 @@ def run(args):
parameters["count"] = count
parameters["were"] = "was" if count == 1 else "were"
parameters["they"] = "it" if count == 1 else "they"
print("\t" + report_template_by_filter_name[filter_name].format(**parameters))
print_err("\t" + report_template_by_filter_name[filter_name].format(**parameters))

if (group_by and args.sequences_per_group) or args.subsample_max_sequences:
seed_txt = ", using seed {}".format(args.subsample_seed) if args.subsample_seed else ""
print(f"\t{num_excluded_subsamp} {'was' if num_excluded_subsamp == 1 else 'were'} dropped because of subsampling criteria{seed_txt}")
print_err(f"\t{num_excluded_subsamp} {'was' if num_excluded_subsamp == 1 else 'were'} dropped because of subsampling criteria{seed_txt}")

if total_strains_passed == 0:
empty_results_message = "All samples have been dropped! Check filter rules and metadata file format."
Expand All @@ -463,4 +463,4 @@ def run(args):
else:
raise ValueError(f"Encountered unhandled --empty-output-reporting method {args.empty_output_reporting!r}")

print(f"{total_strains_passed} {'strain' if total_strains_passed == 1 else 'strains'} passed all filters")
print_err(f"{total_strains_passed} {'strain' if total_strains_passed == 1 else 'strains'} passed all filters")
2 changes: 2 additions & 0 deletions augur/io/print.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@


def print_err(*args):
"""Print to stderr. When data goes to stdout (most cases), this should be
used for any informational messages, not just errors/warnings."""
print(*args, file=sys.stderr)
10 changes: 7 additions & 3 deletions tests/functional/filter/cram/filter-empty-output-reporting.t
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,30 @@ Test the default behavior for empty results is an error.
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --exclude-all \
> --output-strains filtered_strains.txt > /dev/null
12 strains were dropped during filtering
12 were dropped by `--exclude-all`
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[2]
$ wc -l filtered_strains.txt
\s*0 .* (re)

Repeat with the --empty-output-reporting=warn option.
This whould output a warning message but no error.
This should output a warning message but no error.

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --exclude-all \
> --output-strains filtered_strains.txt \
> --empty-output-reporting warn > /dev/null
12 strains were dropped during filtering
12 were dropped by `--exclude-all`
WARNING: All samples have been dropped! Check filter rules and metadata file format.
0 strains passed all filters
$ wc -l filtered_strains.txt
\s*0 .* (re)

Ignore empty results with the --empty-output-reporting=silent option.
Make sure all 3 output types are empty, except the metadata output should still include the header.
This should not output any messages to stderr.

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
Expand All @@ -38,7 +42,7 @@ This should not output any messages to stderr.
> --output-sequences filtered_seqs.fasta \
> --output-metadata filtered_metadata.tsv \
> --output-strains filtered_strains.txt \
> --empty-output-reporting silent > /dev/null
> --empty-output-reporting silent 2>/dev/null
$ wc -l filtered_seqs.fasta
\s*0 .* (re)
$ diff <(head -n 1 filtered_metadata.tsv) <(head -n 1 "$TESTDIR/../data/metadata.tsv")
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/filter/cram/filter-exclude-include.t
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ Force include one South American record by country to get two total records.
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \
> --include-where "country=Ecuador" \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null
$ wc -l filtered_strains.txt
\s*2 .* (re)
4 changes: 2 additions & 2 deletions tests/functional/filter/cram/filter-exclude-where-multiple.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Scenario 1: Run command with one --exclude-where flag and multiple values
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --exclude-where "region=A" "region=B" \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null

Both exclusions are applied.

Expand All @@ -30,7 +30,7 @@ Scenario 2: Run command with two --exclude-where flags
> --metadata metadata.tsv \
> --exclude-where "region=A" \
> --exclude-where "region=B" \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null

Both exclusions are applied.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Test all outputs with --include-where.
> --output-metadata metadata-filtered.tsv \
> --output-strains strains-filtered.txt \
> --output-sequences sequences-filtered.fasta \
> > /dev/null 2>&1
> 2>/dev/null
$ cat metadata-filtered.tsv | tail -n+2 | sort -k1
a\t1 (esc)
b\t2 (esc)
Expand Down Expand Up @@ -72,7 +72,7 @@ Test all outputs with --include.
> --output-metadata metadata-filtered.tsv \
> --output-strains strains-filtered.txt \
> --output-sequences sequences-filtered.fasta \
> > /dev/null 2>&1
> 2>/dev/null
$ cat metadata-filtered.tsv | tail -n+2 | sort -k1
a\t1 (esc)
b\t2 (esc)
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/filter/cram/filter-max-date.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Test that --max-date is inclusive.
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --max-date 2020-03-01 \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null
$ sort filtered_strains.txt
SEQ_1
SEQ_2
4 changes: 2 additions & 2 deletions tests/functional/filter/cram/filter-metadata-date-formats.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Test that 2020 is evaluated as 2020-XX-XX.
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --min-date 2020-02-01 \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null
$ sort filtered_strains.txt
SEQ_2
SEQ_3
Expand All @@ -26,7 +26,7 @@ Test that 2020.0, 2020, and 2020-XX-XX all pass --min-date 2019
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --min-date 2019 \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null
$ sort filtered_strains.txt
SEQ_1
SEQ_2
Expand Down
4 changes: 2 additions & 2 deletions tests/functional/filter/cram/filter-metadata-delimiter.t
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Comma-delimited metadata is allowed by default. However, the output metadata wil
$ ${AUGUR} filter \
> --metadata metadata.txt \
> --exclude-where column=A \
> --output-metadata filtered.txt > /dev/null
> --output-metadata filtered.txt 2>/dev/null
$ cat filtered.txt
strain\tcolumn (esc)
SEQ_2\tB (esc)
Expand Down Expand Up @@ -62,7 +62,7 @@ Allow colon-delimited metadata. However, the output metadata will be tab-delimit
> --metadata metadata.txt \
> --metadata-delimiters ':' \
> --exclude-where column=A \
> --output-metadata filtered.txt > /dev/null
> --output-metadata filtered.txt 2>/dev/null
$ cat filtered.txt
strain\tcolumn (esc)
SEQ_2\tB (esc)
2 changes: 1 addition & 1 deletion tests/functional/filter/cram/filter-min-date.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Test that --min-date is inclusive.
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --min-date 2020-02-26 \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null
$ sort filtered_strains.txt
SEQ_1
SEQ_2
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ This should produce no results because the intersection of metadata and sequence
> --max-date 2020-01-30 \
> --output-strains filtered_strains.txt > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
13 strains were dropped during filtering
1 had no metadata
12 had no sequence data
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[2]
$ wc -l filtered_strains.txt
Expand All @@ -27,6 +30,9 @@ Repeat with sequence and strain outputs. We should get the same results.
> --output-strains filtered_strains.txt \
> --output-sequences filtered.fasta > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
13 strains were dropped during filtering
1 had no metadata
12 had no sequence data
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[2]
$ wc -l filtered_strains.txt
Expand All @@ -42,6 +48,9 @@ Since we expect metadata to be filtered by presence of strains in input sequence
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --output-strains filtered_strains.txt > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
13 strains were dropped during filtering
1 had no metadata
12 had no sequence data
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[2]
$ wc -l filtered_strains.txt
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/filter/cram/filter-numerical-ids.t
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Test that nothing is filtered out due to missing sequence data.
> --metadata metadata.tsv \
> --sequences sequences.fasta \
> --output-strains filtered_strains.txt \
> > /dev/null 2>&1
> 2>/dev/null
$ sort filtered_strains.txt
1
2
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/filter/cram/filter-output-contents.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ The purpose of this test file is to check format and consistency among the
> --no-probabilistic-sampling \
> --output-metadata filtered_metadata.tsv \
> --output-strains filtered_strains.txt \
> --output filtered.fasta > /dev/null
> --output filtered.fasta 2>/dev/null

Check that the header row is identical between input and output metadata.

Expand Down
6 changes: 3 additions & 3 deletions tests/functional/filter/cram/filter-output-metadata-header.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Quoted columns containing the tab delimiter are left unchanged.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --output-metadata filtered_metadata.tsv > /dev/null
> --output-metadata filtered_metadata.tsv 2>/dev/null

$ head -n 1 filtered_metadata.tsv
strain "col 1"
Expand All @@ -28,7 +28,7 @@ Quoted columns without the tab delimiter are stripped of the quotes.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --output-metadata filtered_metadata.tsv > /dev/null
> --output-metadata filtered_metadata.tsv 2>/dev/null

$ head -n 1 filtered_metadata.tsv
strain col1
Expand All @@ -42,7 +42,7 @@ Any other columns with quotes are quoted, and pre-existing quotes are escsaped b

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --output-metadata filtered_metadata.tsv > /dev/null
> --output-metadata filtered_metadata.tsv 2>/dev/null

$ head -n 1 filtered_metadata.tsv
strain "col""1" "col2"""
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Test that --include_where still works with filtering on query.
> --metadata metadata.tsv \
> --query "quality=='good' & location=='colorado'" \
> --include-where "location=nevada" \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null
$ sort filtered_strains.txt
SEQ_1
SEQ_3
2 changes: 1 addition & 1 deletion tests/functional/filter/cram/filter-query-and-include.t
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Test that --include_where still works with filtering on query.
> --metadata metadata.tsv \
> --query "quality=='good' & location=='colorado'" \
> --include include.txt \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null
$ sort filtered_strains.txt
SEQ_1
SEQ_3
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ The 'region name' column is query-able by backtick quoting.
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query '(`region name` == "A")' \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null

$ sort filtered_strains.txt
SEQ_1
Expand Down
10 changes: 5 additions & 5 deletions tests/functional/filter/cram/filter-query-example.t
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ First, select strains from Brazil (there should be 1).
$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --query "country == 'Brazil'" \
> --output-strains filtered_strains.brazil.txt > /dev/null
> --output-strains filtered_strains.brazil.txt 2>/dev/null
$ wc -l filtered_strains.brazil.txt
\s*1 .* (re)

Expand All @@ -17,7 +17,7 @@ Then, select strains from Colombia (there should be 3).
$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --query "country == 'Colombia'" \
> --output-strains filtered_strains.colombia.txt > /dev/null
> --output-strains filtered_strains.colombia.txt 2>/dev/null
$ wc -l filtered_strains.colombia.txt
\s*3 .* (re)

Expand All @@ -29,7 +29,7 @@ Finally, exclude all sequences except those from the two sets of strains (there
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --exclude-all \
> --include filtered_strains.brazil.txt filtered_strains.colombia.txt \
> --output filtered.fasta > /dev/null
> --output filtered.fasta 2>/dev/null
$ grep "^>" filtered.fasta | wc -l
\s*4 (re)

Expand All @@ -42,7 +42,7 @@ We should get the same outputs without building a sequence index on the fly, bec
> --exclude-all \
> --include filtered_strains.brazil.txt filtered_strains.colombia.txt \
> --output filtered.fasta \
> --output-metadata filtered.tsv > /dev/null
> --output-metadata filtered.tsv 2>/dev/null
$ grep "^>" filtered.fasta | wc -l
\s*4 (re)

Expand All @@ -58,6 +58,6 @@ Alternately, exclude the sequences from Brazil and Colombia (N=4) and records wi
> --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --exclude filtered_strains.brazil.txt filtered_strains.colombia.txt \
> --output filtered.fasta > /dev/null
> --output filtered.fasta 2>/dev/null
$ grep "^>" filtered.fasta | wc -l
\s*7 (re)
4 changes: 2 additions & 2 deletions tests/functional/filter/cram/filter-query-numerical.t
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ The 'coverage' column should be query-able by numerical comparisons.
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95" \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null

$ sort filtered_strains.txt
SEQ_2
Expand Down Expand Up @@ -52,7 +52,7 @@ However, that is still possible by explicitly specifying that it is a string col
> --metadata metadata.tsv \
> --query "coverage.str.endswith('.95')" \
> --query-columns coverage:str \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null

$ sort filtered_strains.txt
SEQ_2
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/filter/cram/filter-query-str.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Create metadata file for testing.
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "column.str.startswith('value')" \
> --output-strains filtered_strains.txt > /dev/null
> --output-strains filtered_strains.txt 2>/dev/null

$ sort filtered_strains.txt
SEQ_1
Expand Down
4 changes: 4 additions & 0 deletions tests/functional/filter/cram/filter-sequences-vcf.t
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ Filter TB strains from VCF and save as a list of filtered strains.
> --output filtered.vcf \
> --output-strains filtered_strains.txt > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
162 strains were dropped during filtering
155 had no sequence data
7 were dropped because they were earlier than 2012.0 or missing a date
3 strains passed all filters
$ wc -l filtered_strains.txt
\s*3 .* (re)

Expand Down
Loading

0 comments on commit 556e999

Please sign in to comment.