Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ancestral, translate] node data validation improvements #1440

Merged
merged 5 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
from .io.vcf import is_vcf as is_filename_vcf
from treetime.vcf_utils import read_vcf, write_vcf
from collections import defaultdict
from .types import ValidationMode
from .util_support.node_data_file import NodeDataObject
from .export_v2 import validation_mode_help_message

def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
marginal=False, fill_overhangs=True, infer_tips=False,
Expand Down Expand Up @@ -329,6 +332,12 @@ def register_parser(parent_subparsers):
"the gene name.")
output_group.add_argument('--output-vcf', type=str, help='name of output VCF file which will include ancestral seqs')

general_group = parser.add_argument_group(
"general",
)
general_group.add_argument('--validation-mode', type=ValidationMode, choices=[mode for mode in ValidationMode], default=ValidationMode.ERROR,
help=validation_mode_help_message)

return parser

def validate_arguments(args, is_vcf):
Expand Down Expand Up @@ -465,6 +474,9 @@ def run(args):
oh.write(f">{node.name}\n{aa_result['tt'].sequence(node, as_string=True, reconstructed=True)}\n")

out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
# use NodeDataObject to perform validation on the file before it's written
NodeDataObject(anc_seqs, out_name, args.validation_mode)

write_json(anc_seqs, out_name)
print("ancestral mutations written to", out_name, file=sys.stdout)

Expand Down
28 changes: 15 additions & 13 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,20 @@ def node_data_prop_is_normal_trait(name):

return True

validation_mode_help_message = """
Control if optional validation checks are performed and what
happens if they fail.

'error' and 'warn' modes perform validation and emit messages about
failed validation checks. 'error' mode causes a non-zero exit
status if any validation checks failed, while 'warn' does not.

'skip' mode performs no validation.

Note that some validation checks are non-optional and as such are
not affected by this setting.
"""


def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("v2", help=__doc__)
Expand Down Expand Up @@ -917,19 +931,7 @@ def register_parser(parent_subparsers):
type=ValidationMode,
choices=[mode for mode in ValidationMode],
default=ValidationMode.ERROR,
help="""
Control if optional validation checks are performed and what
happens if they fail.

'error' and 'warn' modes perform validation and emit messages about
failed validation checks. 'error' mode causes a non-zero exit
status if any validation checks failed, while 'warn' does not.

'skip' mode performs no validation.

Note that some validation checks are non-optional and as such are
not affected by this setting.
""")
help=validation_mode_help_message)
optional_settings.add_argument(
'--skip-validation',
dest="validation_mode",
Expand Down
14 changes: 11 additions & 3 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
from treetime.vcf_utils import read_vcf
from augur.errors import AugurError
from textwrap import dedent
from .types import ValidationMode
from .util_support.node_data_file import NodeDataObject
from .export_v2 import validation_mode_help_message

class MissingNodeError(Exception):
pass
Expand Down Expand Up @@ -335,12 +338,12 @@ def sequences_vcf(reference_fasta, vcf):
ref = compress_seq['reference']
return (sequences, ref)

def sequences_json(node_data_json, tree):
def sequences_json(node_data_json, tree, validation_mode):
"""
Extract the full nuc sequence for each node in the provided node-data JSON.
Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
"""
node_data = read_node_data(node_data_json)
node_data = read_node_data(node_data_json, validation_mode=validation_mode)
if node_data is None:
raise AugurError("could not read node data (incl sequences)")
# extract sequences from node meta data
Expand Down Expand Up @@ -370,6 +373,8 @@ def register_parser(parent_subparsers):
parser.add_argument('--alignment-output', type=str, help="write out translated gene alignments. "
"If a VCF-input, a .vcf or .vcf.gz will be output here (depending on file ending). If fasta-input, specify the file name "
"like so: 'my_alignment_%%GENE.fasta', where '%%GENE' will be replaced by the name of the gene")
parser.add_argument('--validation-mode', type=ValidationMode, choices=[mode for mode in ValidationMode], default=ValidationMode.ERROR, help=validation_mode_help_message)

vcf_only = parser.add_argument_group(
title="VCF specific",
description="These arguments are only applicable if the input (--ancestral-sequences) is in VCF format."
Expand Down Expand Up @@ -440,7 +445,7 @@ def run(args):
if len(features_without_variation):
print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))
else:
(reference, sequences) = sequences_json(args.ancestral_sequences, tree)
(reference, sequences) = sequences_json(args.ancestral_sequences, tree, args.validation_mode)
translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'}
for fname, feat in features.items():
if fname=='nuc':
Expand Down Expand Up @@ -470,6 +475,9 @@ def run(args):

output_data = {'annotations':annotations, 'nodes':aa_muts, 'reference': reference_translations}
out_name = get_json_name(args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json')
# use NodeDataObject to perform validation on the file before it's written
NodeDataObject(output_data, out_name, args.validation_mode)

write_json(output_data, out_name)
print("amino acid mutations written to", out_name, file=sys.stdout)

Expand Down
102 changes: 62 additions & 40 deletions augur/util_support/node_data_file.py
jameshadfield marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -60,45 +60,67 @@ def items(self):
return filtered_attrs.items()

def validate(self):
if self.annotations:
try:
validate_json(
self.annotations,
load_json_schema("schema-annotations.json"),
self.fname,
if self.validation_mode is ValidationMode.SKIP:
return # don't perform validation (i.e. skip it)

try:
if self.annotations:
try:
# validate_json will print any errors to STDERR
validate_json(
self.annotations,
load_json_schema("schema-annotations.json"),
self.fname,
)
except ValidateError as err:
raise ValidateError(
f"{self.fname} contains an `annotations` attribute of an invalid JSON format. Was it "
"produced by different version of augur the one you are currently using "
f" ({__version__})? Please check the program that produced that JSON file."
) from err

if not isinstance(self.nodes, dict):
raise ValidateError(
f"`nodes` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
)
except ValidateError as err:
raise AugurError(
f"{self.fname} contains an `annotations` attribute of an invalid JSON format. Was it "
"produced by different version of augur the one you are currently using "
f" ({__version__})? Please check the program that produced that JSON file."
) from err

if not isinstance(self.nodes, dict):
raise AugurError(
f"`nodes` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
)

if not isinstance(self.branches, dict):
raise AugurError(
f"`branches` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
)

if not self.nodes and not self.branches:
print_err(
f"WARNING: {self.fname} has empty or nonexistent `nodes` and `branches`. Please check the formatting of this JSON!"
)

if self.validation_mode is not ValidationMode.SKIP and self.is_generated_by_incompatible_augur:
msg = (
f"Augur version incompatibility detected: the JSON {self.fname} was generated by "
f"{self.generated_by}, which is incompatible with the current augur version "
f"({__version__}). We suggest you rerun the pipeline using the current version of "
"augur."
)
if self.validation_mode is ValidationMode.ERROR:
raise AugurError(msg)
elif self.validation_mode is ValidationMode.WARN:
print_err(f"WARNING: {msg}")

if not isinstance(self.branches, dict):
raise ValidateError(
f"`branches` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
)

if not self.nodes and not self.branches:
print_err(
f"WARNING: {self.fname} has empty or nonexistent `nodes` and `branches`. Please check the formatting of this JSON!"
)

if self.is_generated_by_incompatible_augur:
msg = (
f"Augur version incompatibility detected: the JSON {self.fname} was generated by "
f"{self.generated_by}, which is incompatible with the current augur version "
f"({__version__}). We suggest you rerun the pipeline using the current version of "
"augur."
)
raise ValidateError(msg)
except ValidateError as e:
if self.validation_mode is ValidationMode.WARN:
# string-representation of error mirrors how augur ultimately handles AugurError
print_err(f"WARNING: {e}")
else:
raise ValueError(f"unknown validation mode: {self.validation_mode!r}")
# Re-raising within an except block will use implicit chaining (via __context__)
# however since we ultimately catch AugurError and use a string representation
# this isn't particularly helpful. One day we may add a dev-flag to allow printing
# the traceback or similar
raise AugurError(*e.args)


class NodeDataObject(NodeDataFile):
jameshadfield marked this conversation as resolved.
Show resolved Hide resolved
"""
NodeDataObject is identical to NodeDataFile except it takes a node-data dict
rather than loading the node data from a file
"""
def __init__(self, node_data_json, fname, validation_mode=ValidationMode.ERROR):
self.fname = fname
self.validation_mode = validation_mode
self.attrs = node_data_json
self.validate()
2 changes: 1 addition & 1 deletion augur/util_support/node_data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class NodeDataReader:

If a tree file is specified, it is used to verify the node names.

If validation_mode is set to :py:attr:`augur.types.ValidationMode.SKIP`, Augur version of node data files is not checked.
If validation_mode is set to :py:attr:`augur.types.ValidationMode.SKIP` no validation is performed.
"""

def __init__(self, filenames, tree_file=None, validation_mode=ValidationMode.ERROR):
Expand Down
4 changes: 2 additions & 2 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ def _parse(feat):
for segment in feat.location.parts # segment: SimpleLocation
]
else:
raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}")
raise AugurError(f"Encountered a genome feature with an unknown location type '{type(feat.location)}'")
a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]
a['type'] = feat.type # (unused by auspice)
if ref_seq_name:
Expand All @@ -883,6 +883,6 @@ def _parse(feat):
if fname=='nuc':
assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand"
elif annotations[fname]['strand'] not in ['+', '-']:
print("WARNING: Feature {fname:q} uses a strand which auspice cannot display")
print(f"WARNING: Feature '{fname}' uses a strand which auspice cannot display")

return annotations
2 changes: 2 additions & 0 deletions tests/functional/translate/cram/genes.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ as a feature ('nuc' in this case)
Couldn't find gene gene3 in GFF or GenBank file
Read in 2 features from reference sequence file
Validating schema of .+ (re)
Validating schema of .+ (re)
amino acid mutations written to .+ (re)

$ python3 "$SCRIPTS/diff_jsons.py" \
Expand All @@ -39,6 +40,7 @@ Using a text file rather than command line arguments
Couldn't find gene gene3 in GFF or GenBank file
Read in 2 features from reference sequence file
Validating schema of .+ (re)
Validating schema of .+ (re)
amino acid mutations written to .+ (re)

$ python3 "$SCRIPTS/diff_jsons.py" \
Expand Down
1 change: 1 addition & 0 deletions tests/functional/translate/cram/translate-with-genbank.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Translate amino acids for genes using a GenBank file.
WARNING: 1 CDS features skipped as they didn't have a locus_tag or gene qualifier.
Read in 3 features from reference sequence file
Validating schema of '.+nt_muts.json'... (re)
Validating schema of .* (re)
amino acid mutations written to .* (re)

$ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --output-node-data aa_muts.json
Read in 3 features from reference sequence file
Validating schema of '.+/nt_muts.json'... (re)
Validating schema of .* (re)
amino acid mutations written to .* (re)

Other than the sequence ids which will include a temporary path, the JSONs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --output-node-data aa_muts.json
Read in 3 features from reference sequence file
Validating schema of '.+/nt_muts.json'... (re)
Validating schema of .* (re)
amino acid mutations written to .* (re)

$ python3 "${SCRIPTS}/diff_jsons.py" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ This is an identical test setup as `translate-with-gff-and-gene.t` but using loc
> --output-node-data aa_muts.json
Read in 3 features from reference sequence file
Validating schema of '.+/nt_muts.json'... (re)
Validating schema of .* (re)
amino acid mutations written to .* (re)

$ python3 "${SCRIPTS}/diff_jsons.py" \
Expand Down
1 change: 1 addition & 0 deletions tests/functional/translate/cram/vcf-with-root-mutation.t
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ to the provided reference.fasta)
> --vcf-reference "$ANC_DATA/reference.fasta" \
> --vcf-reference-output reference.fasta
Read in 3 features from reference sequence file
Validating schema of 'aa_muts.json'...
amino acid mutations written to aa_muts.json

The _reference_ produced is the actual reference, not using the mutations in the tree
Expand Down
1 change: 1 addition & 0 deletions tests/functional/translate/cram/vcf.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Setup
> --vcf-reference "$ANC_DATA/reference.fasta" \
> --vcf-reference-output reference.fasta
Read in 3 features from reference sequence file
Validating schema of 'aa_muts.json'...
amino acid mutations written to aa_muts.json

$ cat reference.fasta
Expand Down
Loading