Skip to content

Commit

Permalink
[ancestral, translate] validate output node-data
Browse files Browse the repository at this point in the history
By leveraging the existing validation designed for reading node-data
files we not only validate against the annotation schema but also
perform other sanity checks on the node-data structure for which a
schema does not (yet) exist.

As part of this we add a `--skip-validation` argument (following `augur
export v2`) which skips validation of the output node-data and
(translate only) skips validation of the input node-data.
  • Loading branch information
jameshadfield committed Mar 16, 2024
1 parent a6fce6f commit 34b0787
Show file tree
Hide file tree
Showing 10 changed files with 45 additions and 3 deletions.
11 changes: 11 additions & 0 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
from .io.vcf import is_vcf as is_filename_vcf
from treetime.vcf_utils import read_vcf, write_vcf
from collections import defaultdict
from .types import ValidationMode
from .util_support.node_data_file import NodeDataObject

def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
marginal=False, fill_overhangs=True, infer_tips=False,
Expand Down Expand Up @@ -329,6 +331,12 @@ def register_parser(parent_subparsers):
"the gene name.")
output_group.add_argument('--output-vcf', type=str, help='name of output VCF file which will include ancestral seqs')

general_group = parser.add_argument_group(
"general",
)
general_group.add_argument('--skip-validation', action='store_true',
help="Skip validation of input/output files. Use at your own risk!")

return parser

def validate_arguments(args, is_vcf):
Expand Down Expand Up @@ -465,6 +473,9 @@ def run(args):
oh.write(f">{node.name}\n{aa_result['tt'].sequence(node, as_string=True, reconstructed=True)}\n")

out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
# use NodeDataObject to perform validation on the file before it's written
NodeDataObject(anc_seqs, out_name, **{'validation_mode': ValidationMode.SKIP if args.skip_validation else {}})

write_json(anc_seqs, out_name)
print("ancestral mutations written to", out_name, file=sys.stdout)

Expand Down
15 changes: 12 additions & 3 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
from treetime.vcf_utils import read_vcf
from augur.errors import AugurError
from textwrap import dedent
from .types import ValidationMode
from .util_support.node_data_file import NodeDataObject


class MissingNodeError(Exception):
pass
Expand Down Expand Up @@ -335,12 +338,12 @@ def sequences_vcf(reference_fasta, vcf):
ref = compress_seq['reference']
return (sequences, ref)

def sequences_json(node_data_json, tree):
def sequences_json(node_data_json, tree, skip_validation):
"""
Extract the full nuc sequence for each node in the provided node-data JSON.
Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
"""
node_data = read_node_data(node_data_json)
node_data = read_node_data(node_data_json, **{'validation_mode': ValidationMode.SKIP if skip_validation else {}})
if node_data is None:
raise AugurError("could not read node data (incl sequences)")
# extract sequences from node meta data
Expand Down Expand Up @@ -370,6 +373,9 @@ def register_parser(parent_subparsers):
parser.add_argument('--alignment-output', type=str, help="write out translated gene alignments. "
"If a VCF-input, a .vcf or .vcf.gz will be output here (depending on file ending). If fasta-input, specify the file name "
"like so: 'my_alignment_%%GENE.fasta', where '%%GENE' will be replaced by the name of the gene")
parser.add_argument('--skip-validation', action='store_true',
help="Skip validation of input/output files. Use at your own risk!")

vcf_only = parser.add_argument_group(
title="VCF specific",
description="These arguments are only applicable if the input (--ancestral-sequences) is in VCF format."
Expand Down Expand Up @@ -440,7 +446,7 @@ def run(args):
if len(features_without_variation):
print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))
else:
(reference, sequences) = sequences_json(args.ancestral_sequences, tree)
(reference, sequences) = sequences_json(args.ancestral_sequences, tree, args.skip_validation)
translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'}
for fname, feat in features.items():
if fname=='nuc':
Expand Down Expand Up @@ -470,6 +476,9 @@ def run(args):

output_data = {'annotations':annotations, 'nodes':aa_muts, 'reference': reference_translations}
out_name = get_json_name(args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json')
# use NodeDataObject to perform validation on the file before it's written
NodeDataObject(output_data, out_name, **{'validation_mode': ValidationMode.SKIP if args.skip_validation else {}})

write_json(output_data, out_name)
print("amino acid mutations written to", out_name, file=sys.stdout)

Expand Down
14 changes: 14 additions & 0 deletions augur/util_support/node_data_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,17 @@ def validate(self):
print_err(f"WARNING: {msg}")
else:
raise ValueError(f"unknown validation mode: {self.validation_mode!r}")



class NodeDataObject(NodeDataFile):
"""
NodeDataObject is identical to NodeDataFile except it takes a node-data dict
rather than loading the node data from a file
"""
def __init__(self, node_data_json, fname, validation_mode=ValidationMode.ERROR):
self.fname = fname
self.validation_mode = validation_mode
self.attrs = node_data_json
if self.validation_mode != ValidationMode.SKIP:
self.validate()
2 changes: 2 additions & 0 deletions tests/functional/translate/cram/genes.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ as a feature ('nuc' in this case)
Couldn't find gene gene3 in GFF or GenBank file
Read in 2 features from reference sequence file
Validating schema of .+ (re)
Validating schema of .+ (re)
amino acid mutations written to .+ (re)

$ python3 "$SCRIPTS/diff_jsons.py" \
Expand All @@ -39,6 +40,7 @@ Using a text file rather than command line arguments
Couldn't find gene gene3 in GFF or GenBank file
Read in 2 features from reference sequence file
Validating schema of .+ (re)
Validating schema of .+ (re)
amino acid mutations written to .+ (re)

$ python3 "$SCRIPTS/diff_jsons.py" \
Expand Down
1 change: 1 addition & 0 deletions tests/functional/translate/cram/translate-with-genbank.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Translate amino acids for genes using a GenBank file.
WARNING: 1 CDS features skipped as they didn't have a locus_tag or gene qualifier.
Read in 3 features from reference sequence file
Validating schema of '.+nt_muts.json'... (re)
Validating schema of .* (re)
amino acid mutations written to .* (re)

$ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --output-node-data aa_muts.json
Read in 3 features from reference sequence file
Validating schema of '.+/nt_muts.json'... (re)
Validating schema of .* (re)
amino acid mutations written to .* (re)

Other than the sequence ids which will include a temporary path, the JSONs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --output-node-data aa_muts.json
Read in 3 features from reference sequence file
Validating schema of '.+/nt_muts.json'... (re)
Validating schema of .* (re)
amino acid mutations written to .* (re)

$ python3 "${SCRIPTS}/diff_jsons.py" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ This is an identical test setup as `translate-with-gff-and-gene.t` but using loc
> --output-node-data aa_muts.json
Read in 3 features from reference sequence file
Validating schema of '.+/nt_muts.json'... (re)
Validating schema of .* (re)
amino acid mutations written to .* (re)

$ python3 "${SCRIPTS}/diff_jsons.py" \
Expand Down
1 change: 1 addition & 0 deletions tests/functional/translate/cram/vcf-with-root-mutation.t
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ to the provided reference.fasta)
> --vcf-reference "$ANC_DATA/reference.fasta" \
> --vcf-reference-output reference.fasta
Read in 3 features from reference sequence file
Validating schema of 'aa_muts.json'...
amino acid mutations written to aa_muts.json

The _reference_ produced is the actual reference, not using the mutations in the tree
Expand Down
1 change: 1 addition & 0 deletions tests/functional/translate/cram/vcf.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Setup
> --vcf-reference "$ANC_DATA/reference.fasta" \
> --vcf-reference-output reference.fasta
Read in 3 features from reference sequence file
Validating schema of 'aa_muts.json'...
amino acid mutations written to aa_muts.json

$ cat reference.fasta
Expand Down

0 comments on commit 34b0787

Please sign in to comment.