[ancestral, translate] validate output node-data

By leveraging the existing validation designed for reading node-data files we not only validate against the annotation schema but also perform other sanity checks on the node-data structure for which a schema does not (yet) exist. As part of this we add a `--skip-validation` argument (following `augur export v2`) which skips validation of the output node-data and (translate only) skips validation of the input node-data.
nextstrain · Mar 16, 2024 · 34b0787 · 34b0787
1 parent a6fce6f
commit 34b0787
Show file tree

Hide file tree

Showing 10 changed files with 45 additions and 3 deletions.
diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -34,6 +34,8 @@
 from .io.vcf import is_vcf as is_filename_vcf
 from treetime.vcf_utils import read_vcf, write_vcf
 from collections import defaultdict
+from .types import ValidationMode
+from .util_support.node_data_file import NodeDataObject
 
 def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
                                  marginal=False, fill_overhangs=True, infer_tips=False,
@@ -329,6 +331,12 @@ def register_parser(parent_subparsers):
                         "the gene name.")
     output_group.add_argument('--output-vcf', type=str, help='name of output VCF file which will include ancestral seqs')
 
+    general_group = parser.add_argument_group(
+        "general",
+    )
+    general_group.add_argument('--skip-validation', action='store_true', 
+                        help="Skip validation of input/output files. Use at your own risk!")
+
     return parser
 
 def validate_arguments(args, is_vcf):
@@ -465,6 +473,9 @@ def run(args):
                         oh.write(f">{node.name}\n{aa_result['tt'].sequence(node, as_string=True, reconstructed=True)}\n")
 
     out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
+    # use NodeDataObject to perform validation on the file before it's written
+    NodeDataObject(anc_seqs, out_name, **{'validation_mode': ValidationMode.SKIP if args.skip_validation else {}})
+
     write_json(anc_seqs, out_name)
     print("ancestral mutations written to", out_name, file=sys.stdout)
 

diff --git a/augur/translate.py b/augur/translate.py
@@ -22,6 +22,9 @@
 from treetime.vcf_utils import read_vcf
 from augur.errors import AugurError
 from textwrap import dedent
+from .types import ValidationMode
+from .util_support.node_data_file import NodeDataObject
+
 
 class MissingNodeError(Exception):
     pass
@@ -335,12 +338,12 @@ def sequences_vcf(reference_fasta, vcf):
     ref = compress_seq['reference']
     return (sequences, ref)
 
-def sequences_json(node_data_json, tree):
+def sequences_json(node_data_json, tree, skip_validation):
     """
     Extract the full nuc sequence for each node in the provided node-data JSON.
     Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
     """
-    node_data = read_node_data(node_data_json)
+    node_data = read_node_data(node_data_json, **{'validation_mode': ValidationMode.SKIP if skip_validation else {}})
     if node_data is None:
         raise AugurError("could not read node data (incl sequences)")
     # extract sequences from node meta data
@@ -370,6 +373,9 @@ def register_parser(parent_subparsers):
     parser.add_argument('--alignment-output', type=str, help="write out translated gene alignments. "
                                    "If a VCF-input, a .vcf or .vcf.gz will be output here (depending on file ending). If fasta-input, specify the file name "
                                    "like so: 'my_alignment_%%GENE.fasta', where '%%GENE' will be replaced by the name of the gene")
+    parser.add_argument('--skip-validation', action='store_true', 
+                        help="Skip validation of input/output files. Use at your own risk!")
+
     vcf_only = parser.add_argument_group(
         title="VCF specific",
         description="These arguments are only applicable if the input (--ancestral-sequences) is in VCF format."
@@ -440,7 +446,7 @@ def run(args):
         if len(features_without_variation):
             print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
     else:
-        (reference, sequences) = sequences_json(args.ancestral_sequences, tree)
+        (reference, sequences) = sequences_json(args.ancestral_sequences, tree, args.skip_validation)
         translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'}
         for fname, feat in features.items():
             if fname=='nuc':
@@ -470,6 +476,9 @@ def run(args):
 
     output_data = {'annotations':annotations, 'nodes':aa_muts, 'reference': reference_translations}
     out_name = get_json_name(args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json')
+    # use NodeDataObject to perform validation on the file before it's written
+    NodeDataObject(output_data, out_name, **{'validation_mode': ValidationMode.SKIP if args.skip_validation else {}})
+
     write_json(output_data, out_name)
     print("amino acid mutations written to", out_name, file=sys.stdout)
 

diff --git a/augur/util_support/node_data_file.py b/augur/util_support/node_data_file.py
@@ -103,3 +103,17 @@ def validate(self):
                 print_err(f"WARNING: {msg}")
             else:
                 raise ValueError(f"unknown validation mode: {self.validation_mode!r}")
+
+
+
+class NodeDataObject(NodeDataFile):
+    """
+    NodeDataObject is identical to NodeDataFile except it takes a node-data dict
+    rather than loading the node data from a file
+    """
+    def __init__(self, node_data_json, fname, validation_mode=ValidationMode.ERROR):
+        self.fname = fname
+        self.validation_mode = validation_mode
+        self.attrs = node_data_json
+        if self.validation_mode != ValidationMode.SKIP:
+            self.validate()
diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t
@@ -18,6 +18,7 @@ as a feature ('nuc' in this case)
   Couldn't find gene gene3 in GFF or GenBank file
   Read in 2 features from reference sequence file
   Validating schema of .+ (re)
+  Validating schema of .+ (re)
   amino acid mutations written to .+ (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" \
@@ -39,6 +40,7 @@ Using a text file rather than command line arguments
   Couldn't find gene gene3 in GFF or GenBank file
   Read in 2 features from reference sequence file
   Validating schema of .+ (re)
+  Validating schema of .+ (re)
   amino acid mutations written to .+ (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" \

diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t
@@ -15,6 +15,7 @@ Translate amino acids for genes using a GenBank file.
   WARNING: 1 CDS features skipped as they didn't have a locus_tag or gene qualifier.
   Read in 3 features from reference sequence file
   Validating schema of '.+nt_muts.json'... (re)
+  Validating schema of .* (re)
   amino acid mutations written to .* (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \

diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
@@ -20,6 +20,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --output-node-data aa_muts.json
   Read in 3 features from reference sequence file
   Validating schema of '.+/nt_muts.json'... (re)
+  Validating schema of .* (re)
   amino acid mutations written to .* (re)
 
 Other than the sequence ids which will include a temporary path, the JSONs

diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t
@@ -20,6 +20,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --output-node-data aa_muts.json
   Read in 3 features from reference sequence file
   Validating schema of '.+/nt_muts.json'... (re)
+  Validating schema of .* (re)
   amino acid mutations written to .* (re)
 
   $ python3 "${SCRIPTS}/diff_jsons.py" \

diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
@@ -20,6 +20,7 @@ This is an identical test setup as `translate-with-gff-and-gene.t` but using loc
   >   --output-node-data aa_muts.json
   Read in 3 features from reference sequence file
   Validating schema of '.+/nt_muts.json'... (re)
+  Validating schema of .* (re)
   amino acid mutations written to .* (re)
 
   $ python3 "${SCRIPTS}/diff_jsons.py" \

diff --git a/tests/functional/translate/cram/vcf-with-root-mutation.t b/tests/functional/translate/cram/vcf-with-root-mutation.t
@@ -23,6 +23,7 @@ to the provided reference.fasta)
   >  --vcf-reference "$ANC_DATA/reference.fasta" \
   >  --vcf-reference-output reference.fasta
   Read in 3 features from reference sequence file
+  Validating schema of 'aa_muts.json'...
   amino acid mutations written to aa_muts.json
 
 The _reference_ produced is the actual reference, not using the mutations in the tree

diff --git a/tests/functional/translate/cram/vcf.t b/tests/functional/translate/cram/vcf.t
@@ -15,6 +15,7 @@ Setup
   >  --vcf-reference "$ANC_DATA/reference.fasta" \
   >  --vcf-reference-output reference.fasta
   Read in 3 features from reference sequence file
+  Validating schema of 'aa_muts.json'...
   amino acid mutations written to aa_muts.json
 
   $ cat reference.fasta