nextstrain · jameshadfield · Mar 19, 2024 · Mar 19, 2024 · Mar 16, 2024 · Mar 16, 2024
diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -34,6 +34,9 @@
 from .io.vcf import is_vcf as is_filename_vcf
 from treetime.vcf_utils import read_vcf, write_vcf
 from collections import defaultdict
+from .types import ValidationMode
+from .util_support.node_data_file import NodeDataObject
+from .export_v2 import validation_mode_help_message
 
 def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
                                  marginal=False, fill_overhangs=True, infer_tips=False,
@@ -329,6 +332,12 @@ def register_parser(parent_subparsers):
                         "the gene name.")
     output_group.add_argument('--output-vcf', type=str, help='name of output VCF file which will include ancestral seqs')
 
+    general_group = parser.add_argument_group(
+        "general",
+    )
+    general_group.add_argument('--validation-mode', type=ValidationMode, choices=[mode for mode in ValidationMode], default=ValidationMode.ERROR,
+                               help=validation_mode_help_message)
+
     return parser
 
 def validate_arguments(args, is_vcf):
@@ -465,6 +474,9 @@ def run(args):
                         oh.write(f">{node.name}\n{aa_result['tt'].sequence(node, as_string=True, reconstructed=True)}\n")
 
     out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
+    # use NodeDataObject to perform validation on the file before it's written
+    NodeDataObject(anc_seqs, out_name, args.validation_mode)
+
     write_json(anc_seqs, out_name)
     print("ancestral mutations written to", out_name, file=sys.stdout)
 

diff --git a/augur/export_v2.py b/augur/export_v2.py
@@ -854,6 +854,20 @@ def node_data_prop_is_normal_trait(name):
 
     return True
 
+validation_mode_help_message = """
+    Control if optional validation checks are performed and what
+    happens if they fail.
+
+    'error' and 'warn' modes perform validation and emit messages about
+    failed validation checks.  'error' mode causes a non-zero exit
+    status if any validation checks failed, while 'warn' does not.
+
+    'skip' mode performs no validation.
+
+    Note that some validation checks are non-optional and as such are
+    not affected by this setting.
+"""
+
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("v2", help=__doc__)
@@ -917,19 +931,7 @@ def register_parser(parent_subparsers):
         type=ValidationMode,
         choices=[mode for mode in ValidationMode],
         default=ValidationMode.ERROR,
-        help="""
-            Control if optional validation checks are performed and what
-            happens if they fail.
-
-            'error' and 'warn' modes perform validation and emit messages about
-            failed validation checks.  'error' mode causes a non-zero exit
-            status if any validation checks failed, while 'warn' does not.
-
-            'skip' mode performs no validation.
-
-            Note that some validation checks are non-optional and as such are
-            not affected by this setting.
-        """)
+        help=validation_mode_help_message)
     optional_settings.add_argument(
         '--skip-validation',
         dest="validation_mode",

diff --git a/augur/translate.py b/augur/translate.py
@@ -22,6 +22,9 @@
 from treetime.vcf_utils import read_vcf
 from augur.errors import AugurError
 from textwrap import dedent
+from .types import ValidationMode
+from .util_support.node_data_file import NodeDataObject
+from .export_v2 import validation_mode_help_message
 
 class MissingNodeError(Exception):
     pass
@@ -335,12 +338,12 @@ def sequences_vcf(reference_fasta, vcf):
     ref = compress_seq['reference']
     return (sequences, ref)
 
-def sequences_json(node_data_json, tree):
+def sequences_json(node_data_json, tree, validation_mode):
     """
     Extract the full nuc sequence for each node in the provided node-data JSON.
     Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
     """
-    node_data = read_node_data(node_data_json)
+    node_data = read_node_data(node_data_json, validation_mode=validation_mode)
     if node_data is None:
         raise AugurError("could not read node data (incl sequences)")
     # extract sequences from node meta data
@@ -370,6 +373,8 @@ def register_parser(parent_subparsers):
     parser.add_argument('--alignment-output', type=str, help="write out translated gene alignments. "
                                    "If a VCF-input, a .vcf or .vcf.gz will be output here (depending on file ending). If fasta-input, specify the file name "
                                    "like so: 'my_alignment_%%GENE.fasta', where '%%GENE' will be replaced by the name of the gene")
+    parser.add_argument('--validation-mode', type=ValidationMode, choices=[mode for mode in ValidationMode], default=ValidationMode.ERROR, help=validation_mode_help_message)
+
     vcf_only = parser.add_argument_group(
         title="VCF specific",
         description="These arguments are only applicable if the input (--ancestral-sequences) is in VCF format."
@@ -440,7 +445,7 @@ def run(args):
         if len(features_without_variation):
             print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
     else:
-        (reference, sequences) = sequences_json(args.ancestral_sequences, tree)
+        (reference, sequences) = sequences_json(args.ancestral_sequences, tree, args.validation_mode)
         translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'}
         for fname, feat in features.items():
             if fname=='nuc':
@@ -470,6 +475,9 @@ def run(args):
 
     output_data = {'annotations':annotations, 'nodes':aa_muts, 'reference': reference_translations}
     out_name = get_json_name(args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json')
+    # use NodeDataObject to perform validation on the file before it's written
+    NodeDataObject(output_data, out_name, args.validation_mode)
+
     write_json(output_data, out_name)
     print("amino acid mutations written to", out_name, file=sys.stdout)
 

diff --git a/augur/util_support/node_data_file.py b/augur/util_support/node_data_file.py
@@ -60,45 +60,67 @@ def items(self):
         return filtered_attrs.items()
 
     def validate(self):
-        if self.annotations:
-            try:
-                validate_json(
-                    self.annotations,
-                    load_json_schema("schema-annotations.json"),
-                    self.fname,
+        if self.validation_mode is ValidationMode.SKIP:
+            return # don't perform validation (i.e. skip it)
+
+        try:
+            if self.annotations:
+                try:
+                    # validate_json will print any errors to STDERR
+                    validate_json(
+                        self.annotations,
+                        load_json_schema("schema-annotations.json"),
+                        self.fname,
+                    )
+                except ValidateError as err:
+                    raise ValidateError(
+                        f"{self.fname} contains an `annotations` attribute of an invalid JSON format. Was it "
+                        "produced by different version of augur the one you are currently using "
+                        f" ({__version__})? Please check the program that produced that JSON file."
+                    ) from err
+
+            if not isinstance(self.nodes, dict):
+                raise ValidateError(
+                    f"`nodes` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
                 )
-            except ValidateError as err:
-                raise AugurError(
-                    f"{self.fname} contains an `annotations` attribute of an invalid JSON format. Was it "
-                    "produced by different version of augur the one you are currently using "
-                    f" ({__version__})? Please check the program that produced that JSON file."
-                ) from err
-
-        if not isinstance(self.nodes, dict):
-            raise AugurError(
-                f"`nodes` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
-            )
-
-        if not isinstance(self.branches, dict):
-            raise AugurError(
-                f"`branches` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
-            )
-
-        if not self.nodes and not self.branches:
-            print_err(
-                f"WARNING: {self.fname} has empty or nonexistent `nodes` and `branches`. Please check the formatting of this JSON!"
-            )
-
-        if self.validation_mode is not ValidationMode.SKIP and self.is_generated_by_incompatible_augur:
-            msg = (
-                f"Augur version incompatibility detected: the JSON {self.fname} was generated by "
-                f"{self.generated_by}, which is incompatible with the current augur version "
-                f"({__version__}). We suggest you rerun the pipeline using the current version of "
-                "augur."
-            )
-            if self.validation_mode is ValidationMode.ERROR:
-                raise AugurError(msg)
-            elif self.validation_mode is ValidationMode.WARN:
-                print_err(f"WARNING: {msg}")
+
+            if not isinstance(self.branches, dict):
+                raise ValidateError(
+                    f"`branches` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
+                )
+
+            if not self.nodes and not self.branches:
+                print_err(
+                    f"WARNING: {self.fname} has empty or nonexistent `nodes` and `branches`. Please check the formatting of this JSON!"
+                )
+
+            if self.is_generated_by_incompatible_augur:
+                msg = (
+                    f"Augur version incompatibility detected: the JSON {self.fname} was generated by "
+                    f"{self.generated_by}, which is incompatible with the current augur version "
+                    f"({__version__}). We suggest you rerun the pipeline using the current version of "
+                    "augur."
+                )
+                raise ValidateError(msg)
+        except ValidateError as e:
+            if self.validation_mode is ValidationMode.WARN:
+                # string-representation of error mirrors how augur ultimately handles AugurError
+                print_err(f"WARNING: {e}")
             else:
-                raise ValueError(f"unknown validation mode: {self.validation_mode!r}")
+                # Re-raising within an except block will use implicit chaining (via __context__)
+                # however since we ultimately catch AugurError and use a string representation
+                # this isn't particularly helpful. One day we may add a dev-flag to allow printing
+                # the traceback or similar
+                raise AugurError(*e.args)
+
+
+class NodeDataObject(NodeDataFile):
+    """
+    NodeDataObject is identical to NodeDataFile except it takes a node-data dict
+    rather than loading the node data from a file
+    """
+    def __init__(self, node_data_json, fname, validation_mode=ValidationMode.ERROR):
+        self.fname = fname
+        self.validation_mode = validation_mode
+        self.attrs = node_data_json
+        self.validate()
diff --git a/augur/util_support/node_data_reader.py b/augur/util_support/node_data_reader.py
@@ -16,7 +16,7 @@ class NodeDataReader:
 
     If a tree file is specified, it is used to verify the node names.
 
-    If validation_mode is set to :py:attr:`augur.types.ValidationMode.SKIP`, Augur version of node data files is not checked.
+    If validation_mode is set to :py:attr:`augur.types.ValidationMode.SKIP` no validation is performed.
     """
 
     def __init__(self, filenames, tree_file=None, validation_mode=ValidationMode.ERROR):

diff --git a/augur/utils.py b/augur/utils.py
@@ -870,7 +870,7 @@ def _parse(feat):
                 for segment in feat.location.parts # segment: SimpleLocation
             ]
         else:
-            raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}")
+            raise AugurError(f"Encountered a genome feature with an unknown location type '{type(feat.location)}'")
         a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]
         a['type'] = feat.type  # (unused by auspice)
         if ref_seq_name:
@@ -883,6 +883,6 @@ def _parse(feat):
         if fname=='nuc':
             assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand"
         elif annotations[fname]['strand'] not in ['+', '-']:
-            print("WARNING: Feature {fname:q} uses a strand which auspice cannot display")
+            print(f"WARNING: Feature '{fname}' uses a strand which auspice cannot display")
 
     return annotations
diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t
@@ -18,6 +18,7 @@ as a feature ('nuc' in this case)
   Couldn't find gene gene3 in GFF or GenBank file
   Read in 2 features from reference sequence file
   Validating schema of .+ (re)
+  Validating schema of .+ (re)
   amino acid mutations written to .+ (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" \
@@ -39,6 +40,7 @@ Using a text file rather than command line arguments
   Couldn't find gene gene3 in GFF or GenBank file
   Read in 2 features from reference sequence file
   Validating schema of .+ (re)
+  Validating schema of .+ (re)
   amino acid mutations written to .+ (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" \

diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t
@@ -15,6 +15,7 @@ Translate amino acids for genes using a GenBank file.
   WARNING: 1 CDS features skipped as they didn't have a locus_tag or gene qualifier.
   Read in 3 features from reference sequence file
   Validating schema of '.+nt_muts.json'... (re)
+  Validating schema of .* (re)
   amino acid mutations written to .* (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \

diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
@@ -20,6 +20,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --output-node-data aa_muts.json
   Read in 3 features from reference sequence file
   Validating schema of '.+/nt_muts.json'... (re)
+  Validating schema of .* (re)
   amino acid mutations written to .* (re)
 
 Other than the sequence ids which will include a temporary path, the JSONs

diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t
@@ -20,6 +20,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --output-node-data aa_muts.json
   Read in 3 features from reference sequence file
   Validating schema of '.+/nt_muts.json'... (re)
+  Validating schema of .* (re)
   amino acid mutations written to .* (re)
 
   $ python3 "${SCRIPTS}/diff_jsons.py" \

diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
@@ -20,6 +20,7 @@ This is an identical test setup as `translate-with-gff-and-gene.t` but using loc
   >   --output-node-data aa_muts.json
   Read in 3 features from reference sequence file
   Validating schema of '.+/nt_muts.json'... (re)
+  Validating schema of .* (re)
   amino acid mutations written to .* (re)
 
   $ python3 "${SCRIPTS}/diff_jsons.py" \

diff --git a/tests/functional/translate/cram/vcf-with-root-mutation.t b/tests/functional/translate/cram/vcf-with-root-mutation.t
@@ -23,6 +23,7 @@ to the provided reference.fasta)
   >  --vcf-reference "$ANC_DATA/reference.fasta" \
   >  --vcf-reference-output reference.fasta
   Read in 3 features from reference sequence file
+  Validating schema of 'aa_muts.json'...
   amino acid mutations written to aa_muts.json
 
 The _reference_ produced is the actual reference, not using the mutations in the tree

diff --git a/tests/functional/translate/cram/vcf.t b/tests/functional/translate/cram/vcf.t
@@ -15,6 +15,7 @@ Setup
   >  --vcf-reference "$ANC_DATA/reference.fasta" \
   >  --vcf-reference-output reference.fasta
   Read in 3 features from reference sequence file
+  Validating schema of 'aa_muts.json'...
   amino acid mutations written to aa_muts.json
 
   $ cat reference.fasta