nextstrain · corneliusroemer · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -11,10 +11,13 @@
 
 * index: Previously specifying a directory that does not exist in the path to `--output` would result in an incorrect error stating that the input file does not exist. It now shows the correct path responsible for the error. [#1644][] (@victorlin)
 * curate format-dates: Update help docs and improve failure messages to show use of `--expected-date-formats`. [#1653][] (@joverlee521)
+* tests: Fix false negative tests due to incorrect use of DeepDiff's `--exclude-regex-path` option. [#1665][] [#1666][] (@corneliusroemer)
 
 [#1644]: https://github.com/nextstrain/augur/issues/1644
 [#1653]: https://github.com/nextstrain/augur/pull/1653
 [#1656]: https://github.com/nextstrain/augur/pull/1656
+[#1665]: https://github.com/nextstrain/augur/issues/1665
+[#1666]: https://github.com/nextstrain/augur/pull/1666
 
 ## 26.0.0 (17 September 2024)
 

diff --git a/docs/contribute/DEV_DOCS.md b/docs/contribute/DEV_DOCS.md
@@ -89,6 +89,8 @@ To compare JSON outputs with stochastic numerical values, use `scripts/diff_json
 
 Both tree and JSON comparison scripts rely on [deepdiff](https://deepdiff.readthedocs.io/en/latest/) for underlying comparisons.
 
+When using `diff_jsons.py`'s `--exclude-regex-paths` argument, make sure to escape any special regex characters, in particular square brackets: do this `\['seqid'\]`, not this `['seqid']`. See [#1655](https://github.com/nextstrain/augur/issues/1665) for what happens if you don't.
+
 #### When to use which type of test
 
 1. Unit tests should be used for the [public API](https://docs.nextstrain.org/projects/augur/en/stable/api/public/index.html).

diff --git a/scripts/diff_jsons.py b/scripts/diff_jsons.py
@@ -3,6 +3,7 @@
 import argparse
 import deepdiff
 import json
+import re
 
 from augur.argparse_ import ExtendOverwriteDefault
 
@@ -15,12 +16,22 @@
     parser.add_argument("first_json", help="first JSON to compare")
     parser.add_argument("second_json", help="second JSON to compare")
     parser.add_argument("--significant-digits", type=int, default=5, help="number of significant digits to use when comparing numeric values")
-    parser.add_argument("--exclude-paths", nargs="+", action=ExtendOverwriteDefault, help="list of paths to exclude from consideration when performing a diff", default=["root['generated_by']['version']"])
+    parser.add_argument("--exclude-paths", nargs="+", action=ExtendOverwriteDefault, help="list of paths to exclude from consideration when performing a diff", default=["root['generated_by']", "root['meta']['updated']"])
     parser.add_argument("--exclude-regex-paths", nargs="+", action="extend", help="list of path regular expressions to exclude from consideration when performing a diff")
     parser.add_argument("--ignore-numeric-type-changes", action="store_true", help="ignore numeric type changes in the diff (e.g., int of 1 to float of 1.0)")
 
     args = parser.parse_args()
 
+    # Test for most fatal errors in regex path usage
+    # Exclude regexes should never match `'`, otherwise the diff is always going to pass
+    for regex in args.exclude_regex_paths or []:
+        result = re.compile(regex).search("'")
+        if result is not None:
+            raise Exception(
+                f"Exclude regex {regex} matches `'` which means this diff will always pass which is probably not what you want.\n"
+                "You probably forgot to escape something in your regex. See for example: https://stackoverflow.com/a/79173188/7483211"
+            )
+
     with open(args.first_json, "r") as fh:
         first_json = json.load(fh)
 

diff --git a/tests/functional/ancestral/cram/ambiguous-positions.t b/tests/functional/ancestral/cram/ambiguous-positions.t
@@ -36,7 +36,7 @@ Setup
 
   $ python3 "$SCRIPTS/diff_jsons.py" \
   >   expected.json "nt_muts.json" \
-  >   --exclude-regex-paths "root\['annotations'\]" "root\['generated_by'\]" "root\['reference'\]"
+  >   --exclude-paths "annotations" "generated_by" "reference"
   {}
 
   $ python3 "$SCRIPTS/compare-json-vcf.py" \

diff --git a/tests/functional/ancestral/cram/case-sensitive.t b/tests/functional/ancestral/cram/case-sensitive.t
@@ -15,8 +15,7 @@ Change the _reference_ to lowercase
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \
-  >   "nt_muts.ref-seq.json" \
-  >   --exclude-paths "root['generated_by']"
+  >   "nt_muts.ref-seq.json" 
   {}
 
 
@@ -37,6 +36,5 @@ be lowecase which will be compared against the uppercase reference
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \
-  >   "nt_muts.ref-seq.json" \
-  >   --exclude-paths "root['generated_by']"
+  >   "nt_muts.ref-seq.json"
   {}
diff --git a/tests/functional/ancestral/cram/general.t b/tests/functional/ancestral/cram/general.t
@@ -19,8 +19,7 @@ node-data JSON we diff against.
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \
-  >   "nt_muts.ref-seq.json" \
-  >   --exclude-paths "root['generated_by']"
+  >   "nt_muts.ref-seq.json"
   {}
 
 Same as above but without providing a `--root-sequence`. The effect of this on behaviour is:
@@ -39,6 +38,5 @@ mutations (as there's nothing to compare the root node to)
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   "$TESTDIR/../data/simple-genome/nt_muts.no-ref-seq.json" \
-  >   "nt_muts.no-ref-seq.json" \
-  >   --exclude-paths "root['generated_by']"
+  >   "nt_muts.no-ref-seq.json"
   {}
diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t
@@ -20,7 +20,7 @@ ancestor).
 Check that the reference length was correctly exported as the nuc annotation
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
-  >   --exclude-regex-paths "['seqid']" -- \
+  >   --exclude-regex-paths "\['seqid'\]" -- \
   >   "$TESTDIR/../data/ancestral_mutations_with_root_sequence.json" \
   >   "$CRAMTMP/$TESTFILE/ancestral_mutations.json"
   {}
diff --git a/tests/functional/ancestral/cram/vcf-multi-allele.t b/tests/functional/ancestral/cram/vcf-multi-allele.t
@@ -24,7 +24,7 @@ See <https://github.com/nextstrain/augur/issues/1380> for the bug this is testin
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   "$DATA/nt_muts.ref-seq.json" \
   >   nt_muts.json \
-  >   --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]" "root\['generated_by'\]"
+  >   --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]"
   {'iterable_item_added': {"root['nodes']['sample_B']['muts'][0]": 'A30G'}}
 
   $ cat > expected.vcf <<EOF

diff --git a/tests/functional/ancestral/cram/vcf.t b/tests/functional/ancestral/cram/vcf.t
@@ -21,7 +21,7 @@ but it will have the reference sequence attached.
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   "$DATA/nt_muts.ref-seq.json" \
   >   "nt_muts.vcf-input.ref-seq.json" \
-  >   --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]" "root\['generated_by'\]"
+  >   --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]"
   {}
 
 Here's the same mutations as in $DATA/nt_muts.ref-seq.json,

diff --git a/tests/functional/ancestral/data/ancestral_mutations_with_root_sequence.json b/tests/functional/ancestral/data/ancestral_mutations_with_root_sequence.json
diff --git a/tests/functional/clades/cram/membership-and-label.t b/tests/functional/clades/cram/membership-and-label.t
@@ -13,6 +13,5 @@ Test custom membership key + label key. The only change should be the key names
 
   $ cat clades_custom.json | sed "s/lineage/clade_membership/" | sed "s/origin/clade/" > clades_sed.json
 
-  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/clades.json" clades_sed.json \
-  >   --exclude-paths "root['generated_by']"
+  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/clades.json" clades_sed.json
   {}
diff --git a/tests/functional/clades/cram/no-label.t b/tests/functional/clades/cram/no-label.t
@@ -11,6 +11,5 @@ Test the ability to _not_ export a branch label (same logic as not exporting the
   >   --label-name none \
   >   --output-node-data clades_no-labels.json &>/dev/null
 
-  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/clades.json" clades_no-labels.json \
-  >   --exclude-paths "root['generated_by']"
+  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/clades.json" clades_no-labels.json
   {'dictionary_item_removed': [root['branches']]}
diff --git a/tests/functional/clades/cram/root-clade-identification.t b/tests/functional/clades/cram/root-clade-identification.t
@@ -12,8 +12,7 @@ This is an oversight and ideally would be fixed
   >   --clades "$TESTDIR/../data/toy_clades_nuc.tsv" \
   >   --output-node-data toy_clades_1.json &>/dev/null
 
-  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/toy_clades_1.json" toy_clades_1.json \
-  >   --exclude-paths "root['generated_by']"
+  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/toy_clades_1.json" toy_clades_1.json
   {}
 
 A clade which exists at the root is identified (and correctly propogated) if the root sequence
@@ -25,8 +24,7 @@ is explicitly set.
   >   --clades "$TESTDIR/../data/toy_clades_nuc.tsv" \
   >   --output-node-data toy_clades_2a.json &>/dev/null
 
-  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/toy_clades_2.json" toy_clades_2a.json \
-  >   --exclude-paths "root['generated_by']"
+  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/toy_clades_2.json" toy_clades_2a.json
   {}
 
 A clade which exists at the root is identified (and correctly propogated) without a root sequence
@@ -38,6 +36,5 @@ if the (branch leading to the) root has the clade-defining mutation.
   >   --clades "$TESTDIR/../data/toy_clades_nuc.tsv" \
   >   --output-node-data toy_clades_2b.json &>/dev/null
 
-  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/toy_clades_2.json" toy_clades_2b.json \
-  >   --exclude-paths "root['generated_by']"
+  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/toy_clades_2.json" toy_clades_2b.json
   {}
diff --git a/tests/functional/refine/cram/not-timetree-mutations-per-site.t b/tests/functional/refine/cram/not-timetree-mutations-per-site.t
@@ -28,5 +28,5 @@ Confirm that trees match expected topology and branch lengths, given that the ou
   >   "$TESTDIR/../data/mutations_per_site_branch_lengths.json" \
   >   branch_lengths.json \
   >   --significant-digits 0 \
-  >   --exclude-paths "root['generated_by']['version']" "root['input_tree']"
+  >   --exclude-paths "generated_by" "input_tree"
   {}
diff --git a/tests/functional/refine/cram/not-timetree-mutations.t b/tests/functional/refine/cram/not-timetree-mutations.t
@@ -29,5 +29,5 @@ Confirm that trees match expected topology and branch lengths, given that the ou
   >   "$TESTDIR/../data/integer_branch_lengths.json" \
   >   branch_lengths.json \
   >   --significant-digits 0 \
-  >   --exclude-paths "root['generated_by']['version']" "root['input_tree']" "root['alignment']"
+  >   --exclude-paths "generated_by" "input_tree" "alignment"
   {}
diff --git a/tests/functional/translate/cram/general.t b/tests/functional/translate/cram/general.t
@@ -18,7 +18,7 @@ which validate the output will fail as it's missing a 'nuc' annotation.
   $ python3 "$SCRIPTS/diff_jsons.py" \
   >   "$DATA/aa_muts.json" \
   >   "aa_muts.json" \
-  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']" 
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
   {}
 
 Same as above but using a GenBank file. This changes the 'type' of the annotations,
@@ -33,5 +33,5 @@ but this is irrelevant for Auspice's use and simply reflects the reference sourc
   $ python3 "$SCRIPTS/diff_jsons.py" \
   >   "$DATA/aa_muts.json" \
   >   "aa_muts.genbank.json" \
-  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root\['annotations'\]\['.+'\]\['type'\]" "root['meta']['updated']" 
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['(seqid|type)'\]"
   {}
diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t
@@ -21,9 +21,9 @@ as a feature ('nuc' in this case)
   amino acid mutations written to .+ (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" \
-  >   "$DATA/aa_muts.json" \
-  >   "aa_muts.genes-args.json" \
-  >   --exclude-regex-paths "seqid" "gene1" "root['meta']['updated']"
+  >  "$DATA/aa_muts.json" \
+  >  "aa_muts.genes-args.json" \
+  >  --exclude-regex-paths "seqid" "gene1"
   {}
 Using a text file rather than command line arguments
 
@@ -43,7 +43,6 @@ Using a text file rather than command line arguments
   amino acid mutations written to .+ (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" \
-  >   "aa_muts.genes-args.json" \
-  >   "aa_muts.genes-txt.json" \
-  > --exclude-paths "root['meta']['updated']"
+  >  "aa_muts.genes-args.json" \
+  >  "aa_muts.genes-txt.json"
   {}
diff --git a/tests/functional/translate/cram/gff.t b/tests/functional/translate/cram/gff.t
@@ -77,7 +77,7 @@ GFF file with 'region' removed, so the only genome information is the ##sequence
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   "$DATA/aa_muts.json" \
   >   "aa_muts.pragma-only.json" \
-  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']"
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
   {}
 
 GFF file with no genome coordinate information

diff --git a/tests/functional/translate/cram/root-mutations.t b/tests/functional/translate/cram/root-mutations.t
@@ -27,5 +27,5 @@ is unchanged (MPCG*). There is also a mutation E4G at the root node to compensat
   $ python3 "$SCRIPTS/diff_jsons.py" \
   >   "$DATA/aa_muts.json" \
   >   "aa_muts.json" \
-  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']"
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
   {'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][0]": 'E4G'}}
diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t
@@ -18,5 +18,5 @@ Translate amino acids for genes using a GenBank file.
   amino acid mutations written to .* (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \
-  >  --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']"
+  >  --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
   {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
@@ -26,7 +26,7 @@ Other than the sequence ids which will include a temporary path, the JSONs
 should be identical.
 
   $ python3 "${SCRIPTS}/diff_jsons.py" \
-  >  --exclude-regex-paths "['seqid']" -- \
+  >  --exclude-regex-paths "\['seqid'\]" -- \
   >  "${DATA}/zika/aa_muts_gff.json" \
   >  aa_muts.json
   {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t
@@ -23,7 +23,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   amino acid mutations written to .* (re)
 
   $ python3 "${SCRIPTS}/diff_jsons.py" \
-  >  --exclude-regex-paths "['seqid']" -- \
+  >  --exclude-regex-paths "\['seqid'\]" -- \
   >  "${DATA}/zika/aa_muts_gff.json" \
   >  aa_muts.json
   {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
@@ -23,7 +23,7 @@ This is an identical test setup as `translate-with-gff-and-gene.t` but using loc
   amino acid mutations written to .* (re)
 
   $ python3 "${SCRIPTS}/diff_jsons.py" \
-  >  --exclude-regex-paths "['seqid']" -- \
+  >  --exclude-regex-paths "\['seqid'\]" -- \
   >  "${DATA}/zika/aa_muts_gff.json" \
   >  aa_muts.json
   {}
diff --git a/tests/functional/translate/cram/vcf-with-root-mutation.t b/tests/functional/translate/cram/vcf-with-root-mutation.t
@@ -43,5 +43,5 @@ node is E (and so are all the other nodes))
   $ python3 "$SCRIPTS/diff_jsons.py" \
   >   aa_muts.truth.json \
   >   aa_muts.json \
-  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']"
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
   {}
diff --git a/tests/functional/translate/cram/vcf.t b/tests/functional/translate/cram/vcf.t
@@ -26,7 +26,7 @@ Setup
   $ python3 "$SCRIPTS/diff_jsons.py" \
   >   "$DATA/aa_muts.json" \
   >   aa_muts.json \
-  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']"
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
   {}
 
 ------------------------------   MISSING TEST   ----------------------------------

diff --git a/tests/functional/translate/data/zika/aa_muts_gff.json b/tests/functional/translate/data/zika/aa_muts_gff.json
@@ -13,11 +13,18 @@
       "start": 457,
       "strand": "+",
       "type": "gene"
+    },
+    "nuc": {
+      "end": 10769,
+      "seqid": "genemap.gff",
+      "start": 1,
+      "strand": "+",
+      "type": "##sequence-region pragma"
     }
   },
   "generated_by": {
     "program": "augur",
-    "version": "16.0.3"
+    "version": "26.0.0"
   },
   "nodes": {
     "BRA/2016/FC_6706": {
@@ -91,7 +98,10 @@
       }
     },
     "NODE_0000006": {
-      "aa_muts": {},
+      "aa_muts": {
+        "CA": [],
+        "PRO": []
+      },
       "aa_sequences": {
         "CA": "MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRMVLAILAFLRFTAIKPSLGLINRWGSVGKKEAMEIIKKFKKDLAAMLRIINARKEKKRRGADTSVGIVGLLLTTAMA",
         "PRO": "AEVTRRGSAYYMYLDRNDAGEAISFPTTLGMNKCYIQIMDLGHMCDATMSYECPMLDEGVEPDDVDCWCNTTSTWVVYGTCHHKKGEARRSRR"