nextstrain · corneliusroemer · Oct 24, 2023 · victorlin · Oct 26, 2023 · victorlin
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,9 @@
 
 ## __NEXT__
 
+### Features
+
+* Add `clades2` command that infers clades at internal and external nodes based on a tree and metadata. Currently a stripped down version of `traits` but that is an implementation detail.  (@corneliusroemer)
 
 ## 23.1.0 (22 September 2023)
 

diff --git a/augur/__init__.py b/augur/__init__.py
@@ -31,6 +31,7 @@
     "translate",
     "reconstruct_sequences",
     "clades",
+    "clades2",
     "traits",
     "sequence_traits",
     "lbi",

diff --git a/augur/clades2.py b/augur/clades2.py
@@ -0,0 +1,137 @@
+"""
+Infer clades at internal nodes from metadata
+"""
+
+from collections import defaultdict
+import sys
+
+from Bio import Phylo
+import numpy as np
+from treetime.wrappers import reconstruct_discrete_traits
+
+from .errors import AugurError
+from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
+from .utils import write_json
+
+def mugration_inference(tree=None, seq_meta=None, field=None, missing='?'):
+    """
+    Infer likely ancestral states of a discrete character assuming a time reversible model.
+
+    Parameters
+    ----------
+    tree : str
+        name of tree file
+    seq_meta : pandas.DataFrame
+        meta data associated with sequences
+    field : str, optional
+        meta data field to use
+    missing : str, optional
+        character that is to be interpreted as missing data, default='?'
+
+    Returns
+    -------
+    T : Bio.Phylo.BaseTree.Tree
+        Biophyton tree
+    """
+
+    T = Phylo.read(tree, 'newick')
+    traits = {}
+    nodes = {n.name:n for n in T.get_terminals()}
+    for name, meta in seq_meta.iterrows():
+        if field in meta and name in nodes and meta[field] != missing:
+            traits[name] = meta[field]
+    unique_states = list(set(traits.values()))
+
+    if len(unique_states)==0:
+        print("WARNING: no states found for clade reconstruction. Using `Unassigned`.", file=sys.stderr)
+        for node in T.find_clades():
+            node.__setattr__(field, "Unassigned")
+        return T
+    elif len(unique_states)==1:
+        print("WARNING: only one state found for clade reconstruction:", unique_states, file=sys.stderr)
+        for node in T.find_clades():
+            node.__setattr__(field, unique_states[0])
+        return T
+    elif len(unique_states)<300:
+        tt, letter_to_state, reverse_alphabet = reconstruct_discrete_traits(T, traits, missing_data=missing)
+    else:
+        raise AugurError("ERROR: 300 or more distinct discrete states found. TreeTime is currently not set up to handle that many states.")
+
+    if tt is None:
+        raise AugurError("ERROR in discrete state reconstruction in TreeTime. Please look for errors above.")
+
+    # attach inferred states as e.g. node.region = 'africa'
+    for node in tt.tree.find_clades():
+        node.__setattr__(field, letter_to_state[node.cseq[0]])
+
+    return tt.tree
+
+
+def register_parser(parent_subparsers):
+    parser = parent_subparsers.add_parser("clades2", help=__doc__)
+    parser.add_argument('--tree', '-t', required=True, help="tree to assign clades on")
+    parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata")
+    parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
+                        help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
+    parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
+                        help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
+    parser.add_argument('--clade-column', required=True, help='column name of metadata field to use for clade inference')
+    parser.add_argument('--output-field-name', type=str, default='clade_membership', help='name of field to save clade inferences to')
+    parser.add_argument('--output-node-data', required=True, type=str, help='name of JSON file to save clade inferences to')
+    parser.epilog = "Note that missing data must be represented by a `?` character. Missing data will currently be inferred."
+    return parser
+
+
+def run(args):
+    """run clade inference
+
+    Parameters
+    ----------
+    args : argparse.Namespace
+        command line arguments are parsed by argparse
+    """
+    tree_fname = args.tree
+    try:
+        traits = read_metadata(
+            args.metadata,
+            delimiters=args.metadata_delimiters,
+            id_columns=args.metadata_id_columns)
+    except InvalidDelimiter:
+        raise AugurError(
+                f"Could not determine the delimiter of {args.metadata!r}. "
+                f"Valid delimiters are: {args.metadata_delimiters!r}. "
+                "This can be changed with --metadata-delimiters."
+            )
+
+    T = Phylo.read(tree_fname, 'newick')
+    missing_internal_node_names = [n.name is None for n in T.get_nonterminals()]
+    if np.all(missing_internal_node_names):
+        print("\n*** WARNING: Tree has no internal node names!", file=sys.stderr)
+        print("*** Without internal node names, ancestral traits can't be linked up to the correct node later.", file=sys.stderr)
+        print("*** If you want to use 'augur export' later, re-run this command with the output of 'augur refine'.", file=sys.stderr)
+        print("*** If you haven't run 'augur refine', you can add node names to your tree by running:", file=sys.stderr)
+        print("*** augur refine --tree %s --output-tree <filename>.nwk" % (tree_fname), file=sys.stderr)
+        print("*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'", file=sys.stderr)
+
+
+    mugration_states = defaultdict(dict)
+
+    from treetime import version as treetime_version
+    print(f"augur clades2 is using TreeTime version {treetime_version}")
+
+    T= mugration_inference(tree=tree_fname, seq_meta=traits, field=args.clade_column)
+
+    if T is None: # something went wrong
+        raise AugurError("TreeTime failed to infer ancestral states. Something went wrong.")
+
+    for node in T.find_clades():
+        mugration_states[node.name][args.output_field_name] = getattr(node, args.clade_column)
+
+
+    write_json({"nodes":mugration_states},args.output_node_data)
+
+    print("\nInferred ancestral states of clade character using TreeTime:"
+          "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
+          "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n", file=sys.stdout)
+
+    print("results written to", args.output_node_data, file=sys.stdout)
diff --git a/tests/functional/clades2.t b/tests/functional/clades2.t
@@ -0,0 +1,28 @@
+Integration tests for augur clades2.
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../bin/augur}"
+
+Infer clades
+
+  $ ${AUGUR} clades2 \
+  >  --metadata "clades2/metadata.tsv" \
+  >  --tree "clades2/tree.nwk" \
+  >  --clade-column clade \
+  >  --output-node-data "$TMP/clades.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades2/clades.json" "$TMP/clades.json"
+  {}
+  $ rm -f "$TMP/clades.json"
+
+Infer clades where some tips are not in metadata
+
+  $ ${AUGUR} clades2 \
+  >  --metadata "clades2/partial_metadata.tsv" \
+  >  --tree "clades2/tree.nwk" \
+  >  --clade-column clade \
+  >  --output-node-data "$TMP/clades.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades2/clades.json" "$TMP/clades.json"
+  {}
+  $ rm -f "$TMP/clades.json"
diff --git a/tests/functional/clades2/clades.json b/tests/functional/clades2/clades.json
@@ -0,0 +1,62 @@
+{
+  "generated_by": {
+    "program": "augur",
+    "version": "23.1.0"
+  },
+  "nodes": {
+    "BRA/2016/FC_6706": {
+      "clade_membership": "B"
+    },
+    "COL/FLR_00008/2015": {
+      "clade_membership": "A.1"
+    },
+    "Colombia/2016/ZC204Se": {
+      "clade_membership": "A"
+    },
+    "DOM/2016/BB_0183": {
+      "clade_membership": "B.1"
+    },
+    "EcEs062_16": {
+      "clade_membership": "B.1"
+    },
+    "HND/2016/HU_ME59": {
+      "clade_membership": "B.2"
+    },
+    "NODE_0000001": {
+      "clade_membership": "A"
+    },
+    "NODE_0000002": {
+      "clade_membership": "B.1"
+    },
+    "NODE_0000003": {
+      "clade_membership": "B.1"
+    },
+    "NODE_0000004": {
+      "clade_membership": "B.2"
+    },
+    "NODE_0000005": {
+      "clade_membership": "B"
+    },
+    "NODE_0000006": {
+      "clade_membership": "A"
+    },
+    "NODE_0000007": {
+      "clade_membership": "A.1"
+    },
+    "NODE_0000008": {
+      "clade_membership": "A.1"
+    },
+    "PAN/CDC_259359_V1_V3/2015": {
+      "clade_membership": "A.1"
+    },
+    "PRVABC59": {
+      "clade_membership": "B.2"
+    },
+    "VEN/UF_1/2016": {
+      "clade_membership": "A.1"
+    },
+    "ZKC2/2016": {
+      "clade_membership": "B.1"
+    }
+  }
+}
diff --git a/tests/functional/clades2/metadata.tsv b/tests/functional/clades2/metadata.tsv
@@ -0,0 +1,11 @@
+strain	clade
+BRA/2016/FC_6706	B
+COL/FLR_00008/2015	A.1
+Colombia/2016/ZC204Se	A
+DOM/2016/BB_0183	B.1
+EcEs062_16	?
+HND/2016/HU_ME59	B.2
+PAN/CDC_259359_V1_V3/2015	?
+PRVABC59	?
+VEN/UF_1/2016	?
+ZKC2/2016	?
diff --git a/tests/functional/clades2/partial_metadata.tsv b/tests/functional/clades2/partial_metadata.tsv
@@ -0,0 +1,9 @@
+strain	clade
+BRA/2016/FC_6706	B
+COL/FLR_00008/2015	A.1
+Colombia/2016/ZC204Se	A
+DOM/2016/BB_0183	B.1
+EcEs062_16	?
+PAN/CDC_259359_V1_V3/2015	?
+VEN/UF_1/2016	?
+ZKC2/2016	?
diff --git a/tests/functional/clades2/tree.nwk b/tests/functional/clades2/tree.nwk
@@ -0,0 +1 @@
+((Colombia/2016/ZC204Se:0.00105368,(PAN/CDC_259359_V1_V3/2015:0.00076051,(COL/FLR_00008/2015:0.00044440,VEN/UF_1/2016:0.00089377)NODE_0000008:0.00038502)NODE_0000007:0.00019253)NODE_0000001:0.00080159,(BRA/2016/FC_6706:0.00214920,(ZKC2/2016:0.00173693,(HND/2016/HU_ME59:0.00206150,PRVABC59:0.00135309)NODE_0000004:0.00013537,(EcEs062_16:0.00175918,DOM/2016/BB_0183:0.00184905)NODE_0000002:0.00021565)NODE_0000003:0.00013737)NODE_0000005:0.00019772)NODE_0000006:0.00100000;