Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP [refine] unique internal node names #1451

Draft
wants to merge 3 commits into
base: james/export-multitree
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,24 @@ def order_nodes(node):
return od


def read_trees(filenames):
trees = [Phylo.read(fname, 'newick') for fname in filenames]
# augur export requires unique node names (both terminal and external) as these
# are used to associate metadata/node-data with nodes. Any duplication is fatal.
# The exception to this is unlabelled node names, which auspice will handle but
# won't be associated with any metadata within export.
node_names = [clade.name for tree in trees for clade in tree.root.find_clades()]
if None in node_names:
raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+
"`augur refine --tree <newick> --output-tree <newick>` to name them.")
if len(set(node_names))!=len(node_names):
from collections import Counter
dups = [name for name, count in Counter(node_names).items() if count>1]
raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " +
", ".join([f"'{v}'" for v in dups[0:5]]) + ("..." if len(dups)>5 else ""))
return (trees, node_names)


def node_div(T, node_attrs):
"""
Scans the provided tree & metadata to see if divergence is defined, and if so returns
Expand Down Expand Up @@ -732,7 +750,12 @@ def _recursively_set_data(node):
node['branch_attrs'] = branch_attrs[node['name']]
for child in node.get("children", []):
_recursively_set_data(child)
_recursively_set_data(data_json["tree"])

if isinstance(data_json["tree"], list):
for subtree in data_json['tree']:
_recursively_set_data(subtree)
else:
_recursively_set_data(data_json["tree"])


def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns):
Expand Down Expand Up @@ -821,7 +844,11 @@ def _recursively_set_data(node):
for child in node.get("children", []):
_recursively_set_data(child)

_recursively_set_data(data_json["tree"])
if isinstance(data_json["tree"], list):
for subtree in data_json['tree']:
_recursively_set_data(subtree)
else:
_recursively_set_data(data_json["tree"])

def node_data_prop_is_normal_trait(name):
# those traits / keys / attrs which are not "special" and can be exported
Expand Down Expand Up @@ -875,7 +902,7 @@ def register_parser(parent_subparsers):
required = parser.add_argument_group(
title="REQUIRED"
)
required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`")
required.add_argument('--tree','-t', metavar="newick", nargs='+', required=True, help="Phylogenetic tree(s), usually output from `augur refine`")
required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)")

config = parser.add_argument_group(
Expand Down Expand Up @@ -1059,12 +1086,12 @@ def create_branch_labels(branch_attrs, node_data, branch_data):
continue
branch_attrs[node_name]["labels"][label_key] = label_value

def parse_node_data_and_metadata(T, node_data, metadata):
def parse_node_data_and_metadata(node_names, node_data, metadata):
node_data_names = set()
metadata_names = set()

# assign everything to node_attrs, exclusions considered later
node_attrs = {clade.name: {} for clade in T.root.find_clades()}
node_attrs = {name: {} for name in node_names}

# first pass: metadata
for metadata_id, node in metadata.items():
Expand All @@ -1088,7 +1115,7 @@ def parse_node_data_and_metadata(T, node_data, metadata):
# third pass: create `branch_attrs`. The data comes from
# (a) some keys within `node_data['nodes']` (for legacy reasons)
# (b) the `node_data['branches']` dictionary, which currently only defines labels
branch_attrs = {clade.name: defaultdict(dict) for clade in T.root.find_clades()}
branch_attrs = {name: defaultdict(dict) for name in node_names}
create_branch_mutations(branch_attrs, node_data)
create_branch_labels(branch_attrs, node_data['nodes'], node_data.get('branches', {}))

Expand Down Expand Up @@ -1173,9 +1200,9 @@ def run(args):
metadata_file = {}

# parse input files
T = Phylo.read(args.tree, 'newick')
(trees, node_names) = read_trees(args.tree)
node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \
parse_node_data_and_metadata(T, node_data_file, metadata_file)
parse_node_data_and_metadata(node_names, node_data_file, metadata_file)
config = get_config(args)
additional_metadata_columns = get_additional_metadata_columns(config, args.metadata_columns, metadata_names)

Expand Down Expand Up @@ -1205,7 +1232,8 @@ def run(args):
set_filters(data_json, config)

# set tree structure
data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs))
trees_json = [convert_tree_to_json_structure(tree.root, node_attrs, node_div(tree, node_attrs)) for tree in trees]
data_json["tree"] = trees_json[0] if len(trees_json)==1 else trees_json
set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns)
set_branch_attrs_on_tree(data_json, branch_attrs)

Expand Down
16 changes: 16 additions & 0 deletions augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def register_parser(parent_subparsers):
default='mutations-per-site', help='Units in which sequence divergences is exported.')
parser.add_argument('--seed', type=int, help='seed for random number generation')
parser.add_argument('--verbosity', type=int, default=1, help='treetime verbosity, between 0 and 6 (higher values more output)')
parser.add_argument('--content-aware-internal-node-names', action="store_true")
parser.set_defaults(covariance=True)
return parser

Expand Down Expand Up @@ -342,6 +343,21 @@ def are_sequence_states_different(nuc1, nuc2):
print("ERROR: divergence unit",args.divergence_units,"not supported!", file=sys.stderr)
return 1

if args.content_aware_internal_node_names:
terminals = [n.name for n in T.find_clades() if n.is_terminal()]
terminals.sort()
internals = {n.name for n in T.find_clades() if not n.is_terminal()}
assert all(n.startswith('NODE_') for n in internals)
# compute hash similar to git short commit hash
import hashlib
id = hashlib.sha256("".join(terminals).encode('utf-8')).hexdigest()[0:7]
def rename(name):
if name not in internals: return name
return f"NODE_{id}_{name.split('_')[1]}"
Comment on lines +353 to +356
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The hashing of the terminals of the tree feels arbitrary and not driven by the actual properties of the hash, esp. given the truncation of it. For example, two trees with the same terminals but different structures will collide. Why not simply produce unique ids for each node instead? (I still think we should do that.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, two trees with the same terminals but different structures will collide.

This functionality is motivated by multi-trees where terminal node names cannot be shared across trees.

for n in T.find_clades():
n.name = rename(n.name)
node_data['nodes'] = {rename(name):data for name,data in node_data['nodes'].items()}

# Export refined tree and node data
tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f', branch_length_only=True)
print("updated tree written to",tree_fname, file=sys.stdout)
Expand Down
36 changes: 28 additions & 8 deletions augur/validate_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
from collections import defaultdict

def ensure_no_duplicate_names(root, ValidateError):
def ensure_no_duplicate_names(tree, ValidateError):
"""
Check that all node names are identical, which is required for auspice (v2) JSONs.
"""
Expand All @@ -18,10 +18,14 @@ def recurse(node):
names.add(node["name"])
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)
if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)


def collectTreeAttrsV2(root, warn):
def collectTreeAttrsV2(tree, warn):
"""
Collect all keys specified on `node["node_attrs"]` throughout the tree
and the values associated with them. Note that this will only look at
Expand All @@ -47,7 +51,12 @@ def recurse(node):
[recurse(child) for child in node["children"]]
else:
num_terminal += 1
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)

for data in seen.values():
if data["count"] == num_nodes:
Expand All @@ -56,7 +65,7 @@ def recurse(node):
return(seen, num_terminal)


def collectMutationGenes(root):
def collectMutationGenes(tree):
"""
Returns a set of all genes specified in the tree in the "aa_muts" objects
"""
Expand All @@ -67,17 +76,28 @@ def recurse(node):
genes.update(mutations.keys())
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)

genes -= {"nuc"}
return genes

def collectBranchLabels(root):
def collectBranchLabels(tree):
labels = set()
def recurse(node):
labels.update(node.get("branch_attrs", {}).get("labels", {}).keys())
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)
return labels

def verifyMainJSONIsInternallyConsistent(data, ValidateError):
Expand Down
10 changes: 10 additions & 0 deletions tests/functional/export_v2/cram/duplicate-node-names.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
$ source "$TESTDIR"/_setup.sh

$ echo "(tipA:1,(tipA:1,tipA:1)internalBC:2,(tipD:3,tipE:4,tipF:1)internalDEF:5)ROOT:0;" > tree1.nwk

$ ${AUGUR} export v2 \
> --tree tree1.nwk \
> --output minimal.json
ERROR: 1 node names occur multiple times in the tree: 'tipA'
[2]

25 changes: 14 additions & 11 deletions tests/functional/export_v2/cram/minify-output.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,29 @@ names to create larger file sizes with less recursion.
> }

A small tree is not automatically minified.
The unminified output is 13KB which is considered acceptable.
The unminified output is 15KB which is considered acceptable.

$ echo "$(generate_newick 10);" > small_tree_raw.nwk

Use augur refine to add internal node names
$ ${AUGUR} refine --tree small_tree_raw.nwk --output-tree small_tree.nwk &>/dev/null

$ echo "$(generate_newick 10);" > small_tree.nwk

$ ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --output output.json &>/dev/null

$ head -c 20 output.json
{
"version": "v2", (no-eol)

$ ls -l output.json | awk '{print $5}'
13813
15039

It can be forcefully minified with an argument.

$ ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --minify-json \
> --output output.json &>/dev/null

Expand All @@ -48,21 +50,22 @@ even if it may seem "falsey".

$ AUGUR_MINIFY_JSON=0 ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --output output.json &>/dev/null

$ head -c 20 output.json
{"version": "v2", "m (no-eol)


A large tree, when forcefully not minified, has an output size of 6MB which is
A large tree, when forcefully not minified, has an output size of 8MB which is
considered large.

$ echo "$(generate_newick 500);" > big_tree.nwk
$ echo "$(generate_newick 500);" > big_tree_raw.nwk

Use augur refine to add internal node names
$ ${AUGUR} refine --tree big_tree_raw.nwk --output-tree big_tree.nwk &>/dev/null

$ ${AUGUR} export v2 \
> --tree big_tree.nwk \
> --skip-validation \
> --no-minify-json \
> --output output.json &>/dev/null

Expand All @@ -71,7 +74,7 @@ considered large.
"version": "v2", (no-eol)

$ ls -l output.json | awk '{print $5}'
6568454
8591420

This means it is automatically minified.

Expand All @@ -84,4 +87,4 @@ This means it is automatically minified.
{"version": "v2", "m (no-eol)

$ ls -l output.json | awk '{print $5}'
561436
576414
18 changes: 18 additions & 0 deletions tests/functional/export_v2/cram/missing-internal-nodes.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Setup

$ source "$TESTDIR"/_setup.sh

Test a tree with unlabelled internal nodes.
In Augur v24.3.0 and earlier these would work _sometimes_ depending
on what code paths were hit, but in most real-life cases would raise
an uncaught error. We now check for these when we parse the tree.

$ echo "(tipA:1,(tipB:1,tipC:1):2,(tipD:3,tipE:4,tipF:1):5):0;" > tree1.nwk

$ ${AUGUR} export v2 \
> --tree tree1.nwk \
> --metadata "$TESTDIR/../data/dataset1_metadata_with_name.tsv" \
> --node-data "$TESTDIR/../data/div_node-data.json" \
> --output test1.json
ERROR: Tree contains unnamed nodes.+ (re)
[2]
53 changes: 53 additions & 0 deletions tests/functional/export_v2/cram/multi-tree.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

Setup

$ source "$TESTDIR"/_setup.sh

Create a small second tree (which has different names/labels than 'tree.nwk')
$ cat > tree2.nwk <<~~
> (tipG:1,(tipH:1,tipI:1)internalHI:2)SECOND_ROOT:0;
> ~~

Minimal export -- no node data, no metadata etc etc
$ ${AUGUR} export v2 \
> --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \
> --output minimal.json &> /dev/null

More realistic export - with node_data for all nodes and metadata for some of them

$ cat > metadata.tsv <<~~
> strain something
> tipA foo
> tipB foo
> tipC foo
> tipG bar
> tipH bar
> ~~


$ cat > node-data.json <<~~
> {
> "nodes": {
> "ROOT": {"mutation_length": 0},
> "tipA": {"mutation_length": 1},
> "internalBC": {"mutation_length": 2},
> "tipB": {"mutation_length": 1},
> "tipC": {"mutation_length": 1},
> "internalDEF": {"mutation_length": 5},
> "tipD": {"mutation_length": 3},
> "tipE": {"mutation_length": 4},
> "tipF": {"mutation_length": 1},
> "SECOND_ROOT": {"mutation_length": 0},
> "tipG": {"mutation_length": 1},
> "internalHI": {"mutation_length": 2},
> "tipH": {"mutation_length": 1},
> "tipI": {"mutation_length": 1}
> }
> }
> ~~

$ ${AUGUR} export v2 \
> --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \
> --metadata metadata.tsv --color-by-metadata something \
> --node-data node-data.json \
> --output output.json &> /dev/null
Loading