From ed38ffdd31ba6e2000536649e3dd5fec2a5a8b7f Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Fri, 18 Jun 2021 15:24:28 +0100 Subject: [PATCH] Add analysis tuts --- analysing_tree_sequences.md | 25 ++ analysing_trees.md | 408 +++++++++++++++++++++++++++++++++ data/different_time_samples.ts | Bin 6764 -> 6764 bytes data/parsimony_map.pickle | Bin 0 -> 761 bytes data/parsimony_map.ts | Bin 0 -> 7156 bytes data/parsimony_simple.ts | Bin 0 -> 7156 bytes data/tree_traversals.ts | Bin 5740 -> 5740 bytes 7 files changed, 433 insertions(+) create mode 100644 analysing_tree_sequences.md create mode 100644 analysing_trees.md create mode 100644 data/parsimony_map.pickle create mode 100644 data/parsimony_map.ts create mode 100644 data/parsimony_simple.ts diff --git a/analysing_tree_sequences.md b/analysing_tree_sequences.md new file mode 100644 index 00000000..02fec1de --- /dev/null +++ b/analysing_tree_sequences.md @@ -0,0 +1,25 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_analysing_tree_sequences)= + +# Analysing tree sequences + +:::{todo} +Add all stats material from https://tskit.dev/tskit/docs/stable/tutorial.html + +Should we rename this "calculating statistics", or something else with the word "statistics" in the name? +::: \ No newline at end of file diff --git a/analysing_trees.md b/analysing_trees.md new file mode 100644 index 00000000..3c9f7555 --- /dev/null +++ b/analysing_trees.md @@ -0,0 +1,408 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +```{code-cell} ipython3 +:tags: [remove-cell] +import io +import pickle + +import msprime +import tskit + +def tree_traversals(): + nodes = """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 1 0 + 3 1 0 + 4 1 0 + 5 0 1 + 6 0 2 + 7 0 3 + """ + edges = """\ + left right parent child + 0 1 5 0,1,2 + 0 1 6 3,4 + 0 1 7 5,6 + """ + # NB same tree as used above, and we're using the same diagram. + ts = tskit.load_text( + nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False + ) + ts.dump("data/tree_traversals.ts") + +def different_time_samples(): + samples = [ + msprime.Sample(0, 0), + msprime.Sample(0, 1), + msprime.Sample(0, 20), + ] + ts = msprime.simulate( + Ne=1e6, + samples=samples, + demographic_events=[ + msprime.PopulationParametersChange(time=10, growth_rate=2, population_id=0), + ], + random_seed=42, + ) + ts.dump("data/different_time_samples.ts") + +def parsimony_simple(): + ts = msprime.sim_ancestry(3, random_seed=42) + ts.dump("data/parsimony_simple.ts") + +def parsimony_map(): + # pick a seed that gives the tips in the right order + ts = msprime.sim_ancestry(3, sequence_length=100, random_seed=38) + ts.dump("data/parsimony_map.ts") + ts = msprime.sim_mutations( + ts, rate=0.01, random_seed=123, discrete_genome=False) + data = [ + {'pos':v.site.position, 'alleles': v.alleles, 'genotypes':v.genotypes} + for v in ts.variants()] + with open("data/parsimony_map.pickle", "wb") as f: + pickle.dump(data, f) + + +def create_notebook_data(): + tree_traversals() + different_time_samples() + parsimony_simple() + parsimony_map() + +# create_notebook_data() # uncomment to recreate the tree seqs used in this notebook +``` + +(sec_analysing_trees)= +# Analysing trees + +There are a number of different ways we might want to analyse a single {class}`Tree`. +Most involve some sort of traversal over the nodes, mutations, or branches in the tree. +{program}`tskit` provides various way of traversing through a tree, and also some +built in phylogenetic algorithms such as {meth}`Tree.map_mutations` which efficently +places mutations ("characters" in phylogenetic terminology) on a given tree. + + +(sec_analysing_trees_traversals)= + +## Tree traversals + +There are various a single {class}`Tree`, traversals in various orders are possible using the +{meth}`~Tree.nodes` iterator. For example, in the following tree we can visit the +nodes in different orders: + + +```{code-cell} ipython3 +import tskit +from IPython.display import SVG, display + +ts = tskit.load("data/tree_traversals.ts") +tree = ts.first() +display(SVG(tree.draw_svg())) + +for order in ["preorder", "inorder", "postorder"]: + print(f"{order}:\t", list(tree.nodes(order=order))) +``` + +Much of the time, the specific ordering of the nodes is not important +and we can leave it out (defaulting to preorder traversal). For example, +here we compute the total branch length of a tree: + +```{code-cell} ipython3 +total_branch_length = sum(tree.branch_length(u) for u in tree.nodes()) +print(f"Total branch length: {total_branch_length}") +``` + +Note that this is also available as the {attr}`Tree.total_branch_length` attribute. + +### Traversing upwards + +For many applications it is useful to be able to traverse upwards from the +leaves. We can do this using the {meth}`Tree.parent` method, which +returns the parent of a node. For example, we can traverse upwards from +each of the samples in the tree: + +```{code-cell} ipython3 +for u in tree.samples(): + path = [] + v = u + while v != tskit.NULL: + path.append(v) + v = tree.parent(v) + print(u, "->", path) +``` + +### Traversals with information + +Sometimes we will need to traverse down the tree while maintaining +some information about the nodes that are above it. While this +can be done using recursive algorithms, it is often more convenient +and efficient to use an iterative approach. Here, for example, +we define an iterator that yields all nodes in preorder along with +their path length to root: + +```{code-cell} ipython3 +def preorder_dist(tree): + for root in tree.roots: + stack = [(root, 0)] + while len(stack) > 0: + u, distance = stack.pop() + yield u, distance + for v in tree.children(u): + stack.append((v, distance + 1)) + +print(list(preorder_dist(tree))) +``` + + +(sec_tutorial_networkx)= + +## Networkx + +Traversals and other network analysis can also be performed using the sizeable +[networkx](https://networkx.github.io/documentation/stable/index.html) +library. This can be achieved by calling {meth}`Tree.as_dict_of_dicts` to +convert a {class}`Tree` instance to a format that can be imported by networkx to +create a graph: + +```{code-cell} ipython3 +import networkx as nx + +g = nx.DiGraph(tree.as_dict_of_dicts()) +print(sorted(g.edges)) +``` + +### Traversing upwards in networkx + +We can revisit the above examples and traverse upwards with +networkx using a depth-first search algorithm: + +```{code-cell} ipython3 +g = nx.DiGraph(tree.as_dict_of_dicts()) +for u in tree.samples(): + path = [u] + [parent for parent, child, _ in + nx.edge_dfs(g, source=u, orientation="reverse")] + print(u, "->", path) +``` + + +### Calculating distances to the root + +Similarly, we can yield the nodes of a tree along with their distance to the +root in pre-order in networkx as well. Running this on the example above gives us +the same result as before: + +```{code-cell} ipython3 +g = nx.DiGraph(tree.as_dict_of_dicts()) +for root in tree.roots: + print(sorted(list(nx.shortest_path_length(g, source=root).items()))) +``` + + +### Finding nearest neighbors + +If some samples in a tree are not at time 0, then finding the nearest neighbor +of a sample is a bit more involved. Instead of writing our own traversal code +we can again draw on a networkx algorithm. +Let us start with an example tree with three samples that were sampled at +different time points: + +```{code-cell} ipython3 +ts = tskit.load("data/different_time_samples.ts") +tree = ts.first() +SVG(tree.draw_svg(y_axis=True, time_scale="rank")) +``` + +The generation times for these nodes are as follows: + +```{code-cell} ipython3 +for u in tree.nodes(): + print(f"Node {u}: time {tree.time(u)}") +``` + +Note that samples 0 and 1 are about 35 generations apart from each other even though +they were sampled at almost the same time. This is why samples 0 and 1 are +closer to sample 2 than to each other. + +For this nearest neighbor search we will be traversing up and down the tree, +so it is easier to treat the tree as an undirected graph: + +```{code-cell} ipython3 +g = nx.Graph(tree.as_dict_of_dicts()) +``` + +When converting the tree to a networkx graph the edges are annotated with their +branch length: + +```{code-cell} ipython3 +for e in g.edges(data=True): + print(e) +``` + +We can now use the "branch_length" field as a weight for a weighted shortest path +search: + +```{code-cell} ipython3 +import collections +import itertools + +# a dictionary of dictionaries to represent our distance matrix +dist_dod = collections.defaultdict(dict) +for source, target in itertools.combinations(tree.samples(), 2): + dist_dod[source][target] = nx.shortest_path_length( + g, source=source, target=target, weight="branch_length" + ) + dist_dod[target][source] = dist_dod[source][target] + +# extract the nearest neighbor of nodes 0, 1, and 2 +nearest_neighbor_of = [min(dist_dod[u], key=dist_dod[u].get) for u in range(3)] + +print(dict(zip(range(3), nearest_neighbor_of))) +``` + + +(sec_analysing_trees_parsimony)= + +## Parsimony + +The {meth}`Tree.map_mutations` method finds a parsimonious explanation for a +set of discrete character observations on the samples in a tree using classical +phylogenetic algorithms. + +```{code-cell} ipython3 +tree = tskit.load("data/parsimony_simple.ts").first() +alleles = ["red", "blue", "green"] +genotypes = [0, 0, 0, 0, 1, 2] +styles = [f".n{j} > .sym {{fill: {alleles[g]}}}" for j, g in enumerate(genotypes)] +display(SVG(tree.draw_svg(style="".join(styles)))) + +ancestral_state, mutations = tree.map_mutations(genotypes, alleles) +print("Ancestral state = ", ancestral_state) +for mut in mutations: + print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") +``` + +So, the algorithm has concluded, quite reasonably, that the most parsimonious +description of this state is that the ancestral state is red and there was +a mutation to blue and green over nodes 4 and 5. + +### Building tables + +One of the main uses of {meth}`Tree.map_mutations` is to position mutations on a tree +to encode observed data. In the following example we show how a set +of tables can be updated using the {ref}`Tables API`; here we +infer the location of mutations in an simulated tree sequence of one tree, +given a set of site positions with their genotypes and allelic states: + +```{code-cell} ipython3 +import pickle +ts = tskit.load("data/parsimony_map.ts") +with open("data/parsimony_map.pickle", "rb") as file: + data = pickle.load(file) # Load saved variant data from a file +display(SVG(ts.draw_svg(size=(500, 300), time_scale="rank"))) +print("Variant data: pos, genotypes & alleles as described by the ts.variants() iterator:") +for i, v in enumerate(data): + print(f"Site {i} (pos {v['pos']:7.4f}): alleles: {v['alleles']}, genotypes: {v['genotypes']}") +print() +tree = ts.first() # there's only one tree anyway +tables = ts.dump_tables() +# Infer the sites and mutations from the variants. +for variant in data: + ancestral_state, mutations = tree.map_mutations(variant["genotypes"], variant['alleles']) + site_id = tables.sites.add_row(variant['pos'], ancestral_state=ancestral_state) + info = f"Site {site_id}: parsimony sets ancestral state to {ancestral_state}" + parent_offset = len(tables.mutations) + for mut in mutations: + parent = mut.parent + if parent != tskit.NULL: + parent += parent_offset + mut_id = tables.mutations.add_row( + site_id, node=mut.node, parent=parent, derived_state=mut.derived_state) + info += f", and places mutation {mut.id} to {mut.derived_state} above node {mut.node}" + print(info) +new_ts = tables.tree_sequence() +``` + +```{code-cell} ipython3 +mut_labels = {} # An array of labels for the mutations +for mut in new_ts.mutations(): # Make pretty labels showing the change in state + site = new_ts.site(mut.site) + older_mut = mut.parent >= 0 # is there an older mutation at the same position? + prev = new_ts.mutation(mut.parent).derived_state if older_mut else site.ancestral_state + mut_labels[site.id] = f"{mut.id}: {prev}→{mut.derived_state}" + +display(SVG(new_ts.draw_svg(size=(500, 300), mutation_labels=mut_labels, time_scale="rank"))) +``` + + +### Parsimony and missing data + +The Hartigan parsimony algorithm in {meth}`Tree.map_mutations` can also take missing data +into account when finding a set of parsimonious state transitions. We do this by +specifying the special value {data}`tskit.MISSING_DATA` (-1) as the state, which is +treated by the algorithm as "could be anything". + +For example, here we state that sample 0 is missing, and use the colour white to indicate +this: + +```{code-cell} ipython3 +tree = tskit.load("data/parsimony_simple.ts").first() +alleles = ["red", "blue", "green", "white"] +genotypes = [tskit.MISSING_DATA, 0, 0, 0, 1, 2] +styles = [f".n{j} > .sym {{fill: {alleles[g]}}}" for j, g in enumerate(genotypes)] +display(SVG(tree.draw_svg(style="".join(styles)))) + +ancestral_state, mutations = tree.map_mutations(genotypes, alleles) +print("Ancestral state = ", ancestral_state) +for mut in mutations: + print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") +``` + + +The algorithm decided, again, quite reasonably, that the most parsimonious explanation +for the input data is the same as before. Thus, if we used this information to fill +out mutation table as above, we would impute the missing value for 0 as red. + +The output of the algorithm can be a little surprising at times. Consider this example:: + +```{code-cell} ipython3 +tree = msprime.simulate(6, random_seed=42).first() +alleles = ["red", "blue", "white"] +genotypes = [1, -1, 0, 0, 0, 0] +node_colours = {j: alleles[g] for j, g in enumerate(genotypes)} +ancestral_state, mutations = tree.map_mutations(genotypes, alleles) +print("Ancestral state = ", ancestral_state) +for mut in mutations: + print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") +SVG(tree.draw(node_colours=node_colours)) +``` + + +Note that this is putting a mutation to blue over node 6, **not** node 0 as +we might expect. Thus, we impute here that node 1 is blue. It is important +to remember that the algorithm is minimising the number of state transitions; +this may not correspond always to what we might consider the most parsimonious +explanation. + +## Fast tree-based algorithms using numba + +:::{todo} +A short example here of how to use numba to speed up tree based dynamic programming +algorithms. +::: + diff --git a/data/different_time_samples.ts b/data/different_time_samples.ts index 5884e9734dfa9880fbe124f637b47a27dd1183bc..71ae1c07298fbdf7d3aa61830d227c77b2043aa7 100644 GIT binary patch delta 66 zcmaE3^2TI?pp>9Rh@q*KiJ_IDv7WKHftj)4WF;vzm6Q|%<1};AG+onVOG{mI6Z14( V3xiYxT?519L^Bg31CyjQa{yT15g`Bo delta 66 zcmaE3^2TI?pp>9_h>?+%skxPjiJpO}rHPUGWF;vz6$8s8!?Z+GOWm}jBy(K@!_+ig Vvt)xrU6bTggJkn0gQVodGyrEW5@rAZ diff --git a/data/parsimony_map.pickle b/data/parsimony_map.pickle new file mode 100644 index 0000000000000000000000000000000000000000..55dba4efe4f725c62625ee528956fb88b8689b8e GIT binary patch literal 761 zcmZo*nfi{20SscNXw**8=wU9%FP`G=ps?@s$ps7yJ?x1&IjK3R#Z!71-GSI~O52nk z&h*s0{F2H7kf>;0X>LKKUUGg>s$OnsPDy5BQBh*$lpdb=qSWO4yyB9g(&UmUlc)5s zf)!5bVb4o}D49IPo1t}zGh@<}_9;PAG`tzT*;=P$aQCpLfQ$xdVaha|(%Au#W$t0D znd0Z?=k*^5z=StL$&{o{XEsJgkn^ViIi*1NXE3=tSX5ris$gKq0O|<=y4e}%<_z`> zt_)s~g&F+F)`zdg9$u9*YYESJGTs?C9NFF6y))*P4jkOi!nHG`m^(L+1%+{?p7P@4&aHm08g UfA!y83D!(RRDmKI995}$0J@~~{r~^~ literal 0 HcmV?d00001 diff --git a/data/parsimony_map.ts b/data/parsimony_map.ts new file mode 100644 index 0000000000000000000000000000000000000000..314f29a859de09543e78902eab5dbacc43402ac9 GIT binary patch literal 7156 zcmc(iO^6&-5XZ-FjbHI2YE-n-c<`_@J0H87*-=S?$wi|E6C|Rc>Fs$l+ev?n-94ME zOC%zq;z7X=Jc=S7BASCpBnr8RxhOe!FnSU_h=^b0pa;>a?)vZazRWni5pkgR^{c91 zRlRzz-+TMqy<2u(zUIa?=afpNyXZN66a87m?|iX6bY=E(sq}xpCVN>U{ZY*&qIh0- z*c;at<2i5sy(0V-S+~>^Ub-Wj3qjmSZexC_{nFIoldW?#X z`kfLU`JO0PKluGl<8P2cz#r`N7WMmCc+~%P;m_3pt&j2lDg4c%uL?hI{NUHG&A=Ol z-(8SL{te-;68@0z=ShE5gZgh39@GA!@S`}$5r4Pv`24_Qe6ZJBtiM=`|N1K9KcmG5 zUo`$38h@l@;gTs z-fDTQzqMz{WBuGJJnCOGf7HJ&Jn9b~{!u5rg?_unpD0*=)UTtBzbKFT^@W#dEEVMu z|7qc+Da<4Ot7n-%>i3rL*Gqwm=8yUNNcdYc9{%-tLm&FDguhx8CkxIW;{Pc8dX0yF z#M4{oFOb0CzbqIZ^LLBzIKNiQPY93ovs(Vnvy6}RbI)1wsQ+%^ao!_;^!2)< z_%FULvlz!?{$efvrtpX}szv@!3Xf?AkNeap9`Rq*`1ykIG5*8a_`#1GKj!b4@VM{1 zw#xc_B0TONzX-o!)WAiK`kfLU_m3N7BN=ypL;X%`ydnI1W8#CqQVt$*Q#vX9`(rL( z$bWzPOLN&D|Y6UFP1$!Qh^vmZg)QE zgpqG2lY#9kd0kLZ?1q6GwABF$&k-y)h*gx3)C!}vidN_%<%bKlx2$m&+;-pg;>nI@ zcVjYL7I>j!vy)*mNjCLJ!Cc0hr?OG6$VEir2cgi458v%ZX~ywnTg6GP%QKPd zf*T@2gn=sp zhDsv(w$dpbDg>Vb7++?I-cS>n#LTi6QcE9ZzwfqXFXFwE&qqFDdEer*;39hXf8Y!0 z!QM!(|J#rA;D_@7_E_~wNVp`!HcUV6MYMgfUSn$JK@!qlPX$Wo#*ku8==*j+8Ie2J zw`BiXOx@E4=y7q3261%Eg>S*-uMSH6|^-;cCc83Q-5GAst%RQvZL-m5%hh_v*(n@<&|niG!LO~#fle*(P&a3 zg&!P4q)3-4t9qVGmovWU(Y22hRi0iIEVxk^(21RpXvqw-X^t0TI?_pEZndNRF0F3E z8%5=8Ya%(B`VBD*sKzYVP+1=B~Ok2iIJ2CFF z7Y(CnRHs|DYOB#Oc5K~6`g-jSBMQT$bs*@MH`ZpSE46y9(s*Ka^1*G}w{6+E&D!0Z zv1X1CFl1~iIDE@6upgn|&VJa%=DiI=AAp2;u@ z0wgwZfD4EVQUGy6f?PmhlemC#35vu85S-uwViyN42wvImAA5>!+7pCm$y2WP-uK?C zSLLdjr*GY}ux;zkt!Gp!m7D4L?Hc;CiQn05`^fg}ierfiymHVTb^I7o>^RPEA zE61z6)$e)cFUq==KJ(li*=%9WtIS`n@fgo22Z(=M+W5=zsNaBjp2kX99`PS% zo}1D<;=gd3`J;ZXF@Gf&xNQEIzxSEnrSb5u&l~#Cf5H4EtTRzI$NlwK$@rN6L(Jp+Kh8YrGb%px4{7;-$owb{ za^(N4mOps-hrQlH|2p&7KTm8j{ySR!;Nf47i$3(9X#A;?@lpS;wfxKS$m190u^)iH z9tI;?;r2v4=`4A^o4xv?n`Cc+NNm?m?2SDu>ULwHnOL#2Bm%bV+fsyVAidsF(v4(b zCzGKa2!34_GWMd-3te%T!m9|D7sf&+B();x3b{ciMG!69{<_9n_S}K($CF*(?!{!f zF7PAAR!)Y=IQ60mV;m;?2Xh{8k;+EBAr}_0AA~~7J_4^N(~RRuSHwx7t7f9m1vf;5 z2)Qd5|2;?;xuW1AouEQLjt0^x8YEsIfG~( zE1lAzLhvbo@nx3i4K?9O%q;s6we(>Q2A<1%k=i@e`KXRqwQrq6k2(+3|G?+cgT0Yn z|F@sYNj)kb^{Cv`qjFP^x-a3*RH^(tPhI-``!XqR({eIO=h6DEEA>jqk&H8WZ5Q<(eAIH9w( zr{WHL-@N|_5`YB^j*v#umf=a^DlScPVnh1(Pq0@rT6{n_31z=#W*eEsN4(iF%tozV ztD6&s8Ash@g>DBVT;pLIW)Sx!ZC^^SF%9zw32CpV0wr`~NHHf00z0IP$ekBhynn5x z?r8(`RdI|KadgZ@bhP%DR!N}7suI|^1?CFZqe-D4R*y~&H60+-CLAn!VY=g6bW#^| zWGgHu3OVgDTH0itg!kUue|{ENvG-8HdEbvbw~%^3Q|;%R7t*4@uAPf%6PJmlcK%!@ zLRXVyCySLI`?)czx>PUAmc3jN4g$-!7lp6t%jJ|b5m8{pLah*Es!0`QK0k*@k*-yx z>iZtu&eT1?qT7E|WIxK+qd_{Kfi<Y7`NN2hS4_a)15}W z(`p$9_TEkUX5&UfMp4o^91g0pjk)PsquHpn9-f=LYu|1A_AKnP4z_2knJJY+VACZ! zOmjG#3Dd>9nk=dBoW|v;#i>S_E+sXLTMlGeq6^8kYIC)0<@?^Ev}O9a6Mx8#lZd~a zf_XFjxKyn4$VfvK)b(1cHj{owy;v=uwPL8IQqhn19QD7h8g#WjQ*E^GZcKNk8l6U~ z)|#Dd&rRhUQ~p0G&beIYG#c&cc6X{efBU&(YBjir#(0O7ekfq Avj6}9 literal 0 HcmV?d00001 diff --git a/data/tree_traversals.ts b/data/tree_traversals.ts index 875ccd54cdd3952aa694004f7ff4a27d32ac1d01..ca97e5c9f09b13a6409d60c64c1e410064119c06 100644 GIT binary patch delta 44 zcmaE(^G0WbhnPx|sj-ETd2*_5T5__1ZknZmiLQx}iG^;macZJbvT0gsQd$ZCK$#7g delta 44 zcmaE(^G0WbhnPx=p_!SXNphNQl9^?aZnAljp{}K|Ns?}&rLmz=s)d=cg-H?sJFg6^