From 9e52653facd33420ca0611126b86a078ec2c198f Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Sat, 19 Jun 2021 22:42:00 +0100 Subject: [PATCH 1/4] Port table tut material into data_structures --- _config.yml | 1 + data/construction_example.trees | Bin 0 -> 5612 bytes data/tables_example.trees | Bin 0 -> 5820 bytes data/tables_example_muts.trees | Bin 0 -> 5988 bytes data_structures.md | 518 ++++++++++++++++++++++++++++++++ getting_started.md | 72 ++++- 6 files changed, 575 insertions(+), 16 deletions(-) create mode 100644 data/construction_example.trees create mode 100644 data/tables_example.trees create mode 100644 data/tables_example_muts.trees diff --git a/_config.yml b/_config.yml index 8f31b82a..32833dbc 100644 --- a/_config.yml +++ b/_config.yml @@ -33,6 +33,7 @@ sphinx: tskit: ["https://tskit.dev/tskit/docs/stable", null] msprime: ["https://tskit.dev/msprime/docs/stable", null] pyslim: ["https://pyslim.readthedocs.io/en/stable", null] + numpy: ["https://numpy.org/doc/stable/", null] myst_enable_extensions: - colon_fence - deflist diff --git a/data/construction_example.trees b/data/construction_example.trees new file mode 100644 index 0000000000000000000000000000000000000000..5c7fc9af8c63afa7db79f551d1afa3671c93d7b1 GIT binary patch literal 5612 zcmbW5Plyyp6vo>m2D6FrPa=sL41$LovW|?q>mL#rArLWv2)THhw9_@S%}#fZT|Gm{ zMLmcH59Si_D29OG#p4DQya-+rJZesXZuv_+s}@Cxp8L4%sS8WKF;5dx8;A<@OlNYXExc%>#$eevJr@kuv^#28Z0~~i9^^^Y_JRW1umFND~;lW!g&*yLR3-avG`{3Ds zSN-gN9X$I_p844)Idc7T!QXf6Kl|4f`*-Epzdm>zqvy)A{yFd%oO#y2{DS(~zaPQB zg9cMS*Ja;n^Zagu--U$-=>N3)oB4kQzZLvL$Ma+T$KbaM-nIUE6ejA$UGNf16njy(H+0{rm%9zTS*F858F`)`T;zw5}e zKU(PD#~)%WR54QDS@6Wjzgof3&H9%Gf0T(BSTS<{H^lzQQ(pz{@%(Oq=kMRI*4V#$ z;Q82JaMVwIkHGWy2YL3VN+0yWYPy!1MlGUSt1$68g!z z*8fHDcOC1q|Gx|U(JtrLdEmL;aV_M)s z9+u*W98uUw3&pw?D4mr|a}p(!+yoaZj8ZXpb&)s;R4GMqKqXrz{j^mM$ihfP{cI>P z4F@V{>fBM~rkWu{F{3o}REARbFh?PKXxYw=^69Xw#z;2tcs%TMbXL};gmDXkgwsCV z*4ayX{ch0>Ynx&tpRO_QBpDZElsoEq%Z4{^bMNu)^BR{Lx#>+BGrbgMd$Uw2U4)qbcabaWF`7(X*ZcR=BAfkF zX1KTi@>eOu+nt3=I$JqyVj0iT#F=>cD)VwKV~s6iM)sb@Xl`b6!X95WeLPZfm30>G p#_2{-KM?GnKe;g5I=IiD4d&`%u literal 0 HcmV?d00001 diff --git a/data/tables_example.trees b/data/tables_example.trees new file mode 100644 index 0000000000000000000000000000000000000000..51cda18120b8139144bafcfb1be545ab3c1b4105 GIT binary patch literal 5820 zcmbW5J&fE$6vrJPd>kLS@ST7Hktis*?B#BAxggL5i3*S?pg5|PW6$ndS=)=pyNale z0ue+(8lpfIfsl~WP=W{*LZShs4Q*&ZNCZSd6D1zc|L^s^tZ!$}l5hOxz2AHPdE>RM z`{?yo-gaoE+J|zrnNr&x5~10>b9{Z-9Rq{$=p9)=&Pxq6H6v-*n`ue--?G@LS+v zHV*9ntKhlqt~~2+f`3ZX=l#_rfPO9NpLNvF`X7q=E&C9<4p7+nZ z{7bv6&->@KUGnVzCV0N@ssA3?aAC85fmr_)^nVu5{nJAKb;tdq{`bKXBhTl#kX@Oq ze_rs5j^Dvt|0S`0^0U^@{res~zju~Ve-=;uKY{1>N8eFD`*#&Qzdy(`f7LK?LH~b( zA3D}2zYi}Sr14yN>OTygzkl+~FJxCH{V#&&{d2}qKkL5W;2bCupN8!jH z=vH^&_jD0jO$^e&H%X?=L3-WAHBOt`Asp+iQ+Czcv|tK(Sc)TZ#8EFR73)^0bY3&f zPSi|tQ>Iv9l8V8%7fI4k)l!rjRI_zD%7S`89>ppi<(0%N8mq9Qi;1Rbs;MA~nWW)N zV<>f3IZD};Wji~`r=z+WBiY2`_OO?ey|OkXY_}jtOxnBKP7c%IXi#>;+NRiKPPdqM z+8nY^b=exjEFG)FPl7^xrh?*^aQh6i)IyDjVoTy-J-V6%pw+u0Rb=iARMN})rf0fU zmddl@w^eTG)JgIs!1dWkT2n)?i8gYW7Gv?l9F3w7&m!}`8T0%!qdD&^PIK@X#?kZI zCrd{)T5Iz6ZQ-ZQaS}Q2mC3n9W4_sp&%_w*#>e-R@tGLrDlX?bOdg)oz?(Z9 z)8gga4){$OTenxstvw#m%&*xj>; zr+5$}9`s^RPeBm^`3DRt2qJhe;?d1XiH9hGMDef}(XM&lZ%=Q{bmnaudaLWlr>b7P zp6+Jfdu8SIyQcR|@9;eDx$F!c%KmKQxCOB%ciD?Opl{x1FQ%cFiQD0LA3Wn9yWfqM zzUBA`{M}addf=g!ikb`ZPg+0uy}K>A5B#bl&;A#{-vj<6 zc&Md<^M3(6x80Rz{Z;S}tNOgZ${L_QQT0zb_Rsq7s`})ae^UL=6#t=PeeT~k;PKe- zTzU5YGkESl`APkAU4Md~g5hJw{>lFh9=EaQ%5(iYaN}*4=l!?)26@isQSh9pmPfN~s6+GYfGvII2fYO=2 zq1OMBBhURyRR3$>Rk;zB{l5*K+fJU(^AQjR)<3QIQ;zky{&Q;mnSWCM&w%Is zbIq}T)_)B=zhB8us?YsD0iN&wP1K*n)Bl$0-@}XG=CXyu&i+48{ZE1C{au!Z_0NIl z^K%bhL}j3=e_r)Z-nITuil1?;&-q_c{gaX_6I{M6s7)uU!JF)E~+xo+YljJjbr zFmGn%r9wUICMHfZYdwks6L0awrW39E?c5Wthe6+OCzWQ~ZzWl1ZfHjhUy_Woq4lDR zYaD00PB_+Er0lA<^a2y|uoNY7bi!6V=s2kaCP@pb>_kB&H^#&Y!&D3&Ez*qwQ%Et` zpn{%6{kTyKNW+fl^wYe>I9xYDJ;@AZW_FW9Y-X5-meNr6o#z;|oojZo!+c3t>_%Fa zc-$WLa=2GkXA4_3G^C97?zYBW)a$ng)3CY}8`_cz^NzD2`&gH)(Tk&X)AhTJtohh9 zvR}feg;@%r(nPi;onA3IHv&k^8-0^yuD4CMm9ELmlFFb|8fU+)!Iq9Sk{LaNFgU(jw(=7C{bSksuR$_e z%VV{b!})mgd2nVr+Vc+7tnEd}Beg?|a|_kubIsY~bJbvWF_?d9w&~9=%q}h+e0r(6 Kv{XIRT>2lY&!05_ literal 0 HcmV?d00001 diff --git a/data_structures.md b/data_structures.md index 568185dd..fddcbc4d 100644 --- a/data_structures.md +++ b/data_structures.md @@ -11,12 +11,530 @@ kernelspec: name: python3 --- +```{currentmodule} tskit +``` + +```{code-cell} ipython3 +:tags: [remove-cell] +import io +import pickle + +import msprime +import tskit + +def tables_examples(): + nodes = """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 1 0 + 3 0 0.15 + 4 0 0.6 + 5 0 0.8 + 6 0 1.0 + """ + edges = """\ + left right parent child + 20 80 3 0 + 20 80 3 2 + 0 100 4 1 + 0 20 4 2 + 80 100 4 2 + 20 80 4 3 + 80 100 5 0 + 80 100 5 4 + 0 20 6 0 + 0 20 6 4 + """ + sites = """\ + id position ancestral_state + 0 15 A + 1 42 G + 2 60 T + """ + mutations = """\ + site node derived_state parent time + 0 4 G -1 0.9 + 1 1 A -1 0.4 + 2 3 C -1 0.55 + 2 0 T 2 0.1 + """ + ts = tskit.load_text( + nodes=io.StringIO(nodes), + edges=io.StringIO(edges), + strict=False, + ) + ts.dump("data/tables_example.trees") + ts = tskit.load_text( + nodes=io.StringIO(nodes), + edges=io.StringIO(edges), + sites=io.StringIO(sites), + mutations=io.StringIO(mutations), + strict=False, + ) + ts.dump("data/tables_example_muts.trees") + +def construction_example(): + nodes = """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 0 10 + 3 1 0 + 4 0 20 + """ + edges = """\ + left right parent child + 0 1000 2 0 + 0 1000 2 1 + 0 1000 4 2 + 0 1000 4 3 + """ + sites = """\ + id position ancestral_state + 0 500 A + """ + mutations = """\ + site node derived_state parent + 0 3 G -1 + """ + ts = tskit.load_text( + nodes=io.StringIO(nodes), + edges=io.StringIO(edges), + sites=io.StringIO(sites), + mutations=io.StringIO(mutations), + strict=False, + ) + ts.dump("data/construction_example.trees") + + +def create_notebook_data(): + tables_examples() + construction_example() + +# create_notebook_data() # uncomment to recreate the tree seqs used in this notebook +``` + + (sec_data_structures)= # Data structures +(sec_data_structures_tables)= ## Tables +Internally, a tree sequence can be thought of as a set of tables, and {program}`tskit` +provides an interface for dealing with these tables directly. This is particularly +relevant when you wish to {ref}`edit or otherwise modify` +a tree sequence, although tables are also useful for bulk access to data contained in +the tree sequence, such as the times of all nodes. + +There are eight tables that together define a tree sequence, although some may be empty, +and together they form a {class}`TableCollection`. +The tables are defined in the official {program}`tskit` documentation for +{ref}`Table Definitions `, and the +{ref}`Tables API ` section in the docs describes how to work with +them. In this tutorial we give some pointers about what you can and cannot do with them. + +### Correspondance between tables and trees + +Consider the following sequence of trees: + +```{code-cell} ipython3 +from IPython.display import SVG +ts = tskit.load("data/tables_example.trees") +SVG(ts.draw_svg(y_axis=True)) +``` + +Ancestral recombination events have produced three different trees +that relate the three sampled genomes ``0``, ``1``, and ``2`` to each other +along the chromosome of length 100. + +#### Node and edge tables + +Each node in each of the above trees represents a particular ancestral genome +(a *haploid* genome; diploid individuals would be represented by two nodes). +Details about each node, including the time it lived, are stored in a +{class}`NodeTable` (details {ref}`here`) + +```{code-cell} ipython3 +ts.tables.nodes +``` + +Importantly, the first column, ``id``, is not actually recorded, and is +only shown when printing out node tables (as here) for convenience. +The second column, ``flags`` records a ``1`` for the individuals that are *samples*, +i.e., whose entire genealogical history is recorded by these trees. +(Note that the trees above record that node 3 inherited from node 4 +on the middle portion of the genome, but not on the ends.) + +The way the nodes are related to each other (i.e. the tree topology) is stored +in an {class}`EdgeTable` (details {ref}`here`). +Since some edges are present in more than one tree (e.g., node 1 inherits from node 4 +across the entire sequence), each edge contains not only the IDs of a parent and a child +node, but also the left and right positions which define the genomic region for which it +appears in the trees: + +```{code-cell} ipython3 +ts.tables.edges +``` + +Since node 3 is most recent, the edge that says that nodes 0 and 2 inherit +from node 3 on the interval between 0.2 and 0.8 comes first. Next are the +edges from node 4: there are four of these, as the edge from node 4 to node +1 is shared across the entire sequence, and for each of the three +genomic intervals there is an additional child node. At this +point, we know the full tree on the middle interval. Finally, edges +specifying the common ancestor of 0 and 4 on the remaining intervals (parents 6 +and 5 respectively) allow us to construct all trees across the entire interval. + + + +#### Sites and mutations tables + +Most tree sequences have DNA variation data associated with them, +{ref}`stored as mutations overlaid on the trees`: + +```{code-cell} ipython3 +ts = tskit.load("data/tables_example_muts.trees") +mut_labels = {} # An array of labels for the mutations, listing position & allele change +for mut in ts.mutations(): # This entire loop is just to make pretty labels + site = ts.site(mut.site) + older_mut = mut.parent >= 0 # is there an older mutation at the same position? + prev = ts.mutation(mut.parent).derived_state if older_mut else site.ancestral_state + mut_labels[mut.id] = "{}→{} @{:g}".format(prev, mut.derived_state, site.position) + +SVG(ts.draw_svg(y_axis=True, mutation_labels=mut_labels)) +``` + +There are four mutations in the depiction above, +marked by red crosses: one above node ``4`` on the first tree which records an A to G +transition at position 15, another above node ``1`` in the second tree which records a G +to A transition at position 45, and the final two above nodes ``2`` and ``0`` recording +transitions, both at position 60, on the second tree. The positions are recorded in the +{class}`SiteTable` (details {ref}`here`): + +```{code-cell} ipython3 +ts.tables.sites +``` + +As with node tables, the ``id`` column is **not** actually recorded, but is +implied by the position in the table. The mutations themselves are recorded in the +{class}`MutationTable` (details {ref}`here`). +This associates each mutation with a site ID, the ID of the node above which the +mutation occurs, the derived state to which the allele has mutated, and (optionally) a +time at which the mutation occured. + + +```{code-cell} ipython3 +ts.tables.mutations +``` + +Where there are multiple mutations at the same site, there can be mutations +stacked on top of each other. The "parent" column therefore contains the ID of the +mutation immediately above the current one at the same site, or -1 if there is no parent. + +These sites and mutations allow us to calculate the DNA sequence, or haplotype, for each +of the sample nodes: + +```{code-cell} ipython3 +for sample, h in enumerate(ts.haplotypes()): + print(f"Sample {sample}: {h}") +``` + +#### Other tables + +The other tables in a {class}`TableCollection` are the {class}`IndividualTable` (which +records which {ref}`individual organism` a node is contained +in), the {class}`PopulationTable` and the {class}`MigrationTable`, and the +{class}`ProvenanceTable` which records the {ref}`tskit:sec_provenance` of the data in +a tree sequence. These won't be covered in this tutorial. + +#### Metadata + +You may have noticed ``metadata`` columns in all the tables above. All tables can have +(optional) metadata attached to their rows, as described in the {ref}`sec_metadata` +section of the official {program}`tskit` documentation. +The {ref}`metadata tutorial` provides more information about how +to set and use these metadata columns in tree sequence tables. + + +### Accessing table data + +To look at a the contents of a table, you can simply `print` it: + +```{code-cell} ipython3 +print(ts.tables.mutations) +``` + +But {program}`tskit` also provides access to the rows and columns of each table. + +#### Row access + +Rows can be accessed using square braces, which will return an object containing the +raw values: + +```{code-cell} ipython3 +row = ts.tables.mutations[1] +print(f"A mutation at site {row.site} exists at time {row.time}") +print(row) +``` + +Additionally, many rows can be extracted into a new table using slices or +{ref}`numpy indexing` + +```{code-cell} ipython3 +ts.tables.mutations[2:4] +``` + +:::{note} +When manipulating table data, it is quite possible to create a table collection that +does not correspond to a valid tree sequence. For instance, if we replaced the mutations +table in our original tree sequence with the table above, the parent column +would refer to an invalid mutation ID (ID 2, when in the new tables we only have +mutations with IDs 0 and 1). In this particular case there is a method, +{meth}`TableCollection.compute_mutation_parents`, which will recalculate the parent IDs +from the trees, but there is no general way to ensure that a manipulated table will +remain valid. +::: + +Rows can also be *added* to a table using ``.add_row()``. However, when modifying tables +from a tree sequence, you should always modify a *copy* of the original data, +using the {meth}`TreeSequence.dump_tables` method: + +```{code-cell} ipython3 +new_tables = ts.dump_tables() +new_pos = 10 +new_site_id = new_tables.sites.add_row(position=new_pos, ancestral_state="G") +print("New empty site allocated at position {new_pos} with ID {new_site_id}") +new_tables.sites +``` + +Note that one of the requirements of a tree sequence is that the sites are listed in +order of position, so this new table will require sorting (see +{ref}`sec_data_structures_tables_creating`) + + +#### Column access + +An entire column in a table can be extracted as a {program}`numpy` array from the table +object. For instance, if ``n`` is a {class}`NodeTable`, then ``n.time`` +will return an array containing the birth times of the individuals whose genomes +are represented by the nodes in the table. *However*, it is important to note that this +is a copy of the data, and modifying individual elements of ``n.time`` will *not* change +the node table ``n``. To change the column data, you need to take a copy, modify it, +and assign it back in. For example, here we add 0.25 to every ``time`` except the first +in the node table from our current example: + +```{code-cell} ipython3 +node_table = new_tables.nodes +times = node_table.time +print("Old node times:", times) +times[1:] = times[1:] + 0.25 +node_table.time = times +node_table +``` + +When assigning columns like this, an error will be raised if the column is not of the +expected length: + +```{code-cell} ipython3 +:tags: [raises-exception] +node_table.time = times[2:] +``` + +(sec_data_structures_tables_creating)= + +### Turning tables into a tree sequence + +The {meth}`TableCollection.tree_sequence` method will attempt to turn a table collection +into a tree sequence. This is not guaranteed to work: it's possible you have created a +nonsensical tree sequence where, for example, a child node has multiple parents at +a given position. The {ref}`tskit:sec_valid_tree_sequence_requirements` section of the +official tskit docs lists the requirements; these include the correct order for rows in +a number of tables. For instance, the sites table is +{ref}`required` to be sorted in order of position. Since +this is not true of the sites table in the ``new_tables`` collection we have just +created, we need to {meth}`~TableCollection.sort` the table collection before turning +it into a tree sequence + +```{code-cell} ipython3 +# new_ts = new_tables.tree_sequence() # This won't work +new_tables.sort() +new_ts = new_tables.tree_sequence() # Now it will +# Plot without mutation labels, for clarity +SVG(new_ts.draw_svg(y_axis=True, y_gridlines=True, mutation_labels={})) +``` + +Here you can see that a new empty site has been added at position 10 (represented by a +tickmark above the X axis with no mutations on it), and all the nodes except node 0 +have had their times increased by 0.25. + +### Constructing a tree sequence + +With the tools above in mind, we can now see how to construct a tree sequence by hand. +It's unlikely that you would ever need to do this from scratch, but it's helpful to +understand how tables work. We'll build the simplest possible tree sequence, a single +tree that looks like this: + +```{code-cell} ipython3 +SVG(tskit.load("data/construction_example.trees").draw_svg(y_axis=True)) +``` + +Starting with an empty set of tables, we can fill, say, the node information by using +{meth}`NodeTable.add_row` as follows: + +```{code-cell} ipython3 +tables = tskit.TableCollection(sequence_length=1e3) +node_table = tables.nodes # set up an alias, for efficiency +node_table.add_row(flags=tskit.NODE_IS_SAMPLE) # Node 0: defaults to time 0 +node_table.add_row(flags=tskit.NODE_IS_SAMPLE) # Node 1: defaults to time 0 +node_table.add_row(time=10) # Node 2: not a sample +node_table.add_row(flags=tskit.NODE_IS_SAMPLE) # Node 3 +node_table.add_row(time=20) # Node 4 +node_table +``` + +The ``.add_row()`` method is natural (and should be reasonably efficient) if +new records appear one-by-one. Alternatively ``.set_columns()`` can be used to +set columns for all the rows at once, by passing in numpy arrays of the appropriate +type. We'll use that for the edges: + +```{code-cell} ipython3 +import numpy as np +edge_table = tables.edges +edge_table.set_columns( + left=np.array([0.0, 0.0, 0.0, 0.0]), + right=np.array([1e3, 1e3, 1e3, 1e3]), + parent=np.array([2, 2, 4, 4], dtype=np.int32), # References IDs in the node table + child=np.array([0, 1, 2, 3], dtype=np.int32), # References IDs in the node table +) +edge_table +``` + +And finally we can add a site and a mutation: here we'll use 0/1 mutations rather than +ATGC. + +```{code-cell} ipython3 +site_id = tables.sites.add_row(position=500.0, ancestral_state='0') +tables.mutations.add_row(site=site_id, node=2, derived_state='1') +ts = tables.tree_sequence() +print("A hand-built tree sequence!") +SVG(ts.draw_svg(y_axis=True)) +``` + +:::{note} +The ordering requirements for a valid tree sequence +{ref}`do not specify` that rows in the +node table have to be sorted in any particular order, e.g. by time. By convention, +sample nodes are often the first ones listed in the node table, and this is the node +order returned by {meth}`~TreeSequence.simplify`, but the example above shows that +sample nodes need not necessarily be those with the IDs $0..n$. +::: + +(sec_data_structures_editing)= + +## Editing tree sequences + +Sometimes we wish to make some minor modifications to a tree sequence that has +been generated by a simulation. However, tree sequence objects are **immutable** +and so we cannot edit them in place. + + +Several {program}`tskit` methods will return a new tree sequence that has been +modified in some way. For example: +- {meth}`TreeSequence.delete_sites` returns a tree sequence with certain sites + deleted +- {meth}`TreeSequence.delete_intervals` and {meth}`TreeSequence.keep_intervals` + return a tree sequence whose trees cover a smaller fraction of the genome (and which can + be combined with {meth}`TreeSequence.trim` to change the coordinate system) +- {meth}`TreeSequence.simplify` returns a new tree sequence with some sample nodes removed + (see the {ref}`simplification tutorial`). +- {meth}`TreeSequence.union` returns a new tree sequence formed by merging two other tree + sequences together. + +However, if you want to do something not covered by those built-in methods, you will +need to edit a copy of the underlying tables and then create a new tree sequence from +the modified tables. In the following example, we use this +approach to remove all singleton sites from a given tree sequence. + +```{code-cell} ipython3 +import dataclasses +def strip_singletons(ts): + tables = ts.dump_tables() + tables.sites.clear() + tables.mutations.clear() + for tree in ts.trees(): + for site in tree.sites(): + assert len(site.mutations) == 1 # Only supports infinite sites muts. + mut = site.mutations[0] + if tree.num_samples(mut.node) > 1: + site_id = tables.sites.append(site) + mut = dataclasses.replace(mut, site=site_id) # set the new site id + tables.mutations.append(mut) + return tables.tree_sequence() +``` + +This function takes a tree sequence containing some infinite sites mutations as +input, and returns a copy in which all singleton sites have been removed. +The approach is very simple: we get a copy of the underlying +table data in a {class}`TableCollection` object, and first clear the +site and mutation tables. We then consider each site in turn, +and if the number of samples with +the mutation is greater than one, we add the site and mutation to our +output tables using {meth}`SiteTable.append` and {meth}`MutationTable.append`. These +functions act exactly like {meth}`SiteTable.add_row` and {meth}`MutationTable.add_row` +but they take an existing site or mutation and add all its details as a new row. + + +:::{note} +In this code we consider only simple infinite sites mutations, +where we cannot have back or recurrent mutations. These would require a slightly +more involved approach where we keep a map of mutation IDs so that +mutation ``parent`` values could be computed. +::: + +After considering each site, we then create a new tree sequence using +the {meth}`TableCollection.tree_sequence` method on our updated tables. +We can test this on a tree sequence that has been mutated using +{func}`msprime.sim_mutations` with the ``discrete_genome`` +parameter set to ``False``, which creates infinite sites mutations: + +```{code-cell} ipython3 +import msprime + +ts = msprime.sim_ancestry(10, random_seed=123) +ts = msprime.sim_mutations(ts, rate=10, discrete_genome=False, random_seed=123) +print(ts.num_sites, "sites in the simulated tree sequence") + +ts_new = strip_singletons(ts) +print(ts_new.num_sites, "sites after removing singletons") +``` + +:::{todo} +Add another example here where we use the array oriented API to edit +the nodes and edges of a tree sequence. Perhaps recapitating would be a +good example? +::: + +(sec_data_structures_trees)= ## Trees + +A {class}`Tree` represents a single tree in a {class}`TreeSequence`. +The {program}`tskit` Tree implementation differs from most tree libraries by +using **integer IDs** to refer to nodes rather than objects. Thus, when we wish to +find the parent of the node with ID '0', we use ``tree.parent(0)``, which +returns another integer. If '0' does not have a parent in the current tree +(e.g., if it is a root), then the special value {data}`tskit.NULL` +({math}`-1`) is returned. The children of a node are found using the +{meth}`Tree.children` method. To obtain information about a particular node, +one may either use ``tree.tree_sequence.node(u)`` to which returns the +corresponding {class}`Node` instance, or use the {attr}`Tree.time` or +{attr}`Tree.population` shorthands. + +:::{todo} +Add an example of using the tree structure. Note that traversing through the +structure is covered in a different tutorial. +::: diff --git a/getting_started.md b/getting_started.md index 5f6f08ea..37b5d8e9 100644 --- a/getting_started.md +++ b/getting_started.md @@ -56,6 +56,8 @@ ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=4321) ts ``` +You can see that there are many thousands of trees in this tree sequence. + :::{note} Since we simulated the ancestry of 20 *diploid* individuals, our tree sequence contains 40 *sample nodes*, one for each genome. @@ -68,25 +70,61 @@ sequence. This process underlies many tree sequence algorithms, including those encounter later in this tutorial for calculating {ref}`population genetic statistics`. To iterate over your own tree sequence you can use -{meth}`TreeSequence.trees()`. Here, for example, is -how to check whether all trees in the tree sequence have fully coalesced (which is to be -expected in reverse-time, coalescent simulations, but not always for tree sequences -produced by {ref}`forward simulation `). +{meth}`TreeSequence.trees()`. + +```{code-cell} ipython3 +print(f"== Tree sequence has {ts.num_trees} trees ==") +for tree in ts.trees(): + print(f"Tree {tree.index} covers {tree.interval}") + if tree.index >= 4: + print("...") + break +print(f"Tree {ts.last().index} covers {ts.last().interval}") +``` + +Here we've also used {meth}`TreeSequence.last()` to access +the last tree directly; it may not surprise you that there's a corresponding +{meth}`TreeSequence.first()` method to return the first tree. + +:::{warning} +The code above doesn't store the tree anywhere, but merely performs a calculation on +each tree within the ``for`` loop. That's because, for efficiency, the +{meth}`trees()` method repeatedly returns the same tree object, +updated internally to reflect the (usually small) changes from tree to tree along the +sequence. For that reason, this won't work: + +``` +list(ts.trees()) # Don't do this! Every tree in the list will be an identical "null" tree +``` + +If you really do want separate instances of each tree (inefficient, and for large tree +sequences risks using up all your computer memory), you can use the +{meth}`TreeSequence.aslist()` method. +::: + +Above, we stopped iterating after Tree 4 to limit the printed output, but iterating +forwards through trees in a tree sequence (or indeed backwards using the standard Python +``reversed`` function) is efficient. That means it's quick, for example to check if all +the trees in a tree sequence have fully coalesced (which is to be expected in +reverse-time, coalescent simulations, but not always for tree sequences produced by +{ref}`forward simulation `). ```{code-cell} ipython3 +import time +start = time.time() for tree in ts.trees(): if tree.has_multiple_roots: print("Tree {tree.index} has not coalesced") break else: - print("All trees in the tree sequence have coalesced") + print(f"All {ts.num_trees} trees have coalesced. Checked in {time.time()-start} seconds") ``` -Since all the trees *have* coalesced, at each position in the genome this tree sequence -must have only one most recent common ancestor (MRCA) of the 40 sample nodes. Below, we -iterate over the trees again, finding the IDs of the root (MRCA) node for each tree. The +Now that we know all trees have coalesced, we know that at each position in the genome +all the 40 sample nodes must have one most recent common ancestor (MRCA). Below, we +iterate over the trees, finding the IDs of the root (MRCA) node for each tree. The time of this root node can be found via the {meth}`tskit.TreeSequence.node` method, which -returns a {class}`node object` with attributes including the node time: +returns a {class}`node object` whose attributes include the node time: ```{code-cell} ipython3 import matplotlib.pyplot as plt @@ -105,17 +143,19 @@ plt.show() ``` It's obvious that there's something unusual about the trees in the middle of this -chromosome, where the selective sweep occurred. It's not particularly efficient to pull -out a tree from the middle of a tree sequence, but it *can* be done, via the -{meth}`TreeSequence.at()` method. Here's the tree at the location -of the sweep, drawn using the {meth}`Tree.draw_svg()` method -(see the {ref}`visualization tutorial ` for more visualization -possibilities): +chromosome, where the selective sweep occurred. + +Currently, it's not particularly efficient to pull out individual trees from the middle +of a tree sequence, but it *can* be done, via the +{meth}`TreeSequence.at()` method. Here's the tree at location +$5\ 000\ 000$ (the position of the sweep) drawn using the +{meth}`Tree.draw_svg()` method (see the +{ref}`visualization tutorial ` for more visualization possibilities): ```{code-cell} ipython3 from IPython.display import SVG -swept_tree = ts.at(5_000_000) # or you can get e.g. the first tree using ts.first() +swept_tree = ts.at(5_000_000) # or you can get e.g. the nth tree using ts.at_index(n) intvl = swept_tree.interval print(f"Tree number {swept_tree.index}, which runs from position {intvl.left} to {intvl.right}:") # Draw it at a wide size, to make room for all 40 tips From fd6f7f922da524c3447223a7836161b02e615fee Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Sat, 19 Jun 2021 22:43:01 +0100 Subject: [PATCH 2/4] Port tree traversal material into new analysis tut And create a stub for tree sequence analysis --- _toc.yml | 4 + analysing_tree_sequences.md | 25 ++ analysing_trees.md | 409 ++++++++++++++++++++++++++++++ data/different_time_samples.trees | Bin 0 -> 6764 bytes data/parsimony_map.pickle | Bin 0 -> 761 bytes data/parsimony_map.trees | Bin 0 -> 7156 bytes data/parsimony_simple.trees | Bin 0 -> 7156 bytes data/tree_traversals.trees | Bin 0 -> 5740 bytes requirements.txt | 1 + 9 files changed, 439 insertions(+) create mode 100644 analysing_tree_sequences.md create mode 100644 analysing_trees.md create mode 100644 data/different_time_samples.trees create mode 100644 data/parsimony_map.pickle create mode 100644 data/parsimony_map.trees create mode 100644 data/parsimony_simple.trees create mode 100644 data/tree_traversals.trees diff --git a/_toc.yml b/_toc.yml index 9bdd798c..ae0e00e0 100644 --- a/_toc.yml +++ b/_toc.yml @@ -13,6 +13,10 @@ parts: - file: simplification - file: metadata - file: tskitr +- caption: Analysis + chapters: + - file: analysing_tree_sequences + - file: analysing_trees - caption: Backward time simulations # TODO this will be broken down more finely as we add more content chapters: diff --git a/analysing_tree_sequences.md b/analysing_tree_sequences.md new file mode 100644 index 00000000..02fec1de --- /dev/null +++ b/analysing_tree_sequences.md @@ -0,0 +1,25 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_analysing_tree_sequences)= + +# Analysing tree sequences + +:::{todo} +Add all stats material from https://tskit.dev/tskit/docs/stable/tutorial.html + +Should we rename this "calculating statistics", or something else with the word "statistics" in the name? +::: \ No newline at end of file diff --git a/analysing_trees.md b/analysing_trees.md new file mode 100644 index 00000000..819c3061 --- /dev/null +++ b/analysing_trees.md @@ -0,0 +1,409 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +```{code-cell} ipython3 +:tags: [remove-cell] +import io +import pickle + +import msprime +import tskit + +def tree_traversals(): + nodes = """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 1 0 + 3 1 0 + 4 1 0 + 5 0 1 + 6 0 2 + 7 0 3 + """ + edges = """\ + left right parent child + 0 1 5 0,1,2 + 0 1 6 3,4 + 0 1 7 5,6 + """ + # NB same tree as used above, and we're using the same diagram. + ts = tskit.load_text( + nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False + ) + ts.dump("data/tree_traversals.trees") + +def different_time_samples(): + samples = [ + msprime.Sample(0, 0), + msprime.Sample(0, 1), + msprime.Sample(0, 20), + ] + ts = msprime.simulate( + Ne=1e6, + samples=samples, + demographic_events=[ + msprime.PopulationParametersChange(time=10, growth_rate=2, population_id=0), + ], + random_seed=42, + ) + ts.dump("data/different_time_samples.trees") + +def parsimony_simple(): + ts = msprime.sim_ancestry(3, random_seed=42) + ts.dump("data/parsimony_simple.trees") + +def parsimony_map(): + # pick a seed that gives the tips in the right order + ts = msprime.sim_ancestry(3, sequence_length=100, random_seed=38) + ts.dump("data/parsimony_map.trees") + ts = msprime.sim_mutations( + ts, rate=0.01, random_seed=123, discrete_genome=False) + data = [ + {'pos':v.site.position, 'alleles': v.alleles, 'genotypes':v.genotypes} + for v in ts.variants()] + with open("data/parsimony_map.pickle", "wb") as f: + pickle.dump(data, f) + + +def create_notebook_data(): + tree_traversals() + different_time_samples() + parsimony_simple() + parsimony_map() + +# create_notebook_data() # uncomment to recreate the tree seqs used in this notebook +``` + +(sec_analysing_trees)= +# Analysing trees + +There are a number of different ways we might want to analyse a single {class}`Tree`. +Most involve some sort of traversal over the nodes, mutations, or branches in the tree. +{program}`tskit` provides various way of traversing through a tree, and also some +built in phylogenetic algorithms such as {meth}`Tree.map_mutations` which efficently +places mutations ("characters" in phylogenetic terminology) on a given tree. + + +(sec_analysing_trees_traversals)= + +## Tree traversals + +There are various a single {class}`Tree`, traversals in various orders are possible using the +{meth}`~Tree.nodes` iterator. For example, in the following tree we can visit the +nodes in different orders: + + +```{code-cell} ipython3 +import tskit +from IPython.display import SVG, display + +ts = tskit.load("data/tree_traversals.trees") +tree = ts.first() +display(SVG(tree.draw_svg())) + +for order in ["preorder", "inorder", "postorder"]: + print(f"{order}:\t", list(tree.nodes(order=order))) +``` + +Much of the time, the specific ordering of the nodes is not important +and we can leave it out (defaulting to preorder traversal). For example, +here we compute the total branch length of a tree: + +```{code-cell} ipython3 +total_branch_length = sum(tree.branch_length(u) for u in tree.nodes()) +print(f"Total branch length: {total_branch_length}") +``` + +Note that this is also available as the {attr}`Tree.total_branch_length` attribute. + +### Traversing upwards + +For many applications it is useful to be able to traverse upwards from the +leaves. We can do this using the {meth}`Tree.parent` method, which +returns the parent of a node. For example, we can traverse upwards from +each of the samples in the tree: + +```{code-cell} ipython3 +for u in tree.samples(): + path = [] + v = u + while v != tskit.NULL: + path.append(v) + v = tree.parent(v) + print(u, "->", path) +``` + +### Traversals with information + +Sometimes we will need to traverse down the tree while maintaining +some information about the nodes that are above it. While this +can be done using recursive algorithms, it is often more convenient +and efficient to use an iterative approach. Here, for example, +we define an iterator that yields all nodes in preorder along with +their path length to root: + +```{code-cell} ipython3 +def preorder_dist(tree): + for root in tree.roots: + stack = [(root, 0)] + while len(stack) > 0: + u, distance = stack.pop() + yield u, distance + for v in tree.children(u): + stack.append((v, distance + 1)) + +print(list(preorder_dist(tree))) +``` + + +(sec_tutorial_networkx)= + +## Networkx + +Traversals and other network analysis can also be performed using the sizeable +[networkx](https://networkx.github.io/documentation/stable/index.html) +library. This can be achieved by calling {meth}`Tree.as_dict_of_dicts` to +convert a {class}`Tree` instance to a format that can be imported by networkx to +create a graph: + +```{code-cell} ipython3 +import networkx as nx + +g = nx.DiGraph(tree.as_dict_of_dicts()) +print(sorted(g.edges)) +``` + +### Traversing upwards in networkx + +We can revisit the above examples and traverse upwards with +networkx using a depth-first search algorithm: + +```{code-cell} ipython3 +g = nx.DiGraph(tree.as_dict_of_dicts()) +for u in tree.samples(): + path = [u] + [parent for parent, child, _ in + nx.edge_dfs(g, source=u, orientation="reverse")] + print(u, "->", path) +``` + + +### Calculating distances to the root + +Similarly, we can yield the nodes of a tree along with their distance to the +root in pre-order in networkx as well. Running this on the example above gives us +the same result as before: + +```{code-cell} ipython3 +g = nx.DiGraph(tree.as_dict_of_dicts()) +for root in tree.roots: + print(sorted(list(nx.shortest_path_length(g, source=root).items()))) +``` + + +### Finding nearest neighbors + +If some samples in a tree are not at time 0, then finding the nearest neighbor +of a sample is a bit more involved. Instead of writing our own traversal code +we can again draw on a networkx algorithm. +Let us start with an example tree with three samples that were sampled at +different time points: + +```{code-cell} ipython3 +ts = tskit.load("data/different_time_samples.trees") +tree = ts.first() +SVG(tree.draw_svg(y_axis=True, time_scale="rank")) +``` + +The generation times for these nodes are as follows: + +```{code-cell} ipython3 +for u in tree.nodes(): + print(f"Node {u}: time {tree.time(u)}") +``` + +Note that samples 0 and 1 are about 35 generations apart from each other even though +they were sampled at almost the same time. This is why samples 0 and 1 are +closer to sample 2 than to each other. + +For this nearest neighbor search we will be traversing up and down the tree, +so it is easier to treat the tree as an undirected graph: + +```{code-cell} ipython3 +g = nx.Graph(tree.as_dict_of_dicts()) +``` + +When converting the tree to a networkx graph the edges are annotated with their +branch length: + +```{code-cell} ipython3 +for e in g.edges(data=True): + print(e) +``` + +We can now use the "branch_length" field as a weight for a weighted shortest path +search: + +```{code-cell} ipython3 +import collections +import itertools + +# a dictionary of dictionaries to represent our distance matrix +dist_dod = collections.defaultdict(dict) +for source, target in itertools.combinations(tree.samples(), 2): + dist_dod[source][target] = nx.shortest_path_length( + g, source=source, target=target, weight="branch_length" + ) + dist_dod[target][source] = dist_dod[source][target] + +# extract the nearest neighbor of nodes 0, 1, and 2 +nearest_neighbor_of = [min(dist_dod[u], key=dist_dod[u].get) for u in range(3)] + +print(dict(zip(range(3), nearest_neighbor_of))) +``` + + +(sec_analysing_trees_parsimony)= + +## Parsimony + +The {meth}`Tree.map_mutations` method finds a parsimonious explanation for a +set of discrete character observations on the samples in a tree using classical +phylogenetic algorithms. + +```{code-cell} ipython3 +tree = tskit.load("data/parsimony_simple.trees").first() +alleles = ["red", "blue", "green"] +genotypes = [0, 0, 0, 0, 1, 2] +styles = [f".n{j} > .sym {{fill: {alleles[g]}}}" for j, g in enumerate(genotypes)] +display(SVG(tree.draw_svg(style="".join(styles)))) + +ancestral_state, mutations = tree.map_mutations(genotypes, alleles) +print("Ancestral state = ", ancestral_state) +for mut in mutations: + print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") +``` + +So, the algorithm has concluded, quite reasonably, that the most parsimonious +description of this state is that the ancestral state is red and there was +a mutation to blue and green over nodes 4 and 5. + +### Building tables + +One of the main uses of {meth}`Tree.map_mutations` is to position mutations on a tree +to encode observed data. In the following example we show how a set +of tables can be updated using the {ref}`Tables API`; here we +infer the location of mutations in an simulated tree sequence of one tree, +given a set of site positions with their genotypes and allelic states: + +```{code-cell} ipython3 +import pickle +ts = tskit.load("data/parsimony_map.trees") +with open("data/parsimony_map.pickle", "rb") as file: + data = pickle.load(file) # Load saved variant data from a file +display(SVG(ts.draw_svg(size=(500, 300), time_scale="rank"))) +print("Variant data: pos, genotypes & alleles as described by the ts.variants() iterator:") +for i, v in enumerate(data): + print(f"Site {i} (pos {v['pos']:7.4f}): alleles: {v['alleles']}, genotypes: {v['genotypes']}") +print() +tree = ts.first() # there's only one tree anyway +tables = ts.dump_tables() +# Infer the sites and mutations from the variants. +for variant in data: + ancestral_state, mutations = tree.map_mutations(variant["genotypes"], variant['alleles']) + site_id = tables.sites.add_row(variant['pos'], ancestral_state=ancestral_state) + info = f"Site {site_id}: parsimony sets ancestral state to {ancestral_state}" + parent_offset = len(tables.mutations) + for mut in mutations: + parent = mut.parent + if parent != tskit.NULL: + parent += parent_offset + mut_id = tables.mutations.add_row( + site_id, node=mut.node, parent=parent, derived_state=mut.derived_state) + info += f", and places mutation {mut.id} to {mut.derived_state} above node {mut.node}" + print(info) +new_ts = tables.tree_sequence() +``` + +```{code-cell} ipython3 +mut_labels = {} # An array of labels for the mutations +for mut in new_ts.mutations(): # Make pretty labels showing the change in state + site = new_ts.site(mut.site) + older_mut = mut.parent >= 0 # is there an older mutation at the same position? + prev = new_ts.mutation(mut.parent).derived_state if older_mut else site.ancestral_state + mut_labels[site.id] = f"{mut.id}: {prev}→{mut.derived_state}" + +display(SVG(new_ts.draw_svg(size=(500, 300), mutation_labels=mut_labels, time_scale="rank"))) +``` + + +### Parsimony and missing data + +The Hartigan parsimony algorithm in {meth}`Tree.map_mutations` can also take missing data +into account when finding a set of parsimonious state transitions. We do this by +specifying the special value {data}`tskit.MISSING_DATA` (-1) as the state, which is +treated by the algorithm as "could be anything". + +For example, here we state that sample 0 is missing, and use the colour white to indicate +this: + +```{code-cell} ipython3 +tree = tskit.load("data/parsimony_simple.trees").first() +alleles = ["red", "blue", "green", "white"] +genotypes = [tskit.MISSING_DATA, 0, 0, 0, 1, 2] +styles = [f".n{j} > .sym {{fill: {alleles[g]}}}" for j, g in enumerate(genotypes)] +display(SVG(tree.draw_svg(style="".join(styles)))) + +ancestral_state, mutations = tree.map_mutations(genotypes, alleles) +print("Ancestral state = ", ancestral_state) +for mut in mutations: + print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") +``` + + +The algorithm decided, again, quite reasonably, that the most parsimonious explanation +for the input data is the same as before. Thus, if we used this information to fill +out mutation table as above, we would impute the missing value for 0 as red. + +The output of the algorithm can be a little surprising at times. Consider this example:: + +```{code-cell} ipython3 +tree = msprime.simulate(6, random_seed=42).first() +alleles = ["red", "blue", "white"] +genotypes = [1, -1, 0, 0, 0, 0] +node_colours = {j: alleles[g] for j, g in enumerate(genotypes)} +ancestral_state, mutations = tree.map_mutations(genotypes, alleles) +print("Ancestral state = ", ancestral_state) +for mut in mutations: + print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") +SVG(tree.draw(node_colours=node_colours)) +``` + + +Note that this is putting a mutation to blue over node 6, **not** node 0 as +we might expect. Thus, we impute here that node 1 is blue. It is important +to remember that the algorithm is minimising the number of state transitions; +this may not correspond always to what we might consider the most parsimonious +explanation. + +## Fast tree-based algorithms using numba + +:::{todo} +Add a few examples here of how to use numba to speed up tree based dynamic programming +algorithms. There are a good number of worked-up examples with timings in +[issue #63](https://github.com/tskit-dev/tutorials/issues/63) +::: + diff --git a/data/different_time_samples.trees b/data/different_time_samples.trees new file mode 100644 index 0000000000000000000000000000000000000000..233cd155aad284de56962e40dab4369c56230760 GIT binary patch literal 6764 zcmbW5O^6&t6vroOWQ`w*8j(ba%@`MckkN2Ye%6_c#?ko{q)Zker{vi%e&3Pojm{Re)9l5i#H5+u;gv#ckz5; zuX)&v*LmxocbUJ(ObZ8@=V|7!lQkbO|A56qpXC7YuUY(j&iGj0x6I$gfxttb6(9Ax z!941}kh6a9`?JOOnTJ2v*(2)n8}q3DW#(_O0c(!+{muNttba3S{^0lRF(8ibdpUXJ z-(db;j{gz!R=OLesQ)qMf#u~9|19$lS@ChcHUwq?^va6=F%roFS&sN`Sn z-^qc&|By32_U{1mcz%YnM z-;BrpDJ%a8e25`X#t8qfF%Jy<9}l14?b`F?=1f5obgfrpRD}z@~F%2%;UI%e+&j0qiA=9nr!tv zzil3U(TUAdAY!-e#_sx=6LmUDSSF5Y^+dpyeOHQ*4W!rY#hpk7Zaf*ff#Am_A(a<} zUf32Z6kbPgyif@llhlc1TgY`fDS~Lp^@kd7$!iZ>Urly=x2wo>DDb0}tDTIKaq4Lm z);Lb~KjysNG-X!3E*BQDAB4ilJ_4^RlZ@45Tc|iS)jN@zf*T`3gpn&4ZwwMfZIN=3 zY*1>hqJeCs4Pq}4!5|){D7_`ob`%M+#K>j{A(@dINHR^4_s~a@_Hbq@JMyQzbT&F= zZO_NUJdXCt%#p(S87E{3&FMD6e$*fMNj1z|2OG)MB>aw(vG=hm6QeJqB@wz|i;|b3 zMK@ui1YT06Itkj6pr5uL3K>x1m4P77jxWM)+|xBvlS!yp(%nk7bgU430$_c{60@-; zyotuLAJIq`b1?AQ+>3N?)$hk!GkQ}Vv1awJ?tAEcg~H9J47+iR)derE#8L>AnCSoS z0+-H5`_#E@*8dYHWB$<`e$(oI$NZ%A+k3`3r{?{y{`u1RzBd2--j(a_j;VS0TPv9Z z=5XsR%@s?vO0`lg9V(Vo)QOj=Ym;!Tq}#OC2C6UVSZTe*)Jm5~NOzDdW9sAy){26_ z4cnSjUZAf?f+{+czHpRxQ9BI>zE9{W{amXiKb0y8%!wQs=%XJjdSPYowy?0(N{If8C$tP5~% zt)%Z8J=OGrypRsNzEtbh-IdXD+(UMCUBqEGAyR`Qf#d07YMJA-d{-&Qp*gmtGR$JN zvM_YAFx-=)JvZ!%(xvmnn>|RfJV*YjB-3M!$9na=whE|Y`V@ip$aEiPSO$%E=#?wT zRMJLAfun@bDNgBy3`^e?p`h|ca^$Kb$vCMJ`=(YSV~6X-G7+{d31`ruGvm0jn~EO} z0>^h3g|CxZpYNyvk}8&iBwf+>J^C!r-x&14L5k`nN?C+Uo{U0zSHvV*D@EEH-IG;C zFB5t!9dqTf7wQY3TfLUN;#b@2sVsW-uot~>z=NgUT!m%@?uh$+g$|#EGR>DE~aIHCAsnu(h#;eWA z7mlAie&qCV=j`l^Gc%=g2weJl3zHmHj8HGGzZ&;0X>LKKUUGg>s$OnsPDy5BQBh*$lpdb=qSWO4yyB9g(&UmUlc)5s zf)!5bVb4o}D49IPo1t}zGh@<}_9;PAG`tzT*;=P$aQCpLfQ$xdVaha|(%Au#W$t0D znd0Z?=k*^5z=StL$&{o{XEsJgkn^ViIi*1NXE3=tSX5ris$gKq0O|<=y4e}%<_z`> zt_)s~g&F+F)`zdg9$u9*YYESJGTs?C9NFF6y))*P4jkOi!nHG`m^(L+1%+{?p7P@4&aHm08g UfA!y83D!(RRDmKI995}$0J@~~{r~^~ literal 0 HcmV?d00001 diff --git a/data/parsimony_map.trees b/data/parsimony_map.trees new file mode 100644 index 0000000000000000000000000000000000000000..0ac459ab82006c3416d7900221d731472b96ff44 GIT binary patch literal 7156 zcmc(iO^g&(5XT1vLHqzekPi(u9S2edxM|CjK0h~5%@(D=cxU!8#) zgg;P_NB%Y8Zxa5v@K;N}UxWJZ5FXQhLim0h!$aA6n=#$&K1lbets8TmT{>lkNlR% z!donl^|$&md90s1g-891=8yVUg-89t!$0bzx6tp_`11wpkNQn%<1fmiejVXu8cRia z#D7+JX$td*fATW(NB!OseytR^X#SYLkA&Z-@$j$D8~V_HE&R=*xKMEZ5dSCP*K0id zBc9$uzf1yy|FU3w%-=@gaegh99}*txXR-Xfml+@H=Yh-QQU3$N`^&1A_?-3sN*V6^#WBv~bkMsYG@TgC}_|PBK@;@hh zKMr!_|B{wJc=(6C-a>y$cZz$Dd-GM|G@Zm**t5f_saRN?q!X0u$6+UG20bpuIX zQ{+AGBTak0msPgwFLv|UaLQ~i%OiX3&Q;b&3JZIZkS5GlcP4Cy?T(jLBkOapt}Gsb z-(E8D-cyyuXh-3U3Y?%p$w#U|U&5slc*&V^5;P@#J8wNN6d+blb`*KGJr%T)X|7p3 zl7>nm`nJ+3?I{GG0vKOriC#|=nZ(Sp9a2jlW~bveWiR5rlg~#!VtL=mwx}fP2xjtQcfbJ;?yJGBeA#lsvC05 z=f>}xGwZmv%yX@|<~XnPpEI+K%$6hG ztQ%&fY?Uo@$S~t@DmhHI0}{^3unp6X+YxPFtk;-|d5na#*HeKKx-q0!BlLYIpp3|! z>)Wz_&8F^Y1N68!MvXW+=0ZAJ+tafo;IVQ73%5X0R4j{aU78b$VYleu;MoAcd$2$0 z2I+=x(?Olnk*%^DVIWx#(9#~;NM!5n`tKeEcI+O_Id6NR+stL|(@c9^&JAcyVAJl3 zX%CMQn>YThOa)C%k{v8|_tftii)xC>Wjj%;s|Y&2?KzXmCv^16c$gf3TE6W4Cus8NHk}L*)+$qF&*h7F?Tr8VV737 zVe)~?l1H5&9)3t29^URoERU3@oH$igt@7qxBI=E2ydqCk1ZBiY^4$V}_TJ|{6A zbY=}>+^{y+D^|T$GxqM-Px@-*9wQ3Fq<$pm3~#ASY%W)-m2&N=iIInQ?%uh5-%k6$ z_?SI5$~pKB-Jydtha;IVUAwc%G=JtaE|=FPk1}0KJdFGIW?G_i$=1pfDNyDAtz2k`E+vT)%4rqSm}w8hRUg}a;-d;enzdBm(N^2RFzfq!+-0Ft+C5GN$?1r%OVE}&dMmbj1zPH+M72q*_G2&!l5_l%oec5H-LY24FQ z^;K0@_jI?P+rRtJm7BJ0x}a1l-9^v&o9WLQeiyUtiOuQD<=p>$OZu{j`@@>cSn)jb zus5zN#;d&5?Vh<{S!Un>|N_4}OpD>x8%)MHqD z)b9-Q$oF)?`oZsa8o!kb0e`U5Th#9t=28FKnZHm6v_8iF2lKbEzRLWl@q@qV>J;3@ z{7gX}`8SxqhWQiBU&Q@k4eGy#c})9@%n#!rNBkM)@%e$r_+YQMSbvcg|Ft#7e@2TB zzG(b6G=8aIeE9!_dEPckMS0Y3nR(1Vc=*@rruTnfej_VR70e%geq)}Oaj7Ve{4U^y zw^knO@9K5(SUJ zcQB9d|5+YL(fnJ?WBt6yJkGOW!$AD|n8*EfsbGA}{}JYK{-0zX^%)i)`p2~VPcc7? zgB_G~EvHjr+2F7AZV zx8w1^_65Jr3mLg#;0BI3M&VTi%MBtSW0G2-bc9@`lfn<@ZEr>6&bv#;5IQ2~BAK9EKMMP@oi~VGU-3I~DfAUf)Zqk@i)vOcss9?;sg_ zAE-)W^kg_M0y}6^@=~3#p|Kv)^|e-iy@Usm@1r#HxMk5_;5mp#BHGlpgGj z^!mU3R8Hzq`KU+brXH1>denUhccxP5+%$FR_wU;{I^xEaSPBs(9@RY(d%IpYRWsz6 z&kc1%T@|ltt!j>Qi(dbepYlKG85InS%27Rk4)~4o@qg&QzDxf6^us!2k zEhO$~1N2mJj0SOZ%!PEc_U0BypvI~aSiJ>i3df~Mp&(Y5P7XC4Ak-%8&$>af<6Cr6 z=X7K%tacc1+9R~I$+lzOdo%ypSztx(V>#zNFLa$;>OM`imvL@DivqiLCZRR|NB}3KjnuSd9I|8M9(%zsri|*& zR;}7n6L4wJrKyTg!S7`KiE{qn9_b7!SiuT>h4H^=YadtmSGLwl{6sYz>cLgnDw zbcqg<9FC>JWbrPwx`thEl{?}E5E>|bZwdUb^z166< zsuPu|W}`V#%QmL$e^Q)tnXcWL*kyO@+49cWiDr3H)ThdwiP}VY*JQIg+nBPelg(X! E0aA7Dv;Y7A literal 0 HcmV?d00001 diff --git a/data/tree_traversals.trees b/data/tree_traversals.trees new file mode 100644 index 0000000000000000000000000000000000000000..cdc41671381dc7fb750264352d271469814c1505 GIT binary patch literal 5740 zcmcJT%c~qk6vjJhG+rNx#zzt*CXhuIUT1Qzno$(5;37o9fSc@ww!3C-lYWd{JwtG% z3lVW)Hs%jdT;(5#s7t|(;xch1#FYp>aN$N`)ztU(^qf0$yXJxoJyoab{LcC6RKJ*y zUVrti2iA_Q?F)k7Wpk#FnV%JoyWsoYLD#tt_CJSR=N{P0ntL(vG59suM;>vVHs0#C z$EV;QbY^e?Jj|ude$0Fh{!zgnSW3DH=D#lZk!OC^?|bm~BOm#;^|O9A!L$BXJnP5h z{UZ49z%w5GWXt;f4xaVD0samN2%GEw7yOf$|JhSN`J;y%c#QF}k!|&({!Q=?fj_|u zf$^5Ktp97^Av^Hpng1gACq#Z;UrW*loYx}%X;1yk|DniFp7GoAeQxp7p;7p6`3==krk3oi^(iiS>_A|2Cfcr-lA2p8H4r?}H~sp7o(# z*)soC!Cz+}2?(3(zb4jCzHR-~_Z4`4|K3=!e&2%U_s0>>?|9blCU|~-kY|0wvTKv` ze+&MEXMXYr@!~<4z?Y}~li+#(lV|)|u;_69MeyAA)1La7|4s1xekITNB5iGQ|1W{( z`@f6)Z9M1S75aNC>iwin= z_vI<%Pw;#mkUvF-C9R5wN_V2|B#m7+RfEd)GF63fScUUezZeX(5-xomZL18Pqp(yt z9Li+4T@8vd3#(2ZW(wVjDs@uiNgk`qCcVwjPjaow%8d4lGFIi>T&c2R5~lS`GD+fb znCi|T4Tsve)&pq~g*M2dF-UJtagB?{c5%mgo0MDioE8ir4pXs3j4TVrF4D zvecEj>ljnnwdE>1i>H&O8Y|iQ9_En|U#iI52|Gs^rQvEtm@UahP49XTfL8B}m5JO>RX(h?ZOwFN znyMk!iXOVq&+W%+nG^c%T*>~C9=KMbo zn}es#Wl#Gb&HVo~f4pLb%;Dcw`chIoQrwZld)-iX6zpI?9OOz-UanQpwf V-c{ko`k?!E?}f8r)Qh6c{{TV@ZCwBW literal 0 HcmV?d00001 diff --git a/requirements.txt b/requirements.txt index c49632c0..c833bd18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ matplotlib seaborn pandas numpy +networkx msprime>=1.0 From 4e48c8419c4aa2ca5d4d71d425b9b1d63cf37048 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 21 Jun 2021 14:25:39 +0100 Subject: [PATCH 3/4] Don't use dataclasses --- data_structures.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/data_structures.md b/data_structures.md index fddcbc4d..3c87de18 100644 --- a/data_structures.md +++ b/data_structures.md @@ -460,7 +460,6 @@ the modified tables. In the following example, we use this approach to remove all singleton sites from a given tree sequence. ```{code-cell} ipython3 -import dataclasses def strip_singletons(ts): tables = ts.dump_tables() tables.sites.clear() @@ -471,7 +470,7 @@ def strip_singletons(ts): mut = site.mutations[0] if tree.num_samples(mut.node) > 1: site_id = tables.sites.append(site) - mut = dataclasses.replace(mut, site=site_id) # set the new site id + mut = mut.replace(site=site_id) # set the new site id tables.mutations.append(mut) return tables.tree_sequence() ``` From ab86d34760aa4abd24d53d9d8d4e0e30e21c54c0 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 21 Jun 2021 17:18:20 +0100 Subject: [PATCH 4/4] Add in a callout to add uses for random tree access Issue https://github.com/tskit-dev/tskit/issues/684 --- getting_started.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/getting_started.md b/getting_started.md index 37b5d8e9..929e62b7 100644 --- a/getting_started.md +++ b/getting_started.md @@ -146,9 +146,11 @@ It's obvious that there's something unusual about the trees in the middle of thi chromosome, where the selective sweep occurred. Currently, it's not particularly efficient to pull out individual trees from the middle -of a tree sequence, but it *can* be done, via the +of a tree sequence +(please comment on [this issue](https://github.com/tskit-dev/tskit/issues/684) if you +have a need for this to be efficient) but it *can* be done, via the {meth}`TreeSequence.at()` method. Here's the tree at location -$5\ 000\ 000$ (the position of the sweep) drawn using the +$5\ 000\ 000$ --- the position of the sweep --- drawn using the {meth}`Tree.draw_svg()` method (see the {ref}`visualization tutorial ` for more visualization possibilities):