From 5463a25ddd880300a52d735536c8ce8df9101eab Mon Sep 17 00:00:00 2001 From: Cecile Ane Date: Fri, 1 Nov 2024 14:44:19 -0500 Subject: [PATCH] doc: added man and lib pages from PhyloNetworks --- docs/make.jl | 26 +- docs/src/index.md | 42 ++- docs/src/lib/internals.md | 21 ++ docs/src/lib/public.md | 17 ++ docs/src/man/bootstrap.md | 339 ++++++++++++++++++++ docs/src/man/dist_reroot.md | 261 ++++++++++++++++ docs/src/man/expectedCFs.md | 111 +++++++ docs/src/man/fixednetworkoptim.md | 183 +++++++++++ docs/src/man/inputdata.md | 187 ++++++++++++ docs/src/man/multiplealleles.md | 147 +++++++++ docs/src/man/snaq_plot.md | 390 ++++++++++++++++++++++++ docs/src/man/ticr_howtogetQuartetCFs.md | 260 ++++++++++++++++ src/auxiliary.jl | 6 +- 13 files changed, 1977 insertions(+), 13 deletions(-) create mode 100644 docs/src/lib/internals.md create mode 100644 docs/src/lib/public.md create mode 100644 docs/src/man/bootstrap.md create mode 100644 docs/src/man/dist_reroot.md create mode 100644 docs/src/man/expectedCFs.md create mode 100644 docs/src/man/fixednetworkoptim.md create mode 100644 docs/src/man/inputdata.md create mode 100644 docs/src/man/multiplealleles.md create mode 100644 docs/src/man/snaq_plot.md create mode 100644 docs/src/man/ticr_howtogetQuartetCFs.md diff --git a/docs/make.jl b/docs/make.jl index 951d04e..5eda64d 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -5,19 +5,41 @@ DocMeta.setdocmeta!(SNaQ, :DocTestSetup, :(using SNaQ, PhyloNetworks); recursive makedocs(; modules=[SNaQ], - authors="Claudia Solis-Lemus , Paul Bastide , Cecile Ane , and contributors", + authors="Claudia Solis-Lemus , Cécile Ané , and contributors", sitename="SNaQ.jl", format=Documenter.HTML(; canonical="https://JuliaPhylo.github.io/SNaQ.jl", edit_link="dev", assets=String[], + prettyurls = get(ENV, "CI", nothing) == "true", # easier local build + size_threshold = 600 * 2^10, + size_threshold_warn = 500 * 2^10, # 600 KiB ), + # exception, so warning-only for :missing_docs. List all others: + warnonly = Documenter.except(:autodocs_block, :cross_references, :docs_block, + :doctest, :eval_block, :example_block, :footnote, :linkcheck_remotes, + :linkcheck, :meta_block, :parse_error, :setup_block), pages=[ "Home" => "index.md", + "Manual" => [ + "Input Data for SNaQ" => "man/inputdata.md", + "TICR pipeline" => "man/ticr_howtogetQuartetCFs.md", + "Network estimation and display" => "man/snaq_plot.md", + "Network comparison and manipulation" => "man/dist_reroot.md", + "Candidate Networks" => "man/fixednetworkoptim.md", + "Extract Expected CFs" => "man/expectedCFs.md", + "Bootstrap" => "man/bootstrap.md", + "Multiple Alleles" => "man/multiplealleles.md", + ], + "Library" => [ + "Public" => "lib/public.md", + "Internals" => "lib/internals.md", + ] ], ) deploydocs(; repo="github.com/JuliaPhylo/SNaQ.jl", - devbranch="dev", + push_preview = true, + devbranch="main", ) diff --git a/docs/src/index.md b/docs/src/index.md index b88be7e..2f5cfb2 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,14 +1,40 @@ -```@meta -CurrentModule = SNaQ -``` - # SNaQ -Documentation for [SNaQ](https://github.com/JuliaPhylo/SNaQ.jl). +[SNaQ](https://github.com/JuliaPhylo/SNaQ.jl) is a [Julia](http://julialang.org) +package that implements the SNaQ method by +[Solís-Lemus & Cécile Ané (2016)](https://doi.org/10.1371/journal.pgen.1005896) +to estimate a phylogenetic network from quartet concordance factors. +See the [PhyloNetworks](https://github.com/JuliaPhylo/PhyloNetworks.jl) +package, which SNaQ depends on, for background on phylogenetic networks +and for how to get help. + +## references -```@index +See them in +[bibtex format](https://github.com/juliaphylo/SNaQ.jl/blob/master/CITATION.bib). + +## Manual + +```@contents +Pages = [ + "man/inputdata.md", + "man/ticr_howtogetQuartetCFs.md", + "man/snaq_plot.md", + "man/dist_reroot.md", + "man/fixednetworkoptim.md", + "man/expectedCFs.md", + "man/bootstrap.md", + "man/multiplealleles.md", +] +Depth = 3 ``` -```@autodocs -Modules = [SNaQ] +For help on individual functions, see the library: + +```@contents +Pages = [ + "lib/public.md", + "lib/internal.md", +] +Depth = 3 ``` diff --git a/docs/src/lib/internals.md b/docs/src/lib/internals.md new file mode 100644 index 0000000..54347f7 --- /dev/null +++ b/docs/src/lib/internals.md @@ -0,0 +1,21 @@ +# internal documentation + +Documentation for `SNaQ`'s internal functions. +These functions are not exported and their access (API) should not be +considered stable. But they can still be used, like this for example: +`SNaQ.foo()` for a function named `foo()`. + + +## functions & types + +```@autodocs +Modules = [SNaQ] +Public = false +Order = [:type,:function,:constant] +``` + +## index + +```@index +Pages = ["internal.md"] +``` diff --git a/docs/src/lib/public.md b/docs/src/lib/public.md new file mode 100644 index 0000000..072872f --- /dev/null +++ b/docs/src/lib/public.md @@ -0,0 +1,17 @@ +# public documentation + +Documentation for `SNaQ`'s public (exported) interface. + +## functions & types + +```@autodocs +Modules = [SNaQ] +Private = false +Order = [:function,:type] +``` + +## index + +```@index +Pages = ["public.md"] +``` diff --git a/docs/src/man/bootstrap.md b/docs/src/man/bootstrap.md new file mode 100644 index 0000000..713a549 --- /dev/null +++ b/docs/src/man/bootstrap.md @@ -0,0 +1,339 @@ +```@setup bootstrap +using PhyloNetworks, SNaQ +mkpath("../assets/figures") +``` +# Bootstrap + +## Running a bootstrap analysis + +There are two ways to do a bootstrap analysis. + +First, we can use a table of quartet CFs estimated with credibility intervals, +such as if we used BUCKy. +The [TICR pipeline](@ref) outputs a CF table with extra columns for credibility +intervals. We could then read that table and get bootstrap networks like this, +and tweak options as needed: + +```julia +using DataFrames, CSV +df = CSV.read("tableCF_withCI.csv", DataFrame) +bootnet = bootsnaq(startnetwork, df, hmax=1, filename="bootstrap") +``` + +Alternatively, we can use bootstrap gene trees: one file of bootstrap trees per gene. +Here, the input is a text file that lists all the bootstrap files (one per gene). +We demonstrate this second option here. + +The names of all our bootstrap files are listed in "BSlistfiles". +(ASTRAL can use the same file to do its own bootstrap, see the +[wiki](https://github.com/juliaphylo/PhyloNetworks.jl/wiki/Gene-Trees:-RAxML) +for more details). +The function `readBootstrapTrees` can read this list of file names, then +read each bootstrap file to get the bootstrap sample for each gene. +We can use them to sample input gene trees at random, one per gene, +and estimate a network from them. We ask the `bootsnaq` function +to repeat this resampling of bootstrap gene trees several times. + +```julia +bootTrees = readBootstrapTrees("BSlistfiles"); +bootnet = bootsnaq(net0, bootTrees, hmax=1, nrep=10, runs=3, + filename="bootsnaq", seed=4321) +``` + +The bootstrap networks are saved in the `boostrap.out` file, so they +can be read in a new session with +`bootnet = readMultiTopology("bootsnaq.out")`. To save the bootstrap networks to +a different file (perhaps after having re-rooted them with an +outgroup), we could do this: `writeMultiTopology(bootnet, "bootstrapNets.tre")`. + +The example above asks for 10 bootstrap replicates, +which is definitely too few, to make the example run faster. +We might also increase the number of optimization runs (`runs`) +done for each bootstrap replicate. This bootstrap was run with the +default 10 runs per replicate, and 100 bootstrap replicates, +and the 100 bootstrap networks come with the package: + +```@example bootstrap +bootnet = readMultiTopology(joinpath(dirname(pathof(SNaQ)), "..","examples","bootsnaq.out")); +length(bootnet) +``` + +If we used a specified list of quartets on the original data, we +should use that same list for the bootstrap analysis through the +option `quartetfile`. + +## support for tree edges + +Now that we have 100 bootstrap networks, we need to summarize +what they have in common (highly supported features) and what they +don't (areas of uncertainty). + +Before summarizing these bootstrap networks on the best network, +it is best to re-read this network to get a reproducible internal numbering +of its nodes and edges, used later for mapping bootstrap support to edges. +```@example bootstrap +net1 = readTopology(joinpath(dirname(pathof(SNaQ)), "..","examples","net1.out")) +``` + +It turns out that the direction of gene flow is quite uncertain +in this example (see below) with a wrong direction inferred sometimes, +so we re-root our best network net1 to the base of O,E, for the figures +to be less confusing later. + +```@setup bootstrap +rootonedge!(net1, 7) +``` +```@example bootstrap +using PhyloPlots, RCall +R"name <- function(x) file.path('..', 'assets', 'figures', x)" # hide +R"svg(name('net1_rotate1_1.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, showedgenumber=true); # edge 7 leads to O+E +R"dev.off()" # hide +rootonedge!(net1, 7) # makes (O,E) outgroup clade +R"svg(name('net1_rotate1_2.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, shownodenumber=true); +R"dev.off()" # hide +nothing # hide +``` +![net1_rotate1 1](../assets/figures/net1_rotate1_1.svg) +![net1_rotate1 2](../assets/figures/net1_rotate1_2.svg) + +Edges cross: but rotating at node -6 should remove this crossing +of edges +```@example bootstrap +rotate!(net1, -6) +``` +```@example bootstrap +R"svg(name('net1_rotate2.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, showgamma=true); +R"dev.off()" # hide +nothing # hide +``` +![net1_rotate2](../assets/figures/net1_rotate2.svg) + +We can now summarize our bootstrap networks. +The functions `treeEdgesBootstrap` and `hybridBootstrapSupport` +read all bootstrap networks and map the edges / nodes +onto a reference network: here net1. +```@example bootstrap +BSe_tree, tree1 = treeEdgesBootstrap(bootnet,net1); +``` +This calculates the major tree `tree1` displayed in `net1`, that is, +the tree obtained by following the major parent (γ>0.5) of each hybrid node. +This tree can be visualized like this, with edge numbers shown for later use. +```@example bootstrap +R"svg(name('major_tree.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(tree1, showedgenumber=true); +R"dev.off()" # hide +nothing # hide +``` +![major_tree](../assets/figures/major_tree.svg) + +Next, we can look at bootstrap table `BSe_tree`, which has one row for +each tree edge in `net1`. One column contains the edge number +(same as shown in the plot) and another column contains the edge +bootstrap support: the proportion of bootstrap replicates in which this edge was +found in the major tree of the inferred network. +We can see the full bootstrap table and see +which tree edges have bootstrap support lower than 100% (none here) with +```@repl bootstrap +using DataFrames # for showall() below +show(BSe_tree, allrows=true, allcols=true) +filter(row -> row[:proportion] < 100, BSe_tree) +``` +Finally, we can map the bootstrap proportions onto the network or its main tree +by passing the bootstrap table to the `edgelabel` option of `plot`: +```@example bootstrap +R"svg(name('boot_tree_net_1.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(tree1, edgelabel=BSe_tree); +R"dev.off()" # hide +R"svg(name('boot_tree_net_2.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, edgelabel=BSe_tree); +R"dev.off()" # hide +nothing # hide +``` +![boot_tree_net 1](../assets/figures/boot_tree_net_1.svg) +![boot_tree_net 2](../assets/figures/boot_tree_net_2.svg) + +(Here, it is important that the numbers assigned to edges when building the boostrap +table --those in `net1` at the time-- correspond to the current edge numbers +in `tree1` and `net1`. That was the purpose of reading the network from the +output file of `snaq!` earlier, for consistency across different Julia sessions.) + +If we wanted to plot only certain bootstrap values, like those below 100% (1.0), +we could do this: +```julia +plot(net1, edgelabel=filter(row -> row[:proportion] < 100, BSe_tree)); +``` + +## support for hybrid edges and hybrid nodes + +Summarizing the placement of reticulations is not standard. +The function `hybridBootstrapSupport` attempts to do so. +The descendants of a given hybrid node form the "recipient" or "hybrid" clade, +and is obtained after removing all other reticulations. +If reticulation is due to gene flow or introgression, the minor hybrid edge (with γ<0.5) +represents this event. The descendants of the lineage from which gene flow originated +is then a second "sister" of the hybrid clade. Because of the reticulation event, +the hybrid clade has 2 sister clades, not 1: the major sister (through the major hybrid edge +with γ>0.5) and the minor sister (through the minor hybrid edge with γ<0.5). +Note that the network says *nothing* about the process: its shows the *relationships* only. +We can calculate the frequency that each clade is a hybrid clade, or a major or minor sister +for some other hybrid, in the bootstrap networks: +```@example bootstrap +BSn, BSe, BSc, BSgam, BSedgenum = hybridBootstrapSupport(bootnet, net1); +``` +Let's look at the results. +We can list all the clades and the percentage of bootstrap networks (bootstrap support) +in which each clade is a hybrid or sister to a hybrid: +```@repl bootstrap +BSn +``` +If a clade contains a single taxon, it is listed with its taxon name. +The clade found in the best network is listed with its tag, starting with H (e.g. "H7"). +The name of other clades start with "c_" followed by their number in the best network, if they +do appear in the best network. +The node numbers, as used internally in the best network, are listed in a separate column. +They can be used later to display the bootstrap support values onto the network. +Various columns give the bootstrap support that each clade is a hybrid, or a (major/minor) sister +to a hybrid. The last column gives the bootstrap support for the full relationship in the +best network: same hybrid with same two sisters. +These bootstrap values are associated with nodes (or possibly, their parent edges). + +To see what is the clade named "H7", for instance: +```@repl bootstrap +BSc # this might be too big +show(BSc, allrows=true, allcols=true) +# BSc[BSc[!,:H7], :taxa] # just a different syntax to subset the data in the same way +filter(row -> row[:H7], BSc).taxa +``` +We can also get bootstrap values associated with edges, to describe the support that a given +hybrid clade has a given sister clade. +```@repl bootstrap +BSe +``` +Here, each row describes a pair of 2 clades: one being the hybrid, the other being its sister, +connected by a hybrid edge. The first rows corresponds to hybrid edges in the best network. Other +rows correspond to edges seen in bootstrap networks but not in the reference network. +```@repl bootstrap +BSedgenum +``` +lists all the hybrid edges in the best network, two for each hybrid node: +the major parent edge and then the minor parent edge. +In our case, there is only one reticulation, so only 2 hybrid edges. + +We can plot the bootstrap values of the 2 hybrid edges in the best network: +```@example bootstrap +R"svg(name('boot_net_net.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, edgelabel=BSe[:,[:edge,:BS_hybrid_edge]]); +R"dev.off()" # hide +nothing # hide +``` +![boot_net_net](../assets/figures/boot_net_net.svg) + +This is showing the bootstrap support each hybrid edge: percentage of bootstrap trees with an +edge from the same sister clade to the same hybrid clade. +Alternatively, we could show the bootstrap support for the full reticulation relationships in +the network, one at each hybrid node (support for same hybrid with same sister clades). +Here, we find that A received gene flow from E (and is sister to B otherwise) in just 32% +of bootstrap networks. In another 1% bootstrap, A received gene flow from another source. +```@example bootstrap +R"svg(name('boot_net_ret.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, nodelabel=BSn[!,[:hybridnode,:BS_hybrid_samesisters]]); +R"dev.off()" # hide +nothing # hide +``` +![boot_net_ret](../assets/figures/boot_net_ret.svg) + +Below is example code to place tree edge support and hybrid edge support +on the same plot. + +```julia +tmp = filter(row -> !ismissing(row[:edge]), BSe) # filter rows +select!(tmp, [:edge,:BS_hybrid_edge]) # select 2 columns only +rename!(tmp, :BS_hybrid_edge => :proportion) # rename those columns, to match names in BSe_tree +rename!(tmp, :edge => :edgeNumber) +tmp = vcat(BSe_tree, tmp) +plot(net1, edgelabel=tmp, nodelabel=BSn[:, [:hybridnode,:BS_hybrid_samesisters]]) +``` + +### Who are the hybrids in bootstrap networks? + +On a different plot, we can show the bootstrap support for hybrid clades, +first mapped to each node with positive hybrid support, +and then mapped on the parent edge of these nodes. +A is estimated as a hybrid in only 33% of our bootstrap networks. +In another 44%, it is the lineage to (E,O) that is estimated as +being of hybrid origin. +```@example bootstrap +R"svg(name('boot_net_hyb_1.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, nodelabel=filter(row->row[:BS_hybrid]>0, BSn)[!,[:hybridnode,:BS_hybrid]]); +R"dev.off()" # hide +nothing # hide +R"svg(name('boot_net_hyb_2.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, edgelabel=filter(row->row[:BS_hybrid]>0, BSn)[!,[:edge,:BS_hybrid]]); +R"dev.off()" # hide +nothing # hide +``` +![boot_net_hyb 1](../assets/figures/boot_net_hyb_1.svg) +![boot_net_hyb 2](../assets/figures/boot_net_hyb_2.svg) + +### Where is the origin of gene flow? + +We can plot the support for the various placements +of the gene flow origin (minor sister clade), +first mapped to each node with positive support for being the origin of gene flow, +and then mapped along the parent edge of these nodes. +We filtered clades to show those with sister support > 5%: +```@example bootstrap +R"svg(name('boot_net_clade_1.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, nodelabel=filter(r->r[:BS_minor_sister]>5, BSn)[!,[:node,:BS_minor_sister]]); +R"dev.off()" # hide +nothing # hide +R"svg(name('boot_net_clade_2.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, edgelabel=filter(r->r[:BS_minor_sister]>5, BSn)[!,[:edge,:BS_minor_sister]]); +R"dev.off()" # hide +nothing # hide +``` +![boot_net_clade 1](../assets/figures/boot_net_clade_1.svg) +![boot_net_clade 2](../assets/figures/boot_net_clade_2.svg) + +In our best network, the lineage to E is estimated as the origin +of gene flow, but this is recovered in only 41% of our bootstrap networks. +In another 49%, it is the lineage to A that is estimated as the *origin* +of gene flow: so gene flow is estimated in the opposite direction. +In this example, there is support for gene flow between (A,B) and (E,O), +but there is much uncertainty about its exact placement and about its direction. + +Mapping the support for major sister clades might be interesting too: +```julia +plot(net1, nodelabel=filter(r->r[:BS_major_sister]>5, BSn)[!,[:node,:BS_major_sister]]) +``` + +The estimated heritability γ on hybrid edges in the reference network, when present in a +bootstrap network, was also extracted: +```@repl bootstrap +BSgam[1:3,:] # first 3 rows only +``` +γ=0 values are for bootstrap replicates that did not have the edge in their network. +Basic summaries on γ values for a given edge, say the minor parent, +could be obtained like this: +```@repl bootstrap +minimum(BSgam[:,2]) +maximum(BSgam[:,2]) +using Statistics # for functions like mean and std (standard deviation) +mean(BSgam[:,2]) +std(BSgam[:,2]) +``` diff --git a/docs/src/man/dist_reroot.md b/docs/src/man/dist_reroot.md new file mode 100644 index 0000000..d2d37cc --- /dev/null +++ b/docs/src/man/dist_reroot.md @@ -0,0 +1,261 @@ +```@setup dist_reroot +using PhyloNetworks, SNaQ +mkpath("../assets/figures") +exampledir = joinpath(dirname(pathof(SNaQ)), "..","examples") +raxmltrees = joinpath(exampledir,"raxmltrees.tre") +raxmlCF = readTableCF(writeTableCF(countquartetsintrees(readMultiTopology(raxmltrees), showprogressbar=false)...)) +astralfile = joinpath(exampledir,"astral.tre") +astraltree = readMultiTopology(astralfile)[102] # 102th tree = last tree here +net0 = readTopology(joinpath(exampledir,"net0.out")) +net1 = readTopology(joinpath(exampledir,"net1.out")) +net0.loglik = 53.53150526187732 +net1.loglik = 28.31506721890958 +``` +# Comparing and manipulating networks + +Examples below follow those in [Getting a Network](@ref). + +## Comparing networks / trees + +Is the SNaQ tree (network with h=0) the same as the ASTRAL tree? +We can calculate their Robinson-Foulds distance: + +```@repl dist_reroot +hardwiredClusterDistance(astraltree, net0, false) +``` +The last option `false` is to consider topologies as unrooted. +The RF distance is 0, so the two unrooted topologies are the same. +If we had considered them as rooted, with whatever root they +currently have in their internal representation, +we would find a difference: +```@repl dist_reroot +hardwiredClusterDistance(astraltree, net0, true) +``` + +## Re-rooting trees and networks + +We can re-root our networks with the outgroup, O, +and then re-compare the ASTRAL tree and the SNaQ tree +as rooted topologies (and find no difference): +```@repl dist_reroot +rootatnode!(astraltree, "O") +rootatnode!(net0, "O") +hardwiredClusterDistance(astraltree, net0, true) +``` +```@example dist_reroot +using PhyloPlots, RCall +R"name <- function(x) file.path('..', 'assets', 'figures', x)" +R"svg(name('net0_O.svg'), width=4, height=4)" +R"par"(mar=[0,0,0,0]) +plot(net0); +R"dev.off()" +nothing # hide +``` +![net0_O](../assets/figures/net0_O.svg) + +Note that, as in previous chapters, we use the possibilities of `RCall` +to save the plot. We only show this commands once, but they will be run +behind the scene each time a plot is called. + +After trees/networks are rooted with a correct outgroup, +their visualization is more meaningful. + +Networks can be re-rooted at a given node or along a given edge. +Get help (type `?`) on the functions `rootatnode!` and `rootonedge!` +for more info. There are examples in the [Bootstrap](@ref) section. + +If the network is plotted with crossing edges, you may identify +ways to rotate the children edges at some nodes to untangle some crossing edges. +This can be done using the function `rotate!`. +See an example in the [Bootstrap](@ref) section, or type `?` then `rotate!`. + +## What if the root conflicts with the direction of a reticulation? + +With 1 hybridization or more, the direction of hybrid edges +constrain the position of the root. The root cannot be downstream of hybrid edges. +Any hybrid node has to be younger than, or of the same age as both of its parents. +So time has to flow "downwards" of any hybrid node, and the root cannot be +placed "below" a hybrid node. +An attempt to re-root the network at a position incompatible with hybrid edges +will fail, with a `RootMismatch` error. To show an example, let's use the +network below. We plotted the edge numbers, because we will want to use them +later to place the root. + +```@example dist_reroot +net7taxa = readTopology("(C,D,((O,(E,#H7:::0.196):0.314):0.664,(((A1,A2))#H7:::0.804,B):10.0):10.0);") +R"svg(name('reroot_net7taxa_1.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net7taxa, showgamma=true, showedgenumber=true, tipoffset=0.2); +R"dev.off()"; # hide +nothing # hide +``` +![reroot net7taxa 1](../assets/figures/reroot_net7taxa_1.svg) + +Let's imagine that the A1 and A2 are our outgroups, and we estimated the network above. +According to this network, time must flow from the hybrid node towards A1 and A2. +So any attempt to reroot the network with A1 as outgroup, or with A2 as outgroup, +or with the A clade (on edge 11), will fail with a `RootMismatch` error: + +```julia +rootatnode!(net7taxa, "A1"); # ERROR: RootMismatch: non-leaf node 5 had 0 children. ... +rootatnode!(net7taxa, "A2"); # ERROR: RootMismatch (again) +rootonedge!(net7taxa, 10); # ERROR: RootMismatch (again) +``` + +In this case, however, it is possible to root the network on either parent edge +of the hybrid node. These edges have numbers 12 and 5, based on the plot above. +We get these 2 rooted versions of the network: + +```@example dist_reroot +R"svg(name('reroot_net7taxa_2.svg'), width=7, height=4)"; # hide +R"layout(matrix(1:2,1,2))"; +R"par"(mar=[0,0,0.5,0]); # hide +rootonedge!(net7taxa, 11); +rotate!(net7taxa, -5) +plot(net7taxa, showgamma=true, tipoffset=0.2, shownodenumber=true); +R"mtext"("rooted on hybrid edge 11 (major)", line=-1) +rootonedge!(net7taxa, 5); +plot(net7taxa, showgamma=true, tipoffset=0.2); +R"mtext"("rooted on hybrid edge 5 (minor)", line=-1); +R"dev.off()"; # hide +nothing # hide +``` +![reroot net7taxa 2](../assets/figures/reroot_net7taxa_2.svg) + +On the second plot, the A clade does not *appear* to be an outgroup, +but this is just because the plot follows the major tree primarily, +based the major hybrid edges (those with γ>0.5). +We can display the exact same network differently, by changing +the γ inheritance values to invert the major/minor consideration of the hybrid edges. +```@example dist_reroot +net7taxa.edge[5] # just to check that it's one of the 2 hybrid edges of interest +setGamma!(net7taxa.edge[5], 0.501) # switch major/minor edges +R"svg(name('reroot_net7taxa_3.svg'), width=4, height=4)"; # hide +R"layout(matrix(1,1,1))"; # hide +R"par"(mar=[0,0,0,0]); # hide +plot(net7taxa, tipoffset=0.2); # not showing gamma values, because we changed them artificially +R"mtext"("rooted on hybrid edge 5 (considered major)", line=-1); +R"dev.off()"; # hide +nothing # hide +``` +![reroot net7taxa 3](../assets/figures/reroot_net7taxa_3.svg) + +Conclusion, in this particular example: it is possible to re-root the network to a place +where the A clade is indeed an outgroup. But it did require some care, +and we discovered that there are 2 acceptable rooting options. +The first is more plausible, if we think that the *species tree* is the *major tree*, +meaning that any gene flow or introgression event replaced less than 50% of the genes +in the recipient population. + +In other cases, it may not be possible to re-root the network with a known outgroup. +It would be the case if A1 was the only outgroup, and if A2 was an ingroup taxon. +In such a case, the outgroup knowledge tells us that our estimated network is wrong. +One (or more) reticulation in the network must be incorrect. +Its placement might be correct, but then its direction would be incorrect. +If the network was estimated via `snaq!`, check tips +about [Candidate networks compatible with a known outgroup](@ref). + +## Extracting the major tree + +We can also compare the networks estimated with h=0 (net0) and h=1 (net1): +```@repl dist_reroot +rootatnode!(net1, "O"); # the ; suppresses screen output +hardwiredClusterDistance(net0, net1, true) +``` +```@example dist_reroot +R"svg(name('net1_O.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, showgamma=true); +R"dev.off()" # hide +nothing # hide +``` +![net1_O](../assets/figures/net1_O.svg) + +They differ by 2 clusters: that's because A is of hybrid descent +in net1, not in net0. + +To beyond this hybrid difference, +we can extract the major tree from the network with 1 hybridization, +that is, delete the hybrid edge supported by less than 50% of genes. +Then we can compare this tree with the ASTRAL/SNaQ tree net0. +```@repl dist_reroot +tree1 = majorTree(net1); # major tree from net1 +hardwiredClusterDistance(net0, tree1, true) +``` +They are identical (at distance 0), so here the species network +with 1 hybrid node is a refinement of the estimated species tree +(this needs not be the case always). + +Is the SNaQ network with 1 hybrid node the same as the true network, +the one that was initially used to simulate the data? + +(digression on the data: gene trees were simulated under the coalescent +along some "true" network, then 500 base-pair alignments were simulated +along each gene tree with the HKY model, +gene trees were estimated from each alignment with RAxML, and +these estimated gene trees served as input to both ASTRAL and SNaQ.) + +The true network is shown below, correctly rooted at the outgroup O, +and plotted with branch lengths proportional to their +values in coalescence units: +```@repl dist_reroot +truenet = readTopology("((((D:0.4,C:0.4):4.8,((A:0.8,B:0.8):2.2)#H1:2.2::0.7):4.0,(#H1:0::0.3,E:3.0):6.2):2.0,O:11.2);"); +hardwiredClusterDistance(net1, truenet, true) +``` +```@example dist_reroot +R"svg(name('truenet_sim.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(truenet, useedgelength=true, showgamma=true); +R"dev.off()" # hide +nothing # hide +``` +![truenet](../assets/figures/truenet_sim.svg) + +Our estimated network is not the same as the true network: +- the underlying tree is correctly estimated +- the origin of gene flow is correctly estimated: E +- the target of gene flow is *not* correctly estimated: it was + the lineage ancestral to (A,B), but it is estimated to be A only. + +For networks, the distance here is the hardwired cluster distance: +the number of hardwired clusters found in one network and not +in the other. The **hardwired cluster** associated with an edge is the +set of *all* tips descendant from that edge, i.e. all tips that +inherited at least *some* genetic material from that edge. + +## Displayed trees and subnetworks + +We can extract all trees displayed in a network. +These trees are obtained by picking one parent hybrid edge +at each hybrid node, and dropping the other parent hybrid edge. +We can choose to pick the "important" hybrid edges only, +with heritability γ at or above a threshold. +Below we use a γ threshold of 0, so we get all displayed trees: +```@repl dist_reroot +t = displayedTrees(net1, 0.0) # list of trees displayed in network +writeTopology(t[1], round=true) +writeTopology(t[2], round=true) +``` +If we decide to keep edges with γ>0.2 only, then we are +left with a single tree in the list (the major tree). +This is because our example has 1 hybrid node with minor γ=0.196. +```@repl dist_reroot +t = displayedTrees(net1, 0.2) +``` + +We can also delete all "non-important" reticulations, +those with a minor heritability γ below some threshold. +The function below changes our network `net1`, +as indicated by its name ending with a `!`. + +```@repl dist_reroot +deleteHybridThreshold!(net1, 0.1) +``` +Nothing happened to our network: because its γ is above 0.1. +But if we set the threshold to 0.3, then our reticulation disappears: +```@repl dist_reroot +deleteHybridThreshold!(net1, 0.3) +``` +See also function `displayedNetworkAt!` to get the network with +a single reticulation of interest, and eliminate all other +reticulations. diff --git a/docs/src/man/expectedCFs.md b/docs/src/man/expectedCFs.md new file mode 100644 index 0000000..44ceefb --- /dev/null +++ b/docs/src/man/expectedCFs.md @@ -0,0 +1,111 @@ +```@setup expCFs +using PhyloNetworks, SNaQ +mkpath("../assets/figures") +exampledir = joinpath(dirname(pathof(SNaQ)), "..","examples") +raxmltrees = joinpath(exampledir,"raxmltrees.tre") +raxmlCF = readTableCF(writeTableCF(countquartetsintrees(readMultiTopology(raxmltrees), showprogressbar=false)...)) +truenet = readTopology("((((D:0.4,C:0.4):4.8,((A:0.8,B:0.8):2.2)#H1:2.2::0.7):4.0,(#H1:0::0.3,E:3.0):6.2):2.0,O:11.2);"); +``` + +# Extract Expected CFs + +A good way to visualize the "goodness-of-fit" of a given estimated network to the data +is to plot the observed CF versus the expected CF. If the network is a good fit, then the dots +in the plot will be close to the diagonal (x=y line). +The following function will create a dataframe with the observed and expected CFs, +which are all saved in the DataCF object after running snaq: +```@repl expCFs +topologyMaxQPseudolik!(truenet, raxmlCF); +df_wide = fittedQuartetCF(raxmlCF) # same as fittedQuartetCF(raxmlCF, :wide) +df_long = fittedQuartetCF(raxmlCF, :long) +``` +It is important to have run `snaq!`, `topologyQPseudolik!` or `topologyMaxQPseudolik!` +before making these tables, or the result would be meaningless. +These functions update the fitted concordance factors (those expected under the network) +inside the DataCF object `raxmlCF`. + +Here is one way to plot them, via R again, and using the R package `ggplot2`. + +```@example expCFs +using RCall +obsCF = df_long[!,:obsCF]; expCF = df_long[!,:expCF]; # hide +R"name <- function(x) file.path('..', 'assets', 'figures', x)"; # hide +R"svg(name('expCFs_obsvsfitted.svg'), width=5, height=4)"; # hide +R"par"(mar=[2.5,2.6,.5,.5], mgp=[1.5,.4,0], tck=-0.01, las=1, pty="s"); # hide +R"plot(0:1, 0:1, type='l', bty='L', lwd=0.3, col='#008080', xlab='quartet CF observed in gene trees', ylab='quartet CF expected from network')"; # hide +R"set.seed"(1234); # hide +R"points(jitter($obsCF,amount=0.005),jitter($expCF,amount=0.005),col='#008080',bg='#00808090',pch=21)"; # hide +R"dev.off()"; # hide +nothing # hide +``` +To install ggplot2 if not installed already, do: +`R"install.packages('ggplot2', dep=TRUE)"` + +```julia +@rlibrary ggplot2 +ggplot(df_long, aes(x=:obsCF,y=:expCF)) + theme_classic() + + geom_segment(x=0,y=0,xend=1,yend=1, color="#008080", size=0.3) + # diagonal line + geom_point(alpha=0.5, color="#008080", position=position_jitter(width=0.005, height=0.005)) + + ylab("quartet CF expected from network") + xlab("quartet CF observed in gene trees") + coord_equal(ratio=1); +# if needed, save with: +ggsave("expCFs_obsvsfitted.svg", scale=1, width=6, height=5); +``` + +![obsvsfitted](../assets/figures/expCFs_obsvsfitted.svg) + +Many points are overlapping, so they were "jittered" a little to see them all better. +There are always many points overlapping on the bottom-left corner: +concordance factors of 0.0 for quartet resolutions not observed, and not expected. +To export the table of quartet CFs and explore the fit of the network with other tools: + +```julia +using CSV +CSV.write("fittedCF.csv", df_long) +``` +alternative code to get a similar plot with [Gadfly](http://gadflyjl.org/): +```julia +using Gadfly +plot(layer(df_long, Geom.point, x="obsCF", y="expCF"), + layer(x=0:1,y=0:1, Geom.line), # diagonal line + Guide.xlabel("CF observed in gene trees"), Guide.ylabel("CF expected from network")) +``` + +We could highlight quartets that include taxon A, say, +if we suspect that it is an unrecognized hybrid. +Many points are overlapping, like before, so they are again "jittered" a bit. + +```@example expCFs +using DataFrames +df_long[!,:has_A] .= "no"; # add a column to our data, to indicate which 4-taxon sets have A or not +for r in eachrow(df_long) + if "A" ∈ [r[:tx1], r[:tx2], r[:tx3], r[:tx4]] + r[:has_A]="yes" + end +end +has_A = df_long.has_A # hide +nq = length(has_A); # hide +R"colA=rep('#008080',$nq); bgA=rep('#00808090',$nq);"; # hide +R"colA[$has_A=='yes']='#F8766D'; bgA[$has_A=='yes']='#F8766D90'"; # hide +R"svg(name('expCFs_obsvsfitted_A.svg'), width=5, height=4)"; # hide +R"par"(mar=[2.5,2.6,.5,.5], mgp=[1.5,.4,0], tck=-0.01, las=1, pty="s"); # hide +R"plot(0:1, 0:1, type='l', bty='L', lwd=0.3, col='black', xlab='quartet CF observed in gene trees', ylab='quartet CF expected from network')"; # hide +R"set.seed"(2345); # hide +R"points(jitter($obsCF,amount=0.005),jitter($expCF,amount=0.005),col=colA,bg=bgA,pch=21)"; # hide +R"legend(x=0.7,y=0.3,pch=21,col=c('#008080','#F8766D'),legend=c('no','yes'),title='has A?', bty='n',bg=c('#00808090','#F8766D90'))"; # hide +R"dev.off()"; # hide +nothing # hide +``` +```@repl expCFs +first(df_long, 7) # first 7 rows +``` + +```julia +ggplot(df_long, aes(x=:obsCF, y=:expCF, color=:has_A)) + theme_classic() + + geom_segment(x=0,y=0,xend=1,yend=1, color="black", size=0.3) + # diagonal line + geom_point(alpha=0.5, position=position_jitter(width=0.005, height=0.005)) + + ylab("quartet CF expected from network") + xlab("quartet CF observed in gene trees") + coord_equal(ratio=1); +# can be saved: +ggsave("expCFs_obsvsfitted_A.svg", width=6, height=5); +``` + +![obsvsfitted A present or not](../assets/figures/expCFs_obsvsfitted_A.svg) diff --git a/docs/src/man/fixednetworkoptim.md b/docs/src/man/fixednetworkoptim.md new file mode 100644 index 0000000..35cc0ae --- /dev/null +++ b/docs/src/man/fixednetworkoptim.md @@ -0,0 +1,183 @@ +# Candidate Networks + +## Optimizing parameters for a given network + +For a given network topology, we can optimize the branch lengths and +inheritance probabilities (γ) with the pseudolikelihood. +This is useful if we have a few candidate networks to compare. +Each network can be optimized individually, and the network with the best +pseudolikelihood can be chosen. + +The score being optimized is a pseudo-deviance, i.e. +a multiple of the negative log pseudo-likelihood up to an additive constant +(the lower the better; a pseudo-deviance of 0 corresponds to a perfect fit). + +Following our example in [Getting a Network](@ref), +we can optimize parameters on the true network +(the one originally used to simulate the data): + +```@setup fixednetworkoptim +using PhyloNetworks, SNaQ +using Logging # to suppress info messages below +baselogger = global_logger() +mkpath("../assets/figures") +exampledir = joinpath(dirname(pathof(SNaQ)), "..","examples") +raxmltrees = joinpath(exampledir,"raxmltrees.tre") +raxmlCF = readTableCF(writeTableCF(countquartetsintrees(readMultiTopology(raxmltrees), showprogressbar=false)...)) +``` + +```@repl fixednetworkoptim +truenet = readTopology("((((D:0.4,C:0.4):4.8,((A:0.8,B:0.8):2.2)#H1:2.2::0.7):4.0,(#H1:0::0.3,E:3.0):6.2):2.0,O:11.2);"); +net1alt = topologyMaxQPseudolik!(truenet, raxmlCF); +writeTopology(net1alt, round=true) +net1alt.loglik # pseudo deviance actually: the lower the better +``` +```@example fixednetworkoptim +using PhyloPlots, RCall +R"name <- function(x) file.path('..', 'assets', 'figures', x)" # hide +R"svg(name('truenet_opt.svg'), width=4, height=4)" # hide +R"par"(mar=[0,0,0,0]) +plot(net1alt, showgamma=true); +R"dev.off()" # hide +nothing # hide +``` +![truenet_opt](../assets/figures/truenet_opt.svg) + +We get a score of 29.941, +which is comparable to the score of the SNaQ network (net1: 28.315), +especially compared to the score of the best tree (net0: 53.532). +This begs the question: is the true network within the "range" of uncertainty? +We can run a [Bootstrap](@ref) analysis to measure uncertainty +in our network inference. + +For a more thorough optimization, we may increase the requirements before +the search stops (but the optimization will take longer). +It makes no difference on this small data set. +```julia +net1par = topologyMaxQPseudolik!(truenet, raxmlCF, ftolRel=1e-10, xtolAbs=1e-10) +net1par.loglik # pseudo deviance, actually: the lower the better +``` + +## Network Score with no optimization + +For a network with given branch lengths and γ heritabilies, +we can compute the pseudolikelihood (well, a pseudo-deviance) with: +```@repl fixednetworkoptim +topologyQPseudolik!(truenet,raxmlCF); +truenet.loglik # again, pseudo deviance +``` +This function is not maximizing the pseudolikelihood, it is simply computing the +pseudolikelihood (or deviance) for the given branch lengths and probabilities of +inheritance. At the moment, both of these functions require that the +given network is of level 1 (cycles don't overlap). + +## Candidate networks compatible with a known outgroup + +If the network was estimated via `snaq!`, it might turn out to be impossible +to root our estimated network with a known outgroup (see section +[What if the root conflicts with the direction of a reticulation?](@ref).) +At this time, `snaq!` does not impose any rooting constraint on the network: +the search for the lowest score considers all level-1 networks, including those +that are incompatible with a known outgroup. +(The monophyly of outgroups is not imposed either, like in many other methods.) + +If the estimated network cannot be rooted with the known outgroup, +we can check the `.networks` output file. +It has a list of networks that are slight modifications of the best network, +where the modifications changed the direction of one reticulation at a time. +For each modified network, the score was calculated. So if we find in this list +a modified network that has a score close to that of the best network, +and that can be re-rooted with our known root position, then this modified network +is a better candidate than the network with the best score. + +Below is what the `net1.networks` file looks like, after performing +the analysis in the section [Network Estimation](@ref). +Scroll to the right to see the scores. + + (C,D,((O,(E,#H7:::0.19558838614943078):0.31352437658618976):0.6640664399202987,(B,(A)#H7:::0.8044116138505693):10.0):10.0);, with -loglik 28.31506721890958 (best network found, remaining sorted by log-pseudolik; the smaller, the better) + (C,D,((O,(E)#H7:::0.8150784689693145):0.9336405757682176,(B,(A,#H7:::0.18492153103068557):0.25386142779877724):1.8758156446611114):10.0);, with -loglik 31.535560380783814 + (B,#H7:9.90999345612101::0.2555404440833535,(A,(E,(O,((C,D):10.0)#H7:0.3419231810962026::0.7444595559166465):0.19994859441332047):2.5014911511063644):0.7957621793330066);, with -loglik 56.64548310161462 + (C,D,((O,(E,((B)#H7:::0.7957543284159452,A):4.786202415937916):0.004527712280136759):1.7952610454570868,#H7:::0.20424567158405482):10.0);, with -loglik 67.17775727492258 + (C,D,(#H7:::0.32947301811471164,(B,(A,(E,(O)#H7:::0.6705269818852884):1.371799259141243):0.0):6.397073999864152):7.677245926003807);, with -loglik 199.11401961057143 + +We can read this file and look at its list of networks like this: + +```@repl fixednetworkoptim +file = "net1.networks"; +# or use the example file available with the package: +file = joinpath(dirname(pathof(SNaQ)), "..","examples","net1.networks"); +netlist = readMultiTopology(file) # read the full list of networks in that file +``` +Next, we would like to extract the network scores from the file. +Below is a one-liner to do this +(we make Julia send a `sed` command to the shell --sorry, Mac or Linux for this.) +```@repl fixednetworkoptim +scoresInString = read(`sed -E 's/.+with -loglik ([0-9]+.[0-9]+).+/\1/' $file`, String) +scores = parse.(Float64, split(scoresInString)) +# next: update the "loglik" of each network with the score read from the file +for i in eachindex(netlist) + netlist[i].loglik = scores[i] + println("net $i in the list: score = ",scores[i]) +end +``` +The first network in the list is the best network returned by `snaq!`. +We see that the second network has a score that's not too far, but the other networks +have worse scores. The best network and its best modification (second network in the +list) are shown below. We chose to show edge numbers, to use them later +to re-root the networks. + +```@example fixednetworkoptim +R"svg(name('fixednetworkoptim_othernets1.svg'), width=7, height=4)" # hide +R"layout(matrix(1:2,1,2))"; # hide +R"par"(mar=[0,0,0,0]) # hide +plot(netlist[1], showgamma=true, showedgenumber=true, tipoffset=0.1); +R"mtext"("best net, score=28.3", line=-1); +plot(netlist[2], showgamma=true, showedgenumber=true, tipoffset=0.1); +R"mtext"("direction modified, score=31.5", line=-1); +R"dev.off()"; # hide +nothing # hide +``` +![othernets before reroot](../assets/figures/fixednetworkoptim_othernets1.svg) + +Now imagine that our outgroup is taxon A. +- best network: we would get a "RootMismatch" error if we tried to set + the root on the external edge 9 to A, with `rootatnode!(netlist[1], "A")` + (see section + [What if the root conflicts with the direction of a reticulation?](@ref)). + But we could root the best network on the major parent edge to A, edge 10 + (rooted network on the left below). +- For the second best network in our list, there are 2 ways to root it + with A: on the external edge 8 to A (top right), or on its parent edge 10 + These 2 options give quite different rooted versions + of the network, one of which requires the existence of an unsampled taxon, + sister to BOECD, that would have contributed to introgression into + an ancestor of E. The second rooted version says that an ancestor of + (or sister to) A contributed to the introgression into the ancestor of E. + A is an outgroup in both cases, but the second case is more parsimonious, + in the sense that it does not require the existence of an unsampled taxon. + +```@example fixednetworkoptim +R"svg(name('fixednetworkoptim_othernets2.svg'), width=7, height=7)" # hide +R"layout(matrix(c(1,4,2,3),2,2))"; # hide +R"par"(mar=[0,0,0.5,0]) # hide +rootonedge!(netlist[1], 10); # root best net to make A outgroup +rotate!(netlist[1], -4); # to 'un-cross' edges +rotate!(netlist[1], -6); +rotate!(netlist[1], -5); +plot(netlist[1], showgamma=true, tipoffset=0.1); +R"mtext"("best net, score=28.3", line=-1); +global_logger(NullLogger()); # hide +rootatnode!(netlist[2], "A"); # net with modified direction: first way to make A outgroup +global_logger(baselogger); # hide +rotate!(netlist[2], -4) # to 'un-cross' edges +rotate!(netlist[2], -6) +plot(netlist[2], showgamma=true, tipoffset=0.1); +R"mtext"("second best in list, score=31.5\nrequires unsampled population", line=-2); +rootonedge!(netlist[2], 10) # net with modified direction: second way to make A outgroup +for i in [9,-7] rotate!(netlist[2], i); end; # to 'un-cross' edges +plot(netlist[2], showgamma=true, tipoffset=0.1); +R"mtext"("second best in list, score=31.5\ndifferent root position", line=-2); +R"dev.off()"; # hide +nothing # hide +``` +![othernets after reroot](../assets/figures/fixednetworkoptim_othernets2.svg) diff --git a/docs/src/man/inputdata.md b/docs/src/man/inputdata.md new file mode 100644 index 0000000..eb40b3a --- /dev/null +++ b/docs/src/man/inputdata.md @@ -0,0 +1,187 @@ +# Input for SNaQ + +SNaQ is a method implemented in the package to estimate a phylogenetic network +from multiple molecular sequence alignments. There are two alternatives for the input data: + +1. A list of estimated gene trees for each locus, which can be obtained using [MrBayes](http://mrbayes.sourceforge.net) or [RAxML](http://sco.h-its.org/exelixis/software.html). Or: +2. A table of concordance factors (CF), i.e. gene tree frequencies, for each 4-taxon subset. This table can be obtained from [BUCKy](http://www.stat.wisc.edu/~ane/bucky/), to account for gene tree uncertainty + +This [pipeline](https://github.com/nstenz/TICR) can be used to obtain the table of +quartet CF needed as input for SNaQ +(see also the [wiki](https://github.com/juliaphylo/PhyloNetworks.jl/wiki/TICR:-from-alignments-to-quartet-concordance-factors).) +It starts from the sequence alignments, +runs MrBayes and then BUCKy (both parallelized), producing the +table of estimated CFs and their credibility intervals. +Additional details on this [TICR pipeline](@ref) +describe how to insert data at various stages (e.g. after running MrBayes on each locus). + +## Tutorial data: gene trees + +We suggest that you create a special directory for running these examples, +where input files can be downloaded and where output files will be +created (with estimated networks for instance). Enter this directory +and run Julia from there. + +Suppose you have a file with a list of gene trees in parenthetical +format called `raxmltrees.tre`. +You can access the example file of input trees +[here](https://github.com/juliaphylo/SNaQ/blob/main/examples/raxmltrees.tre) +or +[here](https://raw.githubusercontent.com/juliaphylo/SNaQ/main/examples/raxmltrees.tre) +for easier download. + +Do not copy-paste into a "smart" text-editor. Instead, save the file +directly into your working directory using "save link as" or "download linked file as". +This file contains 30 gene trees, each in parenthetical format on 6 taxa +like this (with rounded branch lengths): + +`(E:0.038,((A:0.014,B:0.010):0.010,(C:0.008,D:0.002):0.010):0.025,O:0.078);` + +If `raxmltrees.tre` is in your working directory, you can view its content +within Julia: +```julia +less("raxmltrees.tre") +``` +or like this, to view the version downloaded with the package: +```julia +raxmltrees = joinpath(dirname(pathof(SNaQ)), "..","examples","raxmltrees.tre") +less(raxmltrees) +``` +Just type `q` to quit viewing this file. +You could read in these 30 trees and visualize the third one (say) like this: +```@example qcf +using PhyloNetworks, SNaQ +raxmltrees = joinpath(dirname(pathof(SNaQ)), "..","examples","raxmltrees.tre"); +nothing # hide +``` +```@repl qcf +genetrees = readMultiTopology(raxmltrees); +genetrees[3] +``` +To visualize any of these input trees, use the +[PhyloPlots](https://github.com/juliaphylo/PhyloPlots.jl) package: +```@example qcf +using PhyloPlots +using RCall # hide +mkpath("../assets/figures") # hide +R"name <- function(x) file.path('..', 'assets', 'figures', x)" # hide +R"svg(name('inputdata_gene3.svg'), width=4, height=3)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(genetrees[3]); # tree for 3rd gene +R"dev.off()" # hide +nothing # hide +``` +![gene3](../assets/figures/inputdata_gene3.svg) + +To read in all gene trees and directly summarize them by a list +of quartet CFs (proportion of input trees with a given quartet): +```@repl qcf +q,t = countquartetsintrees(genetrees); # read in trees, calculate quartet CFs +df = writeTableCF(q,t) # data frame with observed CFs: gene frequencies +using CSV +CSV.write("tableCF.csv", df); # to save the data frame to a file +raxmlCF = readTableCF("tableCF.csv") # read in the file and produces a "DataCF" object +rm("tableCF.csv") # hide +``` +`less("tableCF.csv")` lets you see the content of the newly created +file "tableCF.csv", within Julia. Again, type `q` to quit viewing this file. + +In this table, each 4-taxon set is listed in one row. +The 3 "CF" columns gives the proportion of genes that has +each of the 3 possible trees on these 4 taxa. + +For more help on any function, type `?` to enter the help mode, +then type the name of the function. For example: type `?` then `countquartetsintrees` +for information on the various options of that function. + +When there are many more taxa, the number of quartets +might be very large and we might want to use a subset to speed things up. +Here, if we wanted to use a random sample of 10 quartets +instead of all quartets, we could do: + +`raxmlCF = readTrees2CF(raxmltrees, whichQ="rand", numQ=10, CFfile="tableCF10.txt")` + +Be careful to use a numQ value smaller than the total number of possible +4-taxon subsets, which is *n choose 4* on *n* taxa (e.g. 15 on 6 taxa). +To get a predictable random sample, you may set the seed with +`using Random; Random.seed!(12321)` +(for instance) prior to sampling the quartets as above. +The `readTrees2CF` is *much* slower than the function `countquartetsintrees` +to read in trees and calculate the quartet CFs observed in the trees, +when we want to get *all* quartet CFs. But for a small sample of quartets, +then `readTrees2CF` is available. + +## Tutorial data: quartet CFs + +If we already have a table of quartet concordance factor (CF) values +in a file `buckyCF.csv` in this format + +| Taxon1 | Taxon2 | Taxon3 | Taxon4 | CF12_34 | CF13_24 | CF14_23 +|:-------|:-------|:-------|:-------|:--------|:--------|:------- +| D | A| E | O| 0.565 | 0.0903 | 0.3447 +| ... | | | | | | ... + +we would read it in one step like this: `readTableCF("buckyCF.csv")`. +An example file comes with the package, available +[here](https://github.com/juliaphylo/SNaQ/blob/main/examples/buckyCF.csv) +or +[here](https://raw.githubusercontent.com/juliaphylo/SNaQ/main/examples/buckyCF.csv). + +```@repl qcf +buckyCFfile = joinpath(dirname(pathof(SNaQ)), "..","examples","buckyCF.csv"); +buckyCF = readTableCF(buckyCFfile) +``` +The same thing could be done in 2 steps: +first to read the file and convert it to a 'DataFrame' object, +and then to convert this DataFrame into a DataCF object. +```@repl qcf +using CSV, DataFrames +dat = DataFrame(CSV.File(buckyCFfile); copycols=false); +first(dat, 6) # to see the first 6 rows +buckyCF = readTableCF(dat) +writeTableCF(buckyCF) +``` +In the input file, columns need to be in the right order: +with the first 4 columns giving the names of the taxa in each 4-taxon set. +The CF values are assumed to be in columns named "CF12_34", etc., +or else in columns 5,6,7. +If available, a column named "ngenes" will be taken to have the +the number of genes for each 4-taxon subset. + +## Tutorial data: starting tree + +If we have a tree for the data set at hand, +it can be used as a starting point for the optimization. +From our gene trees, we estimated a species tree with +[ASTRAL](https://github.com/smirarab/ASTRAL/blob/master/astral-tutorial.md). +This tree comes with the package in file `astral.tre` +[here](https://github.com/juliaphylo/SNaQ/blob/main/examples/astral.tre). +This file has 102 trees: 100 bootstrap species trees, +followed by their greedy consensus, +followed by the best tree on the original data. +It's this last tree that we are most interested in. +We can read it with +```@example qcf +astralfile = joinpath(dirname(pathof(SNaQ)), "..","examples","astral.tre"); +astraltree = readMultiTopology(astralfile)[102] # 102th tree: last tree here +R"svg(name('inputdata_astraltree.svg'), width=4, height=3)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(astraltree, showedgelength=true); +R"dev.off()"; # hide +nothing # hide +``` +![astraltree](../assets/figures/inputdata_astraltree.svg) + +To start its search, SNaQ will need a network of "level 1". +All trees and all networks with 1 hybridization are of level 1. +To make sure that a network with 2 or more hybridizations is of level 1, +we can read it in with +`readTopologyLevel1` (which also unroots the tree, resolves polytomies, +replaces missing branch lengths by 1 for starting values etc.): +```julia +T=readTopologyLevel1("startNetwork.txt") +``` +(here `startNetwork.txt` is a hypothetical file: replace this by +the name of a file that contains your network of interest.) + +Next: [Getting a Network](@ref) diff --git a/docs/src/man/multiplealleles.md b/docs/src/man/multiplealleles.md new file mode 100644 index 0000000..fcbadcc --- /dev/null +++ b/docs/src/man/multiplealleles.md @@ -0,0 +1,147 @@ +```@setup multialleles +using PhyloNetworks, SNaQ +``` + +# Multiple alleles per species + +## between-species 4-taxon sets + +The default setting for SNaQ considers that each allele in a gene tree corresponds +to a taxon (a tip) in the network. If instead each allele/individual can be mapped confidently +to a species, and if only the species-level network needs to be estimated, +then the following functions can be used: + +```@repl multialleles +using CSV, DataFrames +mappingfile = joinpath(dirname(pathof(SNaQ)), "..","examples","mappingIndividuals.csv"); +tm = CSV.read(mappingfile, DataFrame) # taxon map as a data frame +taxonmap = Dict(row[:individual] => row[:species] for row in eachrow(tm)) # taxon map as a dictionary +``` + +The [mapping file](https://github.com/juliaphylo/SNaQ/blob/main/examples/mappingIndividuals.csv) +can be a text (or `csv`) file with two columns (at least): +one for the individuals, named `allele` or `individual`, +and one column containing the species names, named `species`. +Each row should map an allele name to a species name. +Next, read in the [gene trees](https://github.com/juliaphylo/SNaQ/blob/main/examples/genetrees_alleletips.tre) +and calculate the quartet CFs at the species level: + + +```@repl multialleles +genetreefile = joinpath(dirname(pathof(SNaQ)), "..","examples","genetrees_alleletips.tre"); +genetrees = readMultiTopology(genetreefile); +sort(tipLabels(genetrees[1])) # multiple tips in species S1 +df_sp = writeTableCF(countquartetsintrees(genetrees, taxonmap, showprogressbar=false)...) +``` + +Now `df_sp` is a data frame containing the quartet concordance factors +at the species level only, that is, considering sets made of 4 distinct species, +even if the gene trees may have multiple alleles from the same species. +For 4 distinct species `A,B,C,D`, all alleles from each species (`A` etc.) +will be used to calculate the quartet CF. If a given gene tree has +`n_a` alleles from `a`, `n_b` alleles from `b` etc., then +each set of 4 alleles is given a weight of `1/(n_a n_b n_c n_d)` +to calculated of the CF for `A,B,C,D` (such that the total weight from +this particular gene trees is 1). +It is safe to save this data frame, then use it for `snaq!` like this: + +```@repl multialleles +CSV.write("tableCF_species.csv", df_sp); # to save the data frame to a file +d_sp = readTableCF("tableCF_species.csv"); # to get a "DataCF" object for use in snaq! +summarizeDataCF(d_sp) +``` + +## within-species 4-taxon sets + +Four-taxon sets involving 2 individuals per species can provide more +information about the underlying network, including external branch +length in coalescent units. However, `snaq!` runs more slowly when +using this extra information. To get quartet CFs from sets of 4 individuals +in which 2 individuals are from the same species, the following functions +should be used: + +```@repl multialleles +df_ind = writeTableCF(countquartetsintrees(genetrees, showprogressbar=false)...); # no mapping: CFs across individuals +first(df_ind, 4) # to see the first 4 rows +CSV.write("tableCF_individuals.csv", df_ind); # to save to a file +df_sp = mapAllelesCFtable(mappingfile, "tableCF_individuals.csv"); +d_sp = readTableCF!(df_sp, mergerows=true); +``` +where the mapping file can be a text (or `csv`) file with two columns +named `allele` (or `individual`) and `species`, mapping each allele name to a species name. +The data in `df_ind` is the table of concordance factors at the level of individuals. +In other words, it lists CFs using one row for each set of 4 alleles/individuals. + +`mapAllelesCFtable` creates a new data frame `df_sp` of quartet concordance factors at the +species level: with the allele names replaced by the appropriate species names. + +**Warnings**: +- This procedure requires that all alleles from the same + individual are given the same name (the individual's 'name') across + all genes for which that individual was sequenced. +- For a four-taxon set `A,B,C,D`, all the individuals from `A`, `B`, `C` and `D` + are considered, say `(a1,b1,c1,d1)`, `(a2,b1,c1,d1)`, `(a1,b2,c1,d1)`, `(a2,b2,c1,d1)` + and so on. The CFs of these 4-taxon sets are averaged together to obtain the + CFs at the species level. This procedures gives more weight to genes that have + many alleles (because they contribute to more sets of 4 individuals) and less + weight to genes that have few alleles. + +The last command modifies this data frame `df_sp` by deleting rows that are uninformative +about between-species relationships, such as rows corresponding to 4 individuals from the +same species. The output `d_sp` of this second command is an object of type `DataCF` at the +species level, which can be used as input for networks estimation with `snaq!`. +But before, it is safe to save the concordance factor of quartets of species, +which can be calculated by averaging the CFs of quartets of individuals +from the associated species: + +```@repl multialleles +df_sp = writeTableCF(d_sp) # data frame, quartet CFs averaged across individuals of same species +CSV.write("CFtable_species.csv", df_sp); # save to file +``` + +Some quartets have the same species repeated twice, +representing cases when 2 of the 4 individuals came from the same species. +These quartets, with repeated species, are informative about the population +size of extant populations, i.e. about the lengths of external branches in +coalescent units. + +The main difference between this section compared to the previous section +("between-species 4-taxon sets") is that quartets with 2 individuals from +the same species are included here, such as `a1,a2,b1,c1`. +Also, the weighting of quartets is different. Here, genes with more alleles +are given more weight. + +now we can run snaq: + +```julia +net = snaq!(T_sp, d_sp); +``` +where `T_sp` should be a starting topology with one tip per species, +labelled with the same species names as the names used in the mapping file. + +If `snaq!` takes too long that way, we can try a less ambitious estimation +that does not estimate the external branch lengths, that is, +*without* using quartets that have 2 individuals from the same species. +To do so, we can use the quartet concordance factors at the species level, +but filter out the quartets with one (or more) species repeated. +This can be done as in the first section ("between-species 4-taxon sets") +to give equal weight to all genes, +or as shown below to give more weight to genes that have more alleles: + +```@repl multialleles +first(df_sp, 3) # some quartets have the same species twice +function hasrep(row) # see if a row (4-taxon set) has a species name ending with "__2": repeated species + occursin(r"__2$", row[:t1]) || occursin(r"__2$", row[:t2]) || # replace :t1 :t2 etc. by appropriate column names in your data, + occursin(r"__2$", row[:t3]) || occursin(r"__2$", row[:t4]) # e.g. by :taxon1 :taxon2 etc. +end +df_sp_reduced = filter(!hasrep, df_sp) # removes rows with repeated species +CSV.write("CFtable_species_norep.csv", df_sp_reduced); # to save to file +d_sp_reduced = readTableCF(df_sp_reduced) # DataCF object, for input to snaq! +``` + +and now we can run `snaq!` on the reduced set of quartets without repeats, +which should be faster: + +```julia +net = snaq!(T_sp, d_sp_reduced); +``` diff --git a/docs/src/man/snaq_plot.md b/docs/src/man/snaq_plot.md new file mode 100644 index 0000000..e8e4237 --- /dev/null +++ b/docs/src/man/snaq_plot.md @@ -0,0 +1,390 @@ +```@setup snaqplot +using PhyloNetworks, SNaQ +mkpath("../assets/figures") +exampledir = joinpath(dirname(pathof(SNaQ)), "..","examples") +raxmltrees = joinpath(exampledir,"raxmltrees.tre") +raxmlCF = readTrees2CF(raxmltrees, writeTab=false, writeSummary=false) +astralfile = joinpath(exampledir,"astral.tre") +astraltree = readMultiTopology(astralfile)[102] # 102th tree = last tree here +net0 = readTopology(joinpath(exampledir,"net0.out")) +net1 = readTopology(joinpath(exampledir,"net1.out")) +rotate!(net1, -6) +net2 = readTopology(joinpath(exampledir,"net2.out")) +net3 = readTopology(joinpath(exampledir,"net3.out")) +net0.loglik = 53.53150526187732 +net1.loglik = 28.31506721890958 +net2.loglik = 28.31506721890957 +net3.loglik = 28.315067218909626 +``` +# Getting a Network + +## Network Estimation + +SNaQ implements the statistical inference method in +[Solís-Lemus & Ané 2016](http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1005896). +The procedure involves a numerical optimization of branch lengths and inheritance +probabilities and a heuristic search in the space of phylogenetic networks. + +After [Input for SNaQ](@ref), we can estimate the network using the +input data `raxmlCF` and starting from tree (or network) `astraltree`. +We first impose the constraint of at most 0 hybrid node, +that is, we ask for a tree. +```julia +net0 = snaq!(astraltree,raxmlCF, hmax=0, filename="net0", seed=1234) +``` +Part of the screen output shows this: + + MaxNet is (C,D,((B,A):1.395762055180493,(O,E):0.48453400554506426):10.0); + with -loglik 53.53150526187732 + +This parenthetical (extended Newick) description is not very +human-friendly, so we plot the tree +(more about plotting networks below: [Network Visualization](@ref) ). + +```@example snaqplot +using PhyloPlots +using RCall # hide +R"name <- function(x) file.path('..', 'assets', 'figures', x)" # hide +R"svg(name('snaqplot_net0_1.svg'), width=4, height=3)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net0); +R"dev.off()"; # hide +nothing # hide +``` +![net0_1](../assets/figures/snaqplot_net0_1.svg) + +We can use this tree as a starting point to search for the best +network allowing for at most `hmax=1` hybrid node (which is the default). +```julia +net1 = snaq!(net0, raxmlCF, hmax=1, filename="net1", seed=2345) +``` +part of screen output: + + best network and networks with different hybrid/gene flow directions printed to .networks file + MaxNet is (C,D,((O,(E,#H7:::0.19558838614943078):0.31352437658618976):0.6640664399202987,(B,(A)#H7:::0.8044116138505693):10.0):10.0); + with -loglik 28.31506721890958 + +We can visualize the estimated network and its inheritance values γ, which +measure the proportion of genes inherited via each parent at a reticulation event +(e.g. proportion of genes inherited via gene flow). +```@example snaqplot +R"svg(name('snaqplot_net1_1.svg'), width=4, height=3)"; # hide +R"par"(mar=[0,0,0,0]); # hide +plot(net1, showgamma=true); +R"dev.off()"; # hide +nothing # hide +``` +![net1_1](../assets/figures/snaqplot_net1_1.svg) + +This network has A as a hybrid, 80.4% sister to B, +and 19.6% sister to E (which is otherwise sister to O). +C & D are sister to each other. +We can also check the output files created by `snaq!`: +```julia +less("net1.err") # would provide info about errors, if any +less("net1.out") # main output file with the estimated network from each run +less("net1.networks") # extra info +``` +when viewing these result files with `less` +within Julia, use arrows to scroll down and type `q` to quit viewing the files. +The file `net1.networks` contains a list of networks that are slight modifications +of the best (estimated) network `net1`. The modifications changed the direction +of one reticulation at a time, by moving the placement of one hybrid node to another +node inside the same cycle. +For each modified network, the pseudolikelihood score was calculated +(the `loglik` or `-Ploglik` values give a pseudo deviance actually). + +The function name `snaq!` ends with ! because it modifies the argument `raxmlCF` +by including the expected CF. Type `?` then `snaq!` to get help on that function. + +The main output file, here `net1.out` (or `snaq.out` by default) has the estimated +network in parenthetical format, but we can also print it directly to the screen: +```@repl snaqplot +net1 +writeTopology(net1) # writes to screen, full precision for branch lengths and γ +writeTopology(net1, round=true, digits=2) +writeTopology(net1,di=true) # γ omitted: for dendroscope +writeTopology(net1, "bestnet_h1.tre") # writes to file: creates or overwrites file +rm("bestnet_h1.tre") # hide +``` +The option `di=true` is for the parenthetical format used by +[Dendroscope](http://dendroscope.org/) (without reticulation heritabilities). +Copy this parenthetical description and paste it into Dendroscope, +or use the plotting function described below. + +We can go on and let the network have up to 2 or 3 hybrid nodes: +```julia +net2 = snaq!(net1,raxmlCF, hmax=2, filename="net2", seed=3456) +net3 = snaq!(net0,raxmlCF, hmax=3, filename="net3", seed=4567) +``` +and plot them (they are identical and they both have a single reticulation): +```@example snaqplot +R"svg(name('snaqplot_net23.svg'), width=7, height=3)" # hide +using RCall # to be able to tweak our plot within R +R"layout(matrix(1:2, 1, 2))" # to get 2 plots into a single figure: 1 row, 2 columns +R"par"(mar=[0,0,1,0]) # for smaller margins +plot(net2, showgamma=true); +R"mtext"("hmax=2") # add text annotation: title here +plot(net3, showgamma=true); +R"mtext"("hmax=3") +R"dev.off()"; # hide +nothing # hide +``` +![net23](../assets/figures/snaqplot_net23.svg) + +with this screen output for net2 (only 1 hybrid node found): + + MaxNet is (C,D,((B,(A)#H7:::0.804411606649347):10.0,(O,(#H7:::0.19558839335065303,E):0.3135243143217013):0.664066456871298):10.0); + with -loglik 28.31506721890957 + +and this output for net3 (again, only 1 hybrid found): + + MaxNet is (D,C,((O,(E,#H7:::0.19558839257941849):0.3135243301652981):0.6640664138384673,(B,(A)#H7:::0.8044116074205815):10.0):10.0); + with -loglik 28.315067218909626 + +## parallel computations + +For network estimation, multiple runs can done in parallel. +For example, if your machine has 4 or more processors (or cores), +you can tell julia to use 4 processors by starting julia with `julia -p 4`, +or by starting julia the usual way (`julia`) and then adding processors with: + +```julia +using Distributed +addprocs(4) +``` + +If we load a package (`using SNaQ`) before adding processors, +then we need to re-load it again so that all processors have access to it: + +```julia +@everywhere using PhyloNetworks, SNaQ +``` + +After that, running any of the `snaq!(...)` command will use +different cores for different runs, as processors become available. +Fewer details are printed to the log file when multiple cores +are used in parallel. + +When running `bootsnaq`, the analysis of each bootstrap replicate +will use multiple cores to parallelize separate runs of that particular +bootstrap replicate. You may parallelize things further by running +`bootsnaq` multiple times (on separate machines for instance), each time +for a small subset of bootstrap replicates, and with a different seed each time. + +We may tell julia to add more processors than our machine has, +but we will not receive any performance benefits. +At any time during the julia session, `nworkers()` tells us how many +worker processors julia has access to. + +Below is an example of how to use a cluster, to run many independent +`snaq!` searches in parallel on a cluster running the +[slurm](https://slurm.schedmd.com) job manager +(other managers would require a different, but similar submit file). +This example uses 2 files: +1. a julia script file, to do many runs of `snaq!` in parallel, + asking for many cores (default: 10 runs, asking for 10 cores). + This julia script can take arguments: the maximum allowed + number of hybridizations `hmax`, and the number of runs + (to run 50 runs instead of 10, say). +2. a submit file, to launch the julia script. + +**First**: the example julia script, below, is assumed (by the submit file) +to be called `runSNaQ.jl`. It uses a starting tree that +is assumed to be available in a file named `astraltree.tre`, but that +could be modified +(to use a network with h=1 to start the search with hmax=2 for instance). +It also assumes that the quartet concordance factor data are in file +`tableCF_speciesNames.csv`. Again, this file name should be adjusted. +To run this julia script for 50 runs and hmax=3, do `julia runSNaQ.jl 3 50`. + +```julia +#!/usr/bin/env julia + +# file "runSNaQ.jl". run in the shell like this in general: +# julia runSNaQ.jl hvalue nruns +# example for h=2 and default 10 runs: +# julia runSNaQ.jl 2 +# or example for h=3 and 50 runs: +# julia runSNaQ.jl 3 50 + +length(ARGS) > 0 || + error("need 1 or 2 arguments: # reticulations (h) and # runs (optional, 10 by default)") +h = parse(Int, ARGS[1]) +nruns = 10 +if length(ARGS) > 1 + nruns = parse(Int, ARGS[2]) +end +outputfile = string("net", h, "_", nruns, "runs") # example: "net2_10runs" +seed = 1234 + h # change as desired! Best to have it different for different h +@info "will run SNaQ with h=$h, # of runs=$nruns, seed=$seed, output will go to: $outputfile" + +using Distributed +addprocs(nruns) +@everywhere using SNaQ +net0 = readTopology("astraltree.tre"); +using DataFrames, CSV +df_sp = CSV.read("tableCF_speciesNames.csv", DataFrame; pool=false); +d_sp = readTableCF!(df_sp); +net = snaq!(net0, d_sp, hmax=h, filename=outputfile, seed=seed, runs=nruns) +``` + +When julia is called on a script, whatever comes after "julia scriptname" +is given to julia in an array of values. This array is called `ARGS`. +So if we call a script like this: `julia runSNaQ.jl 2` +then the script will know the arguments through `ARGS`, +which would contain a single element, `"2"`. +This first element is just a string, at this stage. We want to use it as a number, +so we ask julia to parse the string into an integer. + +**Second**: we need a "submit" file to ask a job scheduler like +[slurm](https://slurm.schedmd.com) to submit our julia script to a cluster. +In the submit file below, the first 5 lines set things up for slurm. +They are most likely to be specific to your cluster. +The main idea here is to use a slurm "array" from 0 to 3, to run our +julia script multiple times, 4 times actually: from hmax=0 to hmax=3. +Each would do 30 runs +(and each would be allocated 30 cores in the submit script below). +Then log out of the cluster and go for coffee. + +```bash +#!/bin/bash +#SBATCH -o path/to/slurm/log/file/runsnaq_slurm%a.log +#SBATCH -J runsnaq +#SBATCH --array=0-3 +#SBATCH -c 30 +## --array: to run multiple instances of this script, +## one for each value in the array. +## 1 instance = 1 task +## -J job name +## -c number of cores (CPUs) per task + +echo "slurm task ID = $SLURM_ARRAY_TASK_ID used as hmax" +echo "start of SNaQ parallel runs on $(hostname)" +# finally: launch the julia script, using Julia executable appropriate for slurm, with full paths: +/workspace/software/bin/julia --history-file=no -- runSNaQ.jl $SLURM_ARRAY_TASK_ID 30 > net${SLURM_ARRAY_TASK_ID}_30runs.screenlog 2>&1 +echo "end of SNaQ run ..." +``` + +## choosing the number of hybridizations + +Each network has a `loglik` attribute, which is its pseudo deviance: +a multiple of the negative log-likelihood up to a constant (the constant is +such that the score is 0 if the network fits the data perfectly). +The lower the better. We can plot these scores across hybrid values: +```@example snaqplot +scores = [net0.loglik, net1.loglik, net2.loglik, net3.loglik] +hmax = collect(0:3) +R"svg(name('snaqplot_scores_heuristic.svg'), width=4, height=3)" # hide +R"par"(mar=[2.5,2.5,.5,.5], mgp=[1.4,.4,0], tck=-0.02, las=1, lab=[3,5,7]); # hide +R"plot"(hmax, scores, type="b", ylab="network score", xlab="hmax", col="blue"); +R"dev.off()"; # hide +nothing # hide +``` +![scores_heuristic](../assets/figures/snaqplot_scores_heuristic.svg) + +Here the slope heuristic suggests a single hybrid node: +the score does not get much better beyond h=1. + +We made the plot via R above. A more Julian way would use a Julia plotting +package such as [Gadfly](http://gadflyjl.org/stable/) or +[Plots](http://docs.juliaplots.org/latest/ecosystem/), like this for instance: +```julia +using Gadfly +plot(x=collect(0:3), y=scores, Geom.point, Geom.line) +``` +(btw, cool [blog](http://avt.im/blog/2018/03/23/R-packages-ggplot-in-julia) about using ggplot within julia) + +## Network Visualization + +To visualize the estimated network, we can use the companion package +[PhyloPlots](https://github.com/juliaphylo/PhyloPlots.jl). +In the example below, julia creates and sends the plot to R +via [RCall](https://github.com/JuliaInterop/RCall.jl), +so we can tweak the plot in various ways via commands sent to R. +To save the plot in a file: we first tell R to create an image file, +then we send the plot of the network, +then we tell R to wrap up and save its image file. + +```@example snaqplot +using PhyloPlots # to visualize networks +using RCall # to send additional commands to R like this: R"..." +imagefilename = "../assets/figures/snaqplot_net1_2.svg" +R"svg"(imagefilename, width=4, height=3) # starts image file +R"par"(mar=[0,0,0,0]) # to reduce margins (no margins at all here) +plot(net1, showgamma=true, showedgenumber=true); # network is plotted & sent to file +R"dev.off()"; # wrap up and save image file +nothing # hide +``` +![net1_2](../assets/figures/snaqplot_net1_2.svg) + +The plot function has many options, to annotate nodes and edges. In the +example above, hybrid edges were annotated with their γ inheritance values +(in blue: light blue for the minor edge with γ<0.5, and dark blue for the +major edge with γ>0.5), and edges were annotated with their internal numbers. + +Type `?` to switch to the help mode +of Julia, then type the name of the function, here `plot`. +Below are two visualizations. +The first uses the default style (`:fulltree`) and modified edge colors. +The second uses the `:majortree` style. +That style doesn't have an arrow by default for minor hybrid edges, +but we can ask for one by specifying a positive arrow length. +```@example snaqplot +R"svg(name('snaqplot_net1_3.svg'), width=7, height=3)" # hide +R"par"(mar=[0,0,0,0]) # hide +R"layout(matrix(1:2,1,2))"; +plot(net1, showedgelength=true, minorhybridedgecolor="tan"); +plot(net1, style=:majortree, arrowlen=0.07); +R"dev.off()"; # hide +nothing # hide +``` +![net1_3](../assets/figures/snaqplot_net1_3.svg) + +Edge lengths are shown, too. They were estimated in coalescent units: +number of generations / effective population size. +Some edge lengths are not identifiable, hence not shown. + +Below is another example, where space was added between the network and +the taxon names via the `tipoffset` option. +Also, edge colors were changed, and the nodes numbers are shown (used internally) + +```@example snaqplot +R"svg(name('snaqplot_net1_4.svg'), width=4, height=3)" # hide +R"par"(mar=[0,0,0,0]) # hide +plot(net1, tipoffset=0.5, shownodenumber=true, edgecolor="tomato4", + minorhybridedgecolor="skyblue", majorhybridedgecolor="tan"); +R"dev.off()"; # hide +nothing # hide +``` +![net1_4](../assets/figures/snaqplot_net1_4.svg) + +## Re-rooting networks + +SNaQ infers an unrooted semi-directed network. +The direction of hybrid edges can be inferred, +but the direction of tree edges cannot be inferred. +To obtain a representative visualization, +it is best to root the network first, using one or more outgroup. +Go to [Re-rooting trees and networks](@ref) for this. +If your outgroup conflicts with the direction of reticulations +in the estimated network, see section +[Candidate networks compatible with a known outgroup](@ref). + +## Candidate Network Evaluation + +From a set of candidate networks, one might simply need to score of each network +to pick the best. Here, the score is the negative log pseudo-likelihood, and the +lower the better. See the section to get the score of [Candidate Networks](@ref). + +## SNaQ error reporting + +Please report any bugs and errors by opening an +[issue](https://github.com/juliaphylo/SNaQ.jl/issues/new). +The easiest way to provide information on the error is by checking the +`.err` file, which will show the number of runs that +failed and the corresponding seed to replicate the run. +In case of an error, the `.err` file might look like: +`Total errors: 1 in seeds [4545]`. +This file and any information that will help replicating the error +will be immensely helpful to fix the error/bug. diff --git a/docs/src/man/ticr_howtogetQuartetCFs.md b/docs/src/man/ticr_howtogetQuartetCFs.md new file mode 100644 index 0000000..a531360 --- /dev/null +++ b/docs/src/man/ticr_howtogetQuartetCFs.md @@ -0,0 +1,260 @@ +# TICR pipeline + +PhyloNetworks' [wiki](https://github.com/juliaphylo/PhyloNetworks.jl/wiki/TICR:-from-alignments-to-quartet-concordance-factors) +has a step-by-step tutorial, +to go from multiple sequence alignments +to a table of quartet gene frequencies (concordance factors: CFs), +through BUCKy (to integrate out gene tree uncertainty) or through RAxML. +To get the `raxml.pl` perl script to run RAxML on each gene, +download the content of that wiki with + +`git clone https://github.com/juliaphylo/PhyloNetworks.jl.wiki.git` + +then go to the `script/` folder. +Full information and code is [here](https://github.com/nstenz/TICR). +Below is more detail to insert data into the pipeline at various stages, +using one of **two pipelines**, depending on your machine configuration: + +- "no-scheduler" pipeline: original set of Perl scripts from the + TICR pipeline, well suited for a machine or a cluster of machines + without a job scheduler. + The scripts automatically parallelize the work across the available cores. + +- "slurm" pipeline: well suited for a cluster where users submit jobs + via a job scheduler like [SLURM](https://slurm.schedmd.com/) or + [SGE](https://en.wikipedia.org/wiki/Oracle_Grid_Engine). The job scheduler + does the work of parallelizing the work across available cores. + The scripts, in this second pipeline, were created to take full advantage + of job scheduler capabilities. They were developed for a cluster running SLURM. + Adjustments to the submit scripts will be needed, to adapt to your own + SLURM configuration or to the syntax that your job scheduler wants. + +## To run MrBayes: we already have alignments + +### no-scheduler pipeline + +We don't need to run `mdl.pl` if we already have aligned gene sequences +from separate loci. To run MrBayes on each locus, we can simply +create a tarball of the Nexus files we wish to use (fasta won't work at this stage). +The command below assumes that we want to use all the files ending with ".nex" +in the current directory, one file per locus: + +```bash +tar czf my-genes.tar.gz *.nex +``` + +Once the tarball has been successfully generated, we can then specify this +file as input for [mb.pl](https://github.com/nstenz/TICR/blob/master/scripts/mb.pl) +assuming we have a valid MrBayes block located in the file "bayes.txt": + +```bash +mb.pl my-genes.tar.gz -m bayes.txt -o my-genes-mb +``` + +If we get an error message like `mb.pl: Command not found`, it might be because +`mb.pl` has no execute permission, or the current directory is not in our "path". +An easy fix is to run this command instead: + +```bash +perl mb.pl my-genes.tar.gz -m bayes.txt -o my-genes-mb +``` + +The resulting output tarball would now be located in `my-genes-mb/my-genes.mb.tar`, +and can be used normally with +[bucky.pl](https://github.com/nstenz/TICR/blob/master/scripts/bucky.pl), +that is, like this: + +```bash +bucky.pl my-genes-mb/my-genes.mb.tar -o mygenes-bucky +``` + +The output, with the table of concordance factors for all sets of 4 taxa, +will be in a file named `my-genes.CFs.csv` inside a directory named `mygenes-bucky`. +That's the file containing the quartet concordance factors to give to SNaQ as input. +There is *no* need to do any of the steps below: they are already done by `bucky.pl`. + +### slurm pipeline + +SLURM will parallelize the MrBayes runs across genes. + +1. Navigate in some "working" directory where you place: + - a folder containing all nexus files, which we will call "nexusfolder" below + - a text file named `mb-block.txt` with the MrBayes block to be used + for all the genes (containing the options for MrBayes: model of sequence + evolution, number of generations etc.). + If we want a different MrBayes block for different genes, step 2 should be skipped, + and we should instead find some other way to put the specific MrBayes block + at the end of each nexus file. + +2. In the "working" directory above, run the julia script + [`paste-mb-block.jl`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/paste-mb-block.jl) + with "nexusfolder" as argument, to tell the script where to find all the nexus files: + + ```bash + julia path/to/paste-mb-block.jl nexusfolder + ``` + This script will read all the nexus files in the directory `nexusfolder`, + will create a new directory `nexusfolder-block`, + and will create new nexus files (containing the MrBayes block found in file `mb-block.txt`) + as `1.nex, 2.nex, ...` in the new directory. A `translate.txt` file will also be created + to map the original gene file names to the new (numbered) file names. + If we named our MrBayes block file differently: we can edit the script and modify it + to replace `mb-block.txt` by our actual file name for the MrBayes block. + +3. Modify the submit script + [`mb-slurm-submit.sh`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/mb-slurm-submit.sh), + which will parallelize all the individual-gene MrBayes runs with SLURM: + + - change `--array` to the correct number of genes + - change `--mail-user` to the user's email (if this is an option for your job scheduler) + - replace the `/workspace/software/bin` in `PATH="/workspace/software/bin:$PATH"` + to the path where the `mb` executable is located or put the whole path in the command: + `/s/mrbayes-3.2.6-1/bin/mb` + + In slurm, we can then submit the MrBayes array job with: + + ```bash + sbatch mb-slurm-submit.sh + ``` + + With this slurm pipeline, the steps below are needed: keep reading. + +## To run mbsum on the output of MrBayes for each gene + +If we have the output of MrBayes and want to run BUCKy, +we must first run `mbsum` on the output from MrBayes, separately for each gene. + +For a gene with output tree files named `gene1.run1.t`, `gene1.run2.t` and `gene1.run3.t`, +and a desired burnin of 1000 trees per tree file, we do this: + +```bash +mbsum -n 1000 -o gene1.in gene1.run1.t gene1.run2.t gene1.run3.t +``` + +This `mbsum` command will need to be executed for each gene. +Then we can continue to the next section to run bucky. + +Alternatively, we can use the julia script +[`mbsum-t-files.jl`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/mbsum-t-files.jl), +and give it as argument the directory that has the output tree files from MrBayes, +to run mbsum for *all* the genes. +`mbsum` is fast, so there is no attempt to parallelize the various mbsum commands. + +```bash +julia mbsum-t-files.jl mbfolder outputfolder burnin # or +julia --color=yes -- mbsum-t-files.jl mbfolder outputfolder burnin # for colorized messages to the screen +``` +where `burnin` is replaced by the number of trees to ignore in each tree file +for burnin. This `burnin` argument is optional (default: 2501). +The `outputfolder` will contain the output of `mbsum`. + +## To run bucky on all 4-taxon sets: we already have the mbsum output + +### no-scheduler pipeline + +If we already ran `mbsum` on the output from MrBayes, for each individual gene, +we can simply create a tarball containing all the mbsum output files. +So if we had mbsum output in files named `gene1.in`, `gene2.in`, ... , `gene100.in`, +we would want to run something similar to the following command to create the tarball: + +```bash +tar czf my-genes-mbsum.tar.gz gene*.in +``` + +We can now use this tarball along with the `-s` option in +[bucky.pl](https://github.com/nstenz/TICR/blob/master/scripts/bucky.pl) like this: + +```bash +bucky.pl my-genes-mbsum.tar.gz -s -o mygenes-bucky +``` + +Again, if we get an error like `bucky.pl: Command not found`, we could run instead + +```bash +perl bucky.pl my-genes-mbsum.tar.gz -s -o mygenes-bucky +``` + +The output, with the table of concordance factors for all sets of 4 taxa, +will be in a file named `my-genes.CFs.csv` inside directory `mygenes-bucky`. +That's the file containing the quartet concordance factors to give to SNaQ as input. + +### slurm pipeline + +We want to run `bucky` on every 4-taxon set. +SLURM will parallelize these jobs with the submit script +[`bucky-slurm-submit.sh`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/bucky-slurm-submit.sh), +which calls the perl script +[`bucky-slurm.pl`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/bucky-slurm.pl). + +The perl script +[`bucky-slurm.pl`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/bucky-slurm.pl) +runs `bucky` on a single 4-taxon set. +It takes the following arguments, which must be modified in the submit script +[`bucky-slurm-submit.sh`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/bucky-slurm-submit.sh): +- name of the folder containing the `mbsum` output files (one per locus) from previous step. + This folder is named `mbsum` in the submit script: adapt if needed. +- output name: `-o` or `--out-dir` name of the directory to store output files in. + This option is not used in the default submit script +- bucky arguments: `-a` or `--alpha` for the prior alpha value, + and `-n` or `--ngen` number of generations. These options are not used either, + in the script: the defaults are used then (α=1, 1 million generations) +- integer for the given quartet, via option `-q`. + The quartet ID is specified by SLURM with its own array ID: `$SLURM_ARRAY_TASK_ID`. + +In the submit script that gives instructions to the job scheduler: +- adapt the name of the `$SLURM_ARRAY_TASK_ID` variable, which captures the task number + in the array of tasks, to your scheduler syntax +- change `--array` to the correct number of 4-taxon sets. + For example, if there are 15 taxa in the dataset, there are `1365` 4-taxon sets. + To get this number, if you are unsure, use `choose(15,4)` in R or `binomial(15,4)` in Julia, + but replace 15 by your actual number of individuals. +- change `--mail-user` to the user's email (if this is an option for your job scheduler) +- replace the `/workspace/software/bin` in `PATH="/workspace/software/bin:$PATH"` + by the path where the `bucky` executable is located. + Also, replace `/workspace/claudia/software/TICR/scripts/` by the full path where the + `bucky-slurm.pl` script is located. + + +In slurm, we would submit the BUCKy array job with: + +```bash +sbatch bucky-slurm-submit.sh +``` + +At the end, the array job will produce +- a `.concordance` file for every 4-taxon set +- a `.cf` file with the parsed output for that same 4-taxon set, + in the format needed for the final CF table. + +The `.cf` files can be concatenated to produce the file containing +the quartet concordance factors across all 4-taxon sets, to give to SNaQ as input: + +```bash +cat *.cf > CFtable.csv +``` + +Alternatively, if the list of `.cf` files is not easily captured by `*.cf` +(because the list is too long for a shell command), the following julia script +can do the concatenation. Just copy-paste the commands below within a Julia session, +started from the directory that contains the `.cf` files: + +```julia +files = String[] # empty vector of strings: will contain the .cf file names later +for f in filter(x -> endswith(x, ".cf"), readdir()) + push!(files,f) +end +println("found $(length(files)) cf files") # to check how many .cf output files were found +open("CFtable.csv","w") do f_out + # write the header: + write(f_out, "taxon1,taxon2,taxon3,taxon4,CF12_34,CF12_34_lo,CF12_34_hi,CF13_24,CF13_24_lo,CF13_24_hi,CF14_23,CF14_23_lo,CF14_23_hi,ngenes\n") + for file in files + @show file # to see the .cf file name: comment this out if that's too much screen output + open(file) do f_in + line = read(f_in, String) + write(f_out, string(line,"\n")) + end # closes "file" safely + end +end # closes "CFtable.csv" safely +``` +When this is done, we will have a file `CFtable.csv` containing the +quartet concordance factors, to give to SNaQ as input :smiley: diff --git a/src/auxiliary.jl b/src/auxiliary.jl index a80e365..89325cb 100644 --- a/src/auxiliary.jl +++ b/src/auxiliary.jl @@ -6,8 +6,8 @@ function setCHECKNET(b::Bool) global CHECKNET CHECKNET = b - CHECKNET && @warn "PhyloNetworks.CHECKNET is true: will slow snaq! down." - b || @info "PhyloNetworks.CHECKNET set to false" + CHECKNET && @warn "SNaQ.CHECKNET is true: will slow snaq! down." + b || @info "SNaQ.CHECKNET set to false" end # ----- aux general functions --------------- @@ -127,7 +127,7 @@ end Set the length of `edge`, and set `edge.y` and `edge.z` accordingly. Warning: specific to `SNaQ.jl`. -Consider `PhyloNetworks.setlengths!` for a more generic tool. +Consider [`setlengths!`](@ref) from `PhyloNetworks` for a more generic tool. - The new length is censored to 10: if the new length is above 10, the edge's length will be set to 10. Lengths are interpreted in coalescent