From 5463a25ddd880300a52d735536c8ce8df9101eab Mon Sep 17 00:00:00 2001
From: Cecile Ane <cecileane@users.noreply.github.com>
Date: Fri, 1 Nov 2024 14:44:19 -0500
Subject: [PATCH] doc: added man and lib pages from PhyloNetworks

---
 docs/make.jl                            |  26 +-
 docs/src/index.md                       |  42 ++-
 docs/src/lib/internals.md               |  21 ++
 docs/src/lib/public.md                  |  17 ++
 docs/src/man/bootstrap.md               | 339 ++++++++++++++++++++
 docs/src/man/dist_reroot.md             | 261 ++++++++++++++++
 docs/src/man/expectedCFs.md             | 111 +++++++
 docs/src/man/fixednetworkoptim.md       | 183 +++++++++++
 docs/src/man/inputdata.md               | 187 ++++++++++++
 docs/src/man/multiplealleles.md         | 147 +++++++++
 docs/src/man/snaq_plot.md               | 390 ++++++++++++++++++++++++
 docs/src/man/ticr_howtogetQuartetCFs.md | 260 ++++++++++++++++
 src/auxiliary.jl                        |   6 +-
 13 files changed, 1977 insertions(+), 13 deletions(-)
 create mode 100644 docs/src/lib/internals.md
 create mode 100644 docs/src/lib/public.md
 create mode 100644 docs/src/man/bootstrap.md
 create mode 100644 docs/src/man/dist_reroot.md
 create mode 100644 docs/src/man/expectedCFs.md
 create mode 100644 docs/src/man/fixednetworkoptim.md
 create mode 100644 docs/src/man/inputdata.md
 create mode 100644 docs/src/man/multiplealleles.md
 create mode 100644 docs/src/man/snaq_plot.md
 create mode 100644 docs/src/man/ticr_howtogetQuartetCFs.md

diff --git a/docs/make.jl b/docs/make.jl
index 951d04e..5eda64d 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -5,19 +5,41 @@ DocMeta.setdocmeta!(SNaQ, :DocTestSetup, :(using SNaQ, PhyloNetworks); recursive
 
 makedocs(;
     modules=[SNaQ],
-    authors="Claudia Solis-Lemus <crsl4@users.noreply.github.com>, Paul Bastide <pbastide@users.noreply.github.com>, Cecile Ane <cecileane@users.noreply.github.com>, and contributors",
+    authors="Claudia Solis-Lemus <crsl4@users.noreply.github.com>, Cécile Ané <cecileane@users.noreply.github.com>, and contributors",
     sitename="SNaQ.jl",
     format=Documenter.HTML(;
         canonical="https://JuliaPhylo.github.io/SNaQ.jl",
         edit_link="dev",
         assets=String[],
+        prettyurls = get(ENV, "CI", nothing) == "true", # easier local build
+        size_threshold = 600 * 2^10,
+        size_threshold_warn = 500 * 2^10, # 600 KiB
     ),
+    # exception, so warning-only for :missing_docs. List all others:
+    warnonly = Documenter.except(:autodocs_block, :cross_references, :docs_block,
+        :doctest, :eval_block, :example_block, :footnote, :linkcheck_remotes,
+        :linkcheck, :meta_block, :parse_error, :setup_block),
     pages=[
         "Home" => "index.md",
+        "Manual" => [
+            "Input Data for SNaQ" => "man/inputdata.md",
+            "TICR pipeline" => "man/ticr_howtogetQuartetCFs.md",
+            "Network estimation and display" => "man/snaq_plot.md",
+            "Network comparison and manipulation" => "man/dist_reroot.md",
+            "Candidate Networks" => "man/fixednetworkoptim.md",
+            "Extract Expected CFs" => "man/expectedCFs.md",
+            "Bootstrap" => "man/bootstrap.md",
+            "Multiple Alleles" => "man/multiplealleles.md",
+        ],
+        "Library" => [
+            "Public" => "lib/public.md",
+            "Internals" => "lib/internals.md",
+        ]
     ],
 )
 
 deploydocs(;
     repo="github.com/JuliaPhylo/SNaQ.jl",
-    devbranch="dev",
+    push_preview = true,
+    devbranch="main",
 )
diff --git a/docs/src/index.md b/docs/src/index.md
index b88be7e..2f5cfb2 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,14 +1,40 @@
-```@meta
-CurrentModule = SNaQ
-```
-
 # SNaQ
 
-Documentation for [SNaQ](https://github.com/JuliaPhylo/SNaQ.jl).
+[SNaQ](https://github.com/JuliaPhylo/SNaQ.jl) is a [Julia](http://julialang.org)
+package that implements the SNaQ method by
+[Solís-Lemus & Cécile Ané (2016)](https://doi.org/10.1371/journal.pgen.1005896)
+to estimate a phylogenetic network from quartet concordance factors.
+See the [PhyloNetworks](https://github.com/JuliaPhylo/PhyloNetworks.jl)
+package, which SNaQ depends on, for background on phylogenetic networks
+and for how to get help.
+
+## references
 
-```@index
+See them in
+[bibtex format](https://github.com/juliaphylo/SNaQ.jl/blob/master/CITATION.bib).
+
+## Manual
+
+```@contents
+Pages = [
+    "man/inputdata.md",
+    "man/ticr_howtogetQuartetCFs.md",
+    "man/snaq_plot.md",
+    "man/dist_reroot.md",
+    "man/fixednetworkoptim.md",
+    "man/expectedCFs.md",
+    "man/bootstrap.md",
+    "man/multiplealleles.md",
+]
+Depth = 3
 ```
 
-```@autodocs
-Modules = [SNaQ]
+For help on individual functions, see the library:
+
+```@contents
+Pages = [
+    "lib/public.md",
+    "lib/internal.md",
+]
+Depth = 3
 ```
diff --git a/docs/src/lib/internals.md b/docs/src/lib/internals.md
new file mode 100644
index 0000000..54347f7
--- /dev/null
+++ b/docs/src/lib/internals.md
@@ -0,0 +1,21 @@
+# internal documentation
+
+Documentation for `SNaQ`'s internal functions.
+These functions are not exported and their access (API) should not be
+considered stable. But they can still be used, like this for example:
+`SNaQ.foo()` for a function named `foo()`.
+
+
+## functions & types
+
+```@autodocs
+Modules = [SNaQ]
+Public  = false
+Order   = [:type,:function,:constant]
+```
+
+## index
+
+```@index
+Pages = ["internal.md"]
+```
diff --git a/docs/src/lib/public.md b/docs/src/lib/public.md
new file mode 100644
index 0000000..072872f
--- /dev/null
+++ b/docs/src/lib/public.md
@@ -0,0 +1,17 @@
+# public documentation
+
+Documentation for `SNaQ`'s public (exported) interface.
+
+## functions & types
+
+```@autodocs
+Modules = [SNaQ]
+Private = false
+Order   = [:function,:type]
+```
+
+## index
+
+```@index
+Pages = ["public.md"]
+```
diff --git a/docs/src/man/bootstrap.md b/docs/src/man/bootstrap.md
new file mode 100644
index 0000000..713a549
--- /dev/null
+++ b/docs/src/man/bootstrap.md
@@ -0,0 +1,339 @@
+```@setup bootstrap
+using PhyloNetworks, SNaQ
+mkpath("../assets/figures")
+```
+# Bootstrap
+
+## Running a bootstrap analysis
+
+There are two ways to do a bootstrap analysis.
+
+First, we can use a table of quartet CFs estimated with credibility intervals,
+such as if we used BUCKy.
+The [TICR pipeline](@ref) outputs a CF table with extra columns for credibility
+intervals. We could then read that table and get bootstrap networks like this,
+and tweak options as needed:
+
+```julia
+using DataFrames, CSV
+df = CSV.read("tableCF_withCI.csv", DataFrame)
+bootnet = bootsnaq(startnetwork, df, hmax=1, filename="bootstrap")
+```
+
+Alternatively, we can use bootstrap gene trees: one file of bootstrap trees per gene.
+Here, the input is a text file that lists all the bootstrap files (one per gene).
+We demonstrate this second option here.
+
+The names of all our bootstrap files are listed in "BSlistfiles".
+(ASTRAL can use the same file to do its own bootstrap, see the
+[wiki](https://github.com/juliaphylo/PhyloNetworks.jl/wiki/Gene-Trees:-RAxML)
+for more details).
+The function `readBootstrapTrees` can read this list of file names, then
+read each bootstrap file to get the bootstrap sample for each gene.
+We can use them to sample input gene trees at random, one per gene,
+and estimate a network from them. We ask the `bootsnaq` function
+to repeat this resampling of bootstrap gene trees several times.
+
+```julia
+bootTrees = readBootstrapTrees("BSlistfiles");
+bootnet = bootsnaq(net0, bootTrees, hmax=1, nrep=10, runs=3,
+                   filename="bootsnaq", seed=4321)
+```
+
+The bootstrap networks are saved in the `boostrap.out` file, so they
+can be read in a new session with
+`bootnet = readMultiTopology("bootsnaq.out")`. To save the bootstrap networks to
+a different file (perhaps after having re-rooted them with an
+outgroup), we could do this: `writeMultiTopology(bootnet, "bootstrapNets.tre")`.
+
+The example above asks for 10 bootstrap replicates,
+which is definitely too few, to make the example run faster.
+We might also increase the number of optimization runs (`runs`)
+done for each bootstrap replicate. This bootstrap was run with the
+default 10 runs per replicate, and 100 bootstrap replicates,
+and the 100 bootstrap networks come with the package:
+
+```@example bootstrap
+bootnet = readMultiTopology(joinpath(dirname(pathof(SNaQ)), "..","examples","bootsnaq.out"));
+length(bootnet)
+```
+
+If we used a specified list of quartets on the original data, we
+should use that same list for the bootstrap analysis through the
+option `quartetfile`.
+
+## support for tree edges
+
+Now that we have 100 bootstrap networks, we need to summarize
+what they have in common (highly supported features) and what they
+don't (areas of uncertainty).
+
+Before summarizing these bootstrap networks on the best network,
+it is best to re-read this network to get a reproducible internal numbering
+of its nodes and edges, used later for mapping bootstrap support to edges.
+```@example bootstrap
+net1 = readTopology(joinpath(dirname(pathof(SNaQ)), "..","examples","net1.out"))
+```
+
+It turns out that the direction of gene flow is quite uncertain
+in this example (see below) with a wrong direction inferred sometimes,
+so we re-root our best network net1 to the base of O,E, for the figures
+to be less confusing later.
+
+```@setup bootstrap
+rootonedge!(net1, 7)
+```
+```@example bootstrap
+using PhyloPlots, RCall
+R"name <- function(x) file.path('..', 'assets', 'figures', x)" # hide
+R"svg(name('net1_rotate1_1.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, showedgenumber=true); # edge 7 leads to O+E
+R"dev.off()" # hide
+rootonedge!(net1, 7) # makes (O,E) outgroup clade
+R"svg(name('net1_rotate1_2.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, shownodenumber=true);
+R"dev.off()" # hide
+nothing # hide
+```
+![net1_rotate1 1](../assets/figures/net1_rotate1_1.svg)
+![net1_rotate1 2](../assets/figures/net1_rotate1_2.svg)
+
+Edges cross: but rotating at node -6 should remove this crossing
+of edges
+```@example bootstrap
+rotate!(net1, -6)
+```
+```@example bootstrap
+R"svg(name('net1_rotate2.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, showgamma=true);
+R"dev.off()" # hide
+nothing # hide
+```
+![net1_rotate2](../assets/figures/net1_rotate2.svg)
+
+We can now summarize our bootstrap networks.
+The functions `treeEdgesBootstrap` and `hybridBootstrapSupport`
+read all bootstrap networks and map the edges / nodes
+onto a reference network: here net1.
+```@example bootstrap
+BSe_tree, tree1 = treeEdgesBootstrap(bootnet,net1);
+```
+This calculates the major tree `tree1` displayed in `net1`, that is,
+the tree obtained by following the major parent (γ>0.5) of each hybrid node.
+This tree can be visualized like this, with edge numbers shown for later use.
+```@example bootstrap
+R"svg(name('major_tree.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(tree1, showedgenumber=true);
+R"dev.off()" # hide
+nothing # hide
+```
+![major_tree](../assets/figures/major_tree.svg)
+
+Next, we can look at bootstrap table `BSe_tree`, which has one row for
+each tree edge in `net1`. One column contains the edge number
+(same as shown in the plot) and another column contains the edge
+bootstrap support: the proportion of bootstrap replicates in which this edge was
+found in the major tree of the inferred network.
+We can see the full bootstrap table and see
+which tree edges have bootstrap support lower than 100% (none here) with
+```@repl bootstrap
+using DataFrames # for showall() below
+show(BSe_tree, allrows=true, allcols=true)
+filter(row -> row[:proportion] < 100, BSe_tree)
+```
+Finally, we can map the bootstrap proportions onto the network or its main tree
+by passing the bootstrap table to the `edgelabel` option of `plot`:
+```@example bootstrap
+R"svg(name('boot_tree_net_1.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(tree1, edgelabel=BSe_tree);
+R"dev.off()" # hide
+R"svg(name('boot_tree_net_2.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, edgelabel=BSe_tree);
+R"dev.off()" # hide
+nothing # hide
+```
+![boot_tree_net 1](../assets/figures/boot_tree_net_1.svg)
+![boot_tree_net 2](../assets/figures/boot_tree_net_2.svg)
+
+(Here, it is important that the numbers assigned to edges when building the boostrap
+table --those in `net1` at the time-- correspond to the current edge numbers
+in `tree1` and `net1`. That was the purpose of reading the network from the
+output file of `snaq!` earlier, for consistency across different Julia sessions.)
+
+If we wanted to plot only certain bootstrap values, like those below 100% (1.0),
+we could do this:
+```julia
+plot(net1, edgelabel=filter(row -> row[:proportion] < 100, BSe_tree));
+```
+
+## support for hybrid edges and hybrid nodes
+
+Summarizing the placement of reticulations is not standard.
+The function `hybridBootstrapSupport` attempts to do so.
+The descendants of a given hybrid node form the "recipient" or "hybrid" clade,
+and is obtained after removing all other reticulations.
+If reticulation is due to gene flow or introgression, the minor hybrid edge (with γ<0.5)
+represents this event. The descendants of the lineage from which gene flow originated
+is then a second "sister" of the hybrid clade. Because of the reticulation event,
+the hybrid clade has 2 sister clades, not 1: the major sister (through the major hybrid edge
+with γ>0.5) and the minor sister (through the minor hybrid edge with γ<0.5).
+Note that the network says *nothing* about the process: its shows the *relationships* only.
+We can calculate the frequency that each clade is a hybrid clade, or a major or minor sister
+for some other hybrid, in the bootstrap networks:
+```@example bootstrap
+BSn, BSe, BSc, BSgam, BSedgenum = hybridBootstrapSupport(bootnet, net1);
+```
+Let's look at the results.
+We can list all the clades and the percentage of bootstrap networks (bootstrap support)
+in which each clade is a hybrid or sister to a hybrid:
+```@repl bootstrap
+BSn
+```
+If a clade contains a single taxon, it is listed with its taxon name.
+The clade found in the best network is listed with its tag, starting with H (e.g. "H7").
+The name of other clades start with "c_" followed by their number in the best network, if they
+do appear in the best network.
+The node numbers, as used internally in the best network, are listed in a separate column.
+They can be used later to display the bootstrap support values onto the network.
+Various columns give the bootstrap support that each clade is a hybrid, or a (major/minor) sister
+to a hybrid. The last column gives the bootstrap support for the full relationship in the
+best network: same hybrid with same two sisters.
+These bootstrap values are associated with nodes (or possibly, their parent edges).
+
+To see what is the clade named "H7", for instance:
+```@repl bootstrap
+BSc # this might be too big
+show(BSc, allrows=true, allcols=true)
+# BSc[BSc[!,:H7], :taxa] # just a different syntax to subset the data in the same way
+filter(row -> row[:H7], BSc).taxa
+```
+We can also get bootstrap values associated with edges, to describe the support that a given
+hybrid clade has a given sister clade.
+```@repl bootstrap
+BSe
+```
+Here, each row describes a pair of 2 clades: one being the hybrid, the other being its sister,
+connected by a hybrid edge. The first rows corresponds to hybrid edges in the best network. Other
+rows correspond to edges seen in bootstrap networks but not in the reference network.
+```@repl bootstrap
+BSedgenum
+```
+lists all the hybrid edges in the best network, two for each hybrid node:
+the major parent edge and then the minor parent edge.
+In our case, there is only one reticulation, so only 2 hybrid edges.
+
+We can plot the bootstrap values of the 2 hybrid edges in the best network:
+```@example bootstrap
+R"svg(name('boot_net_net.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, edgelabel=BSe[:,[:edge,:BS_hybrid_edge]]);
+R"dev.off()" # hide
+nothing # hide
+```
+![boot_net_net](../assets/figures/boot_net_net.svg)
+
+This is showing the bootstrap support each hybrid edge: percentage of bootstrap trees with an
+edge from the same sister clade to the same hybrid clade.
+Alternatively, we could show the bootstrap support for the full reticulation relationships in
+the network, one at each hybrid node (support for same hybrid with same sister clades).
+Here, we find that A received gene flow from E (and is sister to B otherwise) in just 32%
+of bootstrap networks. In another 1% bootstrap, A received gene flow from another source.
+```@example bootstrap
+R"svg(name('boot_net_ret.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, nodelabel=BSn[!,[:hybridnode,:BS_hybrid_samesisters]]);
+R"dev.off()" # hide
+nothing # hide
+```
+![boot_net_ret](../assets/figures/boot_net_ret.svg)
+
+Below is example code to place tree edge support and hybrid edge support
+on the same plot.
+
+```julia
+tmp = filter(row -> !ismissing(row[:edge]), BSe) # filter rows
+select!(tmp, [:edge,:BS_hybrid_edge])            # select 2 columns only
+rename!(tmp, :BS_hybrid_edge => :proportion)     # rename those columns, to match names in BSe_tree
+rename!(tmp, :edge => :edgeNumber)
+tmp = vcat(BSe_tree, tmp)
+plot(net1, edgelabel=tmp, nodelabel=BSn[:, [:hybridnode,:BS_hybrid_samesisters]])
+```
+
+### Who are the hybrids in bootstrap networks?
+
+On a different plot, we can show the bootstrap support for hybrid clades,
+first mapped to each node with positive hybrid support,
+and then mapped on the parent edge of these nodes.
+A is estimated as a hybrid in only 33% of our bootstrap networks.
+In another 44%, it is the lineage to (E,O) that is estimated as
+being of hybrid origin.
+```@example bootstrap
+R"svg(name('boot_net_hyb_1.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, nodelabel=filter(row->row[:BS_hybrid]>0, BSn)[!,[:hybridnode,:BS_hybrid]]);
+R"dev.off()" # hide
+nothing # hide
+R"svg(name('boot_net_hyb_2.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, edgelabel=filter(row->row[:BS_hybrid]>0, BSn)[!,[:edge,:BS_hybrid]]);
+R"dev.off()" # hide
+nothing # hide
+```
+![boot_net_hyb 1](../assets/figures/boot_net_hyb_1.svg)
+![boot_net_hyb 2](../assets/figures/boot_net_hyb_2.svg)
+
+### Where is the origin of gene flow?
+
+We can plot the support for the various placements
+of the gene flow origin (minor sister clade),
+first mapped to each node with positive support for being the origin of gene flow,
+and then mapped along the parent edge of these nodes.
+We filtered clades to show those with sister support > 5%:
+```@example bootstrap
+R"svg(name('boot_net_clade_1.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, nodelabel=filter(r->r[:BS_minor_sister]>5, BSn)[!,[:node,:BS_minor_sister]]);
+R"dev.off()" # hide
+nothing # hide
+R"svg(name('boot_net_clade_2.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, edgelabel=filter(r->r[:BS_minor_sister]>5, BSn)[!,[:edge,:BS_minor_sister]]);
+R"dev.off()" # hide
+nothing # hide
+```
+![boot_net_clade 1](../assets/figures/boot_net_clade_1.svg)
+![boot_net_clade 2](../assets/figures/boot_net_clade_2.svg)
+
+In our best network, the lineage to E is estimated as the origin
+of gene flow, but this is recovered in only 41% of our bootstrap networks.
+In another 49%, it is the lineage to A that is estimated as the *origin*
+of gene flow: so gene flow is estimated in the opposite direction.
+In this example, there is support for gene flow between (A,B) and (E,O),
+but there is much uncertainty about its exact placement and about its direction.
+
+Mapping the support for major sister clades might be interesting too:
+```julia
+plot(net1, nodelabel=filter(r->r[:BS_major_sister]>5, BSn)[!,[:node,:BS_major_sister]])
+```
+
+The estimated heritability γ on hybrid edges in the reference network, when present in a
+bootstrap network, was also extracted:
+```@repl bootstrap
+BSgam[1:3,:] # first 3 rows only
+```
+γ=0 values are for bootstrap replicates that did not have the edge in their network.
+Basic summaries on γ values for a given edge, say the minor parent,
+could be obtained like this:
+```@repl bootstrap
+minimum(BSgam[:,2])
+maximum(BSgam[:,2])
+using Statistics # for functions like mean and std (standard deviation)
+mean(BSgam[:,2])
+std(BSgam[:,2])
+```
diff --git a/docs/src/man/dist_reroot.md b/docs/src/man/dist_reroot.md
new file mode 100644
index 0000000..d2d37cc
--- /dev/null
+++ b/docs/src/man/dist_reroot.md
@@ -0,0 +1,261 @@
+```@setup dist_reroot
+using PhyloNetworks, SNaQ
+mkpath("../assets/figures")
+exampledir = joinpath(dirname(pathof(SNaQ)), "..","examples")
+raxmltrees = joinpath(exampledir,"raxmltrees.tre")
+raxmlCF = readTableCF(writeTableCF(countquartetsintrees(readMultiTopology(raxmltrees), showprogressbar=false)...))
+astralfile = joinpath(exampledir,"astral.tre")
+astraltree = readMultiTopology(astralfile)[102] # 102th tree = last tree here
+net0 = readTopology(joinpath(exampledir,"net0.out"))
+net1 = readTopology(joinpath(exampledir,"net1.out"))
+net0.loglik = 53.53150526187732
+net1.loglik = 28.31506721890958
+```
+# Comparing and manipulating networks
+
+Examples below follow those in [Getting a Network](@ref).
+
+## Comparing networks / trees
+
+Is the SNaQ tree (network with h=0) the same as the ASTRAL tree?
+We can calculate their Robinson-Foulds distance:
+
+```@repl dist_reroot
+hardwiredClusterDistance(astraltree, net0, false)
+```
+The last option `false` is to consider topologies as unrooted.
+The RF distance is 0, so the two unrooted topologies are the same.
+If we had considered them as rooted, with whatever root they
+currently have in their internal representation,
+we would find a difference:
+```@repl dist_reroot
+hardwiredClusterDistance(astraltree, net0, true)
+```
+
+## Re-rooting trees and networks
+
+We can re-root our networks with the outgroup, O,
+and then re-compare the ASTRAL tree and the SNaQ tree
+as rooted topologies (and find no difference):
+```@repl dist_reroot
+rootatnode!(astraltree, "O")
+rootatnode!(net0, "O")
+hardwiredClusterDistance(astraltree, net0, true)
+```
+```@example dist_reroot
+using PhyloPlots, RCall
+R"name <- function(x) file.path('..', 'assets', 'figures', x)"
+R"svg(name('net0_O.svg'), width=4, height=4)"
+R"par"(mar=[0,0,0,0])
+plot(net0);
+R"dev.off()"
+nothing # hide
+```
+![net0_O](../assets/figures/net0_O.svg)
+
+Note that, as in previous chapters, we use the possibilities of `RCall`
+to save the plot. We only show this commands once, but they will be run
+behind the scene each time a plot is called.
+
+After trees/networks are rooted with a correct outgroup,
+their visualization is more meaningful.
+
+Networks can be re-rooted at a given node or along a given edge.
+Get help (type `?`) on the functions `rootatnode!` and `rootonedge!`
+for more info. There are examples in the [Bootstrap](@ref) section.
+
+If the network is plotted with crossing edges, you may identify
+ways to rotate the children edges at some nodes to untangle some crossing edges.
+This can be done using the function `rotate!`.
+See an example in the [Bootstrap](@ref) section, or type `?` then `rotate!`.
+
+## What if the root conflicts with the direction of a reticulation?
+
+With 1 hybridization or more, the direction of hybrid edges
+constrain the position of the root. The root cannot be downstream of hybrid edges.
+Any hybrid node has to be younger than, or of the same age as both of its parents.
+So time has to flow "downwards" of any hybrid node, and the root cannot be
+placed "below" a hybrid node.
+An attempt to re-root the network at a position incompatible with hybrid edges
+will fail, with a `RootMismatch` error. To show an example, let's use the
+network below. We plotted the edge numbers, because we will want to use them
+later to place the root.
+
+```@example dist_reroot
+net7taxa = readTopology("(C,D,((O,(E,#H7:::0.196):0.314):0.664,(((A1,A2))#H7:::0.804,B):10.0):10.0);")
+R"svg(name('reroot_net7taxa_1.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net7taxa, showgamma=true, showedgenumber=true, tipoffset=0.2);
+R"dev.off()"; # hide
+nothing # hide
+```
+![reroot net7taxa 1](../assets/figures/reroot_net7taxa_1.svg)
+
+Let's imagine that the A1 and A2 are our outgroups, and we estimated the network above.
+According to this network, time must flow from the hybrid node towards A1 and A2.
+So any attempt to reroot the network with A1 as outgroup, or with A2 as outgroup,
+or with the A clade (on edge 11), will fail with a `RootMismatch` error:
+
+```julia
+rootatnode!(net7taxa, "A1"); # ERROR: RootMismatch: non-leaf node 5 had 0 children. ...
+rootatnode!(net7taxa, "A2"); # ERROR: RootMismatch (again)
+rootonedge!(net7taxa, 10);   # ERROR: RootMismatch (again)
+```
+
+In this case, however, it is possible to root the network on either parent edge
+of the hybrid node. These edges have numbers 12 and 5, based on the plot above.
+We get these 2 rooted versions of the network:
+
+```@example dist_reroot
+R"svg(name('reroot_net7taxa_2.svg'), width=7, height=4)"; # hide
+R"layout(matrix(1:2,1,2))";
+R"par"(mar=[0,0,0.5,0]); # hide
+rootonedge!(net7taxa, 11);
+rotate!(net7taxa, -5)
+plot(net7taxa, showgamma=true, tipoffset=0.2, shownodenumber=true);
+R"mtext"("rooted on hybrid edge 11 (major)", line=-1)
+rootonedge!(net7taxa, 5);
+plot(net7taxa, showgamma=true, tipoffset=0.2);
+R"mtext"("rooted on hybrid edge 5 (minor)", line=-1);
+R"dev.off()"; # hide
+nothing # hide
+```
+![reroot net7taxa 2](../assets/figures/reroot_net7taxa_2.svg)
+
+On the second plot, the A clade does not *appear* to be an outgroup,
+but this is just because the plot follows the major tree primarily,
+based the major hybrid edges (those with γ>0.5).
+We can display the exact same network differently, by changing
+the γ inheritance values to invert the major/minor consideration of the hybrid edges.
+```@example dist_reroot
+net7taxa.edge[5] # just to check that it's one of the 2 hybrid edges of interest
+setGamma!(net7taxa.edge[5], 0.501) # switch major/minor edges
+R"svg(name('reroot_net7taxa_3.svg'), width=4, height=4)"; # hide
+R"layout(matrix(1,1,1))"; # hide
+R"par"(mar=[0,0,0,0]); # hide
+plot(net7taxa, tipoffset=0.2); # not showing gamma values, because we changed them artificially
+R"mtext"("rooted on hybrid edge 5 (considered major)", line=-1);
+R"dev.off()"; # hide
+nothing # hide
+```
+![reroot net7taxa 3](../assets/figures/reroot_net7taxa_3.svg)
+
+Conclusion, in this particular example: it is possible to re-root the network to a place
+where the A clade is indeed an outgroup. But it did require some care,
+and we discovered that there are 2 acceptable rooting options.
+The first is more plausible, if we think that the *species tree* is the *major tree*,
+meaning that any gene flow or introgression event replaced less than 50% of the genes
+in the recipient population.
+
+In other cases, it may not be possible to re-root the network with a known outgroup.
+It would be the case if A1 was the only outgroup, and if A2 was an ingroup taxon.
+In such a case, the outgroup knowledge tells us that our estimated network is wrong.
+One (or more) reticulation in the network must be incorrect.
+Its placement might be correct, but then its direction would be incorrect.
+If the network was estimated via `snaq!`, check tips
+about [Candidate networks compatible with a known outgroup](@ref).
+
+## Extracting the major tree
+
+We can also compare the networks estimated with h=0 (net0) and h=1 (net1):
+```@repl dist_reroot
+rootatnode!(net1, "O"); # the ; suppresses screen output
+hardwiredClusterDistance(net0, net1, true)
+```
+```@example dist_reroot
+R"svg(name('net1_O.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, showgamma=true);
+R"dev.off()" # hide
+nothing # hide
+```
+![net1_O](../assets/figures/net1_O.svg)
+
+They differ by 2 clusters: that's because A is of hybrid descent
+in net1, not in net0.
+
+To beyond this hybrid difference,
+we can extract the major tree from the network with 1 hybridization,
+that is, delete the hybrid edge supported by less than 50% of genes.
+Then we can compare this tree with the ASTRAL/SNaQ tree net0.
+```@repl dist_reroot
+tree1 = majorTree(net1); # major tree from net1
+hardwiredClusterDistance(net0, tree1, true)
+```
+They are identical (at distance 0), so here the species network
+with 1 hybrid node is a refinement of the estimated species tree
+(this needs not be the case always).
+
+Is the SNaQ network with 1 hybrid node the same as the true network,
+the one that was initially used to simulate the data?
+
+(digression on the data: gene trees were simulated under the coalescent
+along some "true" network, then 500 base-pair alignments were simulated
+along each gene tree with the HKY model,
+gene trees were estimated from each alignment with RAxML, and
+these estimated gene trees served as input to both ASTRAL and SNaQ.)
+
+The true network is shown below, correctly rooted at the outgroup O,
+and plotted with branch lengths proportional to their
+values in coalescence units:
+```@repl dist_reroot
+truenet = readTopology("((((D:0.4,C:0.4):4.8,((A:0.8,B:0.8):2.2)#H1:2.2::0.7):4.0,(#H1:0::0.3,E:3.0):6.2):2.0,O:11.2);");
+hardwiredClusterDistance(net1, truenet, true)
+```
+```@example dist_reroot
+R"svg(name('truenet_sim.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(truenet, useedgelength=true, showgamma=true);
+R"dev.off()" # hide
+nothing # hide
+```
+![truenet](../assets/figures/truenet_sim.svg)
+
+Our estimated network is not the same as the true network:
+- the underlying tree is correctly estimated
+- the origin of gene flow is correctly estimated: E
+- the target of gene flow is *not* correctly estimated: it was
+  the lineage ancestral to (A,B), but it is estimated to be A only.
+
+For networks, the distance here is the hardwired cluster distance:
+the number of hardwired clusters found in one network and not
+in the other. The **hardwired cluster** associated with an edge is the
+set of *all* tips descendant from that edge, i.e. all tips that
+inherited at least *some* genetic material from that edge.
+
+## Displayed trees and subnetworks
+
+We can extract all trees displayed in a network.
+These trees are obtained by picking one parent hybrid edge
+at each hybrid node, and dropping the other parent hybrid edge.
+We can choose to pick the "important" hybrid edges only,
+with heritability γ at or above a threshold.
+Below we use a γ threshold of 0, so we get all displayed trees:
+```@repl dist_reroot
+t = displayedTrees(net1, 0.0) # list of trees displayed in network
+writeTopology(t[1], round=true)
+writeTopology(t[2], round=true)
+```
+If we decide to keep edges with γ>0.2 only, then we are
+left with a single tree in the list (the major tree).
+This is because our example has 1 hybrid node with minor γ=0.196.
+```@repl dist_reroot
+t = displayedTrees(net1, 0.2)
+```
+
+We can also delete all "non-important" reticulations,
+those with a minor heritability γ below some threshold.
+The function below changes our network `net1`,
+as indicated by its name ending with a `!`.
+
+```@repl dist_reroot
+deleteHybridThreshold!(net1, 0.1)
+```
+Nothing happened to our network: because its γ is above 0.1.
+But if we set the threshold to 0.3, then our reticulation disappears:
+```@repl dist_reroot
+deleteHybridThreshold!(net1, 0.3)
+```
+See also function `displayedNetworkAt!` to get the network with
+a single reticulation of interest, and eliminate all other
+reticulations.
diff --git a/docs/src/man/expectedCFs.md b/docs/src/man/expectedCFs.md
new file mode 100644
index 0000000..44ceefb
--- /dev/null
+++ b/docs/src/man/expectedCFs.md
@@ -0,0 +1,111 @@
+```@setup expCFs
+using PhyloNetworks, SNaQ
+mkpath("../assets/figures")
+exampledir = joinpath(dirname(pathof(SNaQ)), "..","examples")
+raxmltrees = joinpath(exampledir,"raxmltrees.tre")
+raxmlCF = readTableCF(writeTableCF(countquartetsintrees(readMultiTopology(raxmltrees), showprogressbar=false)...))
+truenet = readTopology("((((D:0.4,C:0.4):4.8,((A:0.8,B:0.8):2.2)#H1:2.2::0.7):4.0,(#H1:0::0.3,E:3.0):6.2):2.0,O:11.2);");
+```
+
+# Extract Expected CFs
+
+A good way to visualize the "goodness-of-fit" of a given estimated network to the data
+is to plot the observed CF versus the expected CF. If the network is a good fit, then the dots
+in the plot will be close to the diagonal (x=y line).
+The following function will create a dataframe with the observed and expected CFs,
+which are all saved in the DataCF object after running snaq:
+```@repl expCFs
+topologyMaxQPseudolik!(truenet, raxmlCF);
+df_wide = fittedQuartetCF(raxmlCF) # same as fittedQuartetCF(raxmlCF, :wide)
+df_long = fittedQuartetCF(raxmlCF, :long)
+```
+It is important to have run `snaq!`, `topologyQPseudolik!` or `topologyMaxQPseudolik!`
+before making these tables, or the result would be meaningless.
+These functions update the fitted concordance factors (those expected under the network)
+inside the DataCF object `raxmlCF`.
+
+Here is one way to plot them, via R again, and using the R package `ggplot2`.
+
+```@example expCFs
+using RCall
+obsCF = df_long[!,:obsCF]; expCF = df_long[!,:expCF]; # hide
+R"name <- function(x) file.path('..', 'assets', 'figures', x)"; # hide
+R"svg(name('expCFs_obsvsfitted.svg'), width=5, height=4)"; # hide
+R"par"(mar=[2.5,2.6,.5,.5], mgp=[1.5,.4,0], tck=-0.01, las=1, pty="s"); # hide
+R"plot(0:1, 0:1, type='l', bty='L', lwd=0.3, col='#008080', xlab='quartet CF observed in gene trees', ylab='quartet CF expected from network')"; # hide
+R"set.seed"(1234); # hide
+R"points(jitter($obsCF,amount=0.005),jitter($expCF,amount=0.005),col='#008080',bg='#00808090',pch=21)"; # hide
+R"dev.off()"; # hide
+nothing # hide
+```
+To install ggplot2 if not installed already, do:
+`R"install.packages('ggplot2', dep=TRUE)"`
+
+```julia
+@rlibrary ggplot2
+ggplot(df_long, aes(x=:obsCF,y=:expCF)) + theme_classic() +
+    geom_segment(x=0,y=0,xend=1,yend=1, color="#008080", size=0.3) + # diagonal line
+    geom_point(alpha=0.5, color="#008080", position=position_jitter(width=0.005, height=0.005)) +
+    ylab("quartet CF expected from network") + xlab("quartet CF observed in gene trees") + coord_equal(ratio=1);
+# if needed, save with:
+ggsave("expCFs_obsvsfitted.svg", scale=1, width=6, height=5);
+```
+
+![obsvsfitted](../assets/figures/expCFs_obsvsfitted.svg)
+
+Many points are overlapping, so they were "jittered" a little to see them all better.
+There are always many points overlapping on the bottom-left corner:
+concordance factors of 0.0 for quartet resolutions not observed, and not expected.  
+To export the table of quartet CFs and explore the fit of the network with other tools:
+
+```julia
+using CSV
+CSV.write("fittedCF.csv", df_long)
+```
+alternative code to get a similar plot with [Gadfly](http://gadflyjl.org/):
+```julia
+using Gadfly
+plot(layer(df_long, Geom.point, x="obsCF", y="expCF"),
+     layer(x=0:1,y=0:1, Geom.line), # diagonal line
+     Guide.xlabel("CF observed in gene trees"), Guide.ylabel("CF expected from network"))
+```
+
+We could highlight quartets that include taxon A, say,
+if we suspect that it is an unrecognized hybrid.
+Many points are overlapping, like before, so they are again "jittered" a bit.
+
+```@example expCFs
+using DataFrames
+df_long[!,:has_A] .= "no"; # add a column to our data, to indicate which 4-taxon sets have A or not
+for r in eachrow(df_long)
+    if "A" ∈ [r[:tx1], r[:tx2], r[:tx3], r[:tx4]]
+       r[:has_A]="yes"
+    end
+end
+has_A = df_long.has_A # hide
+nq = length(has_A); # hide
+R"colA=rep('#008080',$nq); bgA=rep('#00808090',$nq);"; # hide
+R"colA[$has_A=='yes']='#F8766D'; bgA[$has_A=='yes']='#F8766D90'"; # hide
+R"svg(name('expCFs_obsvsfitted_A.svg'), width=5, height=4)"; # hide
+R"par"(mar=[2.5,2.6,.5,.5], mgp=[1.5,.4,0], tck=-0.01, las=1, pty="s"); # hide
+R"plot(0:1, 0:1, type='l', bty='L', lwd=0.3, col='black', xlab='quartet CF observed in gene trees', ylab='quartet CF expected from network')"; # hide
+R"set.seed"(2345); # hide
+R"points(jitter($obsCF,amount=0.005),jitter($expCF,amount=0.005),col=colA,bg=bgA,pch=21)"; # hide
+R"legend(x=0.7,y=0.3,pch=21,col=c('#008080','#F8766D'),legend=c('no','yes'),title='has A?', bty='n',bg=c('#00808090','#F8766D90'))"; # hide
+R"dev.off()"; # hide
+nothing # hide
+```
+```@repl expCFs
+first(df_long, 7) # first 7 rows
+```
+
+```julia
+ggplot(df_long, aes(x=:obsCF, y=:expCF, color=:has_A)) + theme_classic() +
+    geom_segment(x=0,y=0,xend=1,yend=1, color="black", size=0.3) + # diagonal line
+    geom_point(alpha=0.5, position=position_jitter(width=0.005, height=0.005)) +
+    ylab("quartet CF expected from network") + xlab("quartet CF observed in gene trees") + coord_equal(ratio=1);
+# can be saved:
+ggsave("expCFs_obsvsfitted_A.svg", width=6, height=5);
+```
+
+![obsvsfitted A present or not](../assets/figures/expCFs_obsvsfitted_A.svg)
diff --git a/docs/src/man/fixednetworkoptim.md b/docs/src/man/fixednetworkoptim.md
new file mode 100644
index 0000000..35cc0ae
--- /dev/null
+++ b/docs/src/man/fixednetworkoptim.md
@@ -0,0 +1,183 @@
+# Candidate Networks
+
+## Optimizing parameters for a given network
+
+For a given network topology, we can optimize the branch lengths and
+inheritance probabilities (γ) with the pseudolikelihood.
+This is useful if we have a few candidate networks to compare.
+Each network can be optimized individually, and the network with the best
+pseudolikelihood can be chosen.
+
+The score being optimized is a pseudo-deviance, i.e.
+a multiple of the negative log pseudo-likelihood up to an additive constant
+(the lower the better; a pseudo-deviance of 0 corresponds to a perfect fit).
+
+Following our example in [Getting a Network](@ref),
+we can optimize parameters on the true network
+(the one originally used to simulate the data):
+
+```@setup fixednetworkoptim
+using PhyloNetworks, SNaQ
+using Logging # to suppress info messages below
+baselogger = global_logger()
+mkpath("../assets/figures")
+exampledir = joinpath(dirname(pathof(SNaQ)), "..","examples")
+raxmltrees = joinpath(exampledir,"raxmltrees.tre")
+raxmlCF = readTableCF(writeTableCF(countquartetsintrees(readMultiTopology(raxmltrees), showprogressbar=false)...))
+```
+
+```@repl fixednetworkoptim
+truenet = readTopology("((((D:0.4,C:0.4):4.8,((A:0.8,B:0.8):2.2)#H1:2.2::0.7):4.0,(#H1:0::0.3,E:3.0):6.2):2.0,O:11.2);");
+net1alt = topologyMaxQPseudolik!(truenet, raxmlCF);
+writeTopology(net1alt, round=true)
+net1alt.loglik # pseudo deviance actually: the lower the better
+```
+```@example fixednetworkoptim
+using PhyloPlots, RCall
+R"name <- function(x) file.path('..', 'assets', 'figures', x)" # hide
+R"svg(name('truenet_opt.svg'), width=4, height=4)" # hide
+R"par"(mar=[0,0,0,0])
+plot(net1alt, showgamma=true);
+R"dev.off()" # hide
+nothing # hide
+```
+![truenet_opt](../assets/figures/truenet_opt.svg)
+
+We get a score of 29.941,
+which is comparable to the score of the SNaQ network (net1: 28.315),
+especially compared to the score of the best tree (net0: 53.532).
+This begs the question: is the true network within the "range" of uncertainty?
+We can run a [Bootstrap](@ref) analysis to measure uncertainty
+in our network inference.
+
+For a more thorough optimization, we may increase the requirements before
+the search stops (but the optimization will take longer).
+It makes no difference on this small data set.
+```julia
+net1par = topologyMaxQPseudolik!(truenet, raxmlCF, ftolRel=1e-10, xtolAbs=1e-10)
+net1par.loglik # pseudo deviance, actually: the lower the better
+```
+
+## Network Score with no optimization
+
+For a network with given branch lengths and γ heritabilies,
+we can compute the pseudolikelihood (well, a pseudo-deviance) with:
+```@repl fixednetworkoptim
+topologyQPseudolik!(truenet,raxmlCF);
+truenet.loglik # again, pseudo deviance
+```
+This function is not maximizing the pseudolikelihood, it is simply computing the
+pseudolikelihood (or deviance) for the given branch lengths and probabilities of
+inheritance. At the moment, both of these functions require that the
+given network is of level 1 (cycles don't overlap).
+
+## Candidate networks compatible with a known outgroup
+
+If the network was estimated via `snaq!`, it might turn out to be impossible
+to root our estimated network with a known outgroup (see section
+[What if the root conflicts with the direction of a reticulation?](@ref).)
+At this time, `snaq!` does not impose any rooting constraint on the network:
+the search for the lowest score considers all level-1 networks, including those
+that are incompatible with a known outgroup.
+(The monophyly of outgroups is not imposed either, like in many other methods.)
+
+If the estimated network cannot be rooted with the known outgroup,
+we can check the `.networks` output file.
+It has a list of networks that are slight modifications of the best network,
+where the modifications changed the direction of one reticulation at a time.
+For each modified network, the score was calculated. So if we find in this list
+a modified network that has a score close to that of the best network,
+and that can be re-rooted with our known root position, then this modified network
+is a better candidate than the network with the best score.
+
+Below is what the `net1.networks` file looks like, after performing
+the analysis in the section [Network Estimation](@ref).
+Scroll to the right to see the scores.
+
+    (C,D,((O,(E,#H7:::0.19558838614943078):0.31352437658618976):0.6640664399202987,(B,(A)#H7:::0.8044116138505693):10.0):10.0);, with -loglik 28.31506721890958 (best network found, remaining sorted by log-pseudolik; the smaller, the better)
+    (C,D,((O,(E)#H7:::0.8150784689693145):0.9336405757682176,(B,(A,#H7:::0.18492153103068557):0.25386142779877724):1.8758156446611114):10.0);, with -loglik 31.535560380783814
+    (B,#H7:9.90999345612101::0.2555404440833535,(A,(E,(O,((C,D):10.0)#H7:0.3419231810962026::0.7444595559166465):0.19994859441332047):2.5014911511063644):0.7957621793330066);, with -loglik 56.64548310161462
+    (C,D,((O,(E,((B)#H7:::0.7957543284159452,A):4.786202415937916):0.004527712280136759):1.7952610454570868,#H7:::0.20424567158405482):10.0);, with -loglik 67.17775727492258
+    (C,D,(#H7:::0.32947301811471164,(B,(A,(E,(O)#H7:::0.6705269818852884):1.371799259141243):0.0):6.397073999864152):7.677245926003807);, with -loglik 199.11401961057143
+
+We can read this file and look at its list of networks like this:
+
+```@repl fixednetworkoptim
+file = "net1.networks";
+# or use the example file available with the package:
+file = joinpath(dirname(pathof(SNaQ)), "..","examples","net1.networks");
+netlist = readMultiTopology(file) # read the full list of networks in that file
+```
+Next, we would like to extract the network scores from the file.
+Below is a one-liner to do this
+(we make Julia send a `sed` command to the shell --sorry, Mac or Linux for this.)
+```@repl fixednetworkoptim
+scoresInString = read(`sed -E 's/.+with -loglik ([0-9]+.[0-9]+).+/\1/' $file`, String)
+scores = parse.(Float64, split(scoresInString))
+# next: update the "loglik" of each network with the score read from the file
+for i in eachindex(netlist)
+   netlist[i].loglik = scores[i]
+   println("net $i in the list: score = ",scores[i])
+end
+```
+The first network in the list is the best network returned by `snaq!`.
+We see that the second network has a score that's not too far, but the other networks
+have worse scores. The best network and its best modification (second network in the
+list) are shown below. We chose to show edge numbers, to use them later
+to re-root the networks.
+
+```@example fixednetworkoptim
+R"svg(name('fixednetworkoptim_othernets1.svg'), width=7, height=4)" # hide
+R"layout(matrix(1:2,1,2))"; # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(netlist[1], showgamma=true, showedgenumber=true, tipoffset=0.1);
+R"mtext"("best net, score=28.3", line=-1);
+plot(netlist[2], showgamma=true, showedgenumber=true, tipoffset=0.1);
+R"mtext"("direction modified, score=31.5", line=-1);
+R"dev.off()"; # hide
+nothing # hide
+```
+![othernets before reroot](../assets/figures/fixednetworkoptim_othernets1.svg)
+
+Now imagine that our outgroup is taxon A.
+- best network: we would get a "RootMismatch" error if we tried to set
+  the root on the external edge 9 to A, with `rootatnode!(netlist[1], "A")`
+  (see section
+  [What if the root conflicts with the direction of a reticulation?](@ref)).
+  But we could root the best network on the major parent edge to A, edge 10
+  (rooted network on the left below).
+- For the second best network in our list, there are 2 ways to root it
+  with A: on the external edge 8 to A (top right), or on its parent edge 10
+  These 2 options give quite different rooted versions
+  of the network, one of which requires the existence of an unsampled taxon,
+  sister to BOECD, that would have contributed to introgression into
+  an ancestor of E. The second rooted version says that an ancestor of
+  (or sister to) A contributed to the introgression into the ancestor of E.
+  A is an outgroup in both cases, but the second case is more parsimonious,
+  in the sense that it does not require the existence of an unsampled taxon.
+
+```@example fixednetworkoptim
+R"svg(name('fixednetworkoptim_othernets2.svg'), width=7, height=7)" # hide
+R"layout(matrix(c(1,4,2,3),2,2))"; # hide
+R"par"(mar=[0,0,0.5,0]) # hide
+rootonedge!(netlist[1], 10); # root best net to make A outgroup
+rotate!(netlist[1], -4); # to 'un-cross' edges
+rotate!(netlist[1], -6);
+rotate!(netlist[1], -5);
+plot(netlist[1], showgamma=true, tipoffset=0.1);
+R"mtext"("best net, score=28.3", line=-1);
+global_logger(NullLogger()); # hide
+rootatnode!(netlist[2], "A"); # net with modified direction: first way to make A outgroup
+global_logger(baselogger);   # hide
+rotate!(netlist[2], -4) # to 'un-cross' edges
+rotate!(netlist[2], -6)
+plot(netlist[2], showgamma=true, tipoffset=0.1);
+R"mtext"("second best in list, score=31.5\nrequires unsampled population", line=-2);
+rootonedge!(netlist[2], 10) # net with modified direction: second way to make A outgroup
+for i in [9,-7] rotate!(netlist[2], i); end; # to 'un-cross' edges
+plot(netlist[2], showgamma=true, tipoffset=0.1);
+R"mtext"("second best in list, score=31.5\ndifferent root position", line=-2);
+R"dev.off()"; # hide
+nothing # hide
+```
+![othernets after reroot](../assets/figures/fixednetworkoptim_othernets2.svg)
diff --git a/docs/src/man/inputdata.md b/docs/src/man/inputdata.md
new file mode 100644
index 0000000..eb40b3a
--- /dev/null
+++ b/docs/src/man/inputdata.md
@@ -0,0 +1,187 @@
+# Input for SNaQ
+
+SNaQ is a method implemented in the package to estimate a phylogenetic network
+from multiple molecular sequence alignments. There are two alternatives for the input data:
+
+1. A list of estimated gene trees for each locus, which can be obtained using [MrBayes](http://mrbayes.sourceforge.net) or [RAxML](http://sco.h-its.org/exelixis/software.html). Or:
+2. A table of concordance factors (CF), i.e. gene tree frequencies, for each 4-taxon subset. This table can be obtained from [BUCKy](http://www.stat.wisc.edu/~ane/bucky/), to account for gene tree uncertainty
+
+This [pipeline](https://github.com/nstenz/TICR) can be used to obtain the table of
+quartet CF needed as input for SNaQ
+(see also the [wiki](https://github.com/juliaphylo/PhyloNetworks.jl/wiki/TICR:-from-alignments-to-quartet-concordance-factors).)
+It starts from the sequence alignments,
+runs MrBayes and then BUCKy (both parallelized), producing the
+table of estimated CFs and their credibility intervals.
+Additional details on this [TICR pipeline](@ref)
+describe how to insert data at various stages (e.g. after running MrBayes on each locus).
+
+## Tutorial data: gene trees
+
+We suggest that you create a special directory for running these examples,
+where input files can be downloaded and where output files will be
+created (with estimated networks for instance). Enter this directory
+and run Julia from there.
+
+Suppose you have a file with a list of gene trees in parenthetical
+format called `raxmltrees.tre`.
+You can access the example file of input trees
+[here](https://github.com/juliaphylo/SNaQ/blob/main/examples/raxmltrees.tre)
+or
+[here](https://raw.githubusercontent.com/juliaphylo/SNaQ/main/examples/raxmltrees.tre)
+for easier download.
+
+Do not copy-paste into a "smart" text-editor. Instead, save the file
+directly into your working directory using "save link as" or "download linked file as".
+This file contains 30 gene trees, each in parenthetical format on 6 taxa
+like this (with rounded branch lengths):
+
+`(E:0.038,((A:0.014,B:0.010):0.010,(C:0.008,D:0.002):0.010):0.025,O:0.078);`
+
+If `raxmltrees.tre` is in your working directory, you can view its content
+within Julia:
+```julia
+less("raxmltrees.tre")
+```
+or like this, to view the version downloaded with the package:
+```julia
+raxmltrees = joinpath(dirname(pathof(SNaQ)), "..","examples","raxmltrees.tre")
+less(raxmltrees)
+```
+Just type `q` to quit viewing this file.
+You could read in these 30 trees and visualize the third one (say) like this:
+```@example qcf
+using PhyloNetworks, SNaQ
+raxmltrees = joinpath(dirname(pathof(SNaQ)), "..","examples","raxmltrees.tre");
+nothing # hide
+```
+```@repl qcf
+genetrees = readMultiTopology(raxmltrees);
+genetrees[3]
+```
+To visualize any of these input trees, use the
+[PhyloPlots](https://github.com/juliaphylo/PhyloPlots.jl) package:
+```@example qcf
+using PhyloPlots
+using RCall # hide
+mkpath("../assets/figures") # hide
+R"name <- function(x) file.path('..', 'assets', 'figures', x)" # hide
+R"svg(name('inputdata_gene3.svg'), width=4, height=3)" # hide
+R"par"(mar=[0,0,0,0])                          # hide
+plot(genetrees[3]); # tree for 3rd gene
+R"dev.off()"                                   # hide
+nothing # hide
+```
+![gene3](../assets/figures/inputdata_gene3.svg)
+
+To read in all gene trees and directly summarize them by a list
+of quartet CFs (proportion of input trees with a given quartet):
+```@repl qcf
+q,t = countquartetsintrees(genetrees); # read in trees, calculate quartet CFs
+df = writeTableCF(q,t)   # data frame with observed CFs: gene frequencies
+using CSV
+CSV.write("tableCF.csv", df); # to save the data frame to a file
+raxmlCF = readTableCF("tableCF.csv") # read in the file and produces a "DataCF" object
+rm("tableCF.csv") # hide
+```
+`less("tableCF.csv")` lets you see the content of the newly created
+file "tableCF.csv", within Julia. Again, type `q` to quit viewing this file.
+
+In this table, each 4-taxon set is listed in one row.
+The 3 "CF" columns gives the proportion of genes that has
+each of the 3 possible trees on these 4 taxa.
+
+For more help on any function, type `?` to enter the help mode,
+then type the name of the function. For example: type `?` then `countquartetsintrees`
+for information on the various options of that function.
+
+When there are many more taxa, the number of quartets
+might be very large and we might want to use a subset to speed things up.
+Here, if we wanted to use a random sample of 10 quartets
+instead of all quartets, we could do:
+
+`raxmlCF = readTrees2CF(raxmltrees, whichQ="rand", numQ=10, CFfile="tableCF10.txt")`
+
+Be careful to use a numQ value smaller than the total number of possible
+4-taxon subsets, which is *n choose 4* on *n* taxa (e.g. 15 on 6 taxa).
+To get a predictable random sample, you may set the seed with
+`using Random; Random.seed!(12321)`
+(for instance) prior to sampling the quartets as above.
+The `readTrees2CF` is *much* slower than the function `countquartetsintrees`
+to read in trees and calculate the quartet CFs observed in the trees,
+when we want to get *all* quartet CFs. But for a small sample of quartets,
+then `readTrees2CF` is available.
+
+## Tutorial data: quartet CFs
+
+If we already have a table of quartet concordance factor (CF) values
+in a file `buckyCF.csv` in this format
+
+| Taxon1 | Taxon2 | Taxon3 | Taxon4 | CF12_34 | CF13_24 | CF14_23
+|:-------|:-------|:-------|:-------|:--------|:--------|:-------
+| D      | A| E | O|   0.565 |       0.0903 |       0.3447
+| ...    |  |   |  |         |              |       ...
+
+we would read it in one step like this: `readTableCF("buckyCF.csv")`.
+An example file comes with the package, available
+[here](https://github.com/juliaphylo/SNaQ/blob/main/examples/buckyCF.csv)
+or
+[here](https://raw.githubusercontent.com/juliaphylo/SNaQ/main/examples/buckyCF.csv).
+
+```@repl qcf
+buckyCFfile = joinpath(dirname(pathof(SNaQ)), "..","examples","buckyCF.csv");
+buckyCF = readTableCF(buckyCFfile)
+```
+The same thing could be done in 2 steps:
+first to read the file and convert it to a 'DataFrame' object,
+and then to convert this DataFrame into a DataCF object.
+```@repl qcf
+using CSV, DataFrames
+dat = DataFrame(CSV.File(buckyCFfile); copycols=false);
+first(dat, 6) # to see the first 6 rows
+buckyCF = readTableCF(dat)
+writeTableCF(buckyCF)
+```
+In the input file, columns need to be in the right order:
+with the first 4 columns giving the names of the taxa in each 4-taxon set.
+The CF values are assumed to be in columns named "CF12_34", etc.,
+or else in columns 5,6,7.
+If available, a column named "ngenes" will be taken to have the
+the number of genes for each 4-taxon subset.
+
+## Tutorial data: starting tree
+
+If we have a tree for the data set at hand,
+it can be used as a starting point for the optimization.
+From our gene trees, we estimated a species tree with
+[ASTRAL](https://github.com/smirarab/ASTRAL/blob/master/astral-tutorial.md).
+This tree comes with the package in file `astral.tre`
+[here](https://github.com/juliaphylo/SNaQ/blob/main/examples/astral.tre).
+This file has 102 trees: 100 bootstrap species trees,
+followed by their greedy consensus,
+followed by the best tree on the original data.
+It's this last tree that we are most interested in.
+We can read it with
+```@example qcf
+astralfile = joinpath(dirname(pathof(SNaQ)), "..","examples","astral.tre");
+astraltree = readMultiTopology(astralfile)[102] # 102th tree: last tree here
+R"svg(name('inputdata_astraltree.svg'), width=4, height=3)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(astraltree, showedgelength=true);
+R"dev.off()"; # hide
+nothing # hide
+```
+![astraltree](../assets/figures/inputdata_astraltree.svg)
+
+To start its search, SNaQ will need a network of "level 1".
+All trees and all networks with 1 hybridization are of level 1.
+To make sure that a network with 2 or more hybridizations is of level 1,
+we can read it in with
+`readTopologyLevel1` (which also unroots the tree, resolves polytomies,
+replaces missing branch lengths by 1 for starting values etc.):
+```julia
+T=readTopologyLevel1("startNetwork.txt")
+```
+(here `startNetwork.txt` is a hypothetical file: replace this by
+the name of a file that contains your network of interest.)
+
+Next: [Getting a Network](@ref)
diff --git a/docs/src/man/multiplealleles.md b/docs/src/man/multiplealleles.md
new file mode 100644
index 0000000..fcbadcc
--- /dev/null
+++ b/docs/src/man/multiplealleles.md
@@ -0,0 +1,147 @@
+```@setup multialleles
+using PhyloNetworks, SNaQ
+```
+
+# Multiple alleles per species
+
+## between-species 4-taxon sets
+
+The default setting for SNaQ considers that each allele in a gene tree corresponds
+to a taxon (a tip) in the network. If instead each allele/individual can be mapped confidently
+to a species, and if only the species-level network needs to be estimated,
+then the following functions can be used:
+
+```@repl multialleles
+using CSV, DataFrames
+mappingfile = joinpath(dirname(pathof(SNaQ)), "..","examples","mappingIndividuals.csv");
+tm = CSV.read(mappingfile, DataFrame) # taxon map as a data frame
+taxonmap = Dict(row[:individual] => row[:species] for row in eachrow(tm)) # taxon map as a dictionary
+```
+
+The [mapping file](https://github.com/juliaphylo/SNaQ/blob/main/examples/mappingIndividuals.csv)
+can be a text (or `csv`) file with two columns (at least):
+one for the individuals, named `allele` or `individual`,
+and one column containing the species names, named `species`.
+Each row should map an allele name to a species name.
+Next, read in the [gene trees](https://github.com/juliaphylo/SNaQ/blob/main/examples/genetrees_alleletips.tre)
+and calculate the quartet CFs at the species level:
+
+
+```@repl multialleles
+genetreefile = joinpath(dirname(pathof(SNaQ)), "..","examples","genetrees_alleletips.tre");
+genetrees = readMultiTopology(genetreefile);
+sort(tipLabels(genetrees[1])) # multiple tips in species S1
+df_sp = writeTableCF(countquartetsintrees(genetrees, taxonmap, showprogressbar=false)...)
+```
+
+Now `df_sp` is a data frame containing the quartet concordance factors
+at the species level only, that is, considering sets made of 4 distinct species,
+even if the gene trees may have multiple alleles from the same species.
+For 4 distinct species `A,B,C,D`, all alleles from each species (`A` etc.)
+will be used to calculate the quartet CF. If a given gene tree has
+`n_a` alleles from `a`, `n_b` alleles from `b` etc., then
+each set of 4 alleles is given a weight of `1/(n_a n_b n_c n_d)`
+to calculated of the CF for `A,B,C,D` (such that the total weight from
+this particular gene trees is 1).
+It is safe to save this data frame, then use it for `snaq!` like this:
+
+```@repl multialleles
+CSV.write("tableCF_species.csv", df_sp);   # to save the data frame to a file
+d_sp = readTableCF("tableCF_species.csv"); # to get a "DataCF" object for use in snaq!
+summarizeDataCF(d_sp)
+```
+
+## within-species 4-taxon sets
+
+Four-taxon sets involving 2 individuals per species can provide more
+information about the underlying network, including external branch
+length in coalescent units. However, `snaq!` runs more slowly when
+using this extra information. To get quartet CFs from sets of 4 individuals
+in which 2 individuals are from the same species, the following functions
+should be used:
+
+```@repl multialleles
+df_ind = writeTableCF(countquartetsintrees(genetrees, showprogressbar=false)...); # no mapping: CFs across individuals
+first(df_ind, 4) # to see the first 4 rows
+CSV.write("tableCF_individuals.csv", df_ind);  # to save to a file
+df_sp = mapAllelesCFtable(mappingfile, "tableCF_individuals.csv");
+d_sp = readTableCF!(df_sp, mergerows=true);
+```
+where the mapping file can be a text (or `csv`) file with two columns
+named `allele` (or `individual`) and `species`, mapping each allele name to a species name.
+The data in `df_ind` is the table of concordance factors at the level of individuals.
+In other words, it lists CFs using one row for each set of 4 alleles/individuals.
+
+`mapAllelesCFtable` creates a new data frame `df_sp` of quartet concordance factors at the
+species level: with the allele names replaced by the appropriate species names.
+
+**Warnings**:
+- This procedure requires that all alleles from the same
+  individual are given the same name (the individual's 'name') across
+  all genes for which that individual was sequenced.
+- For a four-taxon set `A,B,C,D`, all the individuals from `A`, `B`, `C` and `D`
+  are considered, say `(a1,b1,c1,d1)`, `(a2,b1,c1,d1)`, `(a1,b2,c1,d1)`, `(a2,b2,c1,d1)`
+  and so on. The CFs of these 4-taxon sets are averaged together to obtain the
+  CFs at the species level. This procedures gives more weight to genes that have
+  many alleles (because they contribute to more sets of 4 individuals) and less
+  weight to genes that have few alleles.
+
+The last command modifies this data frame `df_sp` by deleting rows that are uninformative
+about between-species relationships, such as rows corresponding to 4 individuals from the
+same species. The output `d_sp` of this second command is an object of type `DataCF` at the
+species level, which can be used as input for networks estimation with `snaq!`.
+But before, it is safe to save the concordance factor of quartets of species,
+which can be calculated by averaging the CFs of quartets of individuals
+from the associated species:
+
+```@repl multialleles
+df_sp = writeTableCF(d_sp) # data frame, quartet CFs averaged across individuals of same species
+CSV.write("CFtable_species.csv", df_sp); # save to file
+```
+
+Some quartets have the same species repeated twice,
+representing cases when 2 of the 4 individuals came from the same species.
+These quartets, with repeated species, are informative about the population
+size of extant populations, i.e. about the lengths of external branches in
+coalescent units.
+
+The main difference between this section compared to the previous section
+("between-species 4-taxon sets") is that quartets with 2 individuals from
+the same species are included here, such as `a1,a2,b1,c1`.
+Also, the weighting of quartets is different. Here, genes with more alleles
+are given more weight.
+
+now we can run snaq:
+
+```julia
+net = snaq!(T_sp, d_sp);
+```
+where `T_sp` should be a starting topology with one tip per species,
+labelled with the same species names as the names used in the mapping file.
+
+If `snaq!` takes too long that way, we can try a less ambitious estimation
+that does not estimate the external branch lengths, that is,
+*without* using quartets that have 2 individuals from the same species.
+To do so, we can use the quartet concordance factors at the species level,
+but filter out the quartets with one (or more) species repeated.
+This can be done as in the first section ("between-species 4-taxon sets")
+to give equal weight to all genes,
+or as shown below to give more weight to genes that have more alleles:
+
+```@repl multialleles
+first(df_sp, 3) # some quartets have the same species twice
+function hasrep(row) # see if a row (4-taxon set) has a species name ending with "__2": repeated species
+    occursin(r"__2$", row[:t1]) || occursin(r"__2$", row[:t2]) || # replace :t1 :t2 etc. by appropriate column names in your data,
+    occursin(r"__2$", row[:t3]) || occursin(r"__2$", row[:t4])    # e.g. by :taxon1 :taxon2 etc.
+end
+df_sp_reduced = filter(!hasrep, df_sp) # removes rows with repeated species
+CSV.write("CFtable_species_norep.csv", df_sp_reduced); # to save to file
+d_sp_reduced = readTableCF(df_sp_reduced) # DataCF object, for input to snaq!
+```
+
+and now we can run `snaq!` on the reduced set of quartets without repeats,
+which should be faster:
+
+```julia
+net = snaq!(T_sp, d_sp_reduced);
+```
diff --git a/docs/src/man/snaq_plot.md b/docs/src/man/snaq_plot.md
new file mode 100644
index 0000000..e8e4237
--- /dev/null
+++ b/docs/src/man/snaq_plot.md
@@ -0,0 +1,390 @@
+```@setup snaqplot
+using PhyloNetworks, SNaQ
+mkpath("../assets/figures")
+exampledir = joinpath(dirname(pathof(SNaQ)), "..","examples")
+raxmltrees = joinpath(exampledir,"raxmltrees.tre")
+raxmlCF = readTrees2CF(raxmltrees, writeTab=false, writeSummary=false)
+astralfile = joinpath(exampledir,"astral.tre")
+astraltree = readMultiTopology(astralfile)[102] # 102th tree = last tree here
+net0 = readTopology(joinpath(exampledir,"net0.out"))
+net1 = readTopology(joinpath(exampledir,"net1.out"))
+rotate!(net1, -6)
+net2 = readTopology(joinpath(exampledir,"net2.out"))
+net3 = readTopology(joinpath(exampledir,"net3.out"))
+net0.loglik = 53.53150526187732
+net1.loglik = 28.31506721890958
+net2.loglik = 28.31506721890957
+net3.loglik = 28.315067218909626
+```
+# Getting a Network
+
+## Network Estimation
+
+SNaQ implements the statistical inference method in
+[Solís-Lemus & Ané 2016](http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1005896).
+The procedure involves a numerical optimization of branch lengths and inheritance
+probabilities and a heuristic search in the space of phylogenetic networks.
+
+After [Input for SNaQ](@ref), we can estimate the network using the
+input data `raxmlCF` and starting from tree (or network) `astraltree`.
+We first impose the constraint of at most 0 hybrid node,
+that is, we ask for a tree.
+```julia
+net0 = snaq!(astraltree,raxmlCF, hmax=0, filename="net0", seed=1234)
+```
+Part of the screen output shows this:
+
+    MaxNet is (C,D,((B,A):1.395762055180493,(O,E):0.48453400554506426):10.0);
+    with -loglik 53.53150526187732
+
+This parenthetical (extended Newick) description is not very
+human-friendly, so we plot the tree
+(more about plotting networks below: [Network Visualization](@ref) ).
+
+```@example snaqplot
+using PhyloPlots
+using RCall # hide
+R"name <- function(x) file.path('..', 'assets', 'figures', x)" # hide
+R"svg(name('snaqplot_net0_1.svg'), width=4, height=3)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net0);
+R"dev.off()"; # hide
+nothing # hide
+```
+![net0_1](../assets/figures/snaqplot_net0_1.svg)
+
+We can use this tree as a starting point to search for the best
+network allowing for at most `hmax=1` hybrid node (which is the default).
+```julia
+net1 = snaq!(net0, raxmlCF, hmax=1, filename="net1", seed=2345)
+```
+part of screen output:
+
+    best network and networks with different hybrid/gene flow directions printed to .networks file
+    MaxNet is (C,D,((O,(E,#H7:::0.19558838614943078):0.31352437658618976):0.6640664399202987,(B,(A)#H7:::0.8044116138505693):10.0):10.0);
+    with -loglik 28.31506721890958
+
+We can visualize the estimated network and its inheritance values γ, which
+measure the proportion of genes inherited via each parent at a reticulation event
+(e.g. proportion of genes inherited via gene flow).
+```@example snaqplot
+R"svg(name('snaqplot_net1_1.svg'), width=4, height=3)"; # hide
+R"par"(mar=[0,0,0,0]); # hide
+plot(net1, showgamma=true);
+R"dev.off()"; # hide
+nothing # hide
+```
+![net1_1](../assets/figures/snaqplot_net1_1.svg)
+
+This network has A as a hybrid, 80.4% sister to B,
+and 19.6% sister to E (which is otherwise sister to O).
+C & D are sister to each other.
+We can also check the output files created by `snaq!`:
+```julia
+less("net1.err") # would provide info about errors, if any
+less("net1.out") # main output file with the estimated network from each run
+less("net1.networks") # extra info
+```
+when viewing these result files with `less`
+within Julia, use arrows to scroll down and type `q` to quit viewing the files.
+The file `net1.networks` contains a list of networks that are slight modifications
+of the best (estimated) network `net1`. The modifications changed the direction
+of one reticulation at a time, by moving the placement of one hybrid node to another
+node inside the same cycle.
+For each modified network, the pseudolikelihood score was calculated
+(the `loglik` or `-Ploglik` values give a pseudo deviance actually).
+
+The function name `snaq!` ends with ! because it modifies the argument `raxmlCF`
+by including the expected CF. Type `?` then `snaq!` to get help on that function.
+
+The main output file, here `net1.out` (or `snaq.out` by default) has the estimated
+network in parenthetical format, but we can also print it directly to the screen:
+```@repl snaqplot
+net1
+writeTopology(net1)  # writes to screen, full precision for branch lengths and γ
+writeTopology(net1, round=true, digits=2)
+writeTopology(net1,di=true) # γ omitted: for dendroscope
+writeTopology(net1, "bestnet_h1.tre") # writes to file: creates or overwrites file
+rm("bestnet_h1.tre") # hide
+```
+The option `di=true` is for the parenthetical format used by
+[Dendroscope](http://dendroscope.org/) (without reticulation heritabilities).
+Copy this parenthetical description and paste it into Dendroscope,
+or use the plotting function described below.
+
+We can go on and let the network have up to 2 or 3 hybrid nodes:
+```julia
+net2 = snaq!(net1,raxmlCF, hmax=2, filename="net2", seed=3456)
+net3 = snaq!(net0,raxmlCF, hmax=3, filename="net3", seed=4567)
+```
+and plot them (they are identical and they both have a single reticulation):
+```@example snaqplot
+R"svg(name('snaqplot_net23.svg'), width=7, height=3)" # hide
+using RCall                  # to be able to tweak our plot within R
+R"layout(matrix(1:2, 1, 2))" # to get 2 plots into a single figure: 1 row, 2 columns
+R"par"(mar=[0,0,1,0])        # for smaller margins
+plot(net2, showgamma=true);
+R"mtext"("hmax=2")           # add text annotation: title here
+plot(net3, showgamma=true);
+R"mtext"("hmax=3")
+R"dev.off()"; # hide
+nothing # hide
+```
+![net23](../assets/figures/snaqplot_net23.svg)
+
+with this screen output for net2 (only 1 hybrid node found):
+
+    MaxNet is (C,D,((B,(A)#H7:::0.804411606649347):10.0,(O,(#H7:::0.19558839335065303,E):0.3135243143217013):0.664066456871298):10.0);
+    with -loglik 28.31506721890957
+
+and this output for net3 (again, only 1 hybrid found):
+
+    MaxNet is (D,C,((O,(E,#H7:::0.19558839257941849):0.3135243301652981):0.6640664138384673,(B,(A)#H7:::0.8044116074205815):10.0):10.0);
+    with -loglik 28.315067218909626
+
+## parallel computations
+
+For network estimation, multiple runs can done in parallel.
+For example, if your machine has 4 or more processors (or cores),
+you can tell julia to use 4 processors by starting julia with `julia -p 4`,
+or by starting julia the usual way (`julia`) and then adding processors with:
+
+```julia
+using Distributed
+addprocs(4)
+```
+
+If we load a package (`using SNaQ`) before adding processors,
+then we need to re-load it again so that all processors have access to it:
+
+```julia
+@everywhere using PhyloNetworks, SNaQ
+```
+
+After that, running any of the `snaq!(...)` command will use
+different cores for different runs, as processors become available.
+Fewer details are printed to the log file when multiple cores
+are used in parallel.
+
+When running `bootsnaq`, the analysis of each bootstrap replicate
+will use multiple cores to parallelize separate runs of that particular
+bootstrap replicate. You may parallelize things further by running
+`bootsnaq` multiple times (on separate machines for instance), each time
+for a small subset of bootstrap replicates, and with a different seed each time.
+
+We may tell julia to add more processors than our machine has,
+but we will not receive any performance benefits.
+At any time during the julia session, `nworkers()` tells us how many
+worker processors julia has access to.
+
+Below is an example of how to use a cluster, to run many independent
+`snaq!` searches in parallel on a cluster running the
+[slurm](https://slurm.schedmd.com) job manager
+(other managers would require a different, but similar submit file).
+This example uses 2 files:
+1. a julia script file, to do many runs of `snaq!` in parallel,
+   asking for many cores (default: 10 runs, asking for 10 cores).
+   This julia script can take arguments: the maximum allowed
+   number of hybridizations `hmax`, and the number of runs
+   (to run 50 runs instead of 10, say).
+2. a submit file, to launch the julia script.
+
+**First**: the example julia script, below, is assumed (by the submit file)
+to be called `runSNaQ.jl`. It uses a starting tree that
+is assumed to be available in a file named `astraltree.tre`, but that
+could be modified
+(to use a network with h=1 to start the search with hmax=2 for instance).
+It also assumes that the quartet concordance factor data are in file
+`tableCF_speciesNames.csv`. Again, this file name should be adjusted.
+To run this julia script for 50 runs and hmax=3, do `julia runSNaQ.jl 3 50`.
+
+```julia
+#!/usr/bin/env julia
+
+# file "runSNaQ.jl". run in the shell like this in general:
+# julia runSNaQ.jl hvalue nruns
+# example for h=2 and default 10 runs:
+# julia runSNaQ.jl 2
+# or example for h=3 and 50 runs:
+# julia runSNaQ.jl 3 50
+
+length(ARGS) > 0 ||
+    error("need 1 or 2 arguments: # reticulations (h) and # runs (optional, 10 by default)")
+h = parse(Int, ARGS[1])
+nruns = 10
+if length(ARGS) > 1
+    nruns = parse(Int, ARGS[2])
+end
+outputfile = string("net", h, "_", nruns, "runs") # example: "net2_10runs"
+seed = 1234 + h # change as desired! Best to have it different for different h
+@info "will run SNaQ with h=$h, # of runs=$nruns, seed=$seed, output will go to: $outputfile"
+
+using Distributed
+addprocs(nruns)
+@everywhere using SNaQ
+net0 = readTopology("astraltree.tre");
+using DataFrames, CSV
+df_sp = CSV.read("tableCF_speciesNames.csv", DataFrame; pool=false);
+d_sp = readTableCF!(df_sp);
+net = snaq!(net0, d_sp, hmax=h, filename=outputfile, seed=seed, runs=nruns)
+```
+
+When julia is called on a script, whatever comes after "julia scriptname"
+is given to julia in an array of values. This array is called `ARGS`.
+So if we call a script like this: `julia runSNaQ.jl 2`
+then the script will know the arguments through `ARGS`,
+which would contain a single element, `"2"`.
+This first element is just a string, at this stage. We want to use it as a number,
+so we ask julia to parse the string into an integer.
+
+**Second**: we need a "submit" file to ask a job scheduler like
+[slurm](https://slurm.schedmd.com) to submit our julia script to a cluster.
+In the submit file below, the first 5 lines set things up for slurm.
+They are most likely to be specific to your cluster.
+The main idea here is to use a slurm "array" from 0 to 3, to run our
+julia script multiple times, 4 times actually: from hmax=0 to hmax=3.
+Each would do 30 runs
+(and each would be allocated 30 cores in the submit script below).
+Then log out of the cluster and go for coffee.
+
+```bash
+#!/bin/bash
+#SBATCH -o path/to/slurm/log/file/runsnaq_slurm%a.log
+#SBATCH -J runsnaq
+#SBATCH --array=0-3
+#SBATCH -c 30
+## --array: to run multiple instances of this script,
+##          one for each value in the array.
+##          1 instance = 1 task
+## -J job name
+## -c number of cores (CPUs) per task
+
+echo "slurm task ID = $SLURM_ARRAY_TASK_ID used as hmax"
+echo "start of SNaQ parallel runs on $(hostname)"
+# finally: launch the julia script, using Julia executable appropriate for slurm, with full paths:
+/workspace/software/bin/julia --history-file=no -- runSNaQ.jl $SLURM_ARRAY_TASK_ID 30 > net${SLURM_ARRAY_TASK_ID}_30runs.screenlog 2>&1
+echo "end of SNaQ run ..."
+```
+
+## choosing the number of hybridizations
+
+Each network has a `loglik` attribute, which is its pseudo deviance:
+a multiple of the negative log-likelihood up to a constant (the constant is
+such that the score is 0 if the network fits the data perfectly).
+The lower the better. We can plot these scores across hybrid values:
+```@example snaqplot
+scores = [net0.loglik, net1.loglik, net2.loglik, net3.loglik]
+hmax = collect(0:3)
+R"svg(name('snaqplot_scores_heuristic.svg'), width=4, height=3)" # hide
+R"par"(mar=[2.5,2.5,.5,.5], mgp=[1.4,.4,0], tck=-0.02, las=1, lab=[3,5,7]);  # hide
+R"plot"(hmax, scores, type="b", ylab="network score", xlab="hmax", col="blue");
+R"dev.off()"; # hide
+nothing # hide
+```
+![scores_heuristic](../assets/figures/snaqplot_scores_heuristic.svg)
+
+Here the slope heuristic suggests a single hybrid node:
+the score does not get much better beyond h=1.
+
+We made the plot via R above. A more Julian way would use a Julia plotting
+package such as [Gadfly](http://gadflyjl.org/stable/) or
+[Plots](http://docs.juliaplots.org/latest/ecosystem/), like this for instance:
+```julia
+using Gadfly
+plot(x=collect(0:3), y=scores, Geom.point, Geom.line)
+```
+(btw, cool [blog](http://avt.im/blog/2018/03/23/R-packages-ggplot-in-julia) about using ggplot within julia)
+
+## Network Visualization
+
+To visualize the estimated network, we can use the companion package
+[PhyloPlots](https://github.com/juliaphylo/PhyloPlots.jl).
+In the example below, julia creates and sends the plot to R
+via [RCall](https://github.com/JuliaInterop/RCall.jl),
+so we can tweak the plot in various ways via commands sent to R.
+To save the plot in a file: we first tell R to create an image file,
+then we send the plot of the network,
+then we tell R to wrap up and save its image file.
+
+```@example snaqplot
+using PhyloPlots # to visualize networks
+using RCall      # to send additional commands to R like this: R"..."
+imagefilename = "../assets/figures/snaqplot_net1_2.svg"
+R"svg"(imagefilename, width=4, height=3) # starts image file
+R"par"(mar=[0,0,0,0]) # to reduce margins (no margins at all here)
+plot(net1, showgamma=true, showedgenumber=true); # network is plotted & sent to file
+R"dev.off()"; # wrap up and save image file
+nothing # hide
+```
+![net1_2](../assets/figures/snaqplot_net1_2.svg)
+
+The plot function has many options, to annotate nodes and edges. In the
+example above, hybrid edges were annotated with their γ inheritance values
+(in blue: light blue for the minor edge with γ<0.5, and dark blue for the
+major edge with γ>0.5), and edges were annotated with their internal numbers.
+
+Type `?` to switch to the help mode
+of Julia, then type the name of the function, here `plot`.
+Below are two visualizations.
+The first uses the default style (`:fulltree`) and modified edge colors.
+The second uses the `:majortree` style.
+That style doesn't have an arrow by default for minor hybrid edges,
+but we can ask for one by specifying a positive arrow length.
+```@example snaqplot
+R"svg(name('snaqplot_net1_3.svg'), width=7, height=3)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+R"layout(matrix(1:2,1,2))";
+plot(net1, showedgelength=true, minorhybridedgecolor="tan");
+plot(net1, style=:majortree, arrowlen=0.07);
+R"dev.off()"; # hide
+nothing # hide
+```
+![net1_3](../assets/figures/snaqplot_net1_3.svg)
+
+Edge lengths are shown, too. They were estimated in coalescent units:
+number of generations / effective population size.
+Some edge lengths are not identifiable, hence not shown.
+
+Below is another example, where space was added between the network and
+the taxon names via the `tipoffset` option.
+Also, edge colors were changed, and the nodes numbers are shown (used internally)
+
+```@example snaqplot
+R"svg(name('snaqplot_net1_4.svg'), width=4, height=3)" # hide
+R"par"(mar=[0,0,0,0]) # hide
+plot(net1, tipoffset=0.5, shownodenumber=true, edgecolor="tomato4",
+     minorhybridedgecolor="skyblue", majorhybridedgecolor="tan");
+R"dev.off()"; # hide
+nothing # hide
+```
+![net1_4](../assets/figures/snaqplot_net1_4.svg)
+
+## Re-rooting networks
+
+SNaQ infers an unrooted semi-directed network.
+The direction of hybrid edges can be inferred,
+but the direction of tree edges cannot be inferred.
+To obtain a representative visualization,
+it is best to root the network first, using one or more outgroup.
+Go to [Re-rooting trees and networks](@ref) for this.
+If your outgroup conflicts with the direction of reticulations
+in the estimated network, see section
+[Candidate networks compatible with a known outgroup](@ref).
+
+## Candidate Network Evaluation
+
+From a set of candidate networks, one might simply need to score of each network
+to pick the best. Here, the score is the negative log pseudo-likelihood, and the
+lower the better. See the section to get the score of [Candidate Networks](@ref).
+
+## SNaQ error reporting
+
+Please report any bugs and errors by opening an
+[issue](https://github.com/juliaphylo/SNaQ.jl/issues/new).
+The easiest way to provide information on the error is by checking the
+`.err` file, which will show the number of runs that
+failed and the corresponding seed to replicate the run.
+In case of an error, the `.err` file might look like:
+`Total errors: 1 in seeds [4545]`.
+This file and any information that will help replicating the error
+will be immensely helpful to fix the error/bug.
diff --git a/docs/src/man/ticr_howtogetQuartetCFs.md b/docs/src/man/ticr_howtogetQuartetCFs.md
new file mode 100644
index 0000000..a531360
--- /dev/null
+++ b/docs/src/man/ticr_howtogetQuartetCFs.md
@@ -0,0 +1,260 @@
+# TICR pipeline
+
+PhyloNetworks' [wiki](https://github.com/juliaphylo/PhyloNetworks.jl/wiki/TICR:-from-alignments-to-quartet-concordance-factors)
+has a step-by-step tutorial,
+to go from multiple sequence alignments
+to a table of quartet gene frequencies (concordance factors: CFs),
+through BUCKy (to integrate out gene tree uncertainty) or through RAxML.
+To get the `raxml.pl` perl script to run RAxML on each gene,
+download the content of that wiki with
+
+`git clone https://github.com/juliaphylo/PhyloNetworks.jl.wiki.git`
+
+then go to the `script/` folder.  
+Full information and code is [here](https://github.com/nstenz/TICR).
+Below is more detail to insert data into the pipeline at various stages,
+using one of **two pipelines**, depending on your machine configuration:
+
+- "no-scheduler" pipeline: original set of Perl scripts from the
+  TICR pipeline, well suited for a machine or a cluster of machines
+  without a job scheduler.
+  The scripts automatically parallelize the work across the available cores.
+
+- "slurm" pipeline: well suited for a cluster where users submit jobs
+  via a job scheduler like [SLURM](https://slurm.schedmd.com/) or
+  [SGE](https://en.wikipedia.org/wiki/Oracle_Grid_Engine). The job scheduler
+  does the work of parallelizing the work across available cores.
+  The scripts, in this second pipeline, were created to take full advantage
+  of job scheduler capabilities. They were developed for a cluster running SLURM.
+  Adjustments to the submit scripts will be needed, to adapt to your own
+  SLURM configuration or to the syntax that your job scheduler wants.
+
+## To run MrBayes: we already have alignments
+
+### no-scheduler pipeline
+
+We don't need to run `mdl.pl` if we already have aligned gene sequences
+from separate loci. To run MrBayes on each locus, we can simply
+create a tarball of the Nexus files we wish to use (fasta won't work at this stage).
+The command below assumes that we want to use all the files ending with ".nex"
+in the current directory, one file per locus:
+
+```bash
+tar czf my-genes.tar.gz *.nex
+```
+
+Once the tarball has been successfully generated, we can then specify this
+file as input for [mb.pl](https://github.com/nstenz/TICR/blob/master/scripts/mb.pl)
+assuming we have a valid MrBayes block located in the file "bayes.txt":
+
+```bash
+mb.pl my-genes.tar.gz -m bayes.txt -o my-genes-mb
+```
+
+If we get an error message like `mb.pl: Command not found`, it might be because
+`mb.pl` has no execute permission, or the current directory is not in our "path".
+An easy fix is to run this command instead:
+
+```bash
+perl mb.pl my-genes.tar.gz -m bayes.txt -o my-genes-mb
+```
+
+The resulting output tarball would now be located in `my-genes-mb/my-genes.mb.tar`,
+and can be used normally with
+[bucky.pl](https://github.com/nstenz/TICR/blob/master/scripts/bucky.pl),
+that is, like this:
+
+```bash
+bucky.pl my-genes-mb/my-genes.mb.tar -o mygenes-bucky
+```
+
+The output, with the table of concordance factors for all sets of 4 taxa,
+will be in a file named `my-genes.CFs.csv` inside a directory named `mygenes-bucky`.
+That's the file containing the quartet concordance factors to give to SNaQ as input.
+There is *no* need to do any of the steps below: they are already done by `bucky.pl`.
+
+### slurm pipeline
+
+SLURM will parallelize the MrBayes runs across genes.
+
+1. Navigate in some "working" directory where you place:
+   - a folder containing all nexus files, which we will call "nexusfolder" below
+   - a text file named `mb-block.txt` with the MrBayes block to be used
+   for all the genes (containing the options for MrBayes: model of sequence
+   evolution, number of generations etc.).
+   If we want a different MrBayes block for different genes, step 2 should be skipped,
+   and we should instead find some other way to put the specific MrBayes block
+   at the end of each nexus file.
+
+2. In the "working" directory above, run the julia script
+   [`paste-mb-block.jl`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/paste-mb-block.jl)
+   with "nexusfolder" as argument, to tell the script where to find all the nexus files:
+
+   ```bash
+   julia path/to/paste-mb-block.jl nexusfolder
+   ```
+   This script will read all the nexus files in the directory `nexusfolder`,
+   will create a new directory `nexusfolder-block`,
+   and will create new nexus files (containing the MrBayes block found in file `mb-block.txt`)
+   as `1.nex, 2.nex, ...` in the new directory. A `translate.txt` file will also be created
+   to map the original gene file names to the new (numbered) file names.
+   If we named our MrBayes block file differently: we can edit the script and modify it
+   to replace `mb-block.txt` by our actual file name for the MrBayes block.
+
+3. Modify the submit script
+   [`mb-slurm-submit.sh`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/mb-slurm-submit.sh),
+   which will parallelize all the individual-gene MrBayes runs with SLURM:
+
+   - change `--array` to the correct number of genes
+   - change `--mail-user` to the user's email (if this is an option for your job scheduler)
+   - replace the `/workspace/software/bin` in `PATH="/workspace/software/bin:$PATH"`
+     to the path where the `mb` executable is located or put the whole path in the command:
+     `/s/mrbayes-3.2.6-1/bin/mb`
+
+   In slurm, we can then submit the MrBayes array job with:
+
+   ```bash
+   sbatch mb-slurm-submit.sh
+   ```
+
+  With this slurm pipeline, the steps below are needed: keep reading.
+
+## To run mbsum on the output of MrBayes for each gene
+
+If we have the output of MrBayes and want to run BUCKy,
+we must first run `mbsum` on the output from MrBayes, separately for each gene.
+
+For a gene with output tree files named `gene1.run1.t`, `gene1.run2.t` and `gene1.run3.t`,
+and a desired burnin of 1000 trees per tree file, we do this:
+
+```bash
+mbsum -n 1000 -o gene1.in gene1.run1.t gene1.run2.t gene1.run3.t
+```
+
+This `mbsum` command will need to be executed for each gene.
+Then we can continue to the next section to run bucky.
+
+Alternatively, we can use the julia script
+[`mbsum-t-files.jl`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/mbsum-t-files.jl),
+and give it as argument the directory that has the output tree files from MrBayes,
+to run mbsum for *all* the genes.
+`mbsum` is fast, so there is no attempt to parallelize the various mbsum commands.
+
+```bash
+julia mbsum-t-files.jl mbfolder outputfolder burnin                # or
+julia --color=yes -- mbsum-t-files.jl mbfolder outputfolder burnin # for colorized messages to the screen
+```
+where `burnin` is replaced by the number of trees to ignore in each tree file
+for burnin. This `burnin` argument is optional (default: 2501).
+The `outputfolder` will contain the output of `mbsum`.
+
+## To run bucky on all 4-taxon sets: we already have the mbsum output
+
+### no-scheduler pipeline
+
+If we already ran `mbsum` on the output from MrBayes, for each individual gene,
+we can simply create a tarball containing all the mbsum output files.
+So if we had mbsum output in files named `gene1.in`, `gene2.in`, ... , `gene100.in`,
+we would want to run something similar to the following command to create the tarball:
+
+```bash
+tar czf my-genes-mbsum.tar.gz gene*.in
+```
+
+We can now use this tarball along with the `-s` option in
+[bucky.pl](https://github.com/nstenz/TICR/blob/master/scripts/bucky.pl) like this:
+
+```bash
+bucky.pl my-genes-mbsum.tar.gz -s -o mygenes-bucky
+```
+
+Again, if we get an error like `bucky.pl: Command not found`, we could run instead
+
+```bash
+perl bucky.pl my-genes-mbsum.tar.gz -s -o mygenes-bucky
+```
+
+The output, with the table of concordance factors for all sets of 4 taxa,
+will be in a file named `my-genes.CFs.csv` inside directory `mygenes-bucky`.
+That's the file containing the quartet concordance factors to give to SNaQ as input.
+
+### slurm pipeline
+
+We want to run `bucky` on every 4-taxon set.
+SLURM will parallelize these jobs with the submit script
+[`bucky-slurm-submit.sh`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/bucky-slurm-submit.sh),
+which calls the perl script
+[`bucky-slurm.pl`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/bucky-slurm.pl).
+
+The perl script
+[`bucky-slurm.pl`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/bucky-slurm.pl)
+runs `bucky` on a single 4-taxon set.
+It takes the following arguments, which must be modified in the submit script
+[`bucky-slurm-submit.sh`](https://github.com/nstenz/TICR/blob/master/scripts-cluster/bucky-slurm-submit.sh):
+- name of the folder containing the `mbsum` output files (one per locus) from previous step.
+  This folder is named `mbsum` in the submit script: adapt if needed.
+- output name: `-o` or `--out-dir` name of the directory to store output files in.
+  This option is not used in the default submit script
+- bucky arguments: `-a` or `--alpha` for the prior alpha value,
+  and `-n` or `--ngen` number of generations. These options are not used either,
+  in the script: the defaults are used then (α=1, 1 million generations)
+- integer for the given quartet, via option `-q`.
+  The quartet ID is specified by SLURM with its own array ID: `$SLURM_ARRAY_TASK_ID`.
+
+In the submit script that gives instructions to the job scheduler:
+- adapt the name of the `$SLURM_ARRAY_TASK_ID` variable, which captures the task number
+  in the array of tasks, to your scheduler syntax
+- change `--array` to the correct number of 4-taxon sets.
+  For example, if there are 15 taxa in the dataset, there are `1365` 4-taxon sets.
+  To get this number, if you are unsure, use `choose(15,4)` in R or `binomial(15,4)` in Julia,
+  but replace 15 by your actual number of individuals.
+- change `--mail-user` to the user's email (if this is an option for your job scheduler)
+- replace the `/workspace/software/bin` in `PATH="/workspace/software/bin:$PATH"`
+  by the path where the `bucky` executable is located.
+  Also, replace `/workspace/claudia/software/TICR/scripts/` by the full path where the
+  `bucky-slurm.pl` script is located.
+
+
+In slurm, we would submit the BUCKy array job with:
+
+```bash
+sbatch bucky-slurm-submit.sh
+```
+
+At the end, the array job will produce
+- a `.concordance` file for every 4-taxon set
+- a `.cf` file with the parsed output for that same 4-taxon set,
+  in the format needed for the final CF table.
+
+The `.cf` files can be concatenated to produce the file containing
+the quartet concordance factors across all 4-taxon sets, to give to SNaQ as input:
+
+```bash
+cat *.cf > CFtable.csv
+```
+
+Alternatively, if the list of `.cf` files is not easily captured by `*.cf`
+(because the list is too long for a shell command), the following julia script
+can do the concatenation. Just copy-paste the commands below within a Julia session,
+started from the directory that contains the `.cf` files:
+
+```julia
+files = String[] # empty vector of strings: will contain the .cf file names later
+for f in filter(x -> endswith(x, ".cf"), readdir())
+    push!(files,f)
+end
+println("found $(length(files)) cf files") # to check how many .cf output files were found
+open("CFtable.csv","w") do f_out
+  # write the header:
+  write(f_out, "taxon1,taxon2,taxon3,taxon4,CF12_34,CF12_34_lo,CF12_34_hi,CF13_24,CF13_24_lo,CF13_24_hi,CF14_23,CF14_23_lo,CF14_23_hi,ngenes\n")
+  for file in files
+    @show file # to see the .cf file name: comment this out if that's too much screen output
+    open(file) do f_in
+        line = read(f_in, String)
+        write(f_out, string(line,"\n"))
+    end # closes "file" safely
+  end
+end # closes "CFtable.csv" safely
+```
+When this is done, we will have a file `CFtable.csv` containing the
+quartet concordance factors, to give to SNaQ as input :smiley:
diff --git a/src/auxiliary.jl b/src/auxiliary.jl
index a80e365..89325cb 100644
--- a/src/auxiliary.jl
+++ b/src/auxiliary.jl
@@ -6,8 +6,8 @@
 function setCHECKNET(b::Bool)
     global CHECKNET
     CHECKNET = b
-    CHECKNET && @warn "PhyloNetworks.CHECKNET is true: will slow snaq! down."
-    b || @info "PhyloNetworks.CHECKNET set to false"
+    CHECKNET && @warn "SNaQ.CHECKNET is true: will slow snaq! down."
+    b || @info "SNaQ.CHECKNET set to false"
 end
 
 # ----- aux general functions ---------------
@@ -127,7 +127,7 @@ end
 
 Set the length of `edge`, and set `edge.y` and `edge.z` accordingly.
 Warning: specific to `SNaQ.jl`.
-Consider `PhyloNetworks.setlengths!` for a more generic tool.
+Consider [`setlengths!`](@ref) from `PhyloNetworks` for a more generic tool.
 
 - The new length is censored to 10: if the new length is above 10,
   the edge's length will be set to 10. Lengths are interpreted in coalescent