diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
index 7c416216..0d38638d 100644
--- a/PopPUNK/__init__.py
+++ b/PopPUNK/__init__.py
@@ -7,5 +7,5 @@
# Minimum sketchlib version
SKETCHLIB_MAJOR = 1
-SKETCHLIB_MINOR = 5
-SKETCHLIB_PATCH = 3
+SKETCHLIB_MINOR = 6
+SKETCHLIB_PATCH = 0
diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 9ac13439..c927ac52 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -111,12 +111,13 @@ def get_options():
type=float, default = None)
refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. '
'See documentation for help.', default=None)
- refinementGroup.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually', default=False,
- action='store_true')
+ refinementGroup.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually',
+ choices=['both', 'core', 'accessory'], default = False)
refinementGroup.add_argument('--no-local', help='Do not perform the local optimization step (speed up on very large datasets)',
default=False, action='store_true')
refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries '
'to clusters [default = reference database directory]', type = str)
+ refinementGroup.add_argument('--core-only', help='Save the core distance fit (with ')
# lineage clustering within strains
lineagesGroup = parser.add_argument_group('Lineage analysis options')
@@ -175,6 +176,7 @@ def main():
from .network import printClusters
from .plot import writeClusterCsv
+ from .plot import plot_scatter
from .prune_db import prune_distance_matrix
@@ -287,6 +289,11 @@ def main():
dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
storePickle(refList, queryList, True, distMat, dists_out)
+ # Plot results
+ plot_scatter(distMat,
+ args.output + "/" + os.path.basename(args.output) + "_distanceDistribution",
+ args.output + " distances")
+
#******************************#
#* *#
#* model fit and network *#
@@ -434,7 +441,6 @@ def main():
overall_lineage,
output_format = 'phandango',
epiCsv = None,
- queryNames = refList,
suffix = '_Lineage')
genomeNetwork = indivNetworks[min(rank_list)]
@@ -469,10 +475,10 @@ def main():
output + "/" + os.path.basename(output) + \
"_" + dist_type + '_graph.gt', fmt = 'gt')
- if args.core_only:
+ if args.indiv_refine == 'core':
fit_type = 'core'
genomeNetwork = indivNetworks['core']
- elif args.accessory_only:
+ elif args.indiv_refine == 'accessory':
fit_type = 'accessory'
genomeNetwork = indivNetworks['accessory']
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 24b97dbe..8b8283c9 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -21,8 +21,6 @@
import pp_sketchlib
-from .plot import plot_scatter
-
# BGMM
from .bgmm import fit2dMultiGaussian
from .bgmm import assign_samples
@@ -126,7 +124,6 @@ def fit(self, X = None):
'''Initial steps for all fit functions.
Creates output directory. If preprocess is set then subsamples passed X
- and draws a scatter plot from result using :func:`~PopPUNK.plot.plot_scatter`.
Args:
X (numpy.array)
@@ -159,12 +156,6 @@ def fit(self, X = None):
self.scale = np.amax(self.subsampled_X, axis = 0)
self.subsampled_X /= self.scale
- # Show clustering
- plot_scatter(self.subsampled_X,
- self.scale,
- self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_distanceDistribution",
- self.outPrefix + " distances")
-
def plot(self, X=None):
'''Initial steps for all plot functions.
@@ -474,7 +465,7 @@ def plot(self, X=None, y=None):
if not hasattr(self, 'subsampled_X'):
self.subsampled_X = utils.shuffle(X, random_state=random.randint(1,10000))[0:self.max_samples,]
- non_noise = np.sum(np.where(self.labels != -1))
+ non_noise = np.sum(self.labels != -1)
sys.stderr.write("Fit summary:\n" + "\n".join(["\tNumber of clusters\t" + str(self.n_clusters),
"\tNumber of datapoints\t" + str(self.subsampled_X.shape[0]),
"\tNumber of assignments\t" + str(non_noise)]) + "\n\n")
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index 0427b378..2ba936bf 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -6,6 +6,7 @@
import sys
import os
import subprocess
+import random
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
@@ -18,7 +19,7 @@
import pandas as pd
from collections import defaultdict
from scipy import spatial
-from sklearn import manifold
+from sklearn import manifold, utils
try: # sklearn >= 0.22
from sklearn.neighbors import KernelDensity
except ImportError:
@@ -27,7 +28,7 @@
from .utils import isolateNameToLabel
-def plot_scatter(X, scale, out_prefix, title, kde = True):
+def plot_scatter(X, out_prefix, title, kde = True):
"""Draws a 2D scatter plot (png) of the core and accessory distances
Also draws contours of kernel density estimare
@@ -35,8 +36,6 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
Args:
X (numpy.array)
n x 2 array of core and accessory distances for n samples.
- scale (numpy.array)
- Scaling factor from :class:`~PopPUNK.models.BGMMFit`
out_prefix (str)
Prefix for output plot file (.png will be appended)
title (str)
@@ -46,6 +45,15 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
(default = True)
"""
+ # Plot results - max 1M for speed
+ max_plot_samples = 1000000
+ if X.shape[0] > max_plot_samples:
+ X = utils.shuffle(X, random_state=random.randint(1,10000))[0:max_plot_samples,]
+
+ # Kernel estimate uses scaled data 0-1 on each axis
+ scale = np.amax(X, axis = 0)
+ X /= scale
+
plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k')
if kde:
xx, yy, xy = get_grid(0, 1, 100)
@@ -58,11 +66,13 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
z = z.reshape(xx.shape).T
levels = np.linspace(z.min(), z.max(), 10)
+ # Rescale contours
plt.contour(xx*scale[0], yy*scale[1], z, levels=levels[1:], cmap='plasma')
scatter_alpha = 1
else:
scatter_alpha = 0.1
+ # Plot on correct scale
plt.scatter(X[:,0]*scale[0].flat, X[:,1]*scale[1].flat, s=1, alpha=scatter_alpha)
plt.title(title)
diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index 251cda5b..21945677 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -108,7 +108,7 @@ def get_options():
faGroup.add_argument('--cytoscape', help='Generate network output files for Cytoscape', default=False, action='store_true')
faGroup.add_argument('--phandango', help='Generate phylogeny and TSV for Phandango visualisation', default=False, action='store_true')
faGroup.add_argument('--grapetree', help='Generate phylogeny and CSV for grapetree visualisation', default=False, action='store_true')
- faGroup.add_argument('--rapidnj', help='Path to rapidNJ binary to build NJ tree for Microreact', default=None)
+ faGroup.add_argument('--rapidnj', help='Path to rapidNJ binary to build NJ tree for Microreact', default='rapidnj')
faGroup.add_argument('--perplexity',
type=float, default = 20.0,
help='Perplexity used to calculate t-SNE projection (with --microreact) [default=20.0]')
@@ -135,6 +135,9 @@ def get_options():
if arg is not None:
arg = arg.rstrip('\\')
+ if args.rapidnj == "":
+ args.rapidnj = None
+
return args
def generate_visualisations(query_db,
diff --git a/docs/api.rst b/docs/api.rst
index e46582fa..dc3ff110 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -6,6 +6,13 @@ Documentation for module functions (for developers)
.. contents::
:local:
+assign.py
+---------
+``poppunk_assign`` main function
+
+.. automodule:: PopPUNK.assign
+ :members:
+
bgmm.py
--------
@@ -24,12 +31,6 @@ Functions used to fit DBSCAN to a database. Access using
.. automodule:: PopPUNK.dbscan
:members:
-mash.py
---------
-
-.. automodule:: PopPUNK.mash
- :members:
-
models.py
---------
@@ -80,6 +81,13 @@ utils.py
.. automodule:: PopPUNK.utils
:members:
+visualise.py
+------------
+``poppunk_visualise`` main function
+
+.. automodule:: PopPUNK.visualise
+ :members:
+
web.py
--------
diff --git a/docs/best_practises.rst b/docs/best_practises.rst
new file mode 100644
index 00000000..593bab24
--- /dev/null
+++ b/docs/best_practises.rst
@@ -0,0 +1,72 @@
+Best practises guide
+====================
+This page details the way in which we would advise that you *should* use and
+run PopPUNK, if possible.
+
+.. image:: images/poppunk_flowchart.png
+ :alt: Flowchart for choosing how to use PopPUNK
+ :align: center
+
+Use an online interface
+-----------------------
+If available, you may want to use one of the browser-based interfaces to
+PopPUNK. These include `PopPUNK-web `__ and
+`pathogen.watch `__
+(*S. pneumoniae* only). See the :doc:`online` page for full details.
+
+Using these interfaces requires nothing to be installed or set up, doesn't require any
+genome data to be shared with us, and will return interactive visualisations. If your
+species isn't available, or you have large batches of genomes to cluster you will
+likely want to use the command line interface instead.
+
+Use the command line interface
+------------------------------
+
+Installation and version
+^^^^^^^^^^^^^^^^^^^^^^^^
+Install via conda if possible. Please use at least version v2.3.0 of PopPUNK
+and v1.5.1 of ``pp-sketchlib``.
+
+Use query assignment mode
+^^^^^^^^^^^^^^^^^^^^^^^^^
+If a database is available for your species (see https://poppunk.net/pages/databases.html)
+we would strongly recommend downloading it to use to cluster your genomes. This
+has many advantages:
+
+- No need to run through the potentially complex model fitting.
+- Assured model performance.
+- Considerable faster run times.
+- Use existing cluster definitions.
+- Use the context of large, high quality reference populations to interpret your
+ genomes' clusters.
+
+See :doc:`query_assignment` for instructions on how to use this mode.
+
+You can think of this as being similar to using an existing MLST/cgMLST/wgMLST scheme
+to define your sample's strains.
+
+Fit your own model
+^^^^^^^^^^^^^^^^^^
+If a database isn't available for your species, you can fit your own. Details
+on how to do this can be found on :doc:`model_fitting`.
+
+After getting a good fit, you may want to share it with others so that they can
+use it to assign queries. See :doc:`model_distribution` for advice. We would also
+be interested to hear from you if you'd like to add your new model to the
+pre-fit databases above -- please contact poppunk@poppunk.net.
+
+Create visualisations
+^^^^^^^^^^^^^^^^^^^^^
+A number of plots are created by default. You can also
+create files for further visualisation in `microreact `__,
+`cytoscape `__,
+`grapetree `__ and
+`phandango `_. We have found that
+looking at the appearance of clusters on a tree is always very helpful, and would
+recommend this for any fit.
+
+Older versions of PopPUNK mandated this be chosen as part of the main analysis,
+and then with ``--generate-viz`` mode. This is now run separately, after the
+main analysis, with ``poppunk_visualise``.
+
+See :doc:`visualisation` for details on options.
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index 941f2c50..61baf7f4 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -38,7 +38,8 @@
# Causes a problem with rtd: https://github.com/pypa/setuptools/issues/1694
autodoc_mock_imports = ["hdbscan",
"numpy",
- "graph-tool",
+ "graph_tool.all",
+ "graph_tool",
"pandas",
"scipy",
"sklearn",
@@ -65,16 +66,16 @@
# General information about the project.
project = 'PopPUNK'
copyright = '2018-2020, John Lees and Nicholas Croucher'
-author = 'John Lees and Nicholas Croucher'
+author = 'John Lees, Daniel Anderson and Nicholas Croucher'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
-version = '2.2.0'
+version = '2.3.0'
# The full version, including alpha/beta/rc tags.
-release = '2.2.0'
+release = '2.3.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
diff --git a/docs/images/13mer_hist.png b/docs/images/13mer_hist.png
new file mode 100644
index 00000000..11980a25
Binary files /dev/null and b/docs/images/13mer_hist.png differ
diff --git a/docs/images/DPGMM_fit_K2.png b/docs/images/DPGMM_fit_K2.png
deleted file mode 100644
index e1d65c50..00000000
Binary files a/docs/images/DPGMM_fit_K2.png and /dev/null differ
diff --git a/docs/images/DPGMM_fit_K3.png b/docs/images/DPGMM_fit_K3.png
deleted file mode 100644
index 36760e40..00000000
Binary files a/docs/images/DPGMM_fit_K3.png and /dev/null differ
diff --git a/docs/images/assign_network.png b/docs/images/assign_network.png
new file mode 100644
index 00000000..3b184785
Binary files /dev/null and b/docs/images/assign_network.png differ
diff --git a/docs/images/DPGMM_fit_K10.png b/docs/images/bgmm_fit_K10.png
similarity index 100%
rename from docs/images/DPGMM_fit_K10.png
rename to docs/images/bgmm_fit_K10.png
diff --git a/docs/images/bgmm_k2_fit.png b/docs/images/bgmm_k2_fit.png
new file mode 100644
index 00000000..f46c2ff4
Binary files /dev/null and b/docs/images/bgmm_k2_fit.png differ
diff --git a/docs/images/bgmm_k4_boundary.png b/docs/images/bgmm_k4_boundary.png
new file mode 100644
index 00000000..37351d45
Binary files /dev/null and b/docs/images/bgmm_k4_boundary.png differ
diff --git a/docs/images/bgmm_k4_fit.png b/docs/images/bgmm_k4_fit.png
new file mode 100644
index 00000000..f96d487a
Binary files /dev/null and b/docs/images/bgmm_k4_fit.png differ
diff --git a/docs/images/cytoscape.png b/docs/images/cytoscape.png
index 7c9cdbab..fb129c8e 100644
Binary files a/docs/images/cytoscape.png and b/docs/images/cytoscape.png differ
diff --git a/docs/images/cytoscape_gpsc.png b/docs/images/cytoscape_gpsc.png
new file mode 100644
index 00000000..e12a3ad9
Binary files /dev/null and b/docs/images/cytoscape_gpsc.png differ
diff --git a/docs/images/dbscan_fit.png b/docs/images/dbscan_fit.png
index bde3f7a0..cf7f5d3d 100644
Binary files a/docs/images/dbscan_fit.png and b/docs/images/dbscan_fit.png differ
diff --git a/docs/images/dbscan_fit_min_prop.png b/docs/images/dbscan_fit_min_prop.png
new file mode 100644
index 00000000..782f5577
Binary files /dev/null and b/docs/images/dbscan_fit_min_prop.png differ
diff --git a/docs/images/fit_example_fixed.png b/docs/images/fit_example_fixed.png
deleted file mode 100644
index 4682bfc5..00000000
Binary files a/docs/images/fit_example_fixed.png and /dev/null differ
diff --git a/docs/images/fit_example_wrong.png b/docs/images/fit_example_wrong.png
deleted file mode 100644
index 782ecce7..00000000
Binary files a/docs/images/fit_example_wrong.png and /dev/null differ
diff --git a/docs/images/flu_phased.png b/docs/images/flu_phased.png
new file mode 100644
index 00000000..909662b5
Binary files /dev/null and b/docs/images/flu_phased.png differ
diff --git a/docs/images/flu_unphased.png b/docs/images/flu_unphased.png
new file mode 100644
index 00000000..d9f257c6
Binary files /dev/null and b/docs/images/flu_unphased.png differ
diff --git a/docs/images/grapetree.png b/docs/images/grapetree.png
new file mode 100644
index 00000000..ac69875e
Binary files /dev/null and b/docs/images/grapetree.png differ
diff --git a/docs/images/grapetree_collapse.png b/docs/images/grapetree_collapse.png
new file mode 100644
index 00000000..213170ad
Binary files /dev/null and b/docs/images/grapetree_collapse.png differ
diff --git a/docs/images/indiv_refine.png b/docs/images/indiv_refine.png
index ceb62541..00ca03bb 100644
Binary files a/docs/images/indiv_refine.png and b/docs/images/indiv_refine.png differ
diff --git a/docs/images/kmer_fit.png b/docs/images/kmer_fit.png
new file mode 100644
index 00000000..16efe919
Binary files /dev/null and b/docs/images/kmer_fit.png differ
diff --git a/docs/images/listeria_dists.png b/docs/images/listeria_dists.png
new file mode 100644
index 00000000..30005b58
Binary files /dev/null and b/docs/images/listeria_dists.png differ
diff --git a/docs/images/listeria_lineage_rank_1.png b/docs/images/listeria_lineage_rank_1.png
new file mode 100644
index 00000000..4604e04c
Binary files /dev/null and b/docs/images/listeria_lineage_rank_1.png differ
diff --git a/docs/images/listeria_lineage_rank_1_histogram.png b/docs/images/listeria_lineage_rank_1_histogram.png
new file mode 100644
index 00000000..9c951e09
Binary files /dev/null and b/docs/images/listeria_lineage_rank_1_histogram.png differ
diff --git a/docs/images/listeria_lineage_rank_3.png b/docs/images/listeria_lineage_rank_3.png
new file mode 100644
index 00000000..85d9f395
Binary files /dev/null and b/docs/images/listeria_lineage_rank_3.png differ
diff --git a/docs/images/listeria_lineage_rank_3_histogram.png b/docs/images/listeria_lineage_rank_3_histogram.png
new file mode 100644
index 00000000..0b7650c5
Binary files /dev/null and b/docs/images/listeria_lineage_rank_3_histogram.png differ
diff --git a/docs/images/listeria_microreact.png b/docs/images/listeria_microreact.png
new file mode 100644
index 00000000..05296a1a
Binary files /dev/null and b/docs/images/listeria_microreact.png differ
diff --git a/docs/images/listeria_refined.png b/docs/images/listeria_refined.png
new file mode 100644
index 00000000..f34daccc
Binary files /dev/null and b/docs/images/listeria_refined.png differ
diff --git a/docs/images/listeria_threshold.png b/docs/images/listeria_threshold.png
new file mode 100644
index 00000000..a2fc3ea4
Binary files /dev/null and b/docs/images/listeria_threshold.png differ
diff --git a/docs/images/lm_GMM_K2.png b/docs/images/lm_GMM_K2.png
deleted file mode 100644
index a4376508..00000000
Binary files a/docs/images/lm_GMM_K2.png and /dev/null differ
diff --git a/docs/images/lm_GMM_K4.png b/docs/images/lm_GMM_K4.png
deleted file mode 100644
index a6e248a7..00000000
Binary files a/docs/images/lm_GMM_K4.png and /dev/null differ
diff --git a/docs/images/lm_dbscan.png b/docs/images/lm_dbscan.png
deleted file mode 100644
index 9ba337bb..00000000
Binary files a/docs/images/lm_dbscan.png and /dev/null differ
diff --git a/docs/images/lm_distance_dist.png b/docs/images/lm_distance_dist.png
deleted file mode 100644
index 9966f27e..00000000
Binary files a/docs/images/lm_distance_dist.png and /dev/null differ
diff --git a/docs/images/lm_fit.png b/docs/images/lm_fit.png
deleted file mode 100644
index 93f2f2b1..00000000
Binary files a/docs/images/lm_fit.png and /dev/null differ
diff --git a/docs/images/lm_microreact.png b/docs/images/lm_microreact.png
deleted file mode 100644
index fede300d..00000000
Binary files a/docs/images/lm_microreact.png and /dev/null differ
diff --git a/docs/images/phandango.png b/docs/images/phandango.png
new file mode 100644
index 00000000..e056db73
Binary files /dev/null and b/docs/images/phandango.png differ
diff --git a/docs/images/poppipe_dag.png b/docs/images/poppipe_dag.png
new file mode 100644
index 00000000..6a2f3cb6
Binary files /dev/null and b/docs/images/poppipe_dag.png differ
diff --git a/docs/images/poppunk_flowchart.png b/docs/images/poppunk_flowchart.png
new file mode 100644
index 00000000..4a71d8d8
Binary files /dev/null and b/docs/images/poppunk_flowchart.png differ
diff --git a/docs/images/web_cyto.png b/docs/images/web_cyto.png
new file mode 100644
index 00000000..c8d0edb0
Binary files /dev/null and b/docs/images/web_cyto.png differ
diff --git a/docs/images/web_home.png b/docs/images/web_home.png
new file mode 100644
index 00000000..3819c9fd
Binary files /dev/null and b/docs/images/web_home.png differ
diff --git a/docs/images/web_micro.png b/docs/images/web_micro.png
new file mode 100644
index 00000000..24ce9ab9
Binary files /dev/null and b/docs/images/web_micro.png differ
diff --git a/docs/images/web_micro_assigned.png b/docs/images/web_micro_assigned.png
new file mode 100644
index 00000000..1ff40f78
Binary files /dev/null and b/docs/images/web_micro_assigned.png differ
diff --git a/docs/images/web_micro_change.png b/docs/images/web_micro_change.png
new file mode 100644
index 00000000..bc90b8f6
Binary files /dev/null and b/docs/images/web_micro_change.png differ
diff --git a/docs/images/web_phylo.png b/docs/images/web_phylo.png
new file mode 100644
index 00000000..9dd00833
Binary files /dev/null and b/docs/images/web_phylo.png differ
diff --git a/docs/images/web_prevs.png b/docs/images/web_prevs.png
new file mode 100644
index 00000000..deb1aafa
Binary files /dev/null and b/docs/images/web_prevs.png differ
diff --git a/docs/images/web_prevs_zoomed.png b/docs/images/web_prevs_zoomed.png
new file mode 100644
index 00000000..fa62659e
Binary files /dev/null and b/docs/images/web_prevs_zoomed.png differ
diff --git a/docs/images/web_stats.png b/docs/images/web_stats.png
new file mode 100644
index 00000000..db2c5947
Binary files /dev/null and b/docs/images/web_stats.png differ
diff --git a/docs/index.rst b/docs/index.rst
index 7817e761..a032d3dd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -9,57 +9,62 @@ PopPUNK documentation
:alt: PopPUNK (Population Partitioning Using Nucleotide K-mers)
:align: center
-In straightforward cases, usage can be as simple as::
+PopPUNK is a tool for clustering genomes. The first version was targeted specifically
+as bacterial genomes, but the current version has also been used for viruses
+(e.g. enterovirus, influenza, SARS-CoV-2) and eukaryotes (e.g. *Candida* sp.,
+*P. falciparum*). Under the hood, PopPUNK uses
+`pp-sketchlib `__ to rapidly calculate
+core and accessory distances, and machine learning tools written in python to
+use these to cluster genomes. A detailed description of the method can be found
+in the `paper `_.
- poppunk --easy-run --r-files references.txt --output poppunk_db
+If you are new to PopPUNK, we'd recommend starting on :doc:`installation`, then
+by reading the :doc:`best_practises`.
-Where ``references.txt`` is a list of assembly fasta files, one per line. See
-:doc:`quickstart` and the :doc:`tutorial` for full details.
+.. important::
+ Looking for older versions of the documentation? For previous versions with
+ the old API (``--assign-query``, ``--refine-fit`` etc) see `v2.2.0 `__.
+ For older versions which used mash, see `v1.2.0 `__.
.. toctree::
- :maxdepth: 2
+ :maxdepth: 1
:caption: Contents:
self
installation.rst
- options.rst
- quickstart.rst
- tutorial.rst
+ best_practises.rst
+ online.rst
+ query_assignment.rst
+ sketching.rst
+ qc.rst
+ model_fitting.rst
+ model_distribution.rst
+ visualisation.rst
+ subclustering.rst
troubleshooting.rst
+ options.rst
scripts.rst
api.rst
miscellaneous.rst
-Details
--------
-A full description of the method can be found in the `paper `_.
-
-``PopPUNK`` uses the fast k-mer distance estimation enabled by `mash `_
-to calculate core and accessory distances between all pairs of isolates of bacteria in a collection. By clustering
-these distances into 'within-strain' and 'between-strain' distances a network
-of within-strain comparisons can be constructed. The use of a network has
-a number of convenient properties, the first being that the connected
-components represent a cluster of strains.
-
-As well as identifying strains, the pairwise distance distribution also helps
-with assembly quality control (particularly in the case of contaminated
-contigs) and may be informative of the level of recombination in the
-population. The network representation also allows definition of representative isolates by
-sampling one example from each clique, and calculation of various statistics
-which can show how good the clustering is.
-
-The advantages of this approach are broadly that:
+Why use PopPUNK?
+----------------
+The advantages of PopPUNK are broadly that:
-- It is fast, and scalable to :math:`10^{4}` genomes in a single run.
+- It is fast, and scalable to over :math:`10^{5}` genomes in a single run.
- Assigning new query sequences to a cluster using an existing database is scalable even beyond this.
+- Cluster names remain consistent between studies, and other cluster labels such as MLST
+ can be appended.
- Databases can be updated online (as sequences arrive).
- Online updating is equivalent to building databases from scratch.
- Databases can be kept small and managable by only keeping representative isolates.
+- Databases naturally allow in-depth analysis of single clusters,
+ but keeping the full context of the whole database.
- There is no bin cluster. Outlier isolates will be in their own cluster.
- Pre-processing, such as generation of an alignment, is not required.
-- The definition of clusters is biologically relevant to how bacteria evolve.
-- There is a lot of quantitative and graphical output to assist with
- clustering.
+- Raw sequence reads can be used as input, while being filtered for sequencing errors.
+- The definition of clusters are biologically relevant.
+- Many quantitative and graphical outputs are provided.
- A direct import into `microreact `_ is
available, as well as `cytoscape `_,
`grapetree `_ and
diff --git a/docs/installation.rst b/docs/installation.rst
index 645ccd92..e6ca66a8 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -31,12 +31,26 @@ Then run::
conda install poppunk
+If you are having conflict issues with conda, our advice would be:
+
+- Remove and reinstall miniconda.
+- Never install anything in the base environment
+- Create a new environment for PopPUNK with ``conda create -n pp_env poppunk``
+
+If you have an older version of PopPUNK, you can upgrade using this method -- you
+may also wish to specify the version, for example ``conda install poppunk==2.3.0`` if you
+wish to upgrade.
+
+conda-forge also has some helpful tips: https://conda-forge.org/docs/user/tipsandtricks.html
+
Installing with pip
-------------------
If you do not have conda, you can also install through pip::
python3 -m pip install poppunk
+This may not deal with all necessary :ref:`dependencies`.
+
Clone the code
--------------
You can also clone the github to run the latest version, which is executed by::
@@ -56,6 +70,7 @@ Dependencies
We tested PopPUNK with the following packages:
* python3 (3.8.2)
+* ``pp-sketchlib`` (1.5.1)
* ``DendroPy`` (4.3.0)
* ``hdbscan`` (0.8.13)
* ``matplotlib`` (2.1.2)
diff --git a/docs/model_distribution.rst b/docs/model_distribution.rst
new file mode 100644
index 00000000..4d6ff0cf
--- /dev/null
+++ b/docs/model_distribution.rst
@@ -0,0 +1,74 @@
+Distributing PopPUNK models
+===========================
+If you have fitted a model yourself, you may be interested in distributing it so that
+others can use it for your species. This will give consistent cluster names across datasets,
+mean the high-quality tested fit can be reused, and speeds up future analysis.
+
+Please contact us at poppunk@poppunk.net. We would be happy to add your sketches and
+fitted model to our other `databases `__.
+
+Database contents
+-----------------
+A database requires the following files:
+
+- ``.h5``. The sketch database, a HDF5 file.
+- ``.dists.pkl`` and ``.dists.npy`` files. Distances for all vs all samples in the sketch database.
+- ``_fit.npz`` and ``_fit.pkl`` files. Python files which describe the model fit.
+- ``_graph.gt``. The network relating distances, fit and strain assignment for all samples in the sketch database.
+- ``_clusters.csv``. The strain assignment of all samples in the sketch database.
+
+If you used a :ref:`lineage-fit` you will also need:
+
+- ``rank_k_fit.npz``. Distances for each rank :math:`k` fit.
+- ``_lineages.csv``. Combined lineage assignments for each rank.
+
+You may also have ``.refs`` versions of these files, which are pruned to contain just the
+reference samples (see below). We would highly recommend including the ``output/output.refs`` file
+with any database, even though it is not strictly required, as it will speed up query assignment.
+Lineage models do not use references.
+
+.. note::
+ If the database is very large, you may consider just distributing the ``.refs`` files. This will
+ enable query assignment, but visualisation and subclustering within strains will no longer be
+ possible, as full information within each strain will be missing.
+
+Picking references
+------------------
+PopPUNK automatically prunes redundant sequence information from databases by removing
+samples from cliques (where every sample is in the same strain as every other sample). This
+algorithm has changed slightly from the originally published one:
+
+#. Split the graph into connected components (strains), which are analysed in parallel.
+#. Identify a clique. If no samples in the clique are already references, add one sample as a reference.
+#. Prune the clique from the graph, creating a subgraph.
+#. Recursively apply steps 2-3 until only two samples or fewer remain.
+#. Add the remaining samples as references
+#. Create the reference graph, and find connected components again.
+#. For any samples which are no longer in the same connected component, find a minimum path
+ between them in the full graph, and add all samples in this path as references.
+
+This makes the algorithm scale better, and ensures clusters remain connected. You may find
+that more references are picked than before using this method, which is a small cost for the
+increase robustness.
+
+This process occurs automatically after the model fit. In the *Listeria* example::
+
+ Removing 97 sequences
+
+31 strains are represented by :math:`128 - 97 = 31` references, exactly one reference
+per cluster, which is the minimum. The refined fit removes 93 sequences with 29 strains,
+so some larger clusters need to be represented by multiple references. The names of the chosen
+references are written to the .refs file. In addition, the distances, sketch database and graph
+have the non-reference sequences pruned and saved with .refs suffixes. This gives a complete database
+suitable for assignment with references only, should the full database be prohibitively large.
+
+.. note::
+ Previous fans (users) of PopPUNK may remember the ``--full-db`` option which switched off
+ reference picking. This was useful, as reference-only databases always lost information. This
+ option has now been removed, and reference picking will always be run. Both full and reference
+ databases are always produced (apart from in lineage mode). The default assignment uses
+ just references, but has the full database available for strain visualisation and subclustering.
+
+If you interrupt the reference picking the output will still be valid. If you wish to
+run reference picking on a database where it is missing (due to being from an older version,
+or interrupted) you can do this with the ``poppunk_references`` script.
diff --git a/docs/model_fitting.rst b/docs/model_fitting.rst
new file mode 100644
index 00000000..d1239b04
--- /dev/null
+++ b/docs/model_fitting.rst
@@ -0,0 +1,760 @@
+Fitting new models
+==================
+
+.. |nbsp| unicode:: 0xA0
+ :trim:
+
+If you cannot find an existing model for your species in the
+`list `__ you will want to fit your own.
+This process is flexible, and there are five different models you can use depending
+on the population structure of your dataset.
+
+.. note::
+ After fitting a model to a new species we would like to share it on our website,
+ so others can use it for assigning queries. If you are open to this, please read
+ :doc:`model_distribution` after this page.
+
+.. contents::
+ :local:
+
+Overview
+--------
+
+First, use ``poppunk --create-db`` to sketch your input data and calculate distances
+between all samples. This is detailed in :doc:`sketching`.
+
+Then, use ``poppunk --fit-model `` with one of the following model names:
+
+- ``bgmm`` -- Bayesian Gaussian Mixture Model. Best for small sample collections
+ with strain-structure. Works best when distance distribution components are clearly
+ separated.
+- ``dbscan`` -- HDBSCAN. A good general method for larger sample collections with
+ strain-structure. Some points will always be designated as noise, so a subsequent run
+ of model refinement may help improve the fit.
+- ``refine`` -- Model refinement. Requires a model already fitted with ``bgmm`` or ``dbscan``
+ and attempts to improve it by maximising the network score. Particularly useful when
+ components overlap significantly (often due to recombination), or when the strain boundary
+ is thought to lie somewhere within a component.
+- ``threshold`` -- Apply a given core or accessory distance threshold to define clusters. Useful if
+ a cutoff threshold is already known/calculated, is estimated from a plot, or to compare a threshold
+ between datasets or species.
+- ``lineage`` -- Lineage clustering. To find lineages within a strain (subclustering), or
+ find clusters in a population without strain structure. Uses a simple nearest neighbour approach
+ so is more of a heuristic. Network scores are not meaningful in this mode.
+
+The most useful guide to deciding which model to use is the ``_distanceDistribution.png`` file
+showing the core and accessory distances. More details on each of these models is given
+further down this page.
+
+A completed fit will consist of:
+
+- A ``_clusters.csv`` file, which gives the strain (cluster) for each sample in the database.
+- ``_fit.npz`` and ``_fit.pkl`` files, which contain numeric data and metadata for the fit.
+- A ``_graph.gt`` file, which is the network defining the fit in graph-tool format.
+- Some plots of the fit, which depend on the specific model used.
+- A ``.refs`` file, which lists the samples kept as 'references' for assigning
+ future samples (see :doc:`model_distribution` for more details).
+
+This page will use 128 *Listeria*\ |nbsp| \ *monocytogenes* genomes from `Kremer et al `__,
+which can be downloaded from `figshare `__. The distribution of
+core and accessory distances from the ``--create-db`` step is as follows:
+
+.. image:: images/listeria_dists.png
+ :alt: Core and accessory distances for the example data
+ :align: center
+
+We also show some examples with 616 *Streptococcus*\ |nbsp| \ *pneumoniae* genomes, which are more complex.
+These genomes were collected from Massachusetts,
+first reported `here `__ and can be accessed
+`here `__.
+
+Common arguments
+----------------
+- ``--ref-db``: the output prefix used with ``--create-db`` i.e. the directory where the .h5 file is located
+- ``--output``: where to save the model. If not specified this defaults to ``ref-db``.
+- ``--overwrite``: overwrite any existing files in the output directory.
+- ``--external-clustering``: any additional labels to add to the cluster output.
+- ``--graph-weights``: save the edges weights in the network as their Euclidean core-accessory
+ distances, rather than as 0 or 1 (useful for visualising the network).
+
+External clusters may be other cluster names, such as serotype, sequence type, cgMLST etc.
+PopPUNK clusters are mapped as one-to-many, so that each strain is labelled with all of
+the clusters any of its members is assigned to in this file. This input file must
+be comma separated, one sample per line, with the sample name as the first column, and
+other clusters as subsequent columns. A header line with 'sample' and the names of other cluster
+types is required. Output is to ``output/output_external_clusters.csv``.
+
+How good is my fit?
+-------------------
+We have found the best way to assess this is to use :doc:`visualisation` on your output
+and look at your assigned clusters against a tree, to determine whether they have
+the specificity required.
+
+You can also compare models with their network score, and
+whether the output plots look as expected. Typically the key thing is that
+**your spatial component nearest the origin is accurate**. More detail is given for each model below.
+
+Interpreting the network summary
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+All fits will output a network summary which looks similar to this::
+
+ Network summary:
+ Components 31
+ Density 0.0897
+ Transitivity 1.0000
+ Score 0.9103
+
+- Components are the number of strains (clusters) found using this model.
+- Density is the proportion of distances assigned as 'within-strain'. Generally
+ smaller is better as this gives more specific clusters, but too close to zero
+ may be an over-specific model.
+- Transitivity measures whether every member of each strain is connected to every
+ other member. Closer to 1 is better, but this can be achieved with very loose fits.
+- Score synthesises the above as :math:`(1 - \mathrm{density}) * \mathrm{transitivity}`,
+ which gives a single number between 0 (bad) and 1 (good) which in many cases is
+ at a maximum when it accurately describes strains in the data.
+
+.. _bgmm:
+
+bgmm
+----
+This mode fits a `Bayesian Gaussian mixture model `__
+to the core and accessory distances. With few points, methods such as DBSCAN may struggle to find
+clusters due to the sparsity, whereas a BGMM can often find a good fit. A further advantage
+is that the equation for the posterior is known, so all points will have an assignment and a non-linear
+boundary found exactly.
+
+However, when there are a very large number of points the likelihood has a tendency
+to totally override the prior in the estimated posterior, meaning many overlapping components
+may be fitted, which may give poor clusters, and is less robust to adding more data. It is possible
+for this mode to fail to converge, but it is more likely to produce a bad fit in difficult cases.
+
+The key parameter to specify is the maximum number of components ``--K``. You should
+choose a number based on the number of components you can see on your distance plot. This
+may be automatically reduced if there is insufficent evidence for this many components. As a rule of thumb,
+if you have under 150 samples or under 1000 samples and clear components then this mode should give
+a good fit.
+
+A better network score is evidence of a better fit, but the output files should also be used to
+judge this. With the test dataset, four components are visible::
+
+ poppunk --fit-model bgmm --ref-db listeria --K 4
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting bgmm model to reference database
+
+ Fit summary:
+ Avg. entropy of assignment 0.0042
+ Number of components used 4
+
+ Scaled component means:
+ [0.9415286 0.90320047]
+ [0.11542755 0.24570244]
+ [0.20966101 0.37694884]
+ [0.00527421 0.07043826]
+
+ Network summary:
+ Components 31
+ Density 0.0897
+ Transitivity 1.0000
+ Score 0.9103
+ Removing 97 sequences
+
+ Done
+
+In the output to the terminal:
+
+- The average entropy of assignment is a measure of the certainty of assignment
+ of each point. Lower is better. Higher values may indicate overlapping components,
+ perhaps due to high amounts of recombination between strains.
+- Number of components used is how many components from ``K`` were actually used
+ in the spatial fit. This is usually equal to ``K``, but may be reduced in small datasets.
+- Scaled component means are the centres of the fitted components in the model, where
+ the core and accessory distances have been rescaled between 0 and 1. These can be
+ used with :ref:`manual-start`.
+
+The fit actually just uses the component closest to the origin -- any distances
+assigned to this component are within-strain. This is the most important part of the
+fit in this mode.
+
+You can see that this gives a good network score, and fits the data well:
+
+.. image:: images/bgmm_k4_fit.png
+ :alt: BGMM fit with K = 4
+ :align: center
+
+The position of the boundary is also produced (in red), along with contours of
+the fitted mixture components:
+
+.. image:: images/bgmm_k4_boundary.png
+ :alt: BGMM fit with K = 4
+ :align: center
+
+If you make K too low, some components will be merged, resulting in a less-specific
+fit with fewer clusters, that do not fully delineate all of the strains (in this
+case just finding the two main lineages of *Listeria* in this data)::
+
+ poppunk --fit-model bgmm --ref-db listeria --K 2
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting bgmm model to reference database
+
+ Fit summary:
+ Avg. entropy of assignment 0.0007
+ Number of components used 2
+
+ Scaled component means:
+ [0.11627304 0.2432584 ]
+ [0.9415286 0.90320047]
+
+ Network summary:
+ Components 2
+ Density 0.5405
+ Transitivity 1.0000
+ Score 0.4595
+ Removing 126 sequences
+
+ Done
+
+.. image:: images/bgmm_k2_fit.png
+ :alt: BGMM fit with K = 2
+ :align: center
+
+Too many components in a small dataset are automatically reduced to an
+appropriate number, obtaining the same good fit as above::
+
+ poppunk --fit-model bgmm --ref-db listeria --K 10
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting bgmm model to reference database
+
+ Fit summary:
+ Avg. entropy of assignment 0.3195
+ Number of components used 4
+
+ Scaled component means:
+ [0.9415286 0.90320047]
+ [3.72458739e-07 4.73196248e-07]
+ [0.00527421 0.07043826]
+ [0.20966682 0.37695524]
+ [0.11542849 0.2457043 ]
+ [1.68940242e-11 2.14632815e-11]
+ [7.66987488e-16 9.74431443e-16]
+ [3.48211781e-20 4.42391191e-20]
+ [1.58087904e-24 2.00845290e-24]
+ [7.17717973e-29 9.11836205e-29]
+
+ Network summary:
+ Components 31
+ Density 0.0897
+ Transitivity 1.0000
+ Score 0.9103
+ Removing 97 sequences
+
+ Done
+
+In a dataset with more points, and less clear components, too many components can lead to
+a bad fit:
+
+.. image:: images/bgmm_fit_K10.png
+ :alt: BGMM fit with K = 10
+ :align: center
+
+This is clearly a poor fit. The real issue is that the component whose mean is nearest
+the origin is unclear, and doesn't include all of the smallest distances.
+
+.. _dbscan:
+
+dbscan
+------
+This mode uses `HDBSCAN `__ to find clusters
+in the core and accessory distances. This is a versatile clustering algorithm capable of
+finding non-linear structure in the data, and can represent irregularly shaped components
+well. Possible drawbacks are that a fit cannot always be found (this can happen
+for small datasets with sparse points, or for datasets without much structure in the core
+and accessory), and that some points are classified as 'noise' so not all of their
+edges are included in the network (these are the small black points).
+
+dbscan usually needs little modification to run::
+
+ poppunk --fit-model dbscan --ref-db listeria
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting dbscan model to reference database
+
+ Fit summary:
+ Number of clusters 5
+ Number of datapoints 8128
+ Number of assignments 7804
+
+ Scaled component means
+ [0.94155383 0.90322459]
+ [0.00527493 0.07044794]
+ [0.20945986 0.37491995]
+ [0.12876077 0.34294888]
+ [0.11413982 0.24224743]
+
+ Network summary:
+ Components 31
+ Density 0.0897
+ Transitivity 1.0000
+ Score 0.9103
+ Removing 97 sequences
+
+ Done
+
+In the output to the terminal:
+
+- The number of clusters is the number of spatial components found in the data.
+- Number of datapoints is the number of points used (all-vs-all distances), which
+ may have been subsampled from the maximum.
+- Number of assignments is the number of points assign to one of the spatial components,
+ so excluding noise points.
+- Scaled component means are the centres of the fitted components in the model, where
+ the core and accessory distances have been rescaled between 0 and 1. These can be
+ used with :ref:`manual-start`.
+
+The fit actually just uses the component closest to the origin -- any distances
+assigned to this component are within-strain. This is the most important part of the
+fit in this mode. In this case the identification of this component is identical to the bgmm
+fit, so they produce the same strains. Note there is a small yellow cluster which is poorly
+defined, but as it does not impact the within-strain cluster the fit is unaffected:
+
+.. image:: images/dbscan_fit.png
+ :alt: DBSCAN fit
+ :align: center
+
+You can alter the fit with ``--D``, which sets a maximum number of clusters, and
+``--min-cluster-prop`` which sets the minimum number of points a cluster can have (as
+a proportion of 'Number of datapoints). If the means of both of the core and accessory are not
+strictly increasing between the within-strain and next further component, the clustering
+fails. In this case the minimum number of samples per cluster is halved, and the fit is
+tried again. If this goes below ten, no fit can be found.
+
+Increasing ``--min-cluster-prop`` or decreasing ``--D`` gets rid of the errant cluster above::
+
+ poppunk --fit-model dbscan --ref-db listeria --min-cluster-prop 0.01
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting dbscan model to reference database
+
+ Fit summary:
+ Number of clusters 4
+ Number of datapoints 8128
+ Number of assignments 7805
+
+ Scaled component means
+ [0.94155383 0.90322459]
+ [0.00522549 0.06876396]
+ [0.11515678 0.24488282]
+ [0.21152104 0.37635505]
+
+ Network summary:
+ Components 31
+ Density 0.0886
+ Transitivity 0.9953
+ Score 0.9071
+ Removing 95 sequences
+
+ Done
+
+But note that a few more noise points are generated, and fewer samples are removed
+when pruning cliques:
+
+.. image:: images/dbscan_fit_min_prop.png
+ :alt: DBSCAN fit increasing assignments per cluster
+ :align: center
+
+Setting either ``--min-cluster-prop`` or ``--D`` too low can cause the fit to fail::
+
+ poppunk --fit-model dbscan --ref-db listeria --min-cluster-prop 0.05
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting dbscan model to reference database
+
+ Failed to find distinct clusters in this dataset
+
+refine
+------
+Model refinement is slightly different: it takes a model already fitted by :ref:`bgmm`
+or :ref:`dbscan` and tries to improve it by optimising the network score. This starts
+with a parallelised global optimisation step, followed by a serial local optimisation
+step (which can be turned off with ``--no-local``). Use of multiple ``--cpus`` is
+effective for these model fits.
+
+Briefly:
+
+* A line between the within- and between-strain means is constructed
+* The point on this line where samples go from being assigned as within-strain to between-strain is used as the starting point
+* A line normal to the first line, passing through this point is constructed. The triangle formed by this line and the x- and y-axes is now the decision boundary. Points within this line are within-strain.
+* The starting point is shifted by a distance along the first line, and a new decision boundary formed in the same way. The network is reconstructed.
+* The shift of the starting point is optimised, as judged by the network score. First globally by a grid search, then locally near the global optimum.
+
+Applying this to the *Listeria* DBSCAN fit (noting that you may specify a separate
+directory to load the model from with ``--model-dir``, if multiple model fits are available)::
+
+ poppunk --fit-model refine --ref-db listeria --model-dir dbscan
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting refine model to reference database
+
+ Loading DBSCAN model
+ Loaded previous model of type: dbscan
+ Initial model-based network construction based on DBSCAN fit
+ Initial boundary based network construction
+ Decision boundary starts at (0.63,0.62)
+ Trying to optimise score globally
+ Trying to optimise score locally
+
+ Optimization terminated successfully;
+ The returned value satisfies the termination criteria
+ (using xtol = 1e-05 )
+ Network summary:
+ Components 29
+ Density 0.0897
+ Transitivity 0.9984
+ Score 0.9088
+ Removing 93 sequences
+
+ Done
+
+As this model was already well fitted, this doesn't change much, and finds very similar
+clusters (though noise points are eliminated):
+
+.. image:: images/listeria_refined.png
+ :alt: A refine fit on Listeria
+ :align: center
+
+The default is to search along the entire range between the within- and between-strain clusters,
+but sometimes this can include undesired optima, particularly near the origin. To exclude these,
+use ``--pos-shift`` to alter the distance between the end of the search range and the origin
+and ``--neg-shift`` for the start of the search range.
+
+This mode is more useful in species with a relatively high recombination rate the distinction between
+the within- and between-strain distributions may be blurred in core and
+accessory space. This does not give the mixture model enough information to
+draw a good boundary as the likelihood is very flat in this region:
+
+.. image:: images/pneumo_unrefined.png
+ :alt: A bad DPGMM fit
+ :align: center
+
+Although the score of this fit looks ok (0.904), inspection of the network and
+microreact reveals that it is too liberal and clusters have been merged. This
+is because some of the blur between the origin and the central distribution has
+been included, and connected clusters together erroneously.
+
+The likelihood of the model fit and the decision boundary looks like this:
+
+.. image:: images/pneumo_likelihood.png
+ :alt: The likelihood and decision boundary of the above fit
+ :align: center
+
+Using the core and accessory distributions alone does not give much information
+about exactly where to put the boundary, and the only way to fix this would be
+by specifying strong priors on the weights of the distributions. Fortunately
+the network properties give information in the region, and we can use
+``--refine-fit`` to tweak the existing fit and pick a better boundary.
+
+Here is the refined fit, which has a score of 0.939, and 62 rather than 32
+components:
+
+.. image:: images/pneumo_refined.png
+ :alt: The refined fit
+ :align: center
+
+Which, looking at the `microreact output `__, is much better:
+
+.. image:: images/refined_microreact.png
+ :alt: The refined fit, in microreact
+ :align: center
+
+.. _manual-start:
+
+Using fit refinement when mixture model totally fails
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If the mixture model does not give any sort of reasonable fit to the points,
+you can manually provide a file with ``--manual-start`` to give the starting parameters to
+``--refine-fit`` mode. The format of this file is as follows::
+
+ mean0 0,0
+ mean1 0.5,0.6
+ start_point 0.3
+
+A key, followed by its value (space separated).
+
+``mean0`` and ``mean1`` define the points (x,y) to draw the line between, and
+``start_point`` is the distance along this line to draw the initial boundary
+(which is normal to the line). These define the three red points (and therefore the
+search range) in the output plot
+
+.. _indiv-refine:
+
+Using core/accessory only
+^^^^^^^^^^^^^^^^^^^^^^^^^
+In some cases, such as analysis within a lineage, it may be desirable to use
+only core or accessory distances to classify further queries. This can be
+achieved by adding the ``--indiv-refine both`` option, which will allow these boundaries to be
+placed independently, allowing the best fit in each case::
+
+ poppunk --fit-model refine --ref-db listeria --model-dir dbscan --indiv-refine both
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting refine model to reference database
+
+ Loading DBSCAN model
+ Loaded previous model of type: dbscan
+ Initial model-based network construction based on DBSCAN fit
+ Initial boundary based network construction
+ Decision boundary starts at (0.63,0.62)
+ Trying to optimise score globally
+ Trying to optimise score locally
+
+ Optimization terminated successfully;
+ The returned value satisfies the termination criteria
+ (using xtol = 1e-05 )
+ Refining core and accessory separately
+ Initial boundary based network construction
+ Decision boundary starts at (0.63,0.62)
+ Trying to optimise score globally
+ Trying to optimise score locally
+
+ Optimization terminated successfully;
+ The returned value satisfies the termination criteria
+ (using xtol = 1e-05 )
+ Initial boundary based network construction
+ Decision boundary starts at (0.63,0.62)
+ Trying to optimise score globally
+ Trying to optimise score locally
+
+ Optimization terminated successfully;
+ The returned value satisfies the termination criteria
+ (using xtol = 1e-05 )
+ Network summary:
+ Components 29
+ Density 0.0897
+ Transitivity 0.9984
+ Score 0.9088
+ Network summary:
+ Components 31
+ Density 0.0897
+ Transitivity 1.0000
+ Score 0.9103
+ Network summary:
+ Components 31
+ Density 0.0808
+ Transitivity 0.9862
+ Score 0.9064
+ Removing 93 sequences
+
+ Done
+
+There are three different networks, and the core and accessory boundaries will
+also be shown on the _refined_fit.png plot as dashed gray lines:
+
+.. image:: images/indiv_refine.png
+ :alt: Refining fit with core and accessory individuals independently
+ :align: center
+
+To use one of these for your saved model, rerun, but instead setting
+``--indiv-refine core`` or ``--indiv-refine accessory``.
+
+threshold
+---------
+In this mode no model is fitted. You provide the threshold at which within- and
+between-strain distances is drawn. This can be useful if ``refine`` cannot find a boundary
+due to a poorly performing network score, but one can clearly be seen from the plot.
+It may also be useful to compare with other fits from related species where a boundary
+has been identified using one of the fitting procedures.
+
+Currently only a core-distance boundary is supported (if you would like an accessory or
+combined mode available, please `raise an issue `__).
+Provide the cutoff with ``--threshold``::
+
+ poppunk --fit-model threshold --ref-db listeria --threshold 0.003
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting threshold model to reference database
+
+ Network summary:
+ Components 31
+ Density 0.0897
+ Transitivity 1.0000
+ Score 0.9103
+ Removing 97 sequences
+
+ Done
+
+.. image:: images/listeria_threshold.png
+ :alt: A threshold fit on Listeria
+ :align: center
+
+.. _lineage-fit:
+
+lineage
+-------
+This mode defines clusters by joining nearest neighbours. As this will typically
+define subclusters within strains, we refer to these as 'lineages'. This can be used
+to find subclusters in addition to one of the above models, or for species without
+strain-structure (e.g. some viruses, *Neisseria gonorrhoeae*, *Mycobacterium tuberculosis*).
+This is the highest resolution (most specific clusters) provided directly by PopPUNK. If it does
+not meet your needs, take a look at :doc:`subclustering` for other options.
+
+A model is not fitted, and a simple data-driven heuristic is used. For each sample, the
+nearest :math:`k` neighbours will be indentified, and joined in the network. Connected components
+of the network define lineages, as in the other models. Only core distances are used (add ``--use-accessory`` to modify this),
+and in the case of ties all distances are included. Note that these are not necessarily
+expected to be transitive, so network scores are not as informative of the optimum.
+
+We refer to :math:`k` as the 'rank' of the model. Typically you won't know which rank
+to use beforehand, so you can provide multiple integer values to the ``--rank`` option, comma separated.
+Clusters from all ranks will be output, and all used with :doc:`query_assignment`. :math:`k = 1` is the
+most specific rank, and higher values will form looser clusters. With the *Listeria* example::
+
+ poppunk --fit-model lineage --ref-db listeria --ranks 1,2,3,5
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Fitting lineage model to reference database
+
+ Network for rank 1
+ Network summary:
+ Components 26
+ Density 0.0271
+ Transitivity 0.1834
+ Score 0.1785
+ Network for rank 2
+ Network summary:
+ Components 12
+ Density 0.0428
+ Transitivity 0.3528
+ Score 0.3377
+ Network for rank 3
+ Network summary:
+ Components 6
+ Density 0.0589
+ Transitivity 0.4191
+ Score 0.3944
+ Network for rank 5
+ Network summary:
+ Components 2
+ Density 0.0904
+ Transitivity 0.5319
+ Score 0.4838
+ Parsed data, now writing to CSV
+
+ Done
+
+This has produced four fits, with ranks 1, 2, 3 and 5 (with fit information contained in
+the .pkl file, and a .npz file for each rank). The _clusters.csv will contain the clusters
+from the lowest rank. The _lineages.csv file contains all of the assignments, a column
+with all of the ranks hyphen-separated (which will give clusters indentical to the lowest rank)::
+
+ id,Rank_1_Lineage,Rank_2_Lineage,Rank_3_Lineage,Rank_5_Lineage,overall_Lineage
+ 12673_8#24,18,2,2,1,18-2-2-1
+ 12673_8#26,4,2,2,1,4-2-2-1
+ 12673_8#27,26,1,1,1,26-1-1-1
+ 12673_8#28,1,1,1,1,1-1-1-1
+ 12673_8#29,4,2,2,1,4-2-2-1
+ 12673_8#31,18,2,2,1,18-2-2-1
+ 12673_8#32,9,8,1,1,9-8-1-1
+ 12673_8#34,7,7,1,1,7-7-1-1
+ 12673_8#36,1,1,1,1,1-1-1-1
+
+The best way to assess the ranks is by visualising them (:doc:`visualisation`)::
+
+ poppunk_visualise --distances listeria/listeria.dists --ref-db listeria --microreact
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ PopPUNK: visualise
+ Loading previously lineage cluster model
+ Writing microreact output
+ Parsed data, now writing to CSV
+ Building phylogeny
+ Running t-SNE
+
+ Done
+
+This can be loaded in microreact: https://microreact.org/project/dVNMftmK6VXRvDxBfrH2y.
+Rank 1 has the smallest clusters:
+
+.. image:: images/listeria_lineage_rank_1.png
+ :alt: Rank 1 lineage fit for Listeria
+ :align: center
+
+Rank 3 has larger clusters. Some of these clusters are polyphyletic on the core neighbour-joining
+tree:
+
+.. image:: images/listeria_lineage_rank_3.png
+ :alt: Rank 3 lineage fit for Listeria
+ :align: center
+
+At the model fit stage, you will also get histograms which show the distances included
+in the network, a useful comparison with the original distance distribution and between ranks:
+
+.. list-table::
+
+ * - .. figure:: images/listeria_lineage_rank_1_histogram.png
+
+ Rank 1
+
+ - .. figure:: images/listeria_lineage_rank_3_histogram.png
+
+ Rank 3
+
+Use an existing model with new data
+-----------------------------------
+
+There is also one further mode, ``--use-model``, which may be useful in limited circumstances. This
+applies any of the above models to a new dataset without refitting it. This may be useful if a reference
+dataset has changed (been added to or removed from) and you do not wish to refit the model, for example
+because it is already in use. However, typically you would use :doc:`query_assignment` with ``--update-db``
+to add to a model::
+
+ poppunk --use-model --ref-db new_db --model-dir old_db
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.6.0
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+
+ Graph-tools OpenMP parallelisation enabled: with 1 threads
+ Mode: Using previous model with a reference database
+
+ Loading BGMM 2D Gaussian model
+ Loaded previous model of type: bgmm
+ Network summary:
+ Components 31
+ Density 0.0897
+ Transitivity 1.0000
+ Score 0.9103
+ Removing 97 sequences
+
+ Done
diff --git a/docs/online.rst b/docs/online.rst
new file mode 100644
index 00000000..72c3c8b0
--- /dev/null
+++ b/docs/online.rst
@@ -0,0 +1,103 @@
+PopPUNK-web
+=================
+This is the newest feature in the PopPUNK pipeline, available at https://web.poppunk.net/.
+PoPUNK-web has been designed for non-specialists and allows you to easily analyse and assign lineages to your query genomic sequences,
+using pre-optimised species-specific databases and default parameters.
+
+.. contents::
+ :local:
+
+How it works
+------------
+PopPUNK-web uses C++ code compiled to WebAssembly to sketch query sequences client side, then post this sketch to our Python backend running PopPUNK.
+This means your raw genomic sequences remain anonymous and secure as they never actually leave your machine.
+
+Using PopPUNK-web
+-----------------------
+PopPUNK-web allows you to upload a single isolate's genomic sequence through our user-friendly drag and drop interface (shown below).
+After sketching and lineage assignment, information is returned on the species and lineage your query has been assigned to, relative to the genomic sequences in our pre-built databases.
+We then use this assignment to generate and return a series of visualisations that provide additional information on the relationship of your sequence to those in our databases.
+Currently, PopPUNK-web only supports *S. pneumoniae* sequences but we hope to expand this in the near future.
+
+.. image:: images/web_home.png
+ :alt: PopPUNK-web homepage
+ :align: center
+
+Outputs
+-------
+The interactive outputs of PopPUNK-web are conventiently located in a single results page.
+You can navigate through the different outputs by using the tabs at the top of the results page.
+
+Sequence metrics
+^^^^^^^^^^^^^^^^
+This tab displays the species and lineage assigned to your query sequence, in addition to the prevalence of the lineage in our databases and information regarding the quality of your uploaded sequence.
+Sequence quality metrics indicate the length of your sequence in base pairs, the number of Ns and the frequencies of each base within the sequence.
+The frequencies of A & T bases and C & G bases should be approximately equal.
+
+.. image:: images/web_stats.png
+ :alt: PopPUNK-web homepage
+ :align: center
+
+Cluster prevalences
+^^^^^^^^^^^^^^^^^^^
+PopPUNK-web uses the plotly.js package (https://plotly.com) to plot the prevalences of all clusters in our species-specific database, including the assigned query sequence.
+Cluster IDs are shown across the x-axis and the corresponding prevalence (%) in our database shown on the y-axis.
+Bars are organised by decreasing cluster prevalence.
+To visualise subsections of this plot, use your scroll wheel or double click an area to zoom in. Double click again to zoom back out.
+
+.. image:: images/web_prevs.png
+ :alt: PopPUNK-web prevalences
+ :align: center
+
+The bar corresponding to the assigned cluster of your query sequence is highlighted in orange.
+
+.. image:: images/web_prevs_zoomed.png
+ :alt: PopPUNK-web zoomed prevalences
+ :align: center
+
+Population phylogeny
+^^^^^^^^^^^^^^^^^^^^
+We make use of the Microreact REST API (https://microreact.org/showcase) to display a phylogeny indicating the relationships of all isolates in the assigned species database.
+By default, isolates are coloured based on their assigned cluster in our database.
+
+.. image:: images/web_micro.png
+ :alt: PopPUNK-web population phylogeny
+ :align: center
+
+This phylogeny does not include your query isolate however, the cluster your query has been assigned to can be highlighted by selecting "Highlight_query," instead of "Cluster" (as shown below).
+
+.. image:: images/web_micro_change.png
+ :alt: PopPUNK-web population phylogeny change
+ :align: center
+
+.. image:: images/web_micro_assigned.png
+ :alt: PopPUNK-web population phylogeny assigned
+ :align: center
+
+Cluster phylogeny
+^^^^^^^^^^^^^^^^^
+We use Phylocanvas (http://phylocanvas.org) to display a phylogeny of your query sequence and its relationships to isolates assigned to the same cluster.
+Your query sequence is highlighted in red as shown below.
+
+.. image:: images/web_phylo.png
+ :alt: PopPUNK-web cluster phylogeny
+ :align: center
+
+Cluster network
+^^^^^^^^^^^^^^^
+Cytoscape.js (https://js.cytoscape.org) is used to display a network representing the cluster assigned to your query sequence.
+Your query sequence is highlighted in orange as shown below.
+Edge lengths are currently arbitrary and do not represent evolutionary distances.
+
+.. image:: images/web_cyto.png
+ :alt: PopPUNK-web network
+ :align: center
+
+New features
+------------
+Do you have a feature you would like to see in PopPUNK-web? File an issue on the PopPUNK-web GitHub repository (https://github.com/johnlees/PopPUNK-web/issues).
+
+Issues/Bugs
+-----------
+As PopPUNK-web is a new feature, it is possible there are bugs or issues we have no come across yet.
+If you identify a problem with PopPUNK-web, please file an issue on the PopPUNK-web GitHub repository (https://github.com/johnlees/PopPUNK-web/issues).
diff --git a/docs/options.rst b/docs/options.rst
index 78ca1e69..cac51239 100644
--- a/docs/options.rst
+++ b/docs/options.rst
@@ -1,131 +1,317 @@
Options
=======
+**Contents**:
+
+.. contents::
+ :local:
+
+poppunk
+-------
+
Usage::
- usage: PopPUNK [-h]
- (--easy-run | --create-db | --fit-model | --refine-model | --assign-query | --use-model | --generate-viz)
- [--ref-db REF_DB] [--r-files R_FILES] [--q-files Q_FILES]
+ poppunk [-h]
+ (--create-db | --fit-model {bgmm,dbscan,refine,lineage,threshold} | --use-model)
+ [--ref-db REF_DB] [--r-files R_FILES]
[--distances DISTANCES]
- [--external-clustering EXTERNAL_CLUSTERING] --output OUTPUT
- [--plot-fit PLOT_FIT] [--full-db] [--update-db] [--overwrite]
- [--min-k MIN_K] [--max-k MAX_K] [--k-step K_STEP]
- [--sketch-size SKETCH_SIZE] [--max-a-dist MAX_A_DIST]
- [--ignore-length] [--K K] [--dbscan] [--D D]
- [--min-cluster-prop MIN_CLUSTER_PROP] [--pos-shift POS_SHIFT]
+ [--external-clustering EXTERNAL_CLUSTERING]
+ [--output OUTPUT] [--plot-fit PLOT_FIT] [--overwrite]
+ [--graph-weights] [--min-k MIN_K] [--max-k MAX_K]
+ [--k-step K_STEP] [--sketch-size SKETCH_SIZE]
+ [--codon-phased] [--min-kmer-count MIN_KMER_COUNT]
+ [--exact-count] [--strand-preserved]
+ [--qc-filter {stop,prune,continue}] [--retain-failures]
+ [--max-a-dist MAX_A_DIST] [--length-sigma LENGTH_SIGMA]
+ [--length-range LENGTH_RANGE LENGTH_RANGE]
+ [--prop-n PROP_N] [--upper-n UPPER_N] [--K K] [--D D]
+ [--min-cluster-prop MIN_CLUSTER_PROP]
+ [--threshold THRESHOLD] [--pos-shift POS_SHIFT]
[--neg-shift NEG_SHIFT] [--manual-start MANUAL_START]
[--indiv-refine] [--no-local] [--model-dir MODEL_DIR]
- [--previous-clustering PREVIOUS_CLUSTERING] [--core-only]
- [--accessory-only] [--subset SUBSET] [--microreact]
- [--cytoscape] [--phandango] [--grapetree] [--rapidnj RAPIDNJ]
- [--perplexity PERPLEXITY] [--info-csv INFO_CSV] [--mash MASH]
- [--threads THREADS] [--no-stream] [--version]
-
-Command line options
-
- optional arguments:
- -h, --help show this help message and exit
-
- Mode of operation:
- --easy-run Create clusters from assemblies with default settings
- --create-db Create pairwise distances database between reference
- sequences
- --fit-model Fit a mixture model to a reference database
- --refine-model Refine the accuracy of a fitted model
- --assign-query Assign the cluster of query sequences without re-
- running the whole mixture model
- --generate-viz Generate files for a visualisation from an existing
- database
- --use-model Apply a fitted model to a reference database to
- restore database files
-
-
- Input files:
- --ref-db REF_DB Location of built reference database
- --r-files R_FILES File listing reference input assemblies
- --q-files Q_FILES File listing query input assemblies
- --distances DISTANCES
- Prefix of input pickle of pre-calculated distances
- --external-clustering EXTERNAL_CLUSTERING
- File with cluster definitions or other labels
- generated with any other method.
-
- Output options:
- --output OUTPUT Prefix for output files (required)
- --plot-fit PLOT_FIT Create this many plots of some fits relating k-mer to
- core/accessory distances [default = 0]
- --full-db Keep full reference database, not just representatives
- --update-db Update reference database with query sequences
- --overwrite Overwrite any existing database files
-
- Kmer comparison options:
- --min-k MIN_K Minimum kmer length [default = 9]
- --max-k MAX_K Maximum kmer length [default = 29]
- --k-step K_STEP K-mer step size [default = 4]
- --sketch-size SKETCH_SIZE
- Kmer sketch size [default = 10000]
-
- Quality control options:
- --max-a-dist MAX_A_DIST
- Maximum accessory distance to permit [default = 0.5]
- --ignore-length Ignore outliers in terms of assembly length [default =
- False]
-
- Model fit options:
- --K K Maximum number of mixture components [default = 2]
- --dbscan Use DBSCAN rather than mixture model
- --D D Maximum number of clusters in DBSCAN fitting [default
- = 100]
- --min-cluster-prop MIN_CLUSTER_PROP
- Minimum proportion of points in a cluster in DBSCAN
- fitting [default = 0.0001]
-
- Refine model options:
- --pos-shift POS_SHIFT
- Maximum amount to move the boundary away from origin
- [default = 0.2]
- --neg-shift NEG_SHIFT
- Maximum amount to move the boundary towards the origin
- [default = 0.4]
- --manual-start MANUAL_START
- A file containing information for a start point. See
- documentation for help.
- --indiv-refine Also run refinement for core and accessory
- individually
- --no-local Do not perform the local optimization step (speed up
- on very large datasets)
-
- Database querying options:
- --model-dir MODEL_DIR
- Directory containing model to use for assigning
- queries to clusters [default = reference database
- directory]
- --previous-clustering PREVIOUS_CLUSTERING
- Directory containing previous cluster definitions and
- network [default = use that in the directory
- containing the model]
- --core-only Use a core-distance only model for assigning queries
- [default = False]
- --accessory-only Use an accessory-distance only model for assigning
- queries [default = False]
-
- Further analysis options:
- --subset SUBSET File with list of sequences to include in
- visualisation (with --generate-viz only)
- --microreact Generate output files for microreact visualisation
- --cytoscape Generate network output files for Cytoscape
- --phandango Generate phylogeny and TSV for Phandango visualisation
- --grapetree Generate phylogeny and CSV for grapetree visualisation
- --rapidnj RAPIDNJ Path to rapidNJ binary to build NJ tree for Microreact
- --perplexity PERPLEXITY
- Perplexity used to calculate t-SNE projection (with
- --microreact) [default=20.0]
- --info-csv INFO_CSV Epidemiological information CSV formatted for
- microreact (can be used with other outputs)
-
- Other options:
- --mash MASH Location of mash executable
- --threads THREADS Number of threads to use [default = 1]
- --no-stream Use temporary files for mash dist interfacing. Reduce
- memory use/increase disk use for large datasets
- --version show program's version number and exit
+ [--ranks RANKS] [--use-accessory] [--threads THREADS]
+ [--gpu-sketch] [--gpu-dist] [--deviceid DEVICEID]
+ [--version]
+
+Command line options::
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+ Mode of operation:
+ --create-db Create pairwise distances database between
+ reference sequences
+ --fit-model {bgmm,dbscan,refine,lineage,threshold}
+ Fit a mixture model to a reference database
+ --use-model Apply a fitted model to a reference database to
+ restore database files
+
+ Input files:
+ --ref-db REF_DB Location of built reference database
+ --r-files R_FILES File listing reference input assemblies
+ --distances DISTANCES
+ Prefix of input pickle of pre-calculated distances
+ --external-clustering EXTERNAL_CLUSTERING
+ File with cluster definitions or other labels
+ generated with any other method.
+
+ Output options:
+ --output OUTPUT Prefix for output files
+ --plot-fit PLOT_FIT Create this many plots of some fits relating k-mer
+ to core/accessory distances [default = 0]
+ --overwrite Overwrite any existing database files
+ --graph-weights Save within-strain Euclidean distances into the
+ graph
+
+ Create DB options:
+ --min-k MIN_K Minimum kmer length [default = 13]
+ --max-k MAX_K Maximum kmer length [default = 29]
+ --k-step K_STEP K-mer step size [default = 4]
+ --sketch-size SKETCH_SIZE
+ Kmer sketch size [default = 10000]
+ --codon-phased Used codon phased seeds X--X--X [default = False]
+ --min-kmer-count MIN_KMER_COUNT
+ Minimum k-mer count when using reads as input
+ [default = 0]
+ --exact-count Use the exact k-mer counter with reads [default =
+ use countmin counter]
+ --strand-preserved Treat input as being on the same strand, and
+ ignore reverse complement k-mers [default = use
+ canonical k-mers]
+
+ Quality control options:
+ --qc-filter {stop,prune,continue}
+ Behaviour following sequence QC step: "stop"
+ [default], "prune" (analyse data passing QC), or
+ "continue" (analyse all data)
+ --retain-failures Retain sketches of genomes that do not pass QC
+ filters in separate database [default = False]
+ --max-a-dist MAX_A_DIST
+ Maximum accessory distance to permit [default =
+ 0.5]
+ --length-sigma LENGTH_SIGMA
+ Number of standard deviations of length
+ distribution beyond which sequences will be
+ excluded [default = 5]
+ --length-range LENGTH_RANGE LENGTH_RANGE
+ Allowed length range, outside of which sequences
+ will be excluded [two values needed - lower and
+ upper bounds]
+ --prop-n PROP_N Threshold ambiguous base proportion above which
+ sequences will be excluded [default = 0.1]
+ --upper-n UPPER_N Threshold ambiguous base count above which
+ sequences will be excluded
+
+ Model fit options:
+ --K K Maximum number of mixture components [default = 2]
+ --D D Maximum number of clusters in DBSCAN fitting
+ [default = 100]
+ --min-cluster-prop MIN_CLUSTER_PROP
+ Minimum proportion of points in a cluster in
+ DBSCAN fitting [default = 0.0001]
+ --threshold THRESHOLD
+ Cutoff if using --fit-model threshold
+
+ Refine model options:
+ --pos-shift POS_SHIFT
+ Maximum amount to move the boundary away from
+ origin [default = to between-strain mean]
+ --neg-shift NEG_SHIFT
+ Maximum amount to move the boundary towards the
+ origin [default = to within-strain mean]
+ --manual-start MANUAL_START
+ A file containing information for a start point.
+ See documentation for help.
+ --indiv-refine {both,core,accessory}
+ Also run refinement for core and accessory
+ individually
+ --no-local Do not perform the local optimization step (speed
+ up on very large datasets)
+ --model-dir MODEL_DIR
+ Directory containing model to use for assigning
+ queries to clusters [default = reference database
+ directory]
+
+ Lineage analysis options:
+ --ranks RANKS Comma separated list of ranks used in lineage
+ clustering [default = 1,2,3]
+ --use-accessory Use accessory distances for lineage definitions
+ [default = use core distances]
+
+ Other options:
+ --threads THREADS Number of threads to use [default = 1]
+ --gpu-sketch Use a GPU when calculating sketches (read data
+ only) [default = False]
+ --gpu-dist Use a GPU when calculating distances [default =
+ False]
+ --deviceid DEVICEID CUDA device ID, if using GPU [default = 0]
+ --version show program's version number and exit
+
+poppunk_assign
+--------------
+
+Usage::
+
+ poppunk_assign [-h] --db DB --query QUERY [--distances DISTANCES]
+ [--external-clustering EXTERNAL_CLUSTERING] --output
+ OUTPUT [--plot-fit PLOT_FIT] [--write-references]
+ [--update-db] [--overwrite] [--graph-weights]
+ [--min-kmer-count MIN_KMER_COUNT] [--exact-count]
+ [--strand-preserved] [--max-a-dist MAX_A_DIST]
+ [--model-dir MODEL_DIR]
+ [--previous-clustering PREVIOUS_CLUSTERING]
+ [--core-only] [--accessory-only] [--threads THREADS]
+ [--gpu-sketch] [--gpu-dist] [--deviceid DEVICEID]
+ [--version]
+
+Command line options::
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+ Input files:
+ --db DB Location of built reference database
+ --query QUERY File listing query input assemblies
+ --distances DISTANCES
+ Prefix of input pickle of pre-calculated distances
+ (if not in --db)
+ --external-clustering EXTERNAL_CLUSTERING
+ File with cluster definitions or other labels
+ generated with any other method.
+
+ Output options:
+ --output OUTPUT Prefix for output files (required)
+ --plot-fit PLOT_FIT Create this many plots of some fits relating k-mer
+ to core/accessory distances [default = 0]
+ --write-references Write reference database isolates' cluster
+ assignments out too
+ --update-db Update reference database with query sequences
+ --overwrite Overwrite any existing database files
+ --graph-weights Save within-strain Euclidean distances into the
+ graph
+
+ Kmer comparison options:
+ --min-kmer-count MIN_KMER_COUNT
+ Minimum k-mer count when using reads as input
+ [default = 0]
+ --exact-count Use the exact k-mer counter with reads [default =
+ use countmin counter]
+ --strand-preserved Treat input as being on the same strand, and
+ ignore reverse complement k-mers [default = use
+ canonical k-mers]
+
+ Quality control options:
+ --max-a-dist MAX_A_DIST
+ Maximum accessory distance to permit [default =
+ 0.5]
+
+ Database querying options:
+ --model-dir MODEL_DIR
+ Directory containing model to use for assigning
+ queries to clusters [default = reference database
+ directory]
+ --previous-clustering PREVIOUS_CLUSTERING
+ Directory containing previous cluster definitions
+ and network [default = use that in the directory
+ containing the model]
+ --core-only (with a 'refine' model) Use a core-distance only
+ model for assigning queries [default = False]
+ --accessory-only (with a 'refine' or 'lineage' model) Use an
+ accessory-distance only model for assigning
+ queries [default = False]
+
+ Other options:
+ --threads THREADS Number of threads to use [default = 1]
+ --gpu-sketch Use a GPU when calculating sketches (read data
+ only) [default = False]
+ --gpu-dist Use a GPU when calculating distances [default =
+ False]
+ --deviceid DEVICEID CUDA device ID, if using GPU [default = 0]
+ --version show program's version number and exit
+
+poppunk_visualise
+-----------------
+
+Usage::
+
+ poppunk_visualise [-h] --ref-db REF_DB [--query-db QUERY_DB]
+ [--distances DISTANCES]
+ [--include-files INCLUDE_FILES]
+ [--external-clustering EXTERNAL_CLUSTERING]
+ [--model-dir MODEL_DIR]
+ [--previous-clustering PREVIOUS_CLUSTERING]
+ [--previous-query-clustering PREVIOUS_QUERY_CLUSTERING]
+ --output OUTPUT [--overwrite] [--core-only]
+ [--accessory-only] [--microreact] [--cytoscape]
+ [--phandango] [--grapetree] [--rapidnj RAPIDNJ]
+ [--perplexity PERPLEXITY] [--info-csv INFO_CSV]
+ [--threads THREADS] [--gpu-dist]
+ [--deviceid DEVICEID] [--strand-preserved]
+ [--version]
+
+Command line options::
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+ Input files:
+ --ref-db REF_DB Location of built reference database
+ --query-db QUERY_DB Location of query database, if distances are from
+ ref-query
+ --distances DISTANCES
+ Prefix of input pickle of pre-calculated distances
+ --include-files INCLUDE_FILES
+ File with list of sequences to include in
+ visualisation. Default is to use all sequences in
+ database.
+ --external-clustering EXTERNAL_CLUSTERING
+ File with cluster definitions or other labels
+ generated with any other method.
+ --model-dir MODEL_DIR
+ Directory containing model to use for assigning
+ queries to clusters [default = reference database
+ directory]
+ --previous-clustering PREVIOUS_CLUSTERING
+ Directory containing previous cluster definitions
+ and network [default = use that in the directory
+ containing the model]
+ --previous-query-clustering PREVIOUS_QUERY_CLUSTERING
+ Directory containing previous cluster definitions
+ from poppunk_assign [default = use that in the
+ directory containing the model]
+
+ Output options:
+ --output OUTPUT Prefix for output files (required)
+ --overwrite Overwrite any existing visualisation files
+
+ Database querying options:
+ --core-only (with a 'refine' model) Use a core-distance only
+ model for assigning queries [default = False]
+ --accessory-only (with a 'refine' or 'lineage' model) Use an
+ accessory-distance only model for assigning
+ queries [default = False]
+
+ Visualisation options:
+ --microreact Generate output files for microreact visualisation
+ --cytoscape Generate network output files for Cytoscape
+ --phandango Generate phylogeny and TSV for Phandango
+ visualisation
+ --grapetree Generate phylogeny and CSV for grapetree
+ visualisation
+ --rapidnj RAPIDNJ Path to rapidNJ binary to build NJ tree for
+ Microreact
+ --perplexity PERPLEXITY
+ Perplexity used to calculate t-SNE projection
+ (with --microreact) [default=20.0]
+ --info-csv INFO_CSV Epidemiological information CSV formatted for
+ microreact (can be used with other outputs)
+
+ Other options:
+ --threads THREADS Number of threads to use [default = 1]
+ --gpu-dist Use a GPU when calculating distances [default =
+ False]
+ --deviceid DEVICEID CUDA device ID, if using GPU [default = 0]
+ --strand-preserved If distances being calculated, treat strand as
+ known when calculating random match chances
+ [default = False]
+ --version show program's version number and exit
diff --git a/docs/qc.rst b/docs/qc.rst
new file mode 100644
index 00000000..600e9f7e
--- /dev/null
+++ b/docs/qc.rst
@@ -0,0 +1,110 @@
+Data quality control
+====================
+PopPUNK now comes with some basic quality control options which are applied by
+default when running ``--create-db``:
+
+- Outlying genome length (calculated during sketching, for assemblies or reads).
+- Too many 'N's.
+- Outlying accessory distance.
+
+Accessory distance is also used with ``poppunk_assign``.
+
+You can set the behaviour when creating the database by setting ``--qc-filter``
+to one of the following three values if any failing samples are found:
+
+- ``stop`` (default) -- Do not proceed past the sketching step, and throw an error.
+- ``prune`` -- Remove failing samples from the sketch database before continuing.
+- ``continue`` -- Ignore failing samples and run anyway.
+
+In all cases a file will be written at ``qcreport.txt`` which lists the failing samples, and the
+reasons why they failed. If running with either prune or continue, you may also add ``--retain-failures``
+to write a separate sketch database with the failed samples.
+
+Random match chances in PopPUNK are only calculated and added to the database after the chosen
+QC step. If you use ``poppunk_sketch`` directly, they will be added without any automated QC.
+
+You can change the genome length cutoff with ``--length-sigma`` which sets the maximum number
+of standard deviations from the mean, and ``--length-range`` which sets an absolute range of
+allowable sizes.
+
+Ambiguous bases are controlled by ``--prop-n`` which gives the maximum percentage of Ns,
+and ``--upper-n`` which gives the absolute maximum value.
+
+The maximum allowed accessory distance is 0.5 to ensure you check for contamination. However,
+many species do really have high accessory values above this range, in which case you
+should increase the value of ``--max-a-dist``.
+
+Removing samples from an existing database
+------------------------------------------
+You can use the ``poppunk_prune`` command to remove samples from a database,
+for example those found to be of poor quality. Create a file
+``remove.txt`` with the names of the samples you wish to remove, one per line,
+and run::
+
+ poppunk_prune --remove remove.txt --distances strain_db/strain_db.dists --output pruned_db
+
+This will remove the samples from the ``strain_db.dists`` files, from which
+``--model-fit`` can be run again.
+
+Dealing with poor quality data
+------------------------------
+In this example we analyse 76 *Haemophilus influenzae* isolates. One isolate, 14412_4_15,
+is contaminated with 12% of reads being *Haemophilus parainfluenzae* and a total
+assembly length of 3.8Mb. It would be removed before input, but its presence
+can also be found with ``PopPUNK --qc-filter continue``.
+
+With the distances
+^^^^^^^^^^^^^^^^^^
+A fit with three mixture components overestimates the number of between strain
+links, and gives a network with a poor score (0.6849) and only five components:
+
+.. image:: images/contam_DPGMM_fit.png
+ :alt: A bad fit to pairwise distances
+ :align: center
+
+Distances in the top left of the plot, with low core distances and high
+accessory distances, are due to the contaminated contigs in the isolate.
+Finding which isolates contribute to these distances reveals a clear culprit::
+
+ awk '$3<0.02 && $4 > 0.3 {print $1}' contam_db/contam_db.search.out | cut -f 1 | sort | uniq -c
+ 1 14412_3_81
+ 1 14412_3_82
+ 1 14412_3_83
+ 1 14412_3_84
+ 1 14412_3_88
+ 1 14412_3_89
+ 1 14412_3_91
+ 1 14412_3_92
+ 1 14412_4_1
+ 1 14412_4_10
+ 28 14412_4_15
+
+In this case it is sufficient to increase the number of mixture components to four,
+which no longer includes these inflated distances. This gives a score of 0.9401 and 28 components:
+
+.. image:: images/contam_DPGMM_better_fit.png
+ :alt: A better fit to pairwise distances
+ :align: center
+
+The best thing to do is to remove the poor quality isolate, or if possible
+remove the contaminated reads/contigs from the assembly.
+
+With the network
+^^^^^^^^^^^^^^^^
+Alternatively, the network itself can be inspected with ``--cytoscape``. Using
+the approach detailed in :ref:`cytoscape-view` gives the following view:
+
+.. image:: images/cytoscape_contaminant.png
+ :alt: A better fit to pairwise distances
+ :align: center
+
+The contaminated node appears when ordering by ClusteringCoefficient, Stress or
+TopologicalCoefficient, and its edges appear when ordering by EdgeBetweeness.
+It can be seen highlighted in the top right component, connecting two clusters
+which otherwise have no links. It can be removed, and components recalculated in
+cytoscape directly, though removal from the PopPUNK database is best.
+
+The second largest cluster is also suspicious, where there are few triangles
+(low transitivity) and the nodes involved have high Stress. This is indicative
+of a bad fit overall, rather than a single problem sample.
+
diff --git a/docs/query_assignment.rst b/docs/query_assignment.rst
new file mode 100644
index 00000000..c9fcf673
--- /dev/null
+++ b/docs/query_assignment.rst
@@ -0,0 +1,237 @@
+Query assignment
+================
+This is the recommended mode to use PopPUNK, as long as a database is available for
+your species. If there is no DB available, you can fit your own (:doc:`model_fitting`).
+
+Briefly, `download your reference database `__ and run::
+
+ poppunk_assign --ref-db database --distances database/database.dists \
+ --q-files qfile.txt --output poppunk_clusters --threads 8
+
+.. contents::
+ :local:
+
+Downloading a database
+----------------------
+Current PopPUNK databases can be found here: https://poppunk.net/pages/databases.html
+
+We refer to sequences in the database as references, and those being added
+as queries.
+
+A database called ``database`` will contain the following files, in ``database/``:
+
+- ``database.h5`` -- the sketches of the reference sequences generated by ``pp-sketchlib``.
+- ``database.dists.npy`` and ``database.dists.pkl`` -- the core and accessory distances for
+ all pairwise comparisons in the sketch database.
+- ``database.fit.npy`` and ``database.fit.pkl`` -- the model fit to the core and accessory distances.
+- ``database_graph.gt`` -- the network defining the fit (loadable with ``graph_tool``).
+- ``database_clusters.csv`` -- the PopPUNK clusters for the reference sequences.
+- ``database_references.refs`` -- a minimal list of references needed to produce correct clusters.
+
+If the ``.refs`` file is missing, all of the samples in the sketch database will be
+used in the distance calculations.
+
+You can use the following arguments to individually target these items if necessary,
+for example when using an alternative fit, or if split across different directories. The
+examples below refer to the default database name:
+
+- (required) ``--ref-db database`` -- the name of directory containing the .h5 file.
+- (required) ``--distances database/database.dists`` -- prefix of the distances.
+- ``--model-dir database`` -- directory containing the model fit and network (dists + fit define the network).
+- ``--previous-clustering database`` -- directory containing the PopPUNK clusters for the references.
+
+Clustering your genomes
+-----------------------
+Create a file which lists your sample names and paths to their sequence data. This file
+has no header, is tab separated, and contains the sample name in the first column. Subsequent
+columns may contain paths to either assembled or raw read data (the type will automatically
+be inferred by checking for the presence of quality scores). Data may be gzipped or uncompressed::
+
+ MS1 ms1_assembled.fa
+ MS2 ms2_assembled.fa
+ SM14 SM14_1.fq.gz SM14_2.fq.gz
+
+Save this as ``qfile.txt``. You're now ready to cluster them!
+Run the following command::
+
+ poppunk_assign --ref-db database --distances database/database.dists \
+ --q-files qfile.txt --output poppunk_clusters --threads 8
+
+This will first of all sketch your input genomes, saving them in ``poppunk_clusters/poppunk_clusters.h5``.
+If you need to rerun part of the analysis with different options this will automatically be picked up
+and loaded.
+
+.. note::
+ :doc:`qc` does not apply to query sequences. A test for maximum accessory distance
+ will be made, but the program will only emit warnings and will run with all genomes
+ anyway. Most options for sketching will be taken from the reference database, but you
+ can still specify error filtering options from read input (``--min-kmer-count`` and
+ ``--exact-count``) and specify your input as ``--strand-preserved``. See :doc:`sketching` for
+ more information on these options.
+
+Next, core and accessory distances between your input sketches and those in the database
+will be computed. This has complexity :math:`O(RQ)` where :math:`R` is the number of
+samples in ``database_references.refs`` and :math:`Q` is the number in ``qfile.txt``. These distances
+are then fed into the model and used to update the network, and therefore clusters.
+
+The output will look something like this::
+
+ Graph-tools OpenMP parallelisation enabled: with 4 threads
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.5.1
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+ Mode: Assigning clusters of query sequences
+
+ Sketching genomes using 8 thread(s)
+ Calculating distances using 8 thread(s)
+ Loading previously refined model
+ Network loaded: 2007 samples
+ Found novel query clusters. Calculating distances between them.
+ Could not find random match chances in database, calculating assuming equal base frequencies
+ Calculating distances using 8 thread(s)
+
+Your clusters will be written to ``poppunk_clusters/poppunk_clusters_clusters.csv``::
+
+ Taxon,Cluster
+ 21946_6_66,9
+ 22695_3_148,9
+ 22984_8_88,9
+ 21946_6_245,116
+ 21946_6_189,814
+ 22695_3_73,814
+ 21946_6_50,422
+ 21903_8_95,148
+ 21903_8_250,301
+ 22984_8_47,70
+
+These names are identical to those used in the reference database, so retain
+the same meaning between studies. If new clusters are found they will be numbered
+in ascending order from largest to smallest, beginning from the end of the reference
+clusters.
+
+.. note::
+ You may observed clusters merging (but never splitting). If your genomes
+ do cause clusters to merge this will be noted in the output, and the new
+ clusters will be named using the old ones. For example, if clusters 23 and 38
+ merged, the new cluster would be called 23_38.
+
+By default, only the query genome clusters are included here. The reference genome
+clusters are considered unchanged from the input. If there are many merges and you
+wish to know their new cluster IDs, use ``--update-db`` (:ref:`update-db`).
+
+You can use ``poppunk_visualise`` to look at your results. Here's an example output
+to cytoscape, showing the clusters as colours, reference genomes as circles and
+queries as triangles (open in a new tab to zoom on detail):
+
+.. image:: images/assign_network.png
+ :alt: Network produced after query assignment
+ :align: center
+
+Adding external cluster labels (MLST, CC etc)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Add the ``--external-clustering`` argument to add a CSV file of cluster definitions
+which the output will be additionally labelled with, and output to ``database/database_external_clusters.csv``.
+These can be any cluster definitions you wish, with as many columns as you like. A header row is required::
+
+ sample,GPSC,MLST
+ 23430_1_186,1,22
+ 17794_6_29,23,43
+ 12291_4_13,1,2
+
+For each PopPUNK cluster, all the samples found in said cluster will be accumulated.
+From these accumulated samples the external clusters will be collected, and assigned
+to all of these examples. This may give you a one-to-one mapping between PopPUNK clusters
+and your external cluster, or you may find multiple external clusters refer to the
+PopPUNK cluster giving output such as ``227;811;763;824``.
+
+Using a model fitted with lineage assignment mode
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+You will need to add ``--assign-lineages`` to pick up the correct model, additionally
+you can add options ``--rank`` to choose the rank to assign from (default is the lowest
+available) and ``--use-accessory`` to use the accessory distances rather than the core
+for clustering. You will find extra model files with the ranks listed in their name if
+this model type is available.
+
+Using a model fitted with ``--indiv-refine``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If the database was fitted with the refine fit mode, and ``indiv-refine`` you may have
+a core distance boundary, accessory boundary and combined core-accessory boundary fit. The
+default is to use the combined boundary, to use the others add ``--core-only`` or
+``--accessory-only``.
+
+Increasing speed
+----------------
+Query assignment is the most efficient mode, typically requiring :math:`Q` sketches and
+:math:`RQ` distances. If you are updating the database, this increases to :math:`Q^2 + RQ`
+distances. If you are assigning a very large number of queries you can run ``poppunk_assign``
+with ``--update-db`` repeatedly for batches of query input, as the :math:`Q^2` term will
+be reduced by clique-pruning at each iteration.
+
+Straightforward ways to increase speed include:
+
+- Add ``--gpu-dist``, if you have a GPU available.
+- Add ``--gpu-sketch``, if your input is all reads, and you have a GPU available. If
+ your input is a mix of assemblies and reads, run in two separate batches, with
+ the batch of reads using this option.
+- Increase ``--threads``.
+
+.. _update-db:
+
+Updating the database
+---------------------
+If you want to add your query genomes into the reference database so that they
+can be used to inform future cluster assignment, this is as simple as adding the
+``--update-db`` option to the command above. This is particularly useful when novel
+query clusters have been found -- they will then be the consistent name for future assignments::
+
+ poppunk_assign --ref-db database --distances database/database.dists \
+ --q-files qfile.txt --output poppunk_clusters --threads 8 --update-db
+
+ Graph-tools OpenMP parallelisation enabled: with 4 threads
+ PopPUNK (POPulation Partitioning Using Nucleotide Kmers)
+ (with backend: sketchlib v1.5.1
+ sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so)
+ Mode: Assigning clusters of query sequences
+
+ Sketching 28 genomes using 4 thread(s)
+ Writing sketches to file
+ Calculating distances using 4 thread(s)
+ Loading BGMM 2D Gaussian model
+ Network loaded: 18 samples
+ Calculating all query-query distances
+ Could not find random match chances in database, calculating assuming equal base frequencies
+ Calculating distances using 4 thread(s)
+ Updating reference database to poppunk_clusters
+ Removing 27 sequences
+
+ Done
+
+The new database contains all of the reference sequences, and all of your query sequences.
+The ``poppunk_clusters`` folder will now contain all of the files of a reference
+database listed above, except for the model. You can use ``--model-dir`` to target
+this for future assignment, or copy it over yourself. Alternatively, if you run
+with the same ``--output`` folder as ``--ref-db``, adding ``--overwrite``, the original
+input folder will contain the updated database containing everything needed.
+
+.. note::
+ This mode can take longer to run with large numbers of input query genomes,
+ as it will calculate all :math:`Q^2` query-query distances, rather than
+ just those found in novel query clusters.
+
+Visualising results
+-------------------
+If you wish to produce visualisations from query assignment results the best
+way to do this is to run with ``--update-db``, and then run ``poppunk_visualise``
+on the output directory, as if visualising a full reference fit.
+
+However, it is possible to run directly on the outputs by adding a ``--ref-db``
+as used in the assign command, and a ``--query-db`` which points to the ``--output``
+directory used in the assign command. In this mode isolates will be annotated
+depending on whether they were a query or reference input.
+
+.. warning::
+ Without ``--update-db``, visualisation is required to recalculate all query-query distances
+ each time it is called. If your query set is large and you want repeated visualisations,
+ run ``poppunk_assign`` with ``--update-db``.
+
+See :doc:`visualisation` for more details.
\ No newline at end of file
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
deleted file mode 100644
index 8a5b071a..00000000
--- a/docs/quickstart.rst
+++ /dev/null
@@ -1,229 +0,0 @@
-Quickstart
-==========
-
-.. |nbsp| unicode:: 0xA0
- :trim:
-
-This guide briefly explains how PopPUNK can be run on a set of genomes.
-For a more detailed example see the :doc:`tutorial`.
-
-We will work with 128 *Listeria monocytogenes* genomes from `Kremer
-et al `_ which can be downloaded
-from `figshare `__.
-
-.. contents::
- :local:
-
-Running PopPUNK
----------------
-First download the example set above, then extract the assemblies and create a
-file with a list of their names and locations::
-
- tar xf listeria_example.tar.bz2
- paste <(ls *.contigs_velvet.fa) <(ls *.contigs_velvet.fa) > reference_list.txt
-
-The second command here generates two columns, where the names are the same as the
-filenames, but you can define whatever name you want in the first column.
-
-Now run PopPUNK::
-
- poppunk --easy-run --r-files reference_list.txt --output lm_example --threads 4 --plot-fit 5 --min-k 13 --full-db
-
-This will:
-
-1. Create a database of mash sketches
-2. Use these to calculate core and accessory distances between samples (which
- are also stored as part of the database).
-3. Fit a two-component Gaussian mixture model to these distances to attempt to
- find within-strain distances.
-4. Use this fit to construct a network, from which clusters are defined
-
-where the additional options:
-
-* ``--threads 4`` increase speed by using more CPUs
-* ``--min-k 13`` ensures the distances are not biased by random matches at
- lower k-mer lengths.
-* ``--plot-fit 5`` plots five examples of the linear fit, to ensure ``--min-k``
- was set high enough.
-* ``--full-db`` does not remove redundant references at the end, so the model
- fit can be re-run.
-
- .. important::
- The key step for getting good clusters is to get the right model fit to
- the distances. The algorithm is robust to most other parameters settings.
- See :ref:`model-refit` for details.
-
-The cluster definitions are output to ``lm_example/lm_example_clusters.csv``.
-
-Check the distance fits
-^^^^^^^^^^^^^^^^^^^^^^^
-The first thing to do is check the relation between mash distances and core and
-accessory distances are correct::
-
- Creating mash database for k = 13
- Random 13-mer probability: 0.04
- Creating mash database for k = 21
- Random 21-mer probability: 0.00
- Creating mash database for k = 17
- Random 17-mer probability: 0.00
- Creating mash database for k = 25
- Random 25-mer probability: 0.00
- Creating mash database for k = 29
- Random 29-mer probability: 0.00
-
-This shows ``--min-k`` was set appropriately, as no random probabilities were
-greater than 0.05. Looking at one of the plots ``lm_example/fit_example_1.pdf``
-shows a straight line fit, with the left most point not significantly above the
-fitted relationship:
-
-.. image:: images/lm_fit.png
- :alt: Straight line fit between log(Jaccard distance) and k-mer length
- :align: center
-
-Check the distance plot
-^^^^^^^^^^^^^^^^^^^^^^^
-A plot of core and accessory distances contains information about population structure,
-and about the evolution of core and accessory elements. Open
-``lm_example/lm_example_distanceDistribution.png``:
-
-.. image:: images/lm_distance_dist.png
- :alt: Plot of pairwise core and accessory distances
- :align: center
-
-Each point is the distance between a pair of isolates in the collection. The
-x-axis shows core distances, the y-axis accessory distances. Lines are contours
-of density in regions where points overlap, running from blue (low density) to
-yellow (high density). Usually the highest density will be observed in the
-top-right most blob, where isolates from different clusters are being compared.
-
-In this sample collection the top-right blob represents comparisons between lineage I and
-lineage II strains. The blob nearest the origin represents comparisons within
-the same strain. The other two blobs are comparisons between different strains
-within the same lineage. Overall there is a correlation between core and
-accessory divergence, and accessory divergence within a cluster is higher than
-the core divergence.
-
-Check the model fit
-^^^^^^^^^^^^^^^^^^^
-A summary of the fit and model is output to ``STDERR``::
-
- Fit summary:
- Avg. entropy of assignment 0.0004
- Number of components used 2
- Warning: trying to create very large network
- Network summary:
- Components 2
- Density 0.5405
- Transitivity 1.0000
- Score 0.4595
-
-This is a bad network score -- a value of at least 0.8 would be expected for
-a good fit. A high density suggests the fit was not specific enough, and too
-many points in the core-accessory plot have been included as within strain.
-Looking at the fit this proves to be true:
-
-.. image:: images/lm_GMM_K2.png
- :alt: Initial fit using two components
- :align: center
-
-As only two components were used, the separate blobs on the plots were not able
-to be captured. The blob closest to the origin must be separated from the
-others for a good high-specificity fit. Inclusion of even a small number of
-points between different clusters rapidly increases cluster size and decreases
-number of clusters. In this example the initial fit clusters lineage I and
-lineage II separately, but merges sub-lineages (which we refer to as strains).
-
-PopPUNK offers three ways to achieve this -- two are discussed below.
-
-.. _model-refit:
-
-Re-fitting the model
---------------------
-To achieve a better model fit which finds the strains within the main lineages
-the blob of points near the origin needs to be separated from the other
-clusters. One can use the existing database to refit the model with minimal
-extra computation.
-
-The first way to do this is to increase the number of mixture components to the
-number of blobs you roughly judge to be in the plot. In this case there are
-four::
-
- poppunk --fit-model --distances lm_example/lm_example.dists --ref-db lm_example --output lm_example --full-db --K 4
-
-This correctly separates the blob at the origin -- the 'within-strain'
-distances:
-
-.. image:: images/lm_GMM_K4.png
- :alt: Improved fit using two components
- :align: center
-
-Which gives more clusters (network components) and a lower density, higher
-scoring network::
-
- Fit summary:
- Avg. entropy of assignment 0.0076
- Number of components used 4
- Network summary:
- Components 31
- Density 0.0897
- Transitivity 1.0000
- Score 0.9103
-
-Alternatively `DBSCAN `__ can be used, which doesn't require the number of
-clusters to be specified::
-
- poppunk --fit-model --distances lm_example/lm_example.dists --ref-db lm_example --output lm_example --full-db --dbscan
-
-This gives a very similar result:
-
-.. image:: images/lm_dbscan.png
- :alt: Improved fit using dbscan
- :align: center
-
-with an almost identical network producing identical clusters::
-
- Fit summary:
- Number of clusters 4
- Number of datapoints 8128
- Number of assignments 8128
- Network summary:
- Components 31
- Density 0.0896
- Transitivity 0.9997
- Score 0.9103
-
-The slight discrepancy is due to one within-strain point being classified as
-noise (small, black point on the plot). For datasets with more noise points from
-DBSCAN then model refinement should always be run after this step (see :ref:`refine-model`).
-
-Creating interactive output
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Now that a good, high-specificity fit has been obtained you can add some extra
-flags to create output files for visualisation:
-
-* ``--microreact`` -- Files for `Microreact `__ (see
- below).
-* ``--rapidnj rapidnj`` -- Perform core NJ tree construction using rapidnj,
- which is much faster than the default implementation. The argument points to
- the rapidnj binary.
-* ``--cytoscape`` -- Files to view the network in `Cytoscape `__.
-* ``--phandango`` -- Files to view the clustering in `phandango `__.
-* ``--grapetree`` -- Files to view the clustering in `GrapeTree `__.
-
-As a brief example, in the ``lm_example`` folder find the files:
-
-* ``lm_example_phandango_clusters.csv``
-* ``lm_example_perplexity20.0_accessory_tsne.dot``
-* ``lm_example_core_NJ.nwk``
-
-And drag-and-drop these into the browser at https://microreact.org/upload.
-This will produce a visualisation with a core genome phylogeny on the left, and
-an embedding of the accessory distances on the right. Each sample is coloured
-by its cluster:
-
-.. image:: images/lm_microreact.png
- :alt: Microreact of Listeria monoscytogenes
- :align: center
-
-The interactive version can be found at https://microreact.org/project/rJJ-cXOum.
-
diff --git a/docs/scripts.rst b/docs/scripts.rst
index 9e5a6099..77d9efc1 100644
--- a/docs/scripts.rst
+++ b/docs/scripts.rst
@@ -6,11 +6,31 @@ Scripts
Brief documentation on the helper scripts included in the package in the ``/scripts`` directory.
To use these scripts you will need to have a clone of the git repository, or they should also be
installed with the prefix 'poppunk' (e.g to run ``extract_distances.py``, run the command
-``poppunk_extract_distances.py``).
+``poppunk_extract_distances.py``).
.. contents::
:local:
+Easy run mode
+-------------
+Previous versions of the software had an ``--easy-run`` mode which would run a pipeline of:
+
+- ``--create-db`` to sketch genomes
+- ``--fit-model --dbscan`` to fit a flexible model
+- ``--refine-model`` to improve this model
+
+This is now available as ``poppunk_easy_run.py`` which will chain calls to ``poppunk``
+and ``poppunk_visualise`` to replicate this functionality.
+
+Adding weights to the network
+-----------------------------
+Converts binary within-cluster edge weights to the Euclidean core-accessory distance.
+This is equivalent to running with ``--graph-weights``::
+
+ poppunk_add_weights _graph.gt .dists