diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 7c416216..0d38638d 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -7,5 +7,5 @@ # Minimum sketchlib version SKETCHLIB_MAJOR = 1 -SKETCHLIB_MINOR = 5 -SKETCHLIB_PATCH = 3 +SKETCHLIB_MINOR = 6 +SKETCHLIB_PATCH = 0 diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 9ac13439..c927ac52 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -111,12 +111,13 @@ def get_options(): type=float, default = None) refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. ' 'See documentation for help.', default=None) - refinementGroup.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually', default=False, - action='store_true') + refinementGroup.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually', + choices=['both', 'core', 'accessory'], default = False) refinementGroup.add_argument('--no-local', help='Do not perform the local optimization step (speed up on very large datasets)', default=False, action='store_true') refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries ' 'to clusters [default = reference database directory]', type = str) + refinementGroup.add_argument('--core-only', help='Save the core distance fit (with ') # lineage clustering within strains lineagesGroup = parser.add_argument_group('Lineage analysis options') @@ -175,6 +176,7 @@ def main(): from .network import printClusters from .plot import writeClusterCsv + from .plot import plot_scatter from .prune_db import prune_distance_matrix @@ -287,6 +289,11 @@ def main(): dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" storePickle(refList, queryList, True, distMat, dists_out) + # Plot results + plot_scatter(distMat, + args.output + "/" + os.path.basename(args.output) + "_distanceDistribution", + args.output + " distances") + #******************************# #* *# #* model fit and network *# @@ -434,7 +441,6 @@ def main(): overall_lineage, output_format = 'phandango', epiCsv = None, - queryNames = refList, suffix = '_Lineage') genomeNetwork = indivNetworks[min(rank_list)] @@ -469,10 +475,10 @@ def main(): output + "/" + os.path.basename(output) + \ "_" + dist_type + '_graph.gt', fmt = 'gt') - if args.core_only: + if args.indiv_refine == 'core': fit_type = 'core' genomeNetwork = indivNetworks['core'] - elif args.accessory_only: + elif args.indiv_refine == 'accessory': fit_type = 'accessory' genomeNetwork = indivNetworks['accessory'] diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 24b97dbe..8b8283c9 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -21,8 +21,6 @@ import pp_sketchlib -from .plot import plot_scatter - # BGMM from .bgmm import fit2dMultiGaussian from .bgmm import assign_samples @@ -126,7 +124,6 @@ def fit(self, X = None): '''Initial steps for all fit functions. Creates output directory. If preprocess is set then subsamples passed X - and draws a scatter plot from result using :func:`~PopPUNK.plot.plot_scatter`. Args: X (numpy.array) @@ -159,12 +156,6 @@ def fit(self, X = None): self.scale = np.amax(self.subsampled_X, axis = 0) self.subsampled_X /= self.scale - # Show clustering - plot_scatter(self.subsampled_X, - self.scale, - self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_distanceDistribution", - self.outPrefix + " distances") - def plot(self, X=None): '''Initial steps for all plot functions. @@ -474,7 +465,7 @@ def plot(self, X=None, y=None): if not hasattr(self, 'subsampled_X'): self.subsampled_X = utils.shuffle(X, random_state=random.randint(1,10000))[0:self.max_samples,] - non_noise = np.sum(np.where(self.labels != -1)) + non_noise = np.sum(self.labels != -1) sys.stderr.write("Fit summary:\n" + "\n".join(["\tNumber of clusters\t" + str(self.n_clusters), "\tNumber of datapoints\t" + str(self.subsampled_X.shape[0]), "\tNumber of assignments\t" + str(non_noise)]) + "\n\n") diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 0427b378..2ba936bf 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -6,6 +6,7 @@ import sys import os import subprocess +import random import numpy as np import matplotlib as mpl mpl.use('Agg') @@ -18,7 +19,7 @@ import pandas as pd from collections import defaultdict from scipy import spatial -from sklearn import manifold +from sklearn import manifold, utils try: # sklearn >= 0.22 from sklearn.neighbors import KernelDensity except ImportError: @@ -27,7 +28,7 @@ from .utils import isolateNameToLabel -def plot_scatter(X, scale, out_prefix, title, kde = True): +def plot_scatter(X, out_prefix, title, kde = True): """Draws a 2D scatter plot (png) of the core and accessory distances Also draws contours of kernel density estimare @@ -35,8 +36,6 @@ def plot_scatter(X, scale, out_prefix, title, kde = True): Args: X (numpy.array) n x 2 array of core and accessory distances for n samples. - scale (numpy.array) - Scaling factor from :class:`~PopPUNK.models.BGMMFit` out_prefix (str) Prefix for output plot file (.png will be appended) title (str) @@ -46,6 +45,15 @@ def plot_scatter(X, scale, out_prefix, title, kde = True): (default = True) """ + # Plot results - max 1M for speed + max_plot_samples = 1000000 + if X.shape[0] > max_plot_samples: + X = utils.shuffle(X, random_state=random.randint(1,10000))[0:max_plot_samples,] + + # Kernel estimate uses scaled data 0-1 on each axis + scale = np.amax(X, axis = 0) + X /= scale + plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k') if kde: xx, yy, xy = get_grid(0, 1, 100) @@ -58,11 +66,13 @@ def plot_scatter(X, scale, out_prefix, title, kde = True): z = z.reshape(xx.shape).T levels = np.linspace(z.min(), z.max(), 10) + # Rescale contours plt.contour(xx*scale[0], yy*scale[1], z, levels=levels[1:], cmap='plasma') scatter_alpha = 1 else: scatter_alpha = 0.1 + # Plot on correct scale plt.scatter(X[:,0]*scale[0].flat, X[:,1]*scale[1].flat, s=1, alpha=scatter_alpha) plt.title(title) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 251cda5b..21945677 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -108,7 +108,7 @@ def get_options(): faGroup.add_argument('--cytoscape', help='Generate network output files for Cytoscape', default=False, action='store_true') faGroup.add_argument('--phandango', help='Generate phylogeny and TSV for Phandango visualisation', default=False, action='store_true') faGroup.add_argument('--grapetree', help='Generate phylogeny and CSV for grapetree visualisation', default=False, action='store_true') - faGroup.add_argument('--rapidnj', help='Path to rapidNJ binary to build NJ tree for Microreact', default=None) + faGroup.add_argument('--rapidnj', help='Path to rapidNJ binary to build NJ tree for Microreact', default='rapidnj') faGroup.add_argument('--perplexity', type=float, default = 20.0, help='Perplexity used to calculate t-SNE projection (with --microreact) [default=20.0]') @@ -135,6 +135,9 @@ def get_options(): if arg is not None: arg = arg.rstrip('\\') + if args.rapidnj == "": + args.rapidnj = None + return args def generate_visualisations(query_db, diff --git a/docs/api.rst b/docs/api.rst index e46582fa..dc3ff110 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -6,6 +6,13 @@ Documentation for module functions (for developers) .. contents:: :local: +assign.py +--------- +``poppunk_assign`` main function + +.. automodule:: PopPUNK.assign + :members: + bgmm.py -------- @@ -24,12 +31,6 @@ Functions used to fit DBSCAN to a database. Access using .. automodule:: PopPUNK.dbscan :members: -mash.py --------- - -.. automodule:: PopPUNK.mash - :members: - models.py --------- @@ -80,6 +81,13 @@ utils.py .. automodule:: PopPUNK.utils :members: +visualise.py +------------ +``poppunk_visualise`` main function + +.. automodule:: PopPUNK.visualise + :members: + web.py -------- diff --git a/docs/best_practises.rst b/docs/best_practises.rst new file mode 100644 index 00000000..593bab24 --- /dev/null +++ b/docs/best_practises.rst @@ -0,0 +1,72 @@ +Best practises guide +==================== +This page details the way in which we would advise that you *should* use and +run PopPUNK, if possible. + +.. image:: images/poppunk_flowchart.png + :alt: Flowchart for choosing how to use PopPUNK + :align: center + +Use an online interface +----------------------- +If available, you may want to use one of the browser-based interfaces to +PopPUNK. These include `PopPUNK-web `__ and +`pathogen.watch `__ +(*S. pneumoniae* only). See the :doc:`online` page for full details. + +Using these interfaces requires nothing to be installed or set up, doesn't require any +genome data to be shared with us, and will return interactive visualisations. If your +species isn't available, or you have large batches of genomes to cluster you will +likely want to use the command line interface instead. + +Use the command line interface +------------------------------ + +Installation and version +^^^^^^^^^^^^^^^^^^^^^^^^ +Install via conda if possible. Please use at least version v2.3.0 of PopPUNK +and v1.5.1 of ``pp-sketchlib``. + +Use query assignment mode +^^^^^^^^^^^^^^^^^^^^^^^^^ +If a database is available for your species (see https://poppunk.net/pages/databases.html) +we would strongly recommend downloading it to use to cluster your genomes. This +has many advantages: + +- No need to run through the potentially complex model fitting. +- Assured model performance. +- Considerable faster run times. +- Use existing cluster definitions. +- Use the context of large, high quality reference populations to interpret your + genomes' clusters. + +See :doc:`query_assignment` for instructions on how to use this mode. + +You can think of this as being similar to using an existing MLST/cgMLST/wgMLST scheme +to define your sample's strains. + +Fit your own model +^^^^^^^^^^^^^^^^^^ +If a database isn't available for your species, you can fit your own. Details +on how to do this can be found on :doc:`model_fitting`. + +After getting a good fit, you may want to share it with others so that they can +use it to assign queries. See :doc:`model_distribution` for advice. We would also +be interested to hear from you if you'd like to add your new model to the +pre-fit databases above -- please contact poppunk@poppunk.net. + +Create visualisations +^^^^^^^^^^^^^^^^^^^^^ +A number of plots are created by default. You can also +create files for further visualisation in `microreact `__, +`cytoscape `__, +`grapetree `__ and +`phandango `_. We have found that +looking at the appearance of clusters on a tree is always very helpful, and would +recommend this for any fit. + +Older versions of PopPUNK mandated this be chosen as part of the main analysis, +and then with ``--generate-viz`` mode. This is now run separately, after the +main analysis, with ``poppunk_visualise``. + +See :doc:`visualisation` for details on options. \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 941f2c50..61baf7f4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -38,7 +38,8 @@ # Causes a problem with rtd: https://github.com/pypa/setuptools/issues/1694 autodoc_mock_imports = ["hdbscan", "numpy", - "graph-tool", + "graph_tool.all", + "graph_tool", "pandas", "scipy", "sklearn", @@ -65,16 +66,16 @@ # General information about the project. project = 'PopPUNK' copyright = '2018-2020, John Lees and Nicholas Croucher' -author = 'John Lees and Nicholas Croucher' +author = 'John Lees, Daniel Anderson and Nicholas Croucher' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '2.2.0' +version = '2.3.0' # The full version, including alpha/beta/rc tags. -release = '2.2.0' +release = '2.3.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/images/13mer_hist.png b/docs/images/13mer_hist.png new file mode 100644 index 00000000..11980a25 Binary files /dev/null and b/docs/images/13mer_hist.png differ diff --git a/docs/images/DPGMM_fit_K2.png b/docs/images/DPGMM_fit_K2.png deleted file mode 100644 index e1d65c50..00000000 Binary files a/docs/images/DPGMM_fit_K2.png and /dev/null differ diff --git a/docs/images/DPGMM_fit_K3.png b/docs/images/DPGMM_fit_K3.png deleted file mode 100644 index 36760e40..00000000 Binary files a/docs/images/DPGMM_fit_K3.png and /dev/null differ diff --git a/docs/images/assign_network.png b/docs/images/assign_network.png new file mode 100644 index 00000000..3b184785 Binary files /dev/null and b/docs/images/assign_network.png differ diff --git a/docs/images/DPGMM_fit_K10.png b/docs/images/bgmm_fit_K10.png similarity index 100% rename from docs/images/DPGMM_fit_K10.png rename to docs/images/bgmm_fit_K10.png diff --git a/docs/images/bgmm_k2_fit.png b/docs/images/bgmm_k2_fit.png new file mode 100644 index 00000000..f46c2ff4 Binary files /dev/null and b/docs/images/bgmm_k2_fit.png differ diff --git a/docs/images/bgmm_k4_boundary.png b/docs/images/bgmm_k4_boundary.png new file mode 100644 index 00000000..37351d45 Binary files /dev/null and b/docs/images/bgmm_k4_boundary.png differ diff --git a/docs/images/bgmm_k4_fit.png b/docs/images/bgmm_k4_fit.png new file mode 100644 index 00000000..f96d487a Binary files /dev/null and b/docs/images/bgmm_k4_fit.png differ diff --git a/docs/images/cytoscape.png b/docs/images/cytoscape.png index 7c9cdbab..fb129c8e 100644 Binary files a/docs/images/cytoscape.png and b/docs/images/cytoscape.png differ diff --git a/docs/images/cytoscape_gpsc.png b/docs/images/cytoscape_gpsc.png new file mode 100644 index 00000000..e12a3ad9 Binary files /dev/null and b/docs/images/cytoscape_gpsc.png differ diff --git a/docs/images/dbscan_fit.png b/docs/images/dbscan_fit.png index bde3f7a0..cf7f5d3d 100644 Binary files a/docs/images/dbscan_fit.png and b/docs/images/dbscan_fit.png differ diff --git a/docs/images/dbscan_fit_min_prop.png b/docs/images/dbscan_fit_min_prop.png new file mode 100644 index 00000000..782f5577 Binary files /dev/null and b/docs/images/dbscan_fit_min_prop.png differ diff --git a/docs/images/fit_example_fixed.png b/docs/images/fit_example_fixed.png deleted file mode 100644 index 4682bfc5..00000000 Binary files a/docs/images/fit_example_fixed.png and /dev/null differ diff --git a/docs/images/fit_example_wrong.png b/docs/images/fit_example_wrong.png deleted file mode 100644 index 782ecce7..00000000 Binary files a/docs/images/fit_example_wrong.png and /dev/null differ diff --git a/docs/images/flu_phased.png b/docs/images/flu_phased.png new file mode 100644 index 00000000..909662b5 Binary files /dev/null and b/docs/images/flu_phased.png differ diff --git a/docs/images/flu_unphased.png b/docs/images/flu_unphased.png new file mode 100644 index 00000000..d9f257c6 Binary files /dev/null and b/docs/images/flu_unphased.png differ diff --git a/docs/images/grapetree.png b/docs/images/grapetree.png new file mode 100644 index 00000000..ac69875e Binary files /dev/null and b/docs/images/grapetree.png differ diff --git a/docs/images/grapetree_collapse.png b/docs/images/grapetree_collapse.png new file mode 100644 index 00000000..213170ad Binary files /dev/null and b/docs/images/grapetree_collapse.png differ diff --git a/docs/images/indiv_refine.png b/docs/images/indiv_refine.png index ceb62541..00ca03bb 100644 Binary files a/docs/images/indiv_refine.png and b/docs/images/indiv_refine.png differ diff --git a/docs/images/kmer_fit.png b/docs/images/kmer_fit.png new file mode 100644 index 00000000..16efe919 Binary files /dev/null and b/docs/images/kmer_fit.png differ diff --git a/docs/images/listeria_dists.png b/docs/images/listeria_dists.png new file mode 100644 index 00000000..30005b58 Binary files /dev/null and b/docs/images/listeria_dists.png differ diff --git a/docs/images/listeria_lineage_rank_1.png b/docs/images/listeria_lineage_rank_1.png new file mode 100644 index 00000000..4604e04c Binary files /dev/null and b/docs/images/listeria_lineage_rank_1.png differ diff --git a/docs/images/listeria_lineage_rank_1_histogram.png b/docs/images/listeria_lineage_rank_1_histogram.png new file mode 100644 index 00000000..9c951e09 Binary files /dev/null and b/docs/images/listeria_lineage_rank_1_histogram.png differ diff --git a/docs/images/listeria_lineage_rank_3.png b/docs/images/listeria_lineage_rank_3.png new file mode 100644 index 00000000..85d9f395 Binary files /dev/null and b/docs/images/listeria_lineage_rank_3.png differ diff --git a/docs/images/listeria_lineage_rank_3_histogram.png b/docs/images/listeria_lineage_rank_3_histogram.png new file mode 100644 index 00000000..0b7650c5 Binary files /dev/null and b/docs/images/listeria_lineage_rank_3_histogram.png differ diff --git a/docs/images/listeria_microreact.png b/docs/images/listeria_microreact.png new file mode 100644 index 00000000..05296a1a Binary files /dev/null and b/docs/images/listeria_microreact.png differ diff --git a/docs/images/listeria_refined.png b/docs/images/listeria_refined.png new file mode 100644 index 00000000..f34daccc Binary files /dev/null and b/docs/images/listeria_refined.png differ diff --git a/docs/images/listeria_threshold.png b/docs/images/listeria_threshold.png new file mode 100644 index 00000000..a2fc3ea4 Binary files /dev/null and b/docs/images/listeria_threshold.png differ diff --git a/docs/images/lm_GMM_K2.png b/docs/images/lm_GMM_K2.png deleted file mode 100644 index a4376508..00000000 Binary files a/docs/images/lm_GMM_K2.png and /dev/null differ diff --git a/docs/images/lm_GMM_K4.png b/docs/images/lm_GMM_K4.png deleted file mode 100644 index a6e248a7..00000000 Binary files a/docs/images/lm_GMM_K4.png and /dev/null differ diff --git a/docs/images/lm_dbscan.png b/docs/images/lm_dbscan.png deleted file mode 100644 index 9ba337bb..00000000 Binary files a/docs/images/lm_dbscan.png and /dev/null differ diff --git a/docs/images/lm_distance_dist.png b/docs/images/lm_distance_dist.png deleted file mode 100644 index 9966f27e..00000000 Binary files a/docs/images/lm_distance_dist.png and /dev/null differ diff --git a/docs/images/lm_fit.png b/docs/images/lm_fit.png deleted file mode 100644 index 93f2f2b1..00000000 Binary files a/docs/images/lm_fit.png and /dev/null differ diff --git a/docs/images/lm_microreact.png b/docs/images/lm_microreact.png deleted file mode 100644 index fede300d..00000000 Binary files a/docs/images/lm_microreact.png and /dev/null differ diff --git a/docs/images/phandango.png b/docs/images/phandango.png new file mode 100644 index 00000000..e056db73 Binary files /dev/null and b/docs/images/phandango.png differ diff --git a/docs/images/poppipe_dag.png b/docs/images/poppipe_dag.png new file mode 100644 index 00000000..6a2f3cb6 Binary files /dev/null and b/docs/images/poppipe_dag.png differ diff --git a/docs/images/poppunk_flowchart.png b/docs/images/poppunk_flowchart.png new file mode 100644 index 00000000..4a71d8d8 Binary files /dev/null and b/docs/images/poppunk_flowchart.png differ diff --git a/docs/images/web_cyto.png b/docs/images/web_cyto.png new file mode 100644 index 00000000..c8d0edb0 Binary files /dev/null and b/docs/images/web_cyto.png differ diff --git a/docs/images/web_home.png b/docs/images/web_home.png new file mode 100644 index 00000000..3819c9fd Binary files /dev/null and b/docs/images/web_home.png differ diff --git a/docs/images/web_micro.png b/docs/images/web_micro.png new file mode 100644 index 00000000..24ce9ab9 Binary files /dev/null and b/docs/images/web_micro.png differ diff --git a/docs/images/web_micro_assigned.png b/docs/images/web_micro_assigned.png new file mode 100644 index 00000000..1ff40f78 Binary files /dev/null and b/docs/images/web_micro_assigned.png differ diff --git a/docs/images/web_micro_change.png b/docs/images/web_micro_change.png new file mode 100644 index 00000000..bc90b8f6 Binary files /dev/null and b/docs/images/web_micro_change.png differ diff --git a/docs/images/web_phylo.png b/docs/images/web_phylo.png new file mode 100644 index 00000000..9dd00833 Binary files /dev/null and b/docs/images/web_phylo.png differ diff --git a/docs/images/web_prevs.png b/docs/images/web_prevs.png new file mode 100644 index 00000000..deb1aafa Binary files /dev/null and b/docs/images/web_prevs.png differ diff --git a/docs/images/web_prevs_zoomed.png b/docs/images/web_prevs_zoomed.png new file mode 100644 index 00000000..fa62659e Binary files /dev/null and b/docs/images/web_prevs_zoomed.png differ diff --git a/docs/images/web_stats.png b/docs/images/web_stats.png new file mode 100644 index 00000000..db2c5947 Binary files /dev/null and b/docs/images/web_stats.png differ diff --git a/docs/index.rst b/docs/index.rst index 7817e761..a032d3dd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,57 +9,62 @@ PopPUNK documentation :alt: PopPUNK (Population Partitioning Using Nucleotide K-mers) :align: center -In straightforward cases, usage can be as simple as:: +PopPUNK is a tool for clustering genomes. The first version was targeted specifically +as bacterial genomes, but the current version has also been used for viruses +(e.g. enterovirus, influenza, SARS-CoV-2) and eukaryotes (e.g. *Candida* sp., +*P. falciparum*). Under the hood, PopPUNK uses +`pp-sketchlib `__ to rapidly calculate +core and accessory distances, and machine learning tools written in python to +use these to cluster genomes. A detailed description of the method can be found +in the `paper `_. - poppunk --easy-run --r-files references.txt --output poppunk_db +If you are new to PopPUNK, we'd recommend starting on :doc:`installation`, then +by reading the :doc:`best_practises`. -Where ``references.txt`` is a list of assembly fasta files, one per line. See -:doc:`quickstart` and the :doc:`tutorial` for full details. +.. important:: + Looking for older versions of the documentation? For previous versions with + the old API (``--assign-query``, ``--refine-fit`` etc) see `v2.2.0 `__. + For older versions which used mash, see `v1.2.0 `__. .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Contents: self installation.rst - options.rst - quickstart.rst - tutorial.rst + best_practises.rst + online.rst + query_assignment.rst + sketching.rst + qc.rst + model_fitting.rst + model_distribution.rst + visualisation.rst + subclustering.rst troubleshooting.rst + options.rst scripts.rst api.rst miscellaneous.rst -Details -------- -A full description of the method can be found in the `paper `_. - -``PopPUNK`` uses the fast k-mer distance estimation enabled by `mash `_ -to calculate core and accessory distances between all pairs of isolates of bacteria in a collection. By clustering -these distances into 'within-strain' and 'between-strain' distances a network -of within-strain comparisons can be constructed. The use of a network has -a number of convenient properties, the first being that the connected -components represent a cluster of strains. - -As well as identifying strains, the pairwise distance distribution also helps -with assembly quality control (particularly in the case of contaminated -contigs) and may be informative of the level of recombination in the -population. The network representation also allows definition of representative isolates by -sampling one example from each clique, and calculation of various statistics -which can show how good the clustering is. - -The advantages of this approach are broadly that: +Why use PopPUNK? +---------------- +The advantages of PopPUNK are broadly that: -- It is fast, and scalable to :math:`10^{4}` genomes in a single run. +- It is fast, and scalable to over :math:`10^{5}` genomes in a single run. - Assigning new query sequences to a cluster using an existing database is scalable even beyond this. +- Cluster names remain consistent between studies, and other cluster labels such as MLST + can be appended. - Databases can be updated online (as sequences arrive). - Online updating is equivalent to building databases from scratch. - Databases can be kept small and managable by only keeping representative isolates. +- Databases naturally allow in-depth analysis of single clusters, + but keeping the full context of the whole database. - There is no bin cluster. Outlier isolates will be in their own cluster. - Pre-processing, such as generation of an alignment, is not required. -- The definition of clusters is biologically relevant to how bacteria evolve. -- There is a lot of quantitative and graphical output to assist with - clustering. +- Raw sequence reads can be used as input, while being filtered for sequencing errors. +- The definition of clusters are biologically relevant. +- Many quantitative and graphical outputs are provided. - A direct import into `microreact `_ is available, as well as `cytoscape `_, `grapetree `_ and diff --git a/docs/installation.rst b/docs/installation.rst index 645ccd92..e6ca66a8 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -31,12 +31,26 @@ Then run:: conda install poppunk +If you are having conflict issues with conda, our advice would be: + +- Remove and reinstall miniconda. +- Never install anything in the base environment +- Create a new environment for PopPUNK with ``conda create -n pp_env poppunk`` + +If you have an older version of PopPUNK, you can upgrade using this method -- you +may also wish to specify the version, for example ``conda install poppunk==2.3.0`` if you +wish to upgrade. + +conda-forge also has some helpful tips: https://conda-forge.org/docs/user/tipsandtricks.html + Installing with pip ------------------- If you do not have conda, you can also install through pip:: python3 -m pip install poppunk +This may not deal with all necessary :ref:`dependencies`. + Clone the code -------------- You can also clone the github to run the latest version, which is executed by:: @@ -56,6 +70,7 @@ Dependencies We tested PopPUNK with the following packages: * python3 (3.8.2) +* ``pp-sketchlib`` (1.5.1) * ``DendroPy`` (4.3.0) * ``hdbscan`` (0.8.13) * ``matplotlib`` (2.1.2) diff --git a/docs/model_distribution.rst b/docs/model_distribution.rst new file mode 100644 index 00000000..4d6ff0cf --- /dev/null +++ b/docs/model_distribution.rst @@ -0,0 +1,74 @@ +Distributing PopPUNK models +=========================== +If you have fitted a model yourself, you may be interested in distributing it so that +others can use it for your species. This will give consistent cluster names across datasets, +mean the high-quality tested fit can be reused, and speeds up future analysis. + +Please contact us at poppunk@poppunk.net. We would be happy to add your sketches and +fitted model to our other `databases `__. + +Database contents +----------------- +A database requires the following files: + +- ``.h5``. The sketch database, a HDF5 file. +- ``.dists.pkl`` and ``.dists.npy`` files. Distances for all vs all samples in the sketch database. +- ``_fit.npz`` and ``_fit.pkl`` files. Python files which describe the model fit. +- ``_graph.gt``. The network relating distances, fit and strain assignment for all samples in the sketch database. +- ``_clusters.csv``. The strain assignment of all samples in the sketch database. + +If you used a :ref:`lineage-fit` you will also need: + +- ``rank_k_fit.npz``. Distances for each rank :math:`k` fit. +- ``_lineages.csv``. Combined lineage assignments for each rank. + +You may also have ``.refs`` versions of these files, which are pruned to contain just the +reference samples (see below). We would highly recommend including the ``output/output.refs`` file +with any database, even though it is not strictly required, as it will speed up query assignment. +Lineage models do not use references. + +.. note:: + If the database is very large, you may consider just distributing the ``.refs`` files. This will + enable query assignment, but visualisation and subclustering within strains will no longer be + possible, as full information within each strain will be missing. + +Picking references +------------------ +PopPUNK automatically prunes redundant sequence information from databases by removing +samples from cliques (where every sample is in the same strain as every other sample). This +algorithm has changed slightly from the originally published one: + +#. Split the graph into connected components (strains), which are analysed in parallel. +#. Identify a clique. If no samples in the clique are already references, add one sample as a reference. +#. Prune the clique from the graph, creating a subgraph. +#. Recursively apply steps 2-3 until only two samples or fewer remain. +#. Add the remaining samples as references +#. Create the reference graph, and find connected components again. +#. For any samples which are no longer in the same connected component, find a minimum path + between them in the full graph, and add all samples in this path as references. + +This makes the algorithm scale better, and ensures clusters remain connected. You may find +that more references are picked than before using this method, which is a small cost for the +increase robustness. + +This process occurs automatically after the model fit. In the *Listeria* example:: + + Removing 97 sequences + +31 strains are represented by :math:`128 - 97 = 31` references, exactly one reference +per cluster, which is the minimum. The refined fit removes 93 sequences with 29 strains, +so some larger clusters need to be represented by multiple references. The names of the chosen +references are written to the .refs file. In addition, the distances, sketch database and graph +have the non-reference sequences pruned and saved with .refs suffixes. This gives a complete database +suitable for assignment with references only, should the full database be prohibitively large. + +.. note:: + Previous fans (users) of PopPUNK may remember the ``--full-db`` option which switched off + reference picking. This was useful, as reference-only databases always lost information. This + option has now been removed, and reference picking will always be run. Both full and reference + databases are always produced (apart from in lineage mode). The default assignment uses + just references, but has the full database available for strain visualisation and subclustering. + +If you interrupt the reference picking the output will still be valid. If you wish to +run reference picking on a database where it is missing (due to being from an older version, +or interrupted) you can do this with the ``poppunk_references`` script. diff --git a/docs/model_fitting.rst b/docs/model_fitting.rst new file mode 100644 index 00000000..d1239b04 --- /dev/null +++ b/docs/model_fitting.rst @@ -0,0 +1,760 @@ +Fitting new models +================== + +.. |nbsp| unicode:: 0xA0 + :trim: + +If you cannot find an existing model for your species in the +`list `__ you will want to fit your own. +This process is flexible, and there are five different models you can use depending +on the population structure of your dataset. + +.. note:: + After fitting a model to a new species we would like to share it on our website, + so others can use it for assigning queries. If you are open to this, please read + :doc:`model_distribution` after this page. + +.. contents:: + :local: + +Overview +-------- + +First, use ``poppunk --create-db`` to sketch your input data and calculate distances +between all samples. This is detailed in :doc:`sketching`. + +Then, use ``poppunk --fit-model `` with one of the following model names: + +- ``bgmm`` -- Bayesian Gaussian Mixture Model. Best for small sample collections + with strain-structure. Works best when distance distribution components are clearly + separated. +- ``dbscan`` -- HDBSCAN. A good general method for larger sample collections with + strain-structure. Some points will always be designated as noise, so a subsequent run + of model refinement may help improve the fit. +- ``refine`` -- Model refinement. Requires a model already fitted with ``bgmm`` or ``dbscan`` + and attempts to improve it by maximising the network score. Particularly useful when + components overlap significantly (often due to recombination), or when the strain boundary + is thought to lie somewhere within a component. +- ``threshold`` -- Apply a given core or accessory distance threshold to define clusters. Useful if + a cutoff threshold is already known/calculated, is estimated from a plot, or to compare a threshold + between datasets or species. +- ``lineage`` -- Lineage clustering. To find lineages within a strain (subclustering), or + find clusters in a population without strain structure. Uses a simple nearest neighbour approach + so is more of a heuristic. Network scores are not meaningful in this mode. + +The most useful guide to deciding which model to use is the ``_distanceDistribution.png`` file +showing the core and accessory distances. More details on each of these models is given +further down this page. + +A completed fit will consist of: + +- A ``_clusters.csv`` file, which gives the strain (cluster) for each sample in the database. +- ``_fit.npz`` and ``_fit.pkl`` files, which contain numeric data and metadata for the fit. +- A ``_graph.gt`` file, which is the network defining the fit in graph-tool format. +- Some plots of the fit, which depend on the specific model used. +- A ``.refs`` file, which lists the samples kept as 'references' for assigning + future samples (see :doc:`model_distribution` for more details). + +This page will use 128 *Listeria*\ |nbsp| \ *monocytogenes* genomes from `Kremer et al `__, +which can be downloaded from `figshare `__. The distribution of +core and accessory distances from the ``--create-db`` step is as follows: + +.. image:: images/listeria_dists.png + :alt: Core and accessory distances for the example data + :align: center + +We also show some examples with 616 *Streptococcus*\ |nbsp| \ *pneumoniae* genomes, which are more complex. +These genomes were collected from Massachusetts, +first reported `here `__ and can be accessed +`here `__. + +Common arguments +---------------- +- ``--ref-db``: the output prefix used with ``--create-db`` i.e. the directory where the .h5 file is located +- ``--output``: where to save the model. If not specified this defaults to ``ref-db``. +- ``--overwrite``: overwrite any existing files in the output directory. +- ``--external-clustering``: any additional labels to add to the cluster output. +- ``--graph-weights``: save the edges weights in the network as their Euclidean core-accessory + distances, rather than as 0 or 1 (useful for visualising the network). + +External clusters may be other cluster names, such as serotype, sequence type, cgMLST etc. +PopPUNK clusters are mapped as one-to-many, so that each strain is labelled with all of +the clusters any of its members is assigned to in this file. This input file must +be comma separated, one sample per line, with the sample name as the first column, and +other clusters as subsequent columns. A header line with 'sample' and the names of other cluster +types is required. Output is to ``output/output_external_clusters.csv``. + +How good is my fit? +------------------- +We have found the best way to assess this is to use :doc:`visualisation` on your output +and look at your assigned clusters against a tree, to determine whether they have +the specificity required. + +You can also compare models with their network score, and +whether the output plots look as expected. Typically the key thing is that +**your spatial component nearest the origin is accurate**. More detail is given for each model below. + +Interpreting the network summary +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +All fits will output a network summary which looks similar to this:: + + Network summary: + Components 31 + Density 0.0897 + Transitivity 1.0000 + Score 0.9103 + +- Components are the number of strains (clusters) found using this model. +- Density is the proportion of distances assigned as 'within-strain'. Generally + smaller is better as this gives more specific clusters, but too close to zero + may be an over-specific model. +- Transitivity measures whether every member of each strain is connected to every + other member. Closer to 1 is better, but this can be achieved with very loose fits. +- Score synthesises the above as :math:`(1 - \mathrm{density}) * \mathrm{transitivity}`, + which gives a single number between 0 (bad) and 1 (good) which in many cases is + at a maximum when it accurately describes strains in the data. + +.. _bgmm: + +bgmm +---- +This mode fits a `Bayesian Gaussian mixture model `__ +to the core and accessory distances. With few points, methods such as DBSCAN may struggle to find +clusters due to the sparsity, whereas a BGMM can often find a good fit. A further advantage +is that the equation for the posterior is known, so all points will have an assignment and a non-linear +boundary found exactly. + +However, when there are a very large number of points the likelihood has a tendency +to totally override the prior in the estimated posterior, meaning many overlapping components +may be fitted, which may give poor clusters, and is less robust to adding more data. It is possible +for this mode to fail to converge, but it is more likely to produce a bad fit in difficult cases. + +The key parameter to specify is the maximum number of components ``--K``. You should +choose a number based on the number of components you can see on your distance plot. This +may be automatically reduced if there is insufficent evidence for this many components. As a rule of thumb, +if you have under 150 samples or under 1000 samples and clear components then this mode should give +a good fit. + +A better network score is evidence of a better fit, but the output files should also be used to +judge this. With the test dataset, four components are visible:: + + poppunk --fit-model bgmm --ref-db listeria --K 4 + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting bgmm model to reference database + + Fit summary: + Avg. entropy of assignment 0.0042 + Number of components used 4 + + Scaled component means: + [0.9415286 0.90320047] + [0.11542755 0.24570244] + [0.20966101 0.37694884] + [0.00527421 0.07043826] + + Network summary: + Components 31 + Density 0.0897 + Transitivity 1.0000 + Score 0.9103 + Removing 97 sequences + + Done + +In the output to the terminal: + +- The average entropy of assignment is a measure of the certainty of assignment + of each point. Lower is better. Higher values may indicate overlapping components, + perhaps due to high amounts of recombination between strains. +- Number of components used is how many components from ``K`` were actually used + in the spatial fit. This is usually equal to ``K``, but may be reduced in small datasets. +- Scaled component means are the centres of the fitted components in the model, where + the core and accessory distances have been rescaled between 0 and 1. These can be + used with :ref:`manual-start`. + +The fit actually just uses the component closest to the origin -- any distances +assigned to this component are within-strain. This is the most important part of the +fit in this mode. + +You can see that this gives a good network score, and fits the data well: + +.. image:: images/bgmm_k4_fit.png + :alt: BGMM fit with K = 4 + :align: center + +The position of the boundary is also produced (in red), along with contours of +the fitted mixture components: + +.. image:: images/bgmm_k4_boundary.png + :alt: BGMM fit with K = 4 + :align: center + +If you make K too low, some components will be merged, resulting in a less-specific +fit with fewer clusters, that do not fully delineate all of the strains (in this +case just finding the two main lineages of *Listeria* in this data):: + + poppunk --fit-model bgmm --ref-db listeria --K 2 + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting bgmm model to reference database + + Fit summary: + Avg. entropy of assignment 0.0007 + Number of components used 2 + + Scaled component means: + [0.11627304 0.2432584 ] + [0.9415286 0.90320047] + + Network summary: + Components 2 + Density 0.5405 + Transitivity 1.0000 + Score 0.4595 + Removing 126 sequences + + Done + +.. image:: images/bgmm_k2_fit.png + :alt: BGMM fit with K = 2 + :align: center + +Too many components in a small dataset are automatically reduced to an +appropriate number, obtaining the same good fit as above:: + + poppunk --fit-model bgmm --ref-db listeria --K 10 + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting bgmm model to reference database + + Fit summary: + Avg. entropy of assignment 0.3195 + Number of components used 4 + + Scaled component means: + [0.9415286 0.90320047] + [3.72458739e-07 4.73196248e-07] + [0.00527421 0.07043826] + [0.20966682 0.37695524] + [0.11542849 0.2457043 ] + [1.68940242e-11 2.14632815e-11] + [7.66987488e-16 9.74431443e-16] + [3.48211781e-20 4.42391191e-20] + [1.58087904e-24 2.00845290e-24] + [7.17717973e-29 9.11836205e-29] + + Network summary: + Components 31 + Density 0.0897 + Transitivity 1.0000 + Score 0.9103 + Removing 97 sequences + + Done + +In a dataset with more points, and less clear components, too many components can lead to +a bad fit: + +.. image:: images/bgmm_fit_K10.png + :alt: BGMM fit with K = 10 + :align: center + +This is clearly a poor fit. The real issue is that the component whose mean is nearest +the origin is unclear, and doesn't include all of the smallest distances. + +.. _dbscan: + +dbscan +------ +This mode uses `HDBSCAN `__ to find clusters +in the core and accessory distances. This is a versatile clustering algorithm capable of +finding non-linear structure in the data, and can represent irregularly shaped components +well. Possible drawbacks are that a fit cannot always be found (this can happen +for small datasets with sparse points, or for datasets without much structure in the core +and accessory), and that some points are classified as 'noise' so not all of their +edges are included in the network (these are the small black points). + +dbscan usually needs little modification to run:: + + poppunk --fit-model dbscan --ref-db listeria + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting dbscan model to reference database + + Fit summary: + Number of clusters 5 + Number of datapoints 8128 + Number of assignments 7804 + + Scaled component means + [0.94155383 0.90322459] + [0.00527493 0.07044794] + [0.20945986 0.37491995] + [0.12876077 0.34294888] + [0.11413982 0.24224743] + + Network summary: + Components 31 + Density 0.0897 + Transitivity 1.0000 + Score 0.9103 + Removing 97 sequences + + Done + +In the output to the terminal: + +- The number of clusters is the number of spatial components found in the data. +- Number of datapoints is the number of points used (all-vs-all distances), which + may have been subsampled from the maximum. +- Number of assignments is the number of points assign to one of the spatial components, + so excluding noise points. +- Scaled component means are the centres of the fitted components in the model, where + the core and accessory distances have been rescaled between 0 and 1. These can be + used with :ref:`manual-start`. + +The fit actually just uses the component closest to the origin -- any distances +assigned to this component are within-strain. This is the most important part of the +fit in this mode. In this case the identification of this component is identical to the bgmm +fit, so they produce the same strains. Note there is a small yellow cluster which is poorly +defined, but as it does not impact the within-strain cluster the fit is unaffected: + +.. image:: images/dbscan_fit.png + :alt: DBSCAN fit + :align: center + +You can alter the fit with ``--D``, which sets a maximum number of clusters, and +``--min-cluster-prop`` which sets the minimum number of points a cluster can have (as +a proportion of 'Number of datapoints). If the means of both of the core and accessory are not +strictly increasing between the within-strain and next further component, the clustering +fails. In this case the minimum number of samples per cluster is halved, and the fit is +tried again. If this goes below ten, no fit can be found. + +Increasing ``--min-cluster-prop`` or decreasing ``--D`` gets rid of the errant cluster above:: + + poppunk --fit-model dbscan --ref-db listeria --min-cluster-prop 0.01 + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting dbscan model to reference database + + Fit summary: + Number of clusters 4 + Number of datapoints 8128 + Number of assignments 7805 + + Scaled component means + [0.94155383 0.90322459] + [0.00522549 0.06876396] + [0.11515678 0.24488282] + [0.21152104 0.37635505] + + Network summary: + Components 31 + Density 0.0886 + Transitivity 0.9953 + Score 0.9071 + Removing 95 sequences + + Done + +But note that a few more noise points are generated, and fewer samples are removed +when pruning cliques: + +.. image:: images/dbscan_fit_min_prop.png + :alt: DBSCAN fit increasing assignments per cluster + :align: center + +Setting either ``--min-cluster-prop`` or ``--D`` too low can cause the fit to fail:: + + poppunk --fit-model dbscan --ref-db listeria --min-cluster-prop 0.05 + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting dbscan model to reference database + + Failed to find distinct clusters in this dataset + +refine +------ +Model refinement is slightly different: it takes a model already fitted by :ref:`bgmm` +or :ref:`dbscan` and tries to improve it by optimising the network score. This starts +with a parallelised global optimisation step, followed by a serial local optimisation +step (which can be turned off with ``--no-local``). Use of multiple ``--cpus`` is +effective for these model fits. + +Briefly: + +* A line between the within- and between-strain means is constructed +* The point on this line where samples go from being assigned as within-strain to between-strain is used as the starting point +* A line normal to the first line, passing through this point is constructed. The triangle formed by this line and the x- and y-axes is now the decision boundary. Points within this line are within-strain. +* The starting point is shifted by a distance along the first line, and a new decision boundary formed in the same way. The network is reconstructed. +* The shift of the starting point is optimised, as judged by the network score. First globally by a grid search, then locally near the global optimum. + +Applying this to the *Listeria* DBSCAN fit (noting that you may specify a separate +directory to load the model from with ``--model-dir``, if multiple model fits are available):: + + poppunk --fit-model refine --ref-db listeria --model-dir dbscan + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting refine model to reference database + + Loading DBSCAN model + Loaded previous model of type: dbscan + Initial model-based network construction based on DBSCAN fit + Initial boundary based network construction + Decision boundary starts at (0.63,0.62) + Trying to optimise score globally + Trying to optimise score locally + + Optimization terminated successfully; + The returned value satisfies the termination criteria + (using xtol = 1e-05 ) + Network summary: + Components 29 + Density 0.0897 + Transitivity 0.9984 + Score 0.9088 + Removing 93 sequences + + Done + +As this model was already well fitted, this doesn't change much, and finds very similar +clusters (though noise points are eliminated): + +.. image:: images/listeria_refined.png + :alt: A refine fit on Listeria + :align: center + +The default is to search along the entire range between the within- and between-strain clusters, +but sometimes this can include undesired optima, particularly near the origin. To exclude these, +use ``--pos-shift`` to alter the distance between the end of the search range and the origin +and ``--neg-shift`` for the start of the search range. + +This mode is more useful in species with a relatively high recombination rate the distinction between +the within- and between-strain distributions may be blurred in core and +accessory space. This does not give the mixture model enough information to +draw a good boundary as the likelihood is very flat in this region: + +.. image:: images/pneumo_unrefined.png + :alt: A bad DPGMM fit + :align: center + +Although the score of this fit looks ok (0.904), inspection of the network and +microreact reveals that it is too liberal and clusters have been merged. This +is because some of the blur between the origin and the central distribution has +been included, and connected clusters together erroneously. + +The likelihood of the model fit and the decision boundary looks like this: + +.. image:: images/pneumo_likelihood.png + :alt: The likelihood and decision boundary of the above fit + :align: center + +Using the core and accessory distributions alone does not give much information +about exactly where to put the boundary, and the only way to fix this would be +by specifying strong priors on the weights of the distributions. Fortunately +the network properties give information in the region, and we can use +``--refine-fit`` to tweak the existing fit and pick a better boundary. + +Here is the refined fit, which has a score of 0.939, and 62 rather than 32 +components: + +.. image:: images/pneumo_refined.png + :alt: The refined fit + :align: center + +Which, looking at the `microreact output `__, is much better: + +.. image:: images/refined_microreact.png + :alt: The refined fit, in microreact + :align: center + +.. _manual-start: + +Using fit refinement when mixture model totally fails +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If the mixture model does not give any sort of reasonable fit to the points, +you can manually provide a file with ``--manual-start`` to give the starting parameters to +``--refine-fit`` mode. The format of this file is as follows:: + + mean0 0,0 + mean1 0.5,0.6 + start_point 0.3 + +A key, followed by its value (space separated). + +``mean0`` and ``mean1`` define the points (x,y) to draw the line between, and +``start_point`` is the distance along this line to draw the initial boundary +(which is normal to the line). These define the three red points (and therefore the +search range) in the output plot + +.. _indiv-refine: + +Using core/accessory only +^^^^^^^^^^^^^^^^^^^^^^^^^ +In some cases, such as analysis within a lineage, it may be desirable to use +only core or accessory distances to classify further queries. This can be +achieved by adding the ``--indiv-refine both`` option, which will allow these boundaries to be +placed independently, allowing the best fit in each case:: + + poppunk --fit-model refine --ref-db listeria --model-dir dbscan --indiv-refine both + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting refine model to reference database + + Loading DBSCAN model + Loaded previous model of type: dbscan + Initial model-based network construction based on DBSCAN fit + Initial boundary based network construction + Decision boundary starts at (0.63,0.62) + Trying to optimise score globally + Trying to optimise score locally + + Optimization terminated successfully; + The returned value satisfies the termination criteria + (using xtol = 1e-05 ) + Refining core and accessory separately + Initial boundary based network construction + Decision boundary starts at (0.63,0.62) + Trying to optimise score globally + Trying to optimise score locally + + Optimization terminated successfully; + The returned value satisfies the termination criteria + (using xtol = 1e-05 ) + Initial boundary based network construction + Decision boundary starts at (0.63,0.62) + Trying to optimise score globally + Trying to optimise score locally + + Optimization terminated successfully; + The returned value satisfies the termination criteria + (using xtol = 1e-05 ) + Network summary: + Components 29 + Density 0.0897 + Transitivity 0.9984 + Score 0.9088 + Network summary: + Components 31 + Density 0.0897 + Transitivity 1.0000 + Score 0.9103 + Network summary: + Components 31 + Density 0.0808 + Transitivity 0.9862 + Score 0.9064 + Removing 93 sequences + + Done + +There are three different networks, and the core and accessory boundaries will +also be shown on the _refined_fit.png plot as dashed gray lines: + +.. image:: images/indiv_refine.png + :alt: Refining fit with core and accessory individuals independently + :align: center + +To use one of these for your saved model, rerun, but instead setting +``--indiv-refine core`` or ``--indiv-refine accessory``. + +threshold +--------- +In this mode no model is fitted. You provide the threshold at which within- and +between-strain distances is drawn. This can be useful if ``refine`` cannot find a boundary +due to a poorly performing network score, but one can clearly be seen from the plot. +It may also be useful to compare with other fits from related species where a boundary +has been identified using one of the fitting procedures. + +Currently only a core-distance boundary is supported (if you would like an accessory or +combined mode available, please `raise an issue `__). +Provide the cutoff with ``--threshold``:: + + poppunk --fit-model threshold --ref-db listeria --threshold 0.003 + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting threshold model to reference database + + Network summary: + Components 31 + Density 0.0897 + Transitivity 1.0000 + Score 0.9103 + Removing 97 sequences + + Done + +.. image:: images/listeria_threshold.png + :alt: A threshold fit on Listeria + :align: center + +.. _lineage-fit: + +lineage +------- +This mode defines clusters by joining nearest neighbours. As this will typically +define subclusters within strains, we refer to these as 'lineages'. This can be used +to find subclusters in addition to one of the above models, or for species without +strain-structure (e.g. some viruses, *Neisseria gonorrhoeae*, *Mycobacterium tuberculosis*). +This is the highest resolution (most specific clusters) provided directly by PopPUNK. If it does +not meet your needs, take a look at :doc:`subclustering` for other options. + +A model is not fitted, and a simple data-driven heuristic is used. For each sample, the +nearest :math:`k` neighbours will be indentified, and joined in the network. Connected components +of the network define lineages, as in the other models. Only core distances are used (add ``--use-accessory`` to modify this), +and in the case of ties all distances are included. Note that these are not necessarily +expected to be transitive, so network scores are not as informative of the optimum. + +We refer to :math:`k` as the 'rank' of the model. Typically you won't know which rank +to use beforehand, so you can provide multiple integer values to the ``--rank`` option, comma separated. +Clusters from all ranks will be output, and all used with :doc:`query_assignment`. :math:`k = 1` is the +most specific rank, and higher values will form looser clusters. With the *Listeria* example:: + + poppunk --fit-model lineage --ref-db listeria --ranks 1,2,3,5 + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Fitting lineage model to reference database + + Network for rank 1 + Network summary: + Components 26 + Density 0.0271 + Transitivity 0.1834 + Score 0.1785 + Network for rank 2 + Network summary: + Components 12 + Density 0.0428 + Transitivity 0.3528 + Score 0.3377 + Network for rank 3 + Network summary: + Components 6 + Density 0.0589 + Transitivity 0.4191 + Score 0.3944 + Network for rank 5 + Network summary: + Components 2 + Density 0.0904 + Transitivity 0.5319 + Score 0.4838 + Parsed data, now writing to CSV + + Done + +This has produced four fits, with ranks 1, 2, 3 and 5 (with fit information contained in +the .pkl file, and a .npz file for each rank). The _clusters.csv will contain the clusters +from the lowest rank. The _lineages.csv file contains all of the assignments, a column +with all of the ranks hyphen-separated (which will give clusters indentical to the lowest rank):: + + id,Rank_1_Lineage,Rank_2_Lineage,Rank_3_Lineage,Rank_5_Lineage,overall_Lineage + 12673_8#24,18,2,2,1,18-2-2-1 + 12673_8#26,4,2,2,1,4-2-2-1 + 12673_8#27,26,1,1,1,26-1-1-1 + 12673_8#28,1,1,1,1,1-1-1-1 + 12673_8#29,4,2,2,1,4-2-2-1 + 12673_8#31,18,2,2,1,18-2-2-1 + 12673_8#32,9,8,1,1,9-8-1-1 + 12673_8#34,7,7,1,1,7-7-1-1 + 12673_8#36,1,1,1,1,1-1-1-1 + +The best way to assess the ranks is by visualising them (:doc:`visualisation`):: + + poppunk_visualise --distances listeria/listeria.dists --ref-db listeria --microreact + + Graph-tools OpenMP parallelisation enabled: with 1 threads + PopPUNK: visualise + Loading previously lineage cluster model + Writing microreact output + Parsed data, now writing to CSV + Building phylogeny + Running t-SNE + + Done + +This can be loaded in microreact: https://microreact.org/project/dVNMftmK6VXRvDxBfrH2y. +Rank 1 has the smallest clusters: + +.. image:: images/listeria_lineage_rank_1.png + :alt: Rank 1 lineage fit for Listeria + :align: center + +Rank 3 has larger clusters. Some of these clusters are polyphyletic on the core neighbour-joining +tree: + +.. image:: images/listeria_lineage_rank_3.png + :alt: Rank 3 lineage fit for Listeria + :align: center + +At the model fit stage, you will also get histograms which show the distances included +in the network, a useful comparison with the original distance distribution and between ranks: + +.. list-table:: + + * - .. figure:: images/listeria_lineage_rank_1_histogram.png + + Rank 1 + + - .. figure:: images/listeria_lineage_rank_3_histogram.png + + Rank 3 + +Use an existing model with new data +----------------------------------- + +There is also one further mode, ``--use-model``, which may be useful in limited circumstances. This +applies any of the above models to a new dataset without refitting it. This may be useful if a reference +dataset has changed (been added to or removed from) and you do not wish to refit the model, for example +because it is already in use. However, typically you would use :doc:`query_assignment` with ``--update-db`` +to add to a model:: + + poppunk --use-model --ref-db new_db --model-dir old_db + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.6.0 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + + Graph-tools OpenMP parallelisation enabled: with 1 threads + Mode: Using previous model with a reference database + + Loading BGMM 2D Gaussian model + Loaded previous model of type: bgmm + Network summary: + Components 31 + Density 0.0897 + Transitivity 1.0000 + Score 0.9103 + Removing 97 sequences + + Done diff --git a/docs/online.rst b/docs/online.rst new file mode 100644 index 00000000..72c3c8b0 --- /dev/null +++ b/docs/online.rst @@ -0,0 +1,103 @@ +PopPUNK-web +================= +This is the newest feature in the PopPUNK pipeline, available at https://web.poppunk.net/. +PoPUNK-web has been designed for non-specialists and allows you to easily analyse and assign lineages to your query genomic sequences, +using pre-optimised species-specific databases and default parameters. + +.. contents:: + :local: + +How it works +------------ +PopPUNK-web uses C++ code compiled to WebAssembly to sketch query sequences client side, then post this sketch to our Python backend running PopPUNK. +This means your raw genomic sequences remain anonymous and secure as they never actually leave your machine. + +Using PopPUNK-web +----------------------- +PopPUNK-web allows you to upload a single isolate's genomic sequence through our user-friendly drag and drop interface (shown below). +After sketching and lineage assignment, information is returned on the species and lineage your query has been assigned to, relative to the genomic sequences in our pre-built databases. +We then use this assignment to generate and return a series of visualisations that provide additional information on the relationship of your sequence to those in our databases. +Currently, PopPUNK-web only supports *S. pneumoniae* sequences but we hope to expand this in the near future. + +.. image:: images/web_home.png + :alt: PopPUNK-web homepage + :align: center + +Outputs +------- +The interactive outputs of PopPUNK-web are conventiently located in a single results page. +You can navigate through the different outputs by using the tabs at the top of the results page. + +Sequence metrics +^^^^^^^^^^^^^^^^ +This tab displays the species and lineage assigned to your query sequence, in addition to the prevalence of the lineage in our databases and information regarding the quality of your uploaded sequence. +Sequence quality metrics indicate the length of your sequence in base pairs, the number of Ns and the frequencies of each base within the sequence. +The frequencies of A & T bases and C & G bases should be approximately equal. + +.. image:: images/web_stats.png + :alt: PopPUNK-web homepage + :align: center + +Cluster prevalences +^^^^^^^^^^^^^^^^^^^ +PopPUNK-web uses the plotly.js package (https://plotly.com) to plot the prevalences of all clusters in our species-specific database, including the assigned query sequence. +Cluster IDs are shown across the x-axis and the corresponding prevalence (%) in our database shown on the y-axis. +Bars are organised by decreasing cluster prevalence. +To visualise subsections of this plot, use your scroll wheel or double click an area to zoom in. Double click again to zoom back out. + +.. image:: images/web_prevs.png + :alt: PopPUNK-web prevalences + :align: center + +The bar corresponding to the assigned cluster of your query sequence is highlighted in orange. + +.. image:: images/web_prevs_zoomed.png + :alt: PopPUNK-web zoomed prevalences + :align: center + +Population phylogeny +^^^^^^^^^^^^^^^^^^^^ +We make use of the Microreact REST API (https://microreact.org/showcase) to display a phylogeny indicating the relationships of all isolates in the assigned species database. +By default, isolates are coloured based on their assigned cluster in our database. + +.. image:: images/web_micro.png + :alt: PopPUNK-web population phylogeny + :align: center + +This phylogeny does not include your query isolate however, the cluster your query has been assigned to can be highlighted by selecting "Highlight_query," instead of "Cluster" (as shown below). + +.. image:: images/web_micro_change.png + :alt: PopPUNK-web population phylogeny change + :align: center + +.. image:: images/web_micro_assigned.png + :alt: PopPUNK-web population phylogeny assigned + :align: center + +Cluster phylogeny +^^^^^^^^^^^^^^^^^ +We use Phylocanvas (http://phylocanvas.org) to display a phylogeny of your query sequence and its relationships to isolates assigned to the same cluster. +Your query sequence is highlighted in red as shown below. + +.. image:: images/web_phylo.png + :alt: PopPUNK-web cluster phylogeny + :align: center + +Cluster network +^^^^^^^^^^^^^^^ +Cytoscape.js (https://js.cytoscape.org) is used to display a network representing the cluster assigned to your query sequence. +Your query sequence is highlighted in orange as shown below. +Edge lengths are currently arbitrary and do not represent evolutionary distances. + +.. image:: images/web_cyto.png + :alt: PopPUNK-web network + :align: center + +New features +------------ +Do you have a feature you would like to see in PopPUNK-web? File an issue on the PopPUNK-web GitHub repository (https://github.com/johnlees/PopPUNK-web/issues). + +Issues/Bugs +----------- +As PopPUNK-web is a new feature, it is possible there are bugs or issues we have no come across yet. +If you identify a problem with PopPUNK-web, please file an issue on the PopPUNK-web GitHub repository (https://github.com/johnlees/PopPUNK-web/issues). diff --git a/docs/options.rst b/docs/options.rst index 78ca1e69..cac51239 100644 --- a/docs/options.rst +++ b/docs/options.rst @@ -1,131 +1,317 @@ Options ======= +**Contents**: + +.. contents:: + :local: + +poppunk +------- + Usage:: - usage: PopPUNK [-h] - (--easy-run | --create-db | --fit-model | --refine-model | --assign-query | --use-model | --generate-viz) - [--ref-db REF_DB] [--r-files R_FILES] [--q-files Q_FILES] + poppunk [-h] + (--create-db | --fit-model {bgmm,dbscan,refine,lineage,threshold} | --use-model) + [--ref-db REF_DB] [--r-files R_FILES] [--distances DISTANCES] - [--external-clustering EXTERNAL_CLUSTERING] --output OUTPUT - [--plot-fit PLOT_FIT] [--full-db] [--update-db] [--overwrite] - [--min-k MIN_K] [--max-k MAX_K] [--k-step K_STEP] - [--sketch-size SKETCH_SIZE] [--max-a-dist MAX_A_DIST] - [--ignore-length] [--K K] [--dbscan] [--D D] - [--min-cluster-prop MIN_CLUSTER_PROP] [--pos-shift POS_SHIFT] + [--external-clustering EXTERNAL_CLUSTERING] + [--output OUTPUT] [--plot-fit PLOT_FIT] [--overwrite] + [--graph-weights] [--min-k MIN_K] [--max-k MAX_K] + [--k-step K_STEP] [--sketch-size SKETCH_SIZE] + [--codon-phased] [--min-kmer-count MIN_KMER_COUNT] + [--exact-count] [--strand-preserved] + [--qc-filter {stop,prune,continue}] [--retain-failures] + [--max-a-dist MAX_A_DIST] [--length-sigma LENGTH_SIGMA] + [--length-range LENGTH_RANGE LENGTH_RANGE] + [--prop-n PROP_N] [--upper-n UPPER_N] [--K K] [--D D] + [--min-cluster-prop MIN_CLUSTER_PROP] + [--threshold THRESHOLD] [--pos-shift POS_SHIFT] [--neg-shift NEG_SHIFT] [--manual-start MANUAL_START] [--indiv-refine] [--no-local] [--model-dir MODEL_DIR] - [--previous-clustering PREVIOUS_CLUSTERING] [--core-only] - [--accessory-only] [--subset SUBSET] [--microreact] - [--cytoscape] [--phandango] [--grapetree] [--rapidnj RAPIDNJ] - [--perplexity PERPLEXITY] [--info-csv INFO_CSV] [--mash MASH] - [--threads THREADS] [--no-stream] [--version] - -Command line options - - optional arguments: - -h, --help show this help message and exit - - Mode of operation: - --easy-run Create clusters from assemblies with default settings - --create-db Create pairwise distances database between reference - sequences - --fit-model Fit a mixture model to a reference database - --refine-model Refine the accuracy of a fitted model - --assign-query Assign the cluster of query sequences without re- - running the whole mixture model - --generate-viz Generate files for a visualisation from an existing - database - --use-model Apply a fitted model to a reference database to - restore database files - - - Input files: - --ref-db REF_DB Location of built reference database - --r-files R_FILES File listing reference input assemblies - --q-files Q_FILES File listing query input assemblies - --distances DISTANCES - Prefix of input pickle of pre-calculated distances - --external-clustering EXTERNAL_CLUSTERING - File with cluster definitions or other labels - generated with any other method. - - Output options: - --output OUTPUT Prefix for output files (required) - --plot-fit PLOT_FIT Create this many plots of some fits relating k-mer to - core/accessory distances [default = 0] - --full-db Keep full reference database, not just representatives - --update-db Update reference database with query sequences - --overwrite Overwrite any existing database files - - Kmer comparison options: - --min-k MIN_K Minimum kmer length [default = 9] - --max-k MAX_K Maximum kmer length [default = 29] - --k-step K_STEP K-mer step size [default = 4] - --sketch-size SKETCH_SIZE - Kmer sketch size [default = 10000] - - Quality control options: - --max-a-dist MAX_A_DIST - Maximum accessory distance to permit [default = 0.5] - --ignore-length Ignore outliers in terms of assembly length [default = - False] - - Model fit options: - --K K Maximum number of mixture components [default = 2] - --dbscan Use DBSCAN rather than mixture model - --D D Maximum number of clusters in DBSCAN fitting [default - = 100] - --min-cluster-prop MIN_CLUSTER_PROP - Minimum proportion of points in a cluster in DBSCAN - fitting [default = 0.0001] - - Refine model options: - --pos-shift POS_SHIFT - Maximum amount to move the boundary away from origin - [default = 0.2] - --neg-shift NEG_SHIFT - Maximum amount to move the boundary towards the origin - [default = 0.4] - --manual-start MANUAL_START - A file containing information for a start point. See - documentation for help. - --indiv-refine Also run refinement for core and accessory - individually - --no-local Do not perform the local optimization step (speed up - on very large datasets) - - Database querying options: - --model-dir MODEL_DIR - Directory containing model to use for assigning - queries to clusters [default = reference database - directory] - --previous-clustering PREVIOUS_CLUSTERING - Directory containing previous cluster definitions and - network [default = use that in the directory - containing the model] - --core-only Use a core-distance only model for assigning queries - [default = False] - --accessory-only Use an accessory-distance only model for assigning - queries [default = False] - - Further analysis options: - --subset SUBSET File with list of sequences to include in - visualisation (with --generate-viz only) - --microreact Generate output files for microreact visualisation - --cytoscape Generate network output files for Cytoscape - --phandango Generate phylogeny and TSV for Phandango visualisation - --grapetree Generate phylogeny and CSV for grapetree visualisation - --rapidnj RAPIDNJ Path to rapidNJ binary to build NJ tree for Microreact - --perplexity PERPLEXITY - Perplexity used to calculate t-SNE projection (with - --microreact) [default=20.0] - --info-csv INFO_CSV Epidemiological information CSV formatted for - microreact (can be used with other outputs) - - Other options: - --mash MASH Location of mash executable - --threads THREADS Number of threads to use [default = 1] - --no-stream Use temporary files for mash dist interfacing. Reduce - memory use/increase disk use for large datasets - --version show program's version number and exit + [--ranks RANKS] [--use-accessory] [--threads THREADS] + [--gpu-sketch] [--gpu-dist] [--deviceid DEVICEID] + [--version] + +Command line options:: + + optional arguments: + -h, --help show this help message and exit + + Mode of operation: + --create-db Create pairwise distances database between + reference sequences + --fit-model {bgmm,dbscan,refine,lineage,threshold} + Fit a mixture model to a reference database + --use-model Apply a fitted model to a reference database to + restore database files + + Input files: + --ref-db REF_DB Location of built reference database + --r-files R_FILES File listing reference input assemblies + --distances DISTANCES + Prefix of input pickle of pre-calculated distances + --external-clustering EXTERNAL_CLUSTERING + File with cluster definitions or other labels + generated with any other method. + + Output options: + --output OUTPUT Prefix for output files + --plot-fit PLOT_FIT Create this many plots of some fits relating k-mer + to core/accessory distances [default = 0] + --overwrite Overwrite any existing database files + --graph-weights Save within-strain Euclidean distances into the + graph + + Create DB options: + --min-k MIN_K Minimum kmer length [default = 13] + --max-k MAX_K Maximum kmer length [default = 29] + --k-step K_STEP K-mer step size [default = 4] + --sketch-size SKETCH_SIZE + Kmer sketch size [default = 10000] + --codon-phased Used codon phased seeds X--X--X [default = False] + --min-kmer-count MIN_KMER_COUNT + Minimum k-mer count when using reads as input + [default = 0] + --exact-count Use the exact k-mer counter with reads [default = + use countmin counter] + --strand-preserved Treat input as being on the same strand, and + ignore reverse complement k-mers [default = use + canonical k-mers] + + Quality control options: + --qc-filter {stop,prune,continue} + Behaviour following sequence QC step: "stop" + [default], "prune" (analyse data passing QC), or + "continue" (analyse all data) + --retain-failures Retain sketches of genomes that do not pass QC + filters in separate database [default = False] + --max-a-dist MAX_A_DIST + Maximum accessory distance to permit [default = + 0.5] + --length-sigma LENGTH_SIGMA + Number of standard deviations of length + distribution beyond which sequences will be + excluded [default = 5] + --length-range LENGTH_RANGE LENGTH_RANGE + Allowed length range, outside of which sequences + will be excluded [two values needed - lower and + upper bounds] + --prop-n PROP_N Threshold ambiguous base proportion above which + sequences will be excluded [default = 0.1] + --upper-n UPPER_N Threshold ambiguous base count above which + sequences will be excluded + + Model fit options: + --K K Maximum number of mixture components [default = 2] + --D D Maximum number of clusters in DBSCAN fitting + [default = 100] + --min-cluster-prop MIN_CLUSTER_PROP + Minimum proportion of points in a cluster in + DBSCAN fitting [default = 0.0001] + --threshold THRESHOLD + Cutoff if using --fit-model threshold + + Refine model options: + --pos-shift POS_SHIFT + Maximum amount to move the boundary away from + origin [default = to between-strain mean] + --neg-shift NEG_SHIFT + Maximum amount to move the boundary towards the + origin [default = to within-strain mean] + --manual-start MANUAL_START + A file containing information for a start point. + See documentation for help. + --indiv-refine {both,core,accessory} + Also run refinement for core and accessory + individually + --no-local Do not perform the local optimization step (speed + up on very large datasets) + --model-dir MODEL_DIR + Directory containing model to use for assigning + queries to clusters [default = reference database + directory] + + Lineage analysis options: + --ranks RANKS Comma separated list of ranks used in lineage + clustering [default = 1,2,3] + --use-accessory Use accessory distances for lineage definitions + [default = use core distances] + + Other options: + --threads THREADS Number of threads to use [default = 1] + --gpu-sketch Use a GPU when calculating sketches (read data + only) [default = False] + --gpu-dist Use a GPU when calculating distances [default = + False] + --deviceid DEVICEID CUDA device ID, if using GPU [default = 0] + --version show program's version number and exit + +poppunk_assign +-------------- + +Usage:: + + poppunk_assign [-h] --db DB --query QUERY [--distances DISTANCES] + [--external-clustering EXTERNAL_CLUSTERING] --output + OUTPUT [--plot-fit PLOT_FIT] [--write-references] + [--update-db] [--overwrite] [--graph-weights] + [--min-kmer-count MIN_KMER_COUNT] [--exact-count] + [--strand-preserved] [--max-a-dist MAX_A_DIST] + [--model-dir MODEL_DIR] + [--previous-clustering PREVIOUS_CLUSTERING] + [--core-only] [--accessory-only] [--threads THREADS] + [--gpu-sketch] [--gpu-dist] [--deviceid DEVICEID] + [--version] + +Command line options:: + + optional arguments: + -h, --help show this help message and exit + + Input files: + --db DB Location of built reference database + --query QUERY File listing query input assemblies + --distances DISTANCES + Prefix of input pickle of pre-calculated distances + (if not in --db) + --external-clustering EXTERNAL_CLUSTERING + File with cluster definitions or other labels + generated with any other method. + + Output options: + --output OUTPUT Prefix for output files (required) + --plot-fit PLOT_FIT Create this many plots of some fits relating k-mer + to core/accessory distances [default = 0] + --write-references Write reference database isolates' cluster + assignments out too + --update-db Update reference database with query sequences + --overwrite Overwrite any existing database files + --graph-weights Save within-strain Euclidean distances into the + graph + + Kmer comparison options: + --min-kmer-count MIN_KMER_COUNT + Minimum k-mer count when using reads as input + [default = 0] + --exact-count Use the exact k-mer counter with reads [default = + use countmin counter] + --strand-preserved Treat input as being on the same strand, and + ignore reverse complement k-mers [default = use + canonical k-mers] + + Quality control options: + --max-a-dist MAX_A_DIST + Maximum accessory distance to permit [default = + 0.5] + + Database querying options: + --model-dir MODEL_DIR + Directory containing model to use for assigning + queries to clusters [default = reference database + directory] + --previous-clustering PREVIOUS_CLUSTERING + Directory containing previous cluster definitions + and network [default = use that in the directory + containing the model] + --core-only (with a 'refine' model) Use a core-distance only + model for assigning queries [default = False] + --accessory-only (with a 'refine' or 'lineage' model) Use an + accessory-distance only model for assigning + queries [default = False] + + Other options: + --threads THREADS Number of threads to use [default = 1] + --gpu-sketch Use a GPU when calculating sketches (read data + only) [default = False] + --gpu-dist Use a GPU when calculating distances [default = + False] + --deviceid DEVICEID CUDA device ID, if using GPU [default = 0] + --version show program's version number and exit + +poppunk_visualise +----------------- + +Usage:: + + poppunk_visualise [-h] --ref-db REF_DB [--query-db QUERY_DB] + [--distances DISTANCES] + [--include-files INCLUDE_FILES] + [--external-clustering EXTERNAL_CLUSTERING] + [--model-dir MODEL_DIR] + [--previous-clustering PREVIOUS_CLUSTERING] + [--previous-query-clustering PREVIOUS_QUERY_CLUSTERING] + --output OUTPUT [--overwrite] [--core-only] + [--accessory-only] [--microreact] [--cytoscape] + [--phandango] [--grapetree] [--rapidnj RAPIDNJ] + [--perplexity PERPLEXITY] [--info-csv INFO_CSV] + [--threads THREADS] [--gpu-dist] + [--deviceid DEVICEID] [--strand-preserved] + [--version] + +Command line options:: + + optional arguments: + -h, --help show this help message and exit + + Input files: + --ref-db REF_DB Location of built reference database + --query-db QUERY_DB Location of query database, if distances are from + ref-query + --distances DISTANCES + Prefix of input pickle of pre-calculated distances + --include-files INCLUDE_FILES + File with list of sequences to include in + visualisation. Default is to use all sequences in + database. + --external-clustering EXTERNAL_CLUSTERING + File with cluster definitions or other labels + generated with any other method. + --model-dir MODEL_DIR + Directory containing model to use for assigning + queries to clusters [default = reference database + directory] + --previous-clustering PREVIOUS_CLUSTERING + Directory containing previous cluster definitions + and network [default = use that in the directory + containing the model] + --previous-query-clustering PREVIOUS_QUERY_CLUSTERING + Directory containing previous cluster definitions + from poppunk_assign [default = use that in the + directory containing the model] + + Output options: + --output OUTPUT Prefix for output files (required) + --overwrite Overwrite any existing visualisation files + + Database querying options: + --core-only (with a 'refine' model) Use a core-distance only + model for assigning queries [default = False] + --accessory-only (with a 'refine' or 'lineage' model) Use an + accessory-distance only model for assigning + queries [default = False] + + Visualisation options: + --microreact Generate output files for microreact visualisation + --cytoscape Generate network output files for Cytoscape + --phandango Generate phylogeny and TSV for Phandango + visualisation + --grapetree Generate phylogeny and CSV for grapetree + visualisation + --rapidnj RAPIDNJ Path to rapidNJ binary to build NJ tree for + Microreact + --perplexity PERPLEXITY + Perplexity used to calculate t-SNE projection + (with --microreact) [default=20.0] + --info-csv INFO_CSV Epidemiological information CSV formatted for + microreact (can be used with other outputs) + + Other options: + --threads THREADS Number of threads to use [default = 1] + --gpu-dist Use a GPU when calculating distances [default = + False] + --deviceid DEVICEID CUDA device ID, if using GPU [default = 0] + --strand-preserved If distances being calculated, treat strand as + known when calculating random match chances + [default = False] + --version show program's version number and exit diff --git a/docs/qc.rst b/docs/qc.rst new file mode 100644 index 00000000..600e9f7e --- /dev/null +++ b/docs/qc.rst @@ -0,0 +1,110 @@ +Data quality control +==================== +PopPUNK now comes with some basic quality control options which are applied by +default when running ``--create-db``: + +- Outlying genome length (calculated during sketching, for assemblies or reads). +- Too many 'N's. +- Outlying accessory distance. + +Accessory distance is also used with ``poppunk_assign``. + +You can set the behaviour when creating the database by setting ``--qc-filter`` +to one of the following three values if any failing samples are found: + +- ``stop`` (default) -- Do not proceed past the sketching step, and throw an error. +- ``prune`` -- Remove failing samples from the sketch database before continuing. +- ``continue`` -- Ignore failing samples and run anyway. + +In all cases a file will be written at ``qcreport.txt`` which lists the failing samples, and the +reasons why they failed. If running with either prune or continue, you may also add ``--retain-failures`` +to write a separate sketch database with the failed samples. + +Random match chances in PopPUNK are only calculated and added to the database after the chosen +QC step. If you use ``poppunk_sketch`` directly, they will be added without any automated QC. + +You can change the genome length cutoff with ``--length-sigma`` which sets the maximum number +of standard deviations from the mean, and ``--length-range`` which sets an absolute range of +allowable sizes. + +Ambiguous bases are controlled by ``--prop-n`` which gives the maximum percentage of Ns, +and ``--upper-n`` which gives the absolute maximum value. + +The maximum allowed accessory distance is 0.5 to ensure you check for contamination. However, +many species do really have high accessory values above this range, in which case you +should increase the value of ``--max-a-dist``. + +Removing samples from an existing database +------------------------------------------ +You can use the ``poppunk_prune`` command to remove samples from a database, +for example those found to be of poor quality. Create a file +``remove.txt`` with the names of the samples you wish to remove, one per line, +and run:: + + poppunk_prune --remove remove.txt --distances strain_db/strain_db.dists --output pruned_db + +This will remove the samples from the ``strain_db.dists`` files, from which +``--model-fit`` can be run again. + +Dealing with poor quality data +------------------------------ +In this example we analyse 76 *Haemophilus influenzae* isolates. One isolate, 14412_4_15, +is contaminated with 12% of reads being *Haemophilus parainfluenzae* and a total +assembly length of 3.8Mb. It would be removed before input, but its presence +can also be found with ``PopPUNK --qc-filter continue``. + +With the distances +^^^^^^^^^^^^^^^^^^ +A fit with three mixture components overestimates the number of between strain +links, and gives a network with a poor score (0.6849) and only five components: + +.. image:: images/contam_DPGMM_fit.png + :alt: A bad fit to pairwise distances + :align: center + +Distances in the top left of the plot, with low core distances and high +accessory distances, are due to the contaminated contigs in the isolate. +Finding which isolates contribute to these distances reveals a clear culprit:: + + awk '$3<0.02 && $4 > 0.3 {print $1}' contam_db/contam_db.search.out | cut -f 1 | sort | uniq -c + 1 14412_3_81 + 1 14412_3_82 + 1 14412_3_83 + 1 14412_3_84 + 1 14412_3_88 + 1 14412_3_89 + 1 14412_3_91 + 1 14412_3_92 + 1 14412_4_1 + 1 14412_4_10 + 28 14412_4_15 + +In this case it is sufficient to increase the number of mixture components to four, +which no longer includes these inflated distances. This gives a score of 0.9401 and 28 components: + +.. image:: images/contam_DPGMM_better_fit.png + :alt: A better fit to pairwise distances + :align: center + +The best thing to do is to remove the poor quality isolate, or if possible +remove the contaminated reads/contigs from the assembly. + +With the network +^^^^^^^^^^^^^^^^ +Alternatively, the network itself can be inspected with ``--cytoscape``. Using +the approach detailed in :ref:`cytoscape-view` gives the following view: + +.. image:: images/cytoscape_contaminant.png + :alt: A better fit to pairwise distances + :align: center + +The contaminated node appears when ordering by ClusteringCoefficient, Stress or +TopologicalCoefficient, and its edges appear when ordering by EdgeBetweeness. +It can be seen highlighted in the top right component, connecting two clusters +which otherwise have no links. It can be removed, and components recalculated in +cytoscape directly, though removal from the PopPUNK database is best. + +The second largest cluster is also suspicious, where there are few triangles +(low transitivity) and the nodes involved have high Stress. This is indicative +of a bad fit overall, rather than a single problem sample. + diff --git a/docs/query_assignment.rst b/docs/query_assignment.rst new file mode 100644 index 00000000..c9fcf673 --- /dev/null +++ b/docs/query_assignment.rst @@ -0,0 +1,237 @@ +Query assignment +================ +This is the recommended mode to use PopPUNK, as long as a database is available for +your species. If there is no DB available, you can fit your own (:doc:`model_fitting`). + +Briefly, `download your reference database `__ and run:: + + poppunk_assign --ref-db database --distances database/database.dists \ + --q-files qfile.txt --output poppunk_clusters --threads 8 + +.. contents:: + :local: + +Downloading a database +---------------------- +Current PopPUNK databases can be found here: https://poppunk.net/pages/databases.html + +We refer to sequences in the database as references, and those being added +as queries. + +A database called ``database`` will contain the following files, in ``database/``: + +- ``database.h5`` -- the sketches of the reference sequences generated by ``pp-sketchlib``. +- ``database.dists.npy`` and ``database.dists.pkl`` -- the core and accessory distances for + all pairwise comparisons in the sketch database. +- ``database.fit.npy`` and ``database.fit.pkl`` -- the model fit to the core and accessory distances. +- ``database_graph.gt`` -- the network defining the fit (loadable with ``graph_tool``). +- ``database_clusters.csv`` -- the PopPUNK clusters for the reference sequences. +- ``database_references.refs`` -- a minimal list of references needed to produce correct clusters. + +If the ``.refs`` file is missing, all of the samples in the sketch database will be +used in the distance calculations. + +You can use the following arguments to individually target these items if necessary, +for example when using an alternative fit, or if split across different directories. The +examples below refer to the default database name: + +- (required) ``--ref-db database`` -- the name of directory containing the .h5 file. +- (required) ``--distances database/database.dists`` -- prefix of the distances. +- ``--model-dir database`` -- directory containing the model fit and network (dists + fit define the network). +- ``--previous-clustering database`` -- directory containing the PopPUNK clusters for the references. + +Clustering your genomes +----------------------- +Create a file which lists your sample names and paths to their sequence data. This file +has no header, is tab separated, and contains the sample name in the first column. Subsequent +columns may contain paths to either assembled or raw read data (the type will automatically +be inferred by checking for the presence of quality scores). Data may be gzipped or uncompressed:: + + MS1 ms1_assembled.fa + MS2 ms2_assembled.fa + SM14 SM14_1.fq.gz SM14_2.fq.gz + +Save this as ``qfile.txt``. You're now ready to cluster them! +Run the following command:: + + poppunk_assign --ref-db database --distances database/database.dists \ + --q-files qfile.txt --output poppunk_clusters --threads 8 + +This will first of all sketch your input genomes, saving them in ``poppunk_clusters/poppunk_clusters.h5``. +If you need to rerun part of the analysis with different options this will automatically be picked up +and loaded. + +.. note:: + :doc:`qc` does not apply to query sequences. A test for maximum accessory distance + will be made, but the program will only emit warnings and will run with all genomes + anyway. Most options for sketching will be taken from the reference database, but you + can still specify error filtering options from read input (``--min-kmer-count`` and + ``--exact-count``) and specify your input as ``--strand-preserved``. See :doc:`sketching` for + more information on these options. + +Next, core and accessory distances between your input sketches and those in the database +will be computed. This has complexity :math:`O(RQ)` where :math:`R` is the number of +samples in ``database_references.refs`` and :math:`Q` is the number in ``qfile.txt``. These distances +are then fed into the model and used to update the network, and therefore clusters. + +The output will look something like this:: + + Graph-tools OpenMP parallelisation enabled: with 4 threads + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.5.1 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + Mode: Assigning clusters of query sequences + + Sketching genomes using 8 thread(s) + Calculating distances using 8 thread(s) + Loading previously refined model + Network loaded: 2007 samples + Found novel query clusters. Calculating distances between them. + Could not find random match chances in database, calculating assuming equal base frequencies + Calculating distances using 8 thread(s) + +Your clusters will be written to ``poppunk_clusters/poppunk_clusters_clusters.csv``:: + + Taxon,Cluster + 21946_6_66,9 + 22695_3_148,9 + 22984_8_88,9 + 21946_6_245,116 + 21946_6_189,814 + 22695_3_73,814 + 21946_6_50,422 + 21903_8_95,148 + 21903_8_250,301 + 22984_8_47,70 + +These names are identical to those used in the reference database, so retain +the same meaning between studies. If new clusters are found they will be numbered +in ascending order from largest to smallest, beginning from the end of the reference +clusters. + +.. note:: + You may observed clusters merging (but never splitting). If your genomes + do cause clusters to merge this will be noted in the output, and the new + clusters will be named using the old ones. For example, if clusters 23 and 38 + merged, the new cluster would be called 23_38. + +By default, only the query genome clusters are included here. The reference genome +clusters are considered unchanged from the input. If there are many merges and you +wish to know their new cluster IDs, use ``--update-db`` (:ref:`update-db`). + +You can use ``poppunk_visualise`` to look at your results. Here's an example output +to cytoscape, showing the clusters as colours, reference genomes as circles and +queries as triangles (open in a new tab to zoom on detail): + +.. image:: images/assign_network.png + :alt: Network produced after query assignment + :align: center + +Adding external cluster labels (MLST, CC etc) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Add the ``--external-clustering`` argument to add a CSV file of cluster definitions +which the output will be additionally labelled with, and output to ``database/database_external_clusters.csv``. +These can be any cluster definitions you wish, with as many columns as you like. A header row is required:: + + sample,GPSC,MLST + 23430_1_186,1,22 + 17794_6_29,23,43 + 12291_4_13,1,2 + +For each PopPUNK cluster, all the samples found in said cluster will be accumulated. +From these accumulated samples the external clusters will be collected, and assigned +to all of these examples. This may give you a one-to-one mapping between PopPUNK clusters +and your external cluster, or you may find multiple external clusters refer to the +PopPUNK cluster giving output such as ``227;811;763;824``. + +Using a model fitted with lineage assignment mode +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +You will need to add ``--assign-lineages`` to pick up the correct model, additionally +you can add options ``--rank`` to choose the rank to assign from (default is the lowest +available) and ``--use-accessory`` to use the accessory distances rather than the core +for clustering. You will find extra model files with the ranks listed in their name if +this model type is available. + +Using a model fitted with ``--indiv-refine`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If the database was fitted with the refine fit mode, and ``indiv-refine`` you may have +a core distance boundary, accessory boundary and combined core-accessory boundary fit. The +default is to use the combined boundary, to use the others add ``--core-only`` or +``--accessory-only``. + +Increasing speed +---------------- +Query assignment is the most efficient mode, typically requiring :math:`Q` sketches and +:math:`RQ` distances. If you are updating the database, this increases to :math:`Q^2 + RQ` +distances. If you are assigning a very large number of queries you can run ``poppunk_assign`` +with ``--update-db`` repeatedly for batches of query input, as the :math:`Q^2` term will +be reduced by clique-pruning at each iteration. + +Straightforward ways to increase speed include: + +- Add ``--gpu-dist``, if you have a GPU available. +- Add ``--gpu-sketch``, if your input is all reads, and you have a GPU available. If + your input is a mix of assemblies and reads, run in two separate batches, with + the batch of reads using this option. +- Increase ``--threads``. + +.. _update-db: + +Updating the database +--------------------- +If you want to add your query genomes into the reference database so that they +can be used to inform future cluster assignment, this is as simple as adding the +``--update-db`` option to the command above. This is particularly useful when novel +query clusters have been found -- they will then be the consistent name for future assignments:: + + poppunk_assign --ref-db database --distances database/database.dists \ + --q-files qfile.txt --output poppunk_clusters --threads 8 --update-db + + Graph-tools OpenMP parallelisation enabled: with 4 threads + PopPUNK (POPulation Partitioning Using Nucleotide Kmers) + (with backend: sketchlib v1.5.1 + sketchlib: /Users/jlees/miniconda3/envs/pp-py38/lib/python3.8/site-packages/pp_sketchlib.cpython-38-darwin.so) + Mode: Assigning clusters of query sequences + + Sketching 28 genomes using 4 thread(s) + Writing sketches to file + Calculating distances using 4 thread(s) + Loading BGMM 2D Gaussian model + Network loaded: 18 samples + Calculating all query-query distances + Could not find random match chances in database, calculating assuming equal base frequencies + Calculating distances using 4 thread(s) + Updating reference database to poppunk_clusters + Removing 27 sequences + + Done + +The new database contains all of the reference sequences, and all of your query sequences. +The ``poppunk_clusters`` folder will now contain all of the files of a reference +database listed above, except for the model. You can use ``--model-dir`` to target +this for future assignment, or copy it over yourself. Alternatively, if you run +with the same ``--output`` folder as ``--ref-db``, adding ``--overwrite``, the original +input folder will contain the updated database containing everything needed. + +.. note:: + This mode can take longer to run with large numbers of input query genomes, + as it will calculate all :math:`Q^2` query-query distances, rather than + just those found in novel query clusters. + +Visualising results +------------------- +If you wish to produce visualisations from query assignment results the best +way to do this is to run with ``--update-db``, and then run ``poppunk_visualise`` +on the output directory, as if visualising a full reference fit. + +However, it is possible to run directly on the outputs by adding a ``--ref-db`` +as used in the assign command, and a ``--query-db`` which points to the ``--output`` +directory used in the assign command. In this mode isolates will be annotated +depending on whether they were a query or reference input. + +.. warning:: + Without ``--update-db``, visualisation is required to recalculate all query-query distances + each time it is called. If your query set is large and you want repeated visualisations, + run ``poppunk_assign`` with ``--update-db``. + +See :doc:`visualisation` for more details. \ No newline at end of file diff --git a/docs/quickstart.rst b/docs/quickstart.rst deleted file mode 100644 index 8a5b071a..00000000 --- a/docs/quickstart.rst +++ /dev/null @@ -1,229 +0,0 @@ -Quickstart -========== - -.. |nbsp| unicode:: 0xA0 - :trim: - -This guide briefly explains how PopPUNK can be run on a set of genomes. -For a more detailed example see the :doc:`tutorial`. - -We will work with 128 *Listeria monocytogenes* genomes from `Kremer -et al `_ which can be downloaded -from `figshare `__. - -.. contents:: - :local: - -Running PopPUNK ---------------- -First download the example set above, then extract the assemblies and create a -file with a list of their names and locations:: - - tar xf listeria_example.tar.bz2 - paste <(ls *.contigs_velvet.fa) <(ls *.contigs_velvet.fa) > reference_list.txt - -The second command here generates two columns, where the names are the same as the -filenames, but you can define whatever name you want in the first column. - -Now run PopPUNK:: - - poppunk --easy-run --r-files reference_list.txt --output lm_example --threads 4 --plot-fit 5 --min-k 13 --full-db - -This will: - -1. Create a database of mash sketches -2. Use these to calculate core and accessory distances between samples (which - are also stored as part of the database). -3. Fit a two-component Gaussian mixture model to these distances to attempt to - find within-strain distances. -4. Use this fit to construct a network, from which clusters are defined - -where the additional options: - -* ``--threads 4`` increase speed by using more CPUs -* ``--min-k 13`` ensures the distances are not biased by random matches at - lower k-mer lengths. -* ``--plot-fit 5`` plots five examples of the linear fit, to ensure ``--min-k`` - was set high enough. -* ``--full-db`` does not remove redundant references at the end, so the model - fit can be re-run. - - .. important:: - The key step for getting good clusters is to get the right model fit to - the distances. The algorithm is robust to most other parameters settings. - See :ref:`model-refit` for details. - -The cluster definitions are output to ``lm_example/lm_example_clusters.csv``. - -Check the distance fits -^^^^^^^^^^^^^^^^^^^^^^^ -The first thing to do is check the relation between mash distances and core and -accessory distances are correct:: - - Creating mash database for k = 13 - Random 13-mer probability: 0.04 - Creating mash database for k = 21 - Random 21-mer probability: 0.00 - Creating mash database for k = 17 - Random 17-mer probability: 0.00 - Creating mash database for k = 25 - Random 25-mer probability: 0.00 - Creating mash database for k = 29 - Random 29-mer probability: 0.00 - -This shows ``--min-k`` was set appropriately, as no random probabilities were -greater than 0.05. Looking at one of the plots ``lm_example/fit_example_1.pdf`` -shows a straight line fit, with the left most point not significantly above the -fitted relationship: - -.. image:: images/lm_fit.png - :alt: Straight line fit between log(Jaccard distance) and k-mer length - :align: center - -Check the distance plot -^^^^^^^^^^^^^^^^^^^^^^^ -A plot of core and accessory distances contains information about population structure, -and about the evolution of core and accessory elements. Open -``lm_example/lm_example_distanceDistribution.png``: - -.. image:: images/lm_distance_dist.png - :alt: Plot of pairwise core and accessory distances - :align: center - -Each point is the distance between a pair of isolates in the collection. The -x-axis shows core distances, the y-axis accessory distances. Lines are contours -of density in regions where points overlap, running from blue (low density) to -yellow (high density). Usually the highest density will be observed in the -top-right most blob, where isolates from different clusters are being compared. - -In this sample collection the top-right blob represents comparisons between lineage I and -lineage II strains. The blob nearest the origin represents comparisons within -the same strain. The other two blobs are comparisons between different strains -within the same lineage. Overall there is a correlation between core and -accessory divergence, and accessory divergence within a cluster is higher than -the core divergence. - -Check the model fit -^^^^^^^^^^^^^^^^^^^ -A summary of the fit and model is output to ``STDERR``:: - - Fit summary: - Avg. entropy of assignment 0.0004 - Number of components used 2 - Warning: trying to create very large network - Network summary: - Components 2 - Density 0.5405 - Transitivity 1.0000 - Score 0.4595 - -This is a bad network score -- a value of at least 0.8 would be expected for -a good fit. A high density suggests the fit was not specific enough, and too -many points in the core-accessory plot have been included as within strain. -Looking at the fit this proves to be true: - -.. image:: images/lm_GMM_K2.png - :alt: Initial fit using two components - :align: center - -As only two components were used, the separate blobs on the plots were not able -to be captured. The blob closest to the origin must be separated from the -others for a good high-specificity fit. Inclusion of even a small number of -points between different clusters rapidly increases cluster size and decreases -number of clusters. In this example the initial fit clusters lineage I and -lineage II separately, but merges sub-lineages (which we refer to as strains). - -PopPUNK offers three ways to achieve this -- two are discussed below. - -.. _model-refit: - -Re-fitting the model --------------------- -To achieve a better model fit which finds the strains within the main lineages -the blob of points near the origin needs to be separated from the other -clusters. One can use the existing database to refit the model with minimal -extra computation. - -The first way to do this is to increase the number of mixture components to the -number of blobs you roughly judge to be in the plot. In this case there are -four:: - - poppunk --fit-model --distances lm_example/lm_example.dists --ref-db lm_example --output lm_example --full-db --K 4 - -This correctly separates the blob at the origin -- the 'within-strain' -distances: - -.. image:: images/lm_GMM_K4.png - :alt: Improved fit using two components - :align: center - -Which gives more clusters (network components) and a lower density, higher -scoring network:: - - Fit summary: - Avg. entropy of assignment 0.0076 - Number of components used 4 - Network summary: - Components 31 - Density 0.0897 - Transitivity 1.0000 - Score 0.9103 - -Alternatively `DBSCAN `__ can be used, which doesn't require the number of -clusters to be specified:: - - poppunk --fit-model --distances lm_example/lm_example.dists --ref-db lm_example --output lm_example --full-db --dbscan - -This gives a very similar result: - -.. image:: images/lm_dbscan.png - :alt: Improved fit using dbscan - :align: center - -with an almost identical network producing identical clusters:: - - Fit summary: - Number of clusters 4 - Number of datapoints 8128 - Number of assignments 8128 - Network summary: - Components 31 - Density 0.0896 - Transitivity 0.9997 - Score 0.9103 - -The slight discrepancy is due to one within-strain point being classified as -noise (small, black point on the plot). For datasets with more noise points from -DBSCAN then model refinement should always be run after this step (see :ref:`refine-model`). - -Creating interactive output -^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Now that a good, high-specificity fit has been obtained you can add some extra -flags to create output files for visualisation: - -* ``--microreact`` -- Files for `Microreact `__ (see - below). -* ``--rapidnj rapidnj`` -- Perform core NJ tree construction using rapidnj, - which is much faster than the default implementation. The argument points to - the rapidnj binary. -* ``--cytoscape`` -- Files to view the network in `Cytoscape `__. -* ``--phandango`` -- Files to view the clustering in `phandango `__. -* ``--grapetree`` -- Files to view the clustering in `GrapeTree `__. - -As a brief example, in the ``lm_example`` folder find the files: - -* ``lm_example_phandango_clusters.csv`` -* ``lm_example_perplexity20.0_accessory_tsne.dot`` -* ``lm_example_core_NJ.nwk`` - -And drag-and-drop these into the browser at https://microreact.org/upload. -This will produce a visualisation with a core genome phylogeny on the left, and -an embedding of the accessory distances on the right. Each sample is coloured -by its cluster: - -.. image:: images/lm_microreact.png - :alt: Microreact of Listeria monoscytogenes - :align: center - -The interactive version can be found at https://microreact.org/project/rJJ-cXOum. - diff --git a/docs/scripts.rst b/docs/scripts.rst index 9e5a6099..77d9efc1 100644 --- a/docs/scripts.rst +++ b/docs/scripts.rst @@ -6,11 +6,31 @@ Scripts Brief documentation on the helper scripts included in the package in the ``/scripts`` directory. To use these scripts you will need to have a clone of the git repository, or they should also be installed with the prefix 'poppunk' (e.g to run ``extract_distances.py``, run the command -``poppunk_extract_distances.py``). +``poppunk_extract_distances.py``). .. contents:: :local: +Easy run mode +------------- +Previous versions of the software had an ``--easy-run`` mode which would run a pipeline of: + +- ``--create-db`` to sketch genomes +- ``--fit-model --dbscan`` to fit a flexible model +- ``--refine-model`` to improve this model + +This is now available as ``poppunk_easy_run.py`` which will chain calls to ``poppunk`` +and ``poppunk_visualise`` to replicate this functionality. + +Adding weights to the network +----------------------------- +Converts binary within-cluster edge weights to the Euclidean core-accessory distance. +This is equivalent to running with ``--graph-weights``:: + + poppunk_add_weights _graph.gt .dists + +Default output is a graph-tool file. Add ``--graphml`` to save as .graphml instead. + Writing the pairwise distances to an output file ------------------------------------------------ By default PopPUNK does not write the calculated :math:`\pi_n` and :math:`a` distances out, as this diff --git a/docs/sketching.rst b/docs/sketching.rst new file mode 100644 index 00000000..ac98beac --- /dev/null +++ b/docs/sketching.rst @@ -0,0 +1,277 @@ +Sketching +========= + +The basis of all analysis is estimation of core and accessory genome distances between samples. +PopPUNK uses genome sketching to make analysis more efficient. In previous versions we used +mash, however the current version now requires `pp-sketchlib `__. + +This page details options related to sketching and distance calculation, and is relevant +to both :doc:`query_assignment` and :doc:`model_fitting`. + +.. contents:: + :local: + +Overview +-------- +Any input given to ``--r-files`` or ``--q-files`` will be sketched using the following +steps: + +1. Run pp-sketchlib to sketch all input genomes. +2. (r-files only) Run :doc:`qc` on the sketches. Remove, ignore or stop, depending on ``--qc-filter``. +3. (r-files only) Calculate random match chances and add to the database. +4. Save sketches in a HDF5 datbase (the .h5 file). +5. (r-files only) Calculate core and accessory distances between every pair of sketches, save in .npy and .pkl. +6. (q-files only) Calculate core and accessory distances between query and reference sketches. +7. Report any core distances greater than ``--max-a-dist`` (and quit, if an r-file). + +To run this before :doc:`model_fitting`, use ``--create-db``:: + + poppunk --create-db --output database --r-files rlist.txt --threads 8 + +As noted elsewhere, the input is file which lists your sample names and paths to their sequence data. This file +has no header, is tab separated, and contains the sample name in the first column. Subsequent +columns may contain paths to either assembled or raw read data (the type will automatically +be inferred by checking for the presence of quality scores). Data may be gzipped or uncompressed:: + + MS1 ms1_assembled.fa + MS2 ms2_assembled.fa + SM14 SM14_1.fq.gz SM14_2.fq.gz + +The rest of this page describes options to further control this process. + +.. note:: + Sketching considers k-mers on both the forward and reverse by default, as typically + reads and assemblies are not aligned to a single strand. For genomes where input is + always on the same (forward) strand, as may be the case with single-stranded + viral genomes, use the ``--strand-preserved`` option to ignore the reverse strand + k-mers. + +Using pp-sketchlib directly +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +You can use pp-sketchlib directly to create sketches, though functionality is identical +to doing this through PopPUNK. You will need to run both sketch and query modes to generate +the sketch database and the distance files as in ``--create-db``:: + + poppunk_sketch --sketch --rfile rfiles.txt --ref-db database --sketch-size 10000 --min-k 15 --k-step 2 --cpus 4 + poppunk_sketch --query --ref-db database --query-db database --cpus 4 + +You may want to do this if you anticipate trying different k-mer sizes, are using the +databases for other purposes, or running a very large analysis where it is useful to split +up the sketching and distance steps. Useful options include: + +- ``--print`` -- to print distances in human-readable format to the terminal. +- ``--jaccard`` -- will output Jaccard distances at each k-mer length, rather than core and accessory distances. +- ``--subset`` -- to only calculate distances for a subset of the genomes in the reference database. + +.. warning:: + Some options have slightly different names. See the pp-sketchlib README for full details. + +.. _kmer-length: + +Choosing the right k-mer lengths +-------------------------------- +To get a sensitive estimate of accessory distance independent from core +distance, a small a k-mer size as possible needs to be included in the fit. +However, for longer genomes too small a k-mer size will result in biased +estimates of distances as small k-mers will match at random. pp-sketchlib now +includes a correction for random matches, but there is still a lower limit at +which this can work. A simple formula for estimating this is: + +.. math:: + + r &= 1 - (1 - 2 \cdot 4^{-k})^{-l} \\ + J_r &= \frac{r^2}{2r - r^2} + +where :math:`k` is the k-mer length, :math:`l` is the length of the genome and :math:`J_r` +is the Jaccard distance expected by chance. When :math:`J_r` approaches 1, estimation will +begin to fail. + +.. note:: + For genomes on a single strand, the factor of two in the first formula above + should be excluded. + +At the other end, choosing a :math:`k` which is too long will result in all k-mers +mismatching. The greater the core distance :math:`\pi`, the lower the allowable maximum. + +Some k-mer ranges for ``--k-min`` and ``--k-max`` we have found to work for various genomes: + +.. table:: k-mer lengths by domain + :widths: auto + :align: center + + ================== ================= =========== ===== ===== + Domain/pathogen Typical :math:`l` :math:`\pi` k-min k-max + ================== ================= =========== ===== ===== + Beta-coronaviruses 20kb 0.1 6 15 + Bacteria 2-5Mb ~0.01-0.04 13 29 + Fungi 16Mb ~0.01 15 31 + Plasmodium 23Mb 0.0005 17 31 + ================== ================= =========== ===== ===== + +A ``--kmer-step`` of four is usually sufficient, but drop this to two or three +to give the best accuracy at the expense of increased execution time. + +A good model will have a straight line fit between :math:`\log(J)` and :math:`k`. Run +with the ``--plot-fit`` option to randomly choose a number +of sample pairs to plot the relation between k-mer distances and core and +accessory fits. This plot does not have to be perfectly straight, but the general trend +should be followed. If you have a point at the end going off the scale, you will want to adjust +your k-mer range. + +.. image:: images/kmer_fit.png + :alt: A fixed fit to k-mer distances + :align: center + +Choosing the sketch size +^^^^^^^^^^^^^^^^^^^^^^^^ +The default sketch size :math:`s` is 10000. Note that this is 10-fold greater than the mash +default of 1000 -- this is required to get sufficient resolution on :math:`\pi`. For closely +related genomes with smaller :math:`\pi`, you may need to increase the sketch size. + +As a rule of thumb, choose :math:`s = \frac{1}{\pi}` based on the minimum resolution +in :math:`\pi` you need to observe. + +.. important:: + Any Jaccard distances :math:`< \frac{5}{s}` will be + ignored in the fit of core and accessory distances. This prevents spurious + matches between very close sketches dominating, when a poor minimum k-mer length + has been chosen. + +Note that a larger sketch size will result in a linear increase in database size +and distance calculation time. + +Sketching from read data +------------------------ +You can also use sequence reads rather than assemblies as input. The main differences are that +this data is typically a lot larger, and may contain false k-mers as the result of sequencing +errors. + +Read data is automatically detected for each input file. It may be interleaved, or given +as forward and reverse reads. Low frequency k-mers, which are assumed to be the result +of sequencing error, will be filtered out automatically. Use the ``--min-kmer-count`` option +to set the minimum number of k-mers needed to be observed to include these. Most error +k-mers will appear only once, but ideally set this somewhere between 1 and the coverage: + +.. image:: images/13mer_hist.png + :alt: Histogram of k-mers from sequence reads + :align: center + +In this example the coverage is around 150x, so most correct k-mers have a frequency +centred around this point (there is a second peak at twice this value, which are +repeats). There is a large peak at a frequency of one, which are the error k-mers. In this +example any filter between 15-75 would be appropriate. + +The default filter is a probabilistic countmin filter, assuming up to 134M unique k-mers. If you expect +significantly more k-mers than this, for example with longer genomes, you should add +the ``--exact-count`` argument to use a hash table instead. This is exact, but may +use more memory. + +Sketching RNA viruses +--------------------- +Firstly, if your viral genomes are single stranded, you probably need to add the +``--strand-preserved`` option. + +For small genomes where strong selection is present, in the example here shown with influenza genomes, the third codon bias may be so +great that 6-mers (or any multiple of three) have fewer matches than 7-mers. +In a mostly coding genome the third codon position across a gene is more free to mutate, as it +can cause non-synonymous changes, whereas the first and second codons always cause coding changes. This +can cause issues with the core-accessory regression pushing some core distances to 0: + +.. image:: images/flu_unphased.png + :alt: RNA virus with dense seeds + :align: center + +A solution to this is to use k-mers with spaced seeds, where only every third base +is added to the k-mer. This prevents multiples of the codon size lining up with heavily mutated +bases. + +.. table:: Codon phased seeds + :widths: auto + :align: center + + ================== ================= ============== + k-mer dense Phased seed + ================== ================= ============== + 3 XXX X--X--X + 4 XXXX X--X--X--X + 5 XXXXX X--X--X--X--X + ================== ================= ============== + +Add the ``--codon-phased`` option to enable this. This fixes the above example: + +.. image:: images/flu_phased.png + :alt: RNA virus with codon phased seeds + :align: center + +.. note:: + When using a database constructed with codon phased seeds for :doc:`query_assignment`, + codon phased seeds will automatically be turned on for the query sequences too. + +GPU acceleration +---------------- +There are two pieces of heavy computation that can be accelerated with the use of a CUDA-enabled +GPU: + +- Sketching read data ``--gpu-sketch``. +- Calculating core and accessory distances ``--gpu-dist``. + +We assume you have a GPU of at least compute capability v7.0 (Tesla) with drivers +correctly installed. You do not need the CUDA toolkit installed, as all libraries are +included with the pp-sketchlib executable. + +.. note:: + You will see 'GPU' in the progress message if a GPU is successfully being used. If you + see the usual CPU version your install may not have been compiled with CUDA. + +Sketching read data with the GPU is a hybrid algorithm which can take advantage of +CPU threads too (which are used to read and process the fastq files). You can add +up to around 16 ``--threads`` to keep a typical consumer GPU busy. The sequence data +must fit in device memory, along with a 2Gb countmin filter. The countmin filter +is 134M entries wide. If you expect your reads to have more unique k-mers than this +you may see an increased error rate. + +Typical output will look like this:: + + Sketching 128 read sets on GPU device 0 + also using 16 CPU cores + Sketching batch: 1 of 9 + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + Sketching batch: 2 of 9 + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + k = 29 (100%) + .... + +Calculating distances with the GPU will give slightly different results to CPU distances, +but typically within 1%, which should not usually affect downstream results. The sketches, +random matches and distances must fit in the device memory. Around 35k bacterial genomes +uses around 10Gb of device memory, typical for a high-end consumer device. If the device memory +is exceeded the calculation will automatically be split into chunks, at only slightly reduced +efficiency. The amount of memory available and needed will be estimated at the start:: + + Calculating distances on GPU device 0 + Estimated device memory required: 565Mb + Total device memory: 11019Mb + Free device memory: 10855Mb + Progress (GPU): 100.0% + +.. important:: + The GPU which is device 0 will be used by default. If you wish to target another + GPU, use the ``--deviceid`` option. This may be important on computing clusters + where you must use your job's allocated GPU. \ No newline at end of file diff --git a/docs/subclustering.rst b/docs/subclustering.rst new file mode 100644 index 00000000..f8f2970c --- /dev/null +++ b/docs/subclustering.rst @@ -0,0 +1,107 @@ +Subclustering with PopPIPE +========================== + +**Contents**: + +.. contents:: + :local: + +.. danger:: + PopPIPE is still a work-in-progress and may not immediately work on your data. We + anticipate finalising the pipeline in the near future. + +Overview +-------- +You can run `PopPIPE `__ on your PopPUNK output, +which will run subclustering and visualisation within your strains. The pipeline +consists of the following steps: + +- Split files into their strains. +- Calculate core and accessory distances within each strain. +- Use the core distances to make a neighbour-joining tree. +- (lineage_clust mode) Generate clusters from core distances with lineage clustering in PopPUNK. +- Use `ska `__ to generate within-strain alignments. +- Use `IQ-TREE `__ to generate an ML phylogeny for each strain using this alignment, + and the NJ tree as a starting point. +- Use `fastbaps `__ to generate subclusters which are partitions of the phylogeny. +- Create an overall visualisation with both core and accessory distances, as in PopPUNK. + The final tree consists of refining the NJ tree by grafting the maximum likelihood trees for subclusters to their matching nodes. + +An example DAG for the steps (excluding ``ska index``, for which there is one per sample): + +.. image:: images/poppipe_dag.png + :alt: Example DAG for a PopPIPE run + :align: center + +Installation +------------ +PopPIPE is a `snakemake `__ pipeline, which depends +upon snakemake and pandas:: + + conda install snakemake pandas + +Other dependencies will be automatically installed by conda the first time +you run the pipeline. You can also install them yourself and omit the `-use-conda` +directive to snakemake:: + + conda create -n poppipe --file=environment.yml + +Then clone the repository:: + + git clone git@github.com:johnlees/PopPIPE.git + +Usage +----- +1. Modify ``config.yml`` as appropriate. +2. Run ``snakemake --cores --use-conda``. + +On a cluster or the cloud, you can use snakemake's built-in ``--cluster`` argument:: + + snakemake --cluster qsub -j 16 --use-conda + +See the `snakemake docs `__ +for more information on your cluster/cloud provider. + +Alternative runs +^^^^^^^^^^^^^^^^ +For quick and dirty clustering and phylogenies using core distances from +`pp-sketchlib `__ alone, run:: + + snakemake --cores --use-conda lineage_clust + +To create a visualisation on `microreact `__: + + snakemake --use-conda make_microreact + +Config file +----------- + +PopPIPE configuration +^^^^^^^^^^^^^^^^^^^^^ + +- ``script_location``: The ``scripts/`` directory, if not running from the root of this repository +- ``poppunk_db``: The PopPUNK HDF5 database file, without the ``.h5`` suffix. +- ``poppunk_clusters``: The PopPUNK cluster CSV file, usually ``poppunk_db/poppunk_db_clusters.csv``. +- ``poppunk_rfile``: The ``--rfile`` used with PopPUNK, which lists sample names and files, one per line, tab separated. +- ``min_cluster_size``: The minimum size of a cluster to run the analysis on (recommended at least 6). + +IQ-TREE configuration +^^^^^^^^^^^^^^^^^^^^^ + +- ``enabled``: Set to ``false`` to turn off ML tree generation, and use the NJ tree throughout. +- ``mode``: Set to ``full`` to run with the specified model, set to ``fast`` to run using ``--fast`` (like fasttree). +- ``model``: A string for the ``-m`` parameter describing the model. Adding ``+ASC`` is recommended. + +fastbaps configuration +^^^^^^^^^^^^^^^^^^^^^^ + +* ``levels``: Number of levels of recursive subclustering. +* ``script``: Location of the ``run_fastbaps`` script. Find by running ``system.file("run_fastbaps", package = "fastbaps")`` in R. + + +Updating a run +----------------- +Running ``snakemake`` from the same directory will keep outputs where possible, +so new additions will automatically be included. + +**TODO**: How to do this when adding new isolates with ``poppunk_assign`` diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst index cc9c45fc..35c10650 100644 --- a/docs/troubleshooting.rst +++ b/docs/troubleshooting.rst @@ -21,6 +21,13 @@ with ``pp-sketchlib =v2.2`` and ``pp-sketchlib >=v1.5.1``. Error/warning messages ---------------------- @@ -66,232 +73,18 @@ used to assign new queries. If you want to change cluster names or assign queries to your own cluster definitions you can use the ``--external-clustering`` argument instead. -.. _kmer-length: - -Choosing the right k-mer lengths --------------------------------- -When using in the ``--create-db`` mode a straight line fit is required. Make -sure to run with the ``--plot-fit`` option, which will randomly choose a number -of sample pairs to plot the relation between k-mer distances and core and -accessory fits. - -To get a sensitive estimate of accessory distance independent from core -distance, a small a k-mer size as possible needs to be included in the fit. -However, for longer genomes too small a k-mer size will result in biased -estimates of distances as small k-mers will match at random. - -Here is an example of a fit with ``--k-step 2 --min-k 13``: - -.. image:: images/fit_example_wrong.png - :alt: A bad fit to k-mer distances - :align: center - -The genome being fitted is 4.6Mb long, which at 13-mer matches gives a 6% -chance of random matches (this information is written to ``STDERR``), resulting -in the left-most point being over-estimated. Using exactly the same command, -but changing ``--min-k 15`` fixes the issue: - -.. image:: images/fit_example_fixed.png - :alt: A fixed fit to k-mer distances - :align: center - -A ``--kmer-step`` of four is usually sufficient, but drop this to two or three -to give the best accuracy. - -.. _manual-start: - -Using fit refinement when mixture model totally fails -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If the mixture model does not give any sort of reasonable fit to the points, -you can manually provide a file with ``--manual-start`` to give the starting parameters to -``--refine-fit`` mode. The format of this file is as follows:: - - mean0 0,0 - mean1 0.5,0.6 - start_point 0.3 - -A key, followed by its value (space separated). - -``mean0`` and ``mean1`` define the points (x,y) to draw the line between, and -``start_point`` is the distance along this line to draw the initial boundary -(which is normal to the line). - -.. _cytoscape-view: - -Viewing the network with cytoscape ----------------------------------- -If you add the ``--cytoscape`` option when running ``--fit-model`` _cytoscape.graphml -and _cytoscape.csv files will be written to the output directory. - -Open `cytoscape `_ and drag and drop the .graphml -file onto the window to import the network. Import -> table -> file to load the -CSV. Click 'Select None' then add the 'id' column as a key, and any required -metadata columns (at least the 'Cluster' column) as attributes. Make sure -'Node Table Columns' is selected as the data type. - -Click on 'Style' and change the node fill colour to be by cluster, the mapping -type as discrete, then right click to autogenerate a colour scheme. You can -also modify the node size here. In the :doc:`tutorial` example, the components -are nicely separated and the network has high transitivity: - -.. image:: images/cytoscape.png - :alt: Cytoscape plot of network - :align: center - -In some cases, edges which are between strain links may have been erroneously included -in the network. This could be due to poor model fit, or a poor quality -sequence. Use Tools -> NetworkAnalyzer -> Analyze Network to compute -information for each node and edge. It may help to analyze connected components separately. -They can be split under Tools -> NetworkAnalyzer -> Subnetwork Creation. - -Here is an example where an errant node is connecting two clusters into one -large cluster, which should be split: - -.. image:: images/cytoscape_component.png - :alt: Cytoscape plot of network - :align: center - -The incorrect node in question has a low CluteringCoefficient and high Stress. -The EdgeBetweeness of its connections are also high. Sorting the node and edge -tables by these columns can find individual problems such as this. - -.. _perplexity: - -Setting the perplexity parameter for t-SNE ------------------------------------------- -In t-SNE an embedding of the accessory genome distances is found which -represents local structure of the data. Isolates with similar accessory content -will visually appear in clusters together. - -The perplexity sets a guess about the number of close neighbours each point -has, and is a trade-off between local and global structure. t-SNE is reasonably -robust to changes in the perplexity parameter (set with ``--perplexity`` when -creating microreact output with ``--microreact`` in the``--fit-model`` mode), -however we would recommend trying a few values to get -a good embedding for the accessory distances. - -There is a good discussion of the effect of perplexity `here `_ -and the sklearn documentation shows some examples of the effect of `changing -perplexity `_. - -In the :doc:`tutorial` example, a perplexity of 30 gives clear clustering of -the accessory genome content, condordant with the core genome structure (`data `__): - -.. image:: images/microreact.png - :alt: Microreact plot of results with perplexity = 30 - :align: center - -With a lower perplexity of 5, the clustering is too loose, and the strain -structure cannot clearly be seen (`data `__): - -.. image:: images/microreact_perplexity5.png - :alt: Microreact plot of results with perplexity = 5 - :align: center - -30 is a good default, but you may wish to try other values, particularly with -larger or smaller datasets. You can re-run the t-SNE using the ``poppunk_tsne`` -command, providing the distances from the previous run:: - - poppunk_tsne --distances strain_db/strain_db.dists --output strain_db \ - --perplexity 20 --verbosity 1 - -.. _qc: - -Dealing with poor quality data ------------------------------- -In this example we analyse 76 *Haemophilus influenzae* isolates. One isolate, 14412_4_15, -is contaminated with 12% of reads being *Haemophilus parainfluenzae* and a total -assembly length of 3.8Mb. It should be removed before input, but its presence -can also be found with ``PopPUNK``. - -With the distances -^^^^^^^^^^^^^^^^^^ -A fit with three mixture components overestimates the number of between strain -links, and gives a network with a poor score (0.6849) and only five components: - -.. image:: images/contam_DPGMM_fit.png - :alt: A bad fit to pairwise distances - :align: center - -Distances in the top left of the plot, with low core distances and high -accessory distances, are due to the contaminated contigs in the isolate. -Finding which isolates contribute to these distances reveals a clear culprit:: - - awk '$3<0.02 && $4 > 0.3 {print $1}' contam_db/contam_db.search.out | cut -f 1 | sort | uniq -c - 1 14412_3_81 - 1 14412_3_82 - 1 14412_3_83 - 1 14412_3_84 - 1 14412_3_88 - 1 14412_3_89 - 1 14412_3_91 - 1 14412_3_92 - 1 14412_4_1 - 1 14412_4_10 - 28 14412_4_15 - -In this case it is sufficient to increase the number of mixture components to four, -which no longer includes these inflated distances. This gives a score of 0.9401 and 28 components: - -.. image:: images/contam_DPGMM_better_fit.png - :alt: A better fit to pairwise distances - :align: center - -The best thing to do is to remove the poor quality isolate, or if possible -remove the contaminated reads/contigs from the assembly. - -With the network -^^^^^^^^^^^^^^^^ -Alternatively, the network itself can be inspected with ``--cytoscape``. Using -the approach detailed in :ref:`cytoscape-view` gives the following view: - -.. image:: images/cytoscape_contaminant.png - :alt: A better fit to pairwise distances - :align: center - -The contaminated node appears when ordering by ClusteringCoefficient, Stress or -TopologicalCoefficient, and its edges appear when ordering by EdgeBetweeness. -It can be seen highlighted in the top right component, connecting two clusters -which otherwise have no links. It can be removed, and components recalculated in -cytoscape directly, though removal from the PopPUNK database is best. - -The second largest cluster is also suspicious, where there are few triangles -(low transitivity) and the nodes involved have high Stress. This is indicative -of a bad fit overall, rather than a single problem sample. - -Removing samples from a database --------------------------------- -You can use the ``prune_poppunk`` command to remove samples from a database, -for example those found to be of poor quality (see :ref:`qc`). Create a file -``remove.txt`` with the names of the samples you wish to remove, one per line, -and run:: - - prune_poppunk --remove remove.txt --distances strain_db/strain_db.dists --output pruned_db - -This will remove the samples from the ``strain_db.dists`` files, from which -``--model-fit`` can be run again. - -If you would like to create the mash sketches again, which is recommended if -you plan to use ``--full-db`` and/or assign future query sequences, add the -``--resketch`` argument:: - - prune_poppunk --remove remove.txt --distances strain_db/strain_db.dists --output pruned_db --resketch --ref-db strain_db --threads 4 - Memory/run-time issues ---------------------- -For larger datasets resource use may be challenging. So far the largest dataset -we've analysed was around 12000 genomes, which used modest computational -resources. Here are some tips based on these experiences: +Here are some tips based on experiences analysing larger datasets: - Add ``--threads`` -- they are used fairly efficiently throughout. -- When running ``--create-db`` with many threads, add the ``--no-stream`` option. - This will trade-off memory for disk usage, as it seems that many threaded - ``mash dist`` output cannot be processed as fast as it is produced. +- Consider the ``--gpu-sketch`` and ``--gpu-dists`` options is applicable, + and a GPU is available. - In ``--refine-model`` set ``--pos-shift 0`` to avoid creating huge networks with close to :math:`N^2` edges. Mixture models normally need to be pruned. - In ``--refine-model`` you may add the ``--no-local`` option to skip that step and decrease run-time, though gains are likely marginal. -- Use ``--rapid-nj``, if producing MicroReact output. +- Use ``--rapid-nj``, if producing microreact output. Another option for scaling is to run ``--create-db`` with a smaller initial set (not using the ``--full-db`` command), then use ``--assign-query`` to add to this. diff --git a/docs/tutorial.rst b/docs/tutorial.rst deleted file mode 100644 index 533a07dd..00000000 --- a/docs/tutorial.rst +++ /dev/null @@ -1,885 +0,0 @@ -Tutorial -======== - -.. |nbsp| unicode:: 0xA0 - :trim: - -This tutorial will guide you through the use of the four modes of PopPUNK, -explaining when to use each one. In places we refer to :doc:`troubleshooting` -which explains how to deal with common problems when the defaults don't quite -work. - -The first two steps can be run together in a single command using ``--easy-run``, -which in many cases will work without need for further modification. - -In this tutorial we will work with the Salmonella genomes reviewed by `Alikhan -et al `_ which can be downloaded -from `EnteroBase `_. - -.. contents:: - :local: - -Creating a database -------------------- -To analyse a population from scratch (where PopPUNK hasn't been used before) -the first step is to create a PopPUNK database, which is essentially a list of -all the core and accessory distances between each pair of isolates in the -collection. - -The basic command to do this is as follows:: - - poppunk --create-db --r-files reference_list.txt --output strain_db --threads 2 --plot-fit 5 - -Where ``references.txt`` is a list of tab separated sample names and sequence files to analyse, -created by, for example:: - - paste <(ls assemblies/names.txt) <(ls assemblies/*.fasta) > reference_list.txt - -The references will first be hashed at different k-mer lengths, then pairwise -distances are calculated, which are finally converted into core and accessory -distances:: - - PopPUNK (POPulation Partitioning Using Nucleotide Kmers) - Mode: Building new database from input sequences - Creating mash database for k = 13 - Random 13-mer probability: 0.06 - Creating mash database for k = 17 - Random 17-mer probability: 0.00 - Creating mash database for k = 15 - Random 15-mer probability: 0.00 - Creating mash database for k = 19 - Random 19-mer probability: 0.00 - Creating mash database for k = 21 - Random 21-mer probability: 0.00 - Creating mash database for k = 25 - Random 25-mer probability: 0.00 - Creating mash database for k = 23 - Random 23-mer probability: 0.00 - Creating mash database for k = 27 - Random 27-mer probability: 0.00 - Creating mash database for k = 29 - Random 29-mer probability: 0.00 - mash dist -p 2 ./strain_db/strain_db.13.msh ./strain_db/strain_db.13.msh 2> strain_db.err.log - mash dist -p 2 ./strain_db/strain_db.15.msh ./strain_db/strain_db.15.msh 2> strain_db.err.log - mash dist -p 2 ./strain_db/strain_db.17.msh ./strain_db/strain_db.17.msh 2> strain_db.err.log - mash dist -p 2 ./strain_db/strain_db.19.msh ./strain_db/strain_db.19.msh 2> strain_db.err.log - mash dist -p 2 ./strain_db/strain_db.21.msh ./strain_db/strain_db.21.msh 2> strain_db.err.log - mash dist -p 2 ./strain_db/strain_db.23.msh ./strain_db/strain_db.23.msh 2> strain_db.err.log - mash dist -p 2 ./strain_db/strain_db.25.msh ./strain_db/strain_db.25.msh 2> strain_db.err.log - mash dist -p 2 ./strain_db/strain_db.27.msh ./strain_db/strain_db.27.msh 2> strain_db.err.log - mash dist -p 2 ./strain_db/strain_db.29.msh ./strain_db/strain_db.29.msh 2> strain_db.err.log - Calculating core and accessory distances - - Done - -We would recommend using as many threads as available for maximum speed (even -if #threads > #k-mers). To convert k-mer distances into core and accessory -distances the following relationship is used: - -.. math:: - - & \mathrm{pr}(a, b) = (1-a)(1-c)^k \\ - & \log (\mathrm{pr}(a, b)) = \log(1-a) + k \cdot \log(1-c) - -Where :math:`\mathrm{pr}(a, b)` is the proportion of k-mers matching at length -:math:`k` between sequences :math:`a` and :math:`b`. In log-linear space this is -linear by k-mer length, and a constrained least squared fit gives the accessory -distance (the intercept) and the core distance (the slope). - -.. warning:: - A straight line fit is required for correct calculation of core and - accessory distances. To inspect this the use of the ``--plot-fit`` options - is generally recommended to inspect some of the regressions. Choice of min-k - depends on this, and is discussed in :ref:`kmer-length`. - -Output files -^^^^^^^^^^^^ -This will create two files `strain_db/strain_db.dists.npy` and `strain_db/strain_db.dists.pkl` which -store the distances and strain names respectively. These are then used in -:ref:`model-fit`. - -There are also databases of sketches at each k-mer length (`*.msh`) which can -be re-used if the same data is fitted with a new range of k-mer lengths. -Otherwise they should be recalculated by specifying ``--overwrite``. - -Relevant command line options -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following command line options can be used in this mode: - - Mode of operation: - --create-db Create pairwise distances database between reference - sequences - Input files: - --r-files R_FILES File listing reference input assemblies - - Output options: - --output OUTPUT Prefix for output files (required) - --plot-fit PLOT_FIT Create this many plots of some fits relating k-mer to - core/accessory distances [default = 0] - --overwrite Overwrite any existing database files - - Quality control options: - --max-a-dist MAX_A_DIST - Maximum accessory distance to permit [default = 0.5] - --ignore-length Ignore outliers in terms of assembly length [default = - False] - - Kmer comparison options: - --min-k MIN_K Minimum kmer length [default = 9] - --max-k MAX_K Maximum kmer length [default = 29] - --k-step K_STEP K-mer step size [default = 4] - --sketch-size SKETCH_SIZE - Kmer sketch size [default = 10000] - - Other options: - --mash MASH Location of mash executable - --threads THREADS Number of threads to use during database querying - [default = 1] - --no-stream Use temporary files for mash dist interfacing. Reduce - memory use/increase disk use for large datasets - -.. _model-fit: - -Fitting the model ------------------ - -The basic command used to fit the model is as follows:: - - poppunk-runner.py --fit-model --distances strain_db/strain_db.dists --output strain_db --full-db --ref-db strain_db --K 2 - -This will fit a mixture of up to three 2D Gaussians to the distribution of core and -accessory distances:: - - PopPUNK (POPulation Partitioning Using Nucleotide Kmers) - Mode: Fitting model to reference database - - Fit summary: - Avg. entropy of assignment 0.0042 - Number of components used 2 - Network summary: - Components 12 - Density 0.1852 - Transitivity 0.9941 - Score 0.8100 - - Done - -The default is to fit two components, one for between-strain and one for -within-strain distances. There are a number of summary statistics which you can use to assess the fit: - -========================== ============== -Statistic Interpretation -========================== ============== -Avg. entropy of assignment How confidently each distance is assigned to a component. Closer to zero is more confident, and indicates less overlap of components, which may be indicative of less recombination overall. -Number of components used The number of mixture components actually used, which may be less than the maximum allowed. -Components The number of components in the network == the number of population clusters -Density The proportion of edges in the network. 0 is no links, 1 is every link. Lower is better. -Transitivity The transitivity of the network, between 0 and 1. Higher is better -Score Network score based on density and transitivity. Higher is better. -========================== ============== - -.. important:: - This is the most important part of getting a good estimation of population - structure. In many cases choosing a sensible ``--K`` will get a fit with - a good score, but in more complex cases PopPUNK allows alternative - model fitting. See :ref:`refine-model` for a discussion on how to improve - the model fit. - -The most useful plot is `strain_db_DPGMM_fit.png` which shows the clustering: - -.. image:: images/DPGMM_fit_K2.png - :alt: 2D fit to distances (K = 2) - :align: center - -This looks reasonable. The component closest to the origin is used to create a network where isolates -determined to be within the same strain are linked by edges. The connected components of -this network are then the population clusters. - -In this case, allowing more components (``--K 10``) gives a worse -fit as more complexity is introduced arbitrarily:: - - PopPUNK (POPulation Partitioning Using Nucleotide Kmers) - Mode: Fitting model to reference database - - Fit summary: - Avg. entropy of assignment 0.0053 - Number of components used 10 - Network summary: - Components 121 - Density 0.0534 - Transitivity 0.8541 - Score 0.8085 - - Done - -.. image:: images/DPGMM_fit_K10.png - :alt: 2D fit to distances (K = 10) - :align: center - -In this case the fit is too conservative, and the network has a high number of -components. - -Once you have a good fit, run again with the ``--microreact`` option (and -``--rapidnj`` if you have `rapidnj `_ installed). -This will create output files which can dragged and dropped into `Microreact `_ -for visualisation of the results. - -Drag the files `strain_db_microreact_clusters.csv`, `strain_db_perplexity5.0_accessory_tsne`, and -`strain_db_core_NJ_microreact.nwk` onto Microreact. For this example, the output is at https://microreact.org/project/Skg0j9sjz -(this also includes a CSV of additional metadata downloaded from EnteroBase and supplied to -PopPUNK with ``--info-csv``). - -.. image:: images/microreact.png - :alt: Microreact plot of results - :align: center - -The left panel shows the tree from the core distances, and the right panel the -embedding of accessory distances (at perplexity 30). Differences in clustering between the two can -be informative of separate core and accessory evolution, but in this case they -are correlated as expected for strains. Tips are coloured by the PopPUNK inferred cluster. - -.. note:: - t-SNE can be sensitive to the ``--perplexity`` parameter provided. This can - be re-run as necessary by changing the parameter value. Use a value between - 5 and 50, but see :ref:`perplexity` for further discussion. - -Using DBSCAN -^^^^^^^^^^^^ -Clustering can also be performed by using DBSCAN, which uses the -`HDBSCAN* library `__. Run the same -``fit-model`` command as above, but add the ``--dbscan`` option:: - - poppunk-runner.py --fit-model --distances strain_db/strain_db.dists --output strain_db --full-db --ref-db strain_db --dbscan - -The output is as follows:: - - PopPUNK (POPulation Partitioning Using Nucleotide Kmers) - Mode: Fitting model to reference database - - Fit summary: - Number of clusters 5 - Number of datapoints 100000 - Number of assignments 100000 - Network summary: - Components 9 - Density 0.1906 - Transitivity 0.9979 - Score 0.8077 - - Done - -In this case the fit is quite similar to the mixture model: - -.. image:: images/dbscan_fit.png - :alt: Data fitted with HDBSCAN - :align: center - -The small black points are classified as noise, and are not used in the network -construction. - -.. warning:: - If there are a lot of noise points (in black) then fit refinement will be - subsequently required, as these points will not contribute to the network. - See :ref:`refine-model`. - -Use of full-db -^^^^^^^^^^^^^^ -By default the ``--full-db`` option is off. When on this will keep every sample in the -analysis in the database for future querying. - -When off (the default) representative samples will be picked from each cluster -by choosing only one reference sample from each clique (where all samples in -a clqiue have a within-cluster link to all other samples in the clique). This -can significantly reduce the database size for future querying without loss of -accuracy. Representative samples are written out to a .refs file, and a new -database is sketched for future distance comparison. - -In the case of the example above, this reduces from 848 to 14 representatives (one for -each of the twelve clusters, except for 3 and 6 which have two each). - -If the program was run through using ``--full-db``, references can be picked -and a full directory with a PopPUNK model for query assignment created using -the ``poppunk_references`` program:: - - poppunk_references --network strain_db/strain_db_graph.gpickle --ref-db strain_db --distances strain_db/strain_db.dists \ - --model strain_db --output strain_references --threads 4 - -Using the ``--model`` will also copy over the model fit, so that the entire -PopPUNK database is in a single directory. - -Providing previous cluster definitions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -By using the option ``--external-clustering`` one can provide cluster names -or labels that have been previously defined by any other method. This could -include, for example, another clustering methods IDs, serotypes, clonal complexes -and MLST assignments. The input is a CSV file which is formatted as follows:: - - sample,serotype,MLST - sample1,12,34 - sample2,23F,1 - -This can contain any subset of the samples in the input, and additionally defined -samples will be safely ignored. - -PopPUNK will output a file ``_external_clusters.csv`` which has, for each sample in -the input (either reference or query, depending on the mode it was run in), a list of -of these labels which have been assigned to any sample in the PopPUNK cluster. Samples are -expected to have a single label, but may receive multiple labels. Novel query clusters -will not receive labels. An example of output:: - - sample,serotype,MLST - sample1,12,34 - sample2,23F,1 - sample3,15B;15C,21 - sample4,NA,NA - -.. _fit-files: - -Output files -^^^^^^^^^^^^ -* strain_db.search.out -- the core and accessory distances between all - pairs. -* strain_db_graph.gpickle -- the network used to predict clusters. -* strain_db_DPGMM_fit.png -- scatter plot of all distances, and mixture model - fit and assignment. -* strain_db_DPGMM_fit_contours.png -- contours of likelihood function fitted to - data (blue low -> yellow high). The thick red line is the decision boundary between - within- and between-strain components. -* strain_db_distanceDistribution.png -- scatter plot of the distance - distribution fitted by the model, and a kernel-density estimate. -* strain_db.csv -- isolate names and the cluster assigned. -* strain_db.png -- unclustered distribution of - distances used in the fit (subsampled from total). -* strain_db.npz -- save fit parameters. -* strain_db.refs -- representative references in the new database (unless - ``--full-db`` was used). - -If ``--dbscan`` was used: - -* strain_db_dbscan.png -- scatter plot of all distances, and DBSCAN - assignment. - -If ``--external-clustering`` was used: - -* strain_db_external_clusters.csv -- a CSV file relating the samples - to previous clusters provided in the input CSV. - -If ``--microreact`` was used: - -* strain_db_core_dists.csv -- matrix of pairwise core distances. -* strain_db_acc_dists.csv -- matrix of pairwise accessory distances. -* strain_db_core_NJ_microreact.nwk -- neighbour joining tree using core - distances (for microreact). -* strain_db_perplexity5.0_accessory_tsne.dot -- t-SNE embedding of - accessory distances at given perplexity (for microreact). -* strain_db_microreact_clusters.csv -- cluster assignments plus any epi - data added with the ``--info-csv`` option (for microreact). - -If ``--cytoscape`` was used: - -* strain_db_cytoscape.csv -- cluster assignments plus any epi data added - with the ``--info-csv`` option (for cytoscape). -* strain_db_cytoscape.graphml -- XML representation of resulting network - (for cytoscape). - -Relevant command line options -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following command line options can be used in this mode: - - Mode of operation: - --fit-model Fit a mixture model to a reference database - - Input files: - --ref-db REF_DB Location of built reference database - --distances DISTANCES - Prefix of input pickle of pre-calculated distances - --external-clustering EXTERNAL_CLUSTERING - File with cluster definitions or other labels - generated with any other method. - - Output options: - --output OUTPUT Prefix for output files (required) - --full-db Keep full reference database, not just representatives - --overwrite Overwrite any existing database files - - Quality control options: - --max-a-dist MAX_A_DIST - Maximum accessory distance to permit [default = 0.5] - - Model fit options: - --K K Maximum number of mixture components [default = 2] - --dbscan Use DBSCAN rather than mixture model - --D D Maximum number of clusters in DBSCAN fitting [default - = 100] - --min-cluster-prop MIN_CLUSTER_PROP - Minimum proportion of points in a cluster in DBSCAN - fitting [default = 0.0001] - Further analysis options: - --microreact Generate output files for microreact visualisation - --cytoscape Generate network output files for Cytoscape - --phandango Generate phylogeny and TSV for Phandango visualisation - --grapetree Generate phylogeny and CSV for grapetree visualisation - --rapidnj RAPIDNJ Path to rapidNJ binary to build NJ tree for Microreact - --perplexity PERPLEXITY - Perplexity used to calculate t-SNE projection (with - --microreact) [default=20.0] - --info-csv INFO_CSV Epidemiological information CSV formatted for - microreact (can be used with other outputs) - - Other options: - --mash MASH Location of mash executable - -.. note:: - If using the default mixture model threads will only be used if ``--full-db`` - is *not* specified and sketching of the representatives is performed at the end. - -.. _refine-model: - -Refining a model -------------------- -In species with a relatively high recombination rate the distinction between -the within- and between-strain distributions may be blurred in core and -accessory space. This does not give the mixture model enough information to -draw a good boundary as the likelihood is very flat in this region. - -See this example of 616 *S.*\ |nbsp| \ *pneumoniae* genomes with the DPGMM fit. These genomes were collected from Massachusetts, -first reported `here `__ and can be accessed -`here `__. - -.. image:: images/pneumo_unrefined.png - :alt: A bad DPGMM fit - :align: center - -Although the score of this fit looks ok (0.904), inspection of the network and -microreact reveals that it is too liberal and clusters have been merged. This -is because some of the blur between the origin and the central distribution has -been included, and connected clusters together erroneously. - -The likelihood of the model fit and the decision boundary looks like this: - -.. image:: images/pneumo_likelihood.png - :alt: The likelihood and decision boundary of the above fit - :align: center - -Using the core and accessory distributions alone does not give much information -about exactly where to put the boundary, and the only way to fix this would be -by specifying strong priors on the weights of the distributions. Fortunately -the network properties give information in the region, and we can use -``--refine-fit`` to tweak the existing fit and pick a better boundary. - -Run:: - - poppunk --refine-model --distances strain_db/strain_db.dists --output strain_db --full-db --ref-db strain_db --threads 4 - -Briefly: - -* A line between the within- and between-strain means is constructed -* The point on this line where samples go from being assigned as within-strain to between-strain is used as the starting point -* A line normal to the first line, passing through this point is constructed. The triangle formed by this line and the x- and y-axes is now the decision boundary. Points within this line are within-strain. -* The starting point is shifted by a distance along the first line, and a new decision boundary formed in the same way. The network is reconstructed. -* The shift of the starting point is optimised, as judged by the network score. First globally by a grid search, then locally near the global optimum. - -If the mixture model does not give any sort of reasonable fit to the points, -see :ref:`manual-start` for details about how to set the starting parameters -for this mode manually. - -The score is a function of transitivity (which is expected to be high, as -everything within a cluster should be the same strain as everything else in the -cluster) and density (which should be low, as there are far fewer within- than -between-strain links). - -Here is the refined fit, which has a score of 0.939, and 62 rather than 32 -components: - -.. image:: images/pneumo_refined.png - :alt: The refined fit - :align: center - -Which, looking at the `microreact output `__, is much better: - -.. image:: images/refined_microreact.png - :alt: The refined fit, in microreact - :align: center - -The core and accessory distances can also be used on their own. -Add the ``--indiv-refine`` option to refine the fit to these two distances -independently (see :ref:`indiv-refine` for more information). - - -Output files -^^^^^^^^^^^^ -The files are as for ``--fit-model`` (:ref:`fit-files`), and also include: - -* strain_db_refined_fit.png -- A plot of the new linear boundary, and core and - accessory distances coloured by assignment to either side of this boundary. -* strain_db_refined_fit.npz -- The saved parameters of the refined fit. - -If ``--indiv-refine`` was used, a copy of the *_clusters.csv* and network *.gpickle* -files for core and accessory only will also be produced. - -Relevant command line options -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following command line options can be used in this mode: - - Mode of operation: - --refine-model Refine the accuracy of a fitted model - - Input files: - --ref-db REF_DB Location of built reference database - --distances DISTANCES - Prefix of input pickle of pre-calculated distances - --external-clustering EXTERNAL_CLUSTERING - File with cluster definitions or other labels - generated with any other method. - - Output options: - --output OUTPUT Prefix for output files (required) - --full-db Keep full reference database, not just representatives - --overwrite Overwrite any existing database files - - Quality control options: - --max-a-dist MAX_A_DIST - Maximum accessory distance to permit [default = 0.5] - - Refine model options: - --pos-shift POS_SHIFT - Maximum amount to move the boundary away from origin - [default = 0.2] - --neg-shift NEG_SHIFT - Maximum amount to move the boundary towards the origin - [default = 0.4] - --manual-start MANUAL_START - A file containing information for a start point. See - documentation for help. - --indiv-refine Also run refinement for core and accessory - individually - --no-local Do not perform the local optimization step (speed up - on very large datasets) - - Further analysis options: - --microreact Generate output files for microreact visualisation - --cytoscape Generate network output files for Cytoscape - --phandango Generate phylogeny and TSV for Phandango visualisation - --grapetree Generate phylogeny and CSV for grapetree visualisation - --rapidnj RAPIDNJ Path to rapidNJ binary to build NJ tree for Microreact - --perplexity PERPLEXITY - Perplexity used to calculate t-SNE projection (with - --microreact) [default=20.0] - --info-csv INFO_CSV Epidemiological information CSV formatted for - microreact (can be used with other outputs) - - Other options: - --mash MASH Location of mash executable - --threads THREADS Number of threads to use during database querying - [default = 1] - -.. note:: - Threads are used for the global optimisation step only. If the local - optimisation step is slow, turn it off with ``--no-local``. - -Applying a single distance threshold ------------------------------------- -If you want to find clusters beneath a genetic distance cutoff, but using -a network which forms clusters by joining samples beneath this threshold, -you can use ``--threshold``. This will connect samples with core distances -below the provided threshold:: - - poppunk --threshold 0.05 --distances strain_db/strain_db.dists --output strain_db --full-db --ref-db strain_db - -.. _assign-query: - -Assigning queries ------------------ -Once a database has been built and a model fitted (either in one step with -``--easy-run``, or having run both steps separately) new sequences can be -assigned to a cluster using ``--assign-queries``. This process is much quicker -than building a database of all sequences from scratch, and will use the same model fit as -before. Cluster names will not change, unless queries cause clusters to be -merged (in which case they will be the previous cluster names, underscore separated). - -Having created a file listing the new sequences to assign ``query_list.txt``, -the command to assign a cluster to new sequences is:: - - poppunk --assign-query --ref-db strain_db --q-files query_list.txt --output strain_query --threads 3 --update-db - -Where *strain_db* is the output of the previous ``PopPUNK`` commands, -containing the model fit and distances. - -.. note:: - It is possible to specify a model fit in a separate directory from the - distance sketches using ``--model-dir``. Similarly a clustering and network - can be specified using ``--previous-clustering``. - -First, distances between queries and -sequences in the reference database will be calculated. The model fit (whether mixture model, -DBSCAN or refined) will be loaded and used to determine matches to existing -clusters:: - - PopPUNK (POPulation Partitioning Using Nucleotide Kmers) - Mode: Assigning clusters of query sequences - - Creating mash database for k = 15 - Random 15-mer probability: 0.00 - Creating mash database for k = 13 - Random 13-mer probability: 0.04 - Creating mash database for k = 17 - Random 17-mer probability: 0.00 - Creating mash database for k = 19 - Random 19-mer probability: 0.00 - Creating mash database for k = 21 - Random 21-mer probability: 0.00 - Creating mash database for k = 23 - Random 23-mer probability: 0.00 - Creating mash database for k = 25 - Random 25-mer probability: 0.00 - Creating mash database for k = 27 - Random 27-mer probability: 0.00 - Creating mash database for k = 29 - Random 29-mer probability: 0.00 - mash dist -p 3 ./strain_db/strain_db.13.msh ./strain_query/strain_query.13.msh 2> strain_db.err.log - mash dist -p 3 ./strain_db/strain_db.15.msh ./strain_query/strain_query.15.msh 2> strain_db.err.log - mash dist -p 3 ./strain_db/strain_db.17.msh ./strain_query/strain_query.17.msh 2> strain_db.err.log - mash dist -p 3 ./strain_db/strain_db.19.msh ./strain_query/strain_query.19.msh 2> strain_db.err.log - mash dist -p 3 ./strain_db/strain_db.21.msh ./strain_query/strain_query.21.msh 2> strain_db.err.log - mash dist -p 3 ./strain_db/strain_db.23.msh ./strain_query/strain_query.23.msh 2> strain_db.err.log - mash dist -p 3 ./strain_db/strain_db.25.msh ./strain_query/strain_query.25.msh 2> strain_db.err.log - mash dist -p 3 ./strain_db/strain_db.27.msh ./strain_query/strain_query.27.msh 2> strain_db.err.log - mash dist -p 3 ./strain_db/strain_db.29.msh ./strain_query/strain_query.29.msh 2> strain_db.err.log - Calculating core and accessory distances - Loading DBSCAN model - -If query sequences were found which didn't match an existing cluster they will -start a new cluster. ``PopPUNK`` will check whether any of these novel clusters -should be merged, based on the model fit:: - - Found novel query clusters. Calculating distances between them: - Creating mash database for k = 13 - Random 13-mer probability: 0.04 - Creating mash database for k = 15 - Random 15-mer probability: 0.00 - Creating mash database for k = 17 - Random 17-mer probability: 0.00 - Creating mash database for k = 19 - Random 19-mer probability: 0.00 - Creating mash database for k = 21 - Random 21-mer probability: 0.00 - Creating mash database for k = 23 - Random 23-mer probability: 0.00 - Creating mash database for k = 25 - Random 25-mer probability: 0.00 - Creating mash database for k = 27 - Random 27-mer probability: 0.00 - Creating mash database for k = 29 - Random 29-mer probability: 0.00 - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.13.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.13.msh 2> ./strain_dbij_sqnjr_tmp.err.log - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.15.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.15.msh 2> ./strain_dbij_sqnjr_tmp.err.log - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.17.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.17.msh 2> ./strain_dbij_sqnjr_tmp.err.log - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.19.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.19.msh 2> ./strain_dbij_sqnjr_tmp.err.log - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.21.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.21.msh 2> ./strain_dbij_sqnjr_tmp.err.log - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.23.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.23.msh 2> ./strain_dbij_sqnjr_tmp.err.log - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.25.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.25.msh 2> ./strain_dbij_sqnjr_tmp.err.log - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.27.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.27.msh 2> ./strain_dbij_sqnjr_tmp.err.log - mash dist -p 3 ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.29.msh ././strain_dbij_sqnjr_tmp/./strain_dbij_sqnjr_tmp.29.msh 2> ./strain_dbij_sqnjr_tmp.err.log - Calculating core and accessory distances - -At this point, cluster assignments for the query sequences are written to a CSV -file. Finally, if new clusters were created due to the queries, the database -will be updated to reflect this if ``--update-db`` was used:: - - Creating mash database for k = 13 - Random 13-mer probability: 0.04 - Overwriting db: ./strain_query/strain_query.13.msh - Creating mash database for k = 15 - Random 15-mer probability: 0.00 - Overwriting db: ./strain_query/strain_query.15.msh - Creating mash database for k = 17 - Random 17-mer probability: 0.00 - Overwriting db: ./strain_query/strain_query.17.msh - Creating mash database for k = 19 - Random 19-mer probability: 0.00 - Overwriting db: ./strain_query/strain_query.19.msh - Creating mash database for k = 21 - Random 21-mer probability: 0.00 - Overwriting db: ./strain_query/strain_query.21.msh - Creating mash database for k = 23 - Random 23-mer probability: 0.00 - Overwriting db: ./strain_query/strain_query.23.msh - Creating mash database for k = 25 - Random 25-mer probability: 0.00 - Overwriting db: ./strain_query/strain_query.25.msh - Creating mash database for k = 27 - Random 27-mer probability: 0.00 - Overwriting db: ./strain_query/strain_query.27.msh - Creating mash database for k = 29 - Random 29-mer probability: 0.00 - Overwriting db: ./strain_query/strain_query.29.msh - Writing strain_query/strain_query.13.joined.msh... - Writing strain_query/strain_query.15.joined.msh... - Writing strain_query/strain_query.17.joined.msh... - Writing strain_query/strain_query.19.joined.msh... - Writing strain_query/strain_query.21.joined.msh... - Writing strain_query/strain_query.23.joined.msh... - Writing strain_query/strain_query.25.joined.msh... - Writing strain_query/strain_query.27.joined.msh... - Writing strain_query/strain_query.29.joined.msh... - - Done - -.. note:: - For future uses of ``--assign-query``, the database now stored in - ``strain-query`` should be used as the ``--ref-db`` argument. - -.. _indiv-refine: - -Using core/accessory only -^^^^^^^^^^^^^^^^^^^^^^^^^ -In some cases, such as analysis within a lineage, it may be desirable to use -only core or accessory distances to classify further queries. This can be -achieved by using the ``--core-only`` or ``--accessory-only`` options with -a fit produced by :ref:`refine-model`. The default is to use the x-axis -intercept of the boundary as the core distance cutoff (y-axis for accessory). -However, if planning on using this mode we recommend running the refinement -with the ``--indiv-refine`` options, which will allow these boundaries to be -placed independently, allowing the best fit in each case:: - - poppunk --refine-model --distances strain_db/strain_db.dists --output strain_db --full-db --indiv-refine --ref-db strain_db --threads 4 - PopPUNK (POPulation Partitioning Using Nucleotide Kmers) - Mode: Refining model fit using network properties - - Loading BGMM 2D Gaussian model - Initial boundary based network construction - Decision boundary starts at (0.54,0.36) - Trying to optimise score globally - Trying to optimise score locally - Refining core and accessory separately - Initial boundary based network construction - Decision boundary starts at (0.54,0.36) - Trying to optimise score globally - Trying to optimise score locally - Initial boundary based network construction - Decision boundary starts at (0.54,0.36) - Trying to optimise score globally - Trying to optimise score locally - Network summary: - Components 132 - Density 0.0889 - Transitivity 0.9717 - Score 0.8853 - Network summary: - Components 114 - Density 0.0955 - Transitivity 0.9770 - Score 0.8837 - Network summary: - Components 92 - Density 0.0937 - Transitivity 0.9327 - Score 0.8453 - writing microreact output: - Building phylogeny - Running t-SNE - - Done - -There are three different networks, and the core and accessory boundaries will -also be shown on the *refined_fit.png* plot as dashed gray lines: - -.. image:: images/indiv_refine.png - :alt: Refining fit with core and accessory individuals independently - :align: center - - -Output files -^^^^^^^^^^^^ -The main output is *strain_query/strain_query_clusters.csv*, which contains the -cluster assignments of the query sequences, ordered by frequency. - -If ``--update-db`` was used a full updated database will be written to -``--output``, which consists of sketches at each k-mer length (*\*.msh*), -a *search.out* file of distances, and a *.gpickle* of the network. - -Relevant command line options -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following command line options can be used in this mode: - - Mode of operation: - --assign-query Assign the cluster of query sequences without re- - running the whole mixture model - - Input files: - --ref-db REF_DB Location of built reference database - --q-files Q_FILES File listing query input assemblies - --external-clustering EXTERNAL_CLUSTERING - File with cluster definitions or other labels - generated with any other method. - - Output options: - --output OUTPUT Prefix for output files (required) - --update-db Update reference database with query sequences - - Quality control options: - --max-a-dist MAX_A_DIST - Maximum accessory distance to permit [default = 0.5] - --ignore-length Ignore outliers in terms of assembly length [default = - False] - - Database querying options: - --model-dir MODEL_DIR - Directory containing model to use for assigning - queries to clusters [default = reference database - directory] - --previous-clustering PREVIOUS_CLUSTERING - Directory containing previous cluster definitions and - network [default = use that in the directory - containing the model] - --core-only Use a core-distance only model for assigning queries - [default = False] - --accessory-only Use an accessory-distance only model for assigning - queries [default = False] - - Other options: - --mash MASH Location of mash executable - --threads THREADS Number of threads to use [default = 1] - --no-stream Use temporary files for mash dist interfacing. Reduce - memory use/increase disk use for large datasets - --version show program's version number and exit - -Creating external visualisations from a fitted model ----------------------------------------------------- -Visualisations for external software (Microreact etc) will be created in a mode -calling ``--fit-model``, ``--refine-model`` or ``--assign-query`` if any of the following options -were added: - -- ``--microreact`` -- ``--cytoscape`` -- ``--phandango`` -- ``--grapetree`` - -Additionally, if ``--refine-model``, ``--indiv-refine`` and ``--cytoscape`` are all -specified, the networks for core and accessory distances only will also be output. - -To create these outputs for visualisation after the initial command has been run use -the ``--generate-viz`` mode, with the same options as the original run (plus any specific -to the visualisation). In this mode you may also specify a file containing a list of samples to -include in the visualisation with ``--subset``. - -.. note:: - Only a single network will be used in this mode if core and accessory distance - restricted models have also been produced. To visualise these instead of the combined - fit use ``--core-only`` or ``--accessory-only``. - -Using a previous model with a new database ------------------------------------------- -If you have a model which has been fitted which you wish to apply this to a new reference -database, you may do this with ``--use-model``. This will take a fitted model, apply it -to distances from ``--create-db`` and produce a network, assignment and reference database -for future use with ``--assign-query``. - -.. note:: - Generally, to use an existing model with new data it is better to - ``--assign-query`` (see :ref:`assign-query`). This mode can be used when - the model, reference database and network are out of sync due to accidentally - overwriting one or losing track of versions. - -Options are the same as ``--fit-model`` for GMM and DBSCAN models or ``--refine-model`` for -refined models. diff --git a/docs/visualisation.rst b/docs/visualisation.rst new file mode 100644 index 00000000..666daa7a --- /dev/null +++ b/docs/visualisation.rst @@ -0,0 +1,280 @@ +Creating visualisations +======================= + +**Contents**: + +.. contents:: + :local: + +We have moved visualisation tools into their own program ``poppunk_visualise``, both +to reinforce our commitment to UK spellings, and so that you can rerun visualisations +with different outputs and settings without rerunning the other parts of the code. + +Starting with either a full database where you have fitted a model (:doc:`model_fitting`), or +output where you have assigned queries using an existing database (:doc:`query_assignment`), you +can create outputs for external interactive visualisation tools: + +- `Microreact `__ -- a genomic epidemiology visualisation tool, displaying clusters, phylogeny and accessory clustering. +- `GrapeTree `__ -- a tool to visualise strains (designed for cgMLST). +- `Phandango `__ -- visualisation linking genomes and phylogenies. +- `Cytoscape `__ -- a network visualisation tool, which can be used to create, view and manipulate a layout of the graph. + +At least one of these output types must be specified as a flag. + +.. important:: + If you run a visualisation on output from query assignment (:doc:`query_assignment`) + this will not contain all the necessary distances, and they will be calculated before + the visualisation files are produced. + You will see a message ``Note: Distance will be extended to full all-vs-all distances``. + If you are running multiple visualisations this calculation will be completed every time. To avoid + this re-run your assignment with ``--update-db``, which will add these distances in permanently. + +Common options +-------------- +Some typical commands for various input settings (with ``--microreact``, but this can +be altered to any output type) with a database ``example``. + +Visualisation of a full database:: + + poppunk_visualise --ref-db example_db --output example_viz --microreact + +Visualisation after query assignment:: + + poppunk_visualise --ref-db example_db --query-db example_query --output example_viz --microreact + +Visualisation when sketches and models are in different folders:: + + poppunk_visualise --ref-db example_db --previous-clustering example_lineages \ + --model-dir example_lineages --output example_viz --microreact + +Visualisation with a lineage model, which has been queried (query-query distances must be provided):: + + poppunk_visualise --distances example_query/example_query.dists --ref-db example_db \ + --model-dir example_lineages --query-db example_lineage_query \ + --output example_viz --microreact + +Notable modifiers include: + +- ``--include-files`` -- give a file with a subset of names to be included in the visualisation. +- ``--external-clustering`` -- other cluster names to map to strains (such as MLST, serotype etc), + as described in model fitting and query assignment. +- ``--info-csv`` -- similar to the above, but a CSV which is simply (inner-)joined to the output on sample name. +- ``--rapidnj`` -- the location of a `rapidnj `__ binary, + used to build the core NJ tree. We highly recommend using this for any tree-building (and is included with + the conda install). This defaults to ``rapidnj`` on the PATH. Set blank to use dendropy instead (slower, especially + for large datasets). +- ``--core-only``/``--accessory-only`` -- use the core or accessory fit from an individually refined model (see :ref:`indiv-refine`). +- ``--threads``, ``--gpu-dist``, ``--deviceid``, ``--strand-preserved`` -- querying options used if extra distance calculations are needed. + To avoid these, rerun your query with ``--update-db``. + +Microreact +---------- +Adding the ``--microreact`` flag will create the following files: + +- _microreact_clusters.csv -- the strain or lineage assignments with headers formatted for microreact, plus anything from ``--info-csv``. +- _core_NJ.nwk -- a neighbour-joining tree from the core distances. +- _perplexity20.0_accessory_tsne.dot -- a 2D embedding of the accessory distances, produced using t-SNE (in this case with + :ref:`perplexity` 20). + +Open https://microreact.org/upload in your browser, and drag and drop these three files +to create your visualisation. Here is the result of running the visualisation on the +*Listeria* BGMM model:: + + poppunk_visualise --ref-db listeria --microreact + + Graph-tools OpenMP parallelisation enabled: with 1 threads + PopPUNK: visualise + Loading BGMM 2D Gaussian model + Writing microreact output + Parsed data, now writing to CSV + Building phylogeny + Running t-SNE + + Done + +This can be viewed at https://microreact.org/project/8PeGg9fCjZADaAGuNJwU9z: + +.. image:: images/listeria_microreact.png + :alt: Microreact page for Listeria monocytogenes + :align: center + +Useful controls include the tree shape, accessed with the control slider in the +top right of the phylogeny page, and the metadata labels, accessed with the 'eye' +on the right of the page. When visualising lineages, changing the 'Colour by' is useful +to compare results from different ranks. + +.. _perplexity: + +Setting the perplexity parameter for t-SNE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In t-SNE an embedding of the accessory genome distances is found which +represents local structure of the data. Isolates with similar accessory content +will visually appear in clusters together. + +The perplexity sets a guess about the number of close neighbours each point +has, and is a trade-off between local and global structure. t-SNE is reasonably +robust to changes in the perplexity parameter (set with ``--perplexity`` when +creating microreact output with ``--microreact``), +however we would recommend trying a few values to get +a good embedding for the accessory distances. + +There is a good discussion of the effect of perplexity `here `_ +and the sklearn documentation shows some examples of the effect of `changing +perplexity `_. + +In the :doc:`model_fitting` example, a perplexity of 30 gives clear clustering of +the accessory genome content, condordant with the core genome structure (`data `__): + +.. image:: images/microreact.png + :alt: Microreact plot of results with perplexity = 30 + :align: center + +With a lower perplexity of 5, the clustering is too loose, and the strain +structure cannot clearly be seen (`data `__): + +.. image:: images/microreact_perplexity5.png + :alt: Microreact plot of results with perplexity = 5 + :align: center + +30 is a good default, but you may wish to try other values, particularly with +larger or smaller datasets. You can re-run the t-SNE using the ``poppunk_tsne`` +command, providing the distances from the previous run:: + + poppunk_tsne --distances strain_db/strain_db.dists --output strain_db \ + --perplexity 20 --verbosity 1 + +GrapeTree +--------- +Adding the ``--grapetree`` flag will create: + +- _microreact_clusters.csv -- the strain or lineage assignments with headers formatted for grapetree, plus anything from ``--info-csv``. +- _core_NJ.nwk -- a neighbour-joining tree from the core distances. + +Open https://achtman-lab.github.io/GrapeTree/MSTree_holder.html in your browser, and use +the 'Load files' button once for each of the files to add the tree and strain assignments to +GrapeTree. This will display an unrooted tree with your clusters: + +.. image:: images/grapetree.png + :alt: Grapetree visualisation of results + :align: center + +One of GrapeTree's key features is the ability to collapse branches, and condense information +into nodes. By going to Tree Layout -> Branch style -> Collapse branches, and setting the long +branch to be shortened, one can obtain a view which shows strain prevalence and relationships: + +.. image:: images/grapetree_collapse.png + :alt: Grapetree visualisation of results + :align: center + +There is also a handy 'Export to Microreact' button in GrapeTree, though this will +not include the accessory embedding, so you may wish to add the ``--microreact`` flag +and generate the files yourself. + +Phandango +--------- +Adding the ``--phandango`` flag will create: + +- _phandango_clusters.csv -- the strain or lineage assignments with headers formatted for phandango, plus anything from ``--info-csv``. +- _core_NJ.tree -- a neighbour-joining tree from the core distances. + +Open https://www.phandango.net in your browser, and use +the 'Load files' button once for each of the files to add the tree and strain assignments to +GrapeTree. This will display the tree with your clusters: + +.. image:: images/phandango.png + :alt: Phandango visualisation of results + :align: center + +Press 's' to access settings, and 'p' to create an .svg file. Phandango is most useful +with a genome (.gff file), and either a plot of recombination, accessory genome analysis +or GWAS results. See the documentation for more information. + +.. _cytoscape-view: + +Cytoscape +--------- +Cytoscape is different from the above modes as it creates a layout and visualisation of +the graph used to create strains from distances. This can be useful for more detailed +investigation of network scores, particularly in strains which have less than perfect transitivity. + +Add the ``--cytoscape`` option:: + + poppunk_visualise --ref-db listeria --cytoscape + + Graph-tools OpenMP parallelisation enabled: with 1 threads + PopPUNK: visualise + Loading BGMM 2D Gaussian model + Writing cytoscape output + Network loaded: 128 samples + Parsed data, now writing to CSV + + Done + +Which will create: + +- _cytoscape.csv -- the strain or lineage assignments with headers formatted for cytoscape, plus anything from ``--info-csv``. +- _cytoscape.graphml -- the network in graphml format. + +The .graphml file is an XML file which contains definitions of the nodes (samples) +and edges (within-strain distances) connecting them. If you used ``--graph-weights`` +when you fitted your model the edges will be annotated with their Euclidean distances +in the 'weight' attribute (which you will need to tell cytoscape). These can be added +with the ``poppunk_add_weights`` script if this flag was not used. + +Open `cytoscape `_ and drag and drop the .graphml +file onto the window to import the network. Import -> table -> file to load the +CSV. Click 'Select None' then add the 'id' column as a key, and any required +metadata columns (at least the 'Cluster' column) as attributes. Make sure +'Node Table Columns' is selected as the data type. + +The graphml file does not contain a layout for the graph, that is, positions of +nodes and edges are not specified for a visualisation. These will be calculated by cytoscape, +automatically for small graphs, and with the 'Layout' menu for larger graphs. The 'Prefuse force directed layout' +or 'yFiles Organic Layout' work well. Select the 'weight' dropdown to use the edge-lengths +when drawing the network. + +.. warning:: + We have found that making graphs with >10k nodes may exceed the memory on a typical + laptop. To view larger graphs, first splitting into subgraphs of each connected component + is very helpful. Older versions of cytoscape allowed you to split the graph into connected + components, but newer versions have removed this feature. This can be done programmatically + with ``networkx`` or ``graph-tool`` in python, or ``igraph`` in R. + +Click on 'Style' and change the node fill colour to be by cluster, the mapping +type as discrete, then right click to autogenerate a colour scheme ('Random' is usually best). You can +also modify the node size and shape here. Here is the *Listeria* example, using edge weights in the layout: + +.. image:: images/cytoscape.png + :alt: Cytoscape plot of network + :align: center + +If you used assign query mode you will also have a column with 'Query' or 'Reference', which can +be used to map to different shapes or colours: + +.. image:: images/assign_network.png + :alt: Network produced after query assignment + :align: center + +Adding an info CSV, or loading external tables directly into cytoscapes gives further options +for investigating individual strains: + +.. image:: images/cytoscape_gpsc.png + :alt: Network with added annotation + :align: center + +In some cases, edges which are between strain links may have been erroneously included +in the network. This could be due to poor model fit, or a poor quality +sequence. Use Tools -> NetworkAnalyzer -> Analyze Network to compute +information for each node and edge. It may help to analyze connected components separately. +They can be split under Tools -> NetworkAnalyzer -> Subnetwork Creation. + +Here is an example where an errant node is connecting two clusters into one +large cluster, which should be split: + +.. image:: images/cytoscape_component.png + :alt: Cytoscape plot of network + :align: center + +The incorrect node in question has a low CluteringCoefficient and high Stress. +The EdgeBetweeness of its connections are also high. Sorting the node and edge +tables by these columns can find individual problems such as this. diff --git a/test/run_test.py b/test/run_test.py index d332db0f..e2b10aaa 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -12,7 +12,6 @@ sys.stderr.write("Extracting example dataset\n") subprocess.run("tar xf example_set.tar.bz2", shell=True, check=True) -#easy run sys.stderr.write("Running database creation (--create-db)\n") subprocess.run("python ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_db --qc-filter prune --overwrite", shell=True, check=True) @@ -31,6 +30,7 @@ #refine model with GMM sys.stderr.write("Running model refinement (--fit-model refine)\n") subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True) +subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True) # lineage clustering sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n")