From 59aa17a680100a64abc9362b0e5ce9972ac7f496 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 22 Dec 2020 16:05:04 +0000 Subject: [PATCH] Run the distance plot at create-db time --- PopPUNK/__main__.py | 6 +++++ PopPUNK/models.py | 9 ------- PopPUNK/plot.py | 18 ++++++++++--- docs/best_practises.rst | 3 +++ docs/conf.py | 4 +-- docs/index.rst | 2 +- docs/installation.rst | 2 +- docs/model_fitting.rst | 57 ++++++++++++++++++++++++++++++++++++++++- docs/online.rst | 2 +- docs/qc.rst | 5 +++- docs/scripts.rst | 11 ++++++++ 11 files changed, 99 insertions(+), 20 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 9ac13439..47fbf986 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -175,6 +175,7 @@ def main(): from .network import printClusters from .plot import writeClusterCsv + from .plot import plot_scatter from .prune_db import prune_distance_matrix @@ -287,6 +288,11 @@ def main(): dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" storePickle(refList, queryList, True, distMat, dists_out) + # Plot results + plot_scatter(distMat, + args.output + "/" + os.path.basename(args.output) + "_distanceDistribution", + args.output + " distances") + #******************************# #* *# #* model fit and network *# diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 24b97dbe..8a974d7e 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -21,8 +21,6 @@ import pp_sketchlib -from .plot import plot_scatter - # BGMM from .bgmm import fit2dMultiGaussian from .bgmm import assign_samples @@ -126,7 +124,6 @@ def fit(self, X = None): '''Initial steps for all fit functions. Creates output directory. If preprocess is set then subsamples passed X - and draws a scatter plot from result using :func:`~PopPUNK.plot.plot_scatter`. Args: X (numpy.array) @@ -159,12 +156,6 @@ def fit(self, X = None): self.scale = np.amax(self.subsampled_X, axis = 0) self.subsampled_X /= self.scale - # Show clustering - plot_scatter(self.subsampled_X, - self.scale, - self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_distanceDistribution", - self.outPrefix + " distances") - def plot(self, X=None): '''Initial steps for all plot functions. diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 0427b378..2ba936bf 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -6,6 +6,7 @@ import sys import os import subprocess +import random import numpy as np import matplotlib as mpl mpl.use('Agg') @@ -18,7 +19,7 @@ import pandas as pd from collections import defaultdict from scipy import spatial -from sklearn import manifold +from sklearn import manifold, utils try: # sklearn >= 0.22 from sklearn.neighbors import KernelDensity except ImportError: @@ -27,7 +28,7 @@ from .utils import isolateNameToLabel -def plot_scatter(X, scale, out_prefix, title, kde = True): +def plot_scatter(X, out_prefix, title, kde = True): """Draws a 2D scatter plot (png) of the core and accessory distances Also draws contours of kernel density estimare @@ -35,8 +36,6 @@ def plot_scatter(X, scale, out_prefix, title, kde = True): Args: X (numpy.array) n x 2 array of core and accessory distances for n samples. - scale (numpy.array) - Scaling factor from :class:`~PopPUNK.models.BGMMFit` out_prefix (str) Prefix for output plot file (.png will be appended) title (str) @@ -46,6 +45,15 @@ def plot_scatter(X, scale, out_prefix, title, kde = True): (default = True) """ + # Plot results - max 1M for speed + max_plot_samples = 1000000 + if X.shape[0] > max_plot_samples: + X = utils.shuffle(X, random_state=random.randint(1,10000))[0:max_plot_samples,] + + # Kernel estimate uses scaled data 0-1 on each axis + scale = np.amax(X, axis = 0) + X /= scale + plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k') if kde: xx, yy, xy = get_grid(0, 1, 100) @@ -58,11 +66,13 @@ def plot_scatter(X, scale, out_prefix, title, kde = True): z = z.reshape(xx.shape).T levels = np.linspace(z.min(), z.max(), 10) + # Rescale contours plt.contour(xx*scale[0], yy*scale[1], z, levels=levels[1:], cmap='plasma') scatter_alpha = 1 else: scatter_alpha = 0.1 + # Plot on correct scale plt.scatter(X[:,0]*scale[0].flat, X[:,1]*scale[1].flat, s=1, alpha=scatter_alpha) plt.title(title) diff --git a/docs/best_practises.rst b/docs/best_practises.rst index 8aff718c..593bab24 100644 --- a/docs/best_practises.rst +++ b/docs/best_practises.rst @@ -42,6 +42,9 @@ has many advantages: See :doc:`query_assignment` for instructions on how to use this mode. +You can think of this as being similar to using an existing MLST/cgMLST/wgMLST scheme +to define your sample's strains. + Fit your own model ^^^^^^^^^^^^^^^^^^ If a database isn't available for your species, you can fit your own. Details diff --git a/docs/conf.py b/docs/conf.py index c01c8484..66d95983 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -73,9 +73,9 @@ # built documents. # # The short X.Y version. -version = '2.2.0' +version = '2.3.0' # The full version, including alpha/beta/rc tags. -release = '2.2.0' +release = '2.3.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index c3455e2e..a032d3dd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,7 +23,7 @@ by reading the :doc:`best_practises`. .. important:: Looking for older versions of the documentation? For previous versions with - the old API (`--assign-query`, `--refine-fit` etc) see `v2.2.0 `__. + the old API (``--assign-query``, ``--refine-fit`` etc) see `v2.2.0 `__. For older versions which used mash, see `v1.2.0 `__. .. toctree:: diff --git a/docs/installation.rst b/docs/installation.rst index 4a644763..e6ca66a8 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -38,7 +38,7 @@ If you are having conflict issues with conda, our advice would be: - Create a new environment for PopPUNK with ``conda create -n pp_env poppunk`` If you have an older version of PopPUNK, you can upgrade using this method -- you -may also wish to specify the version, for example `conda install poppunk==2.3.0` if you +may also wish to specify the version, for example ``conda install poppunk==2.3.0`` if you wish to upgrade. conda-forge also has some helpful tips: https://conda-forge.org/docs/user/tipsandtricks.html diff --git a/docs/model_fitting.rst b/docs/model_fitting.rst index 5550c3df..9e3c7835 100644 --- a/docs/model_fitting.rst +++ b/docs/model_fitting.rst @@ -1,7 +1,46 @@ Fitting new models ================== +If you cannot find an existing model for your species in the +`list `__ you will want to fit your own. +This process is flexible, and there are five different models you can use depending +on the population structure of your dataset. +First, you must use ``poppunk --create-db`` to sketch your input data and calculate distances +between all samples. This is detailed in :doc:`sketching`. + +Then, you wil use ``poppunk --fit-model `` with one of the following: + +- ``bgmm`` -- Bayesian Gaussian Mixture Model. Best for small sample collections + with strain-structure. Works best when distance distribution components are clearly + separated. +- ``dbscan`` -- HDBSCAN. A good general method for larger sample collections with + strain-structure. Some points will always be designated as noise, so a subsequent run + of model refinement may help improve the fit. +- ``refine`` -- Model refinement. Requires a model already fitted with ``bgmm`` or ``dbscan`` + and attempts to improve it by maximising the network score. Particularly useful when + components overlap significantly (often due to recombination), or when the strain boundary + is thought to lie somewhere within a component. +- ``lineage`` -- Lineage clustering. To find lineages within a strain (subclustering), or + find clusters in a population without strain structure. Uses a simple nearest neighbour approach + so is more of a heuristic. Network scores are not meaningful in this mode. +- ``threshold`` -- Apply a given core or accessory distance threshold to define clusters. Useful if + a cutoff threshold is already known/calculated, is estimated from a plot, or to compare a threshold + between datasets or species. + +The most useful guide to deciding which model to use is the ``_distanceDistribution.png`` file +showing the core and accessory distances. + +bgmm +---- + + +dbscan +------ + + +refine +------ .. _manual-start: @@ -19,4 +58,20 @@ A key, followed by its value (space separated). ``mean0`` and ``mean1`` define the points (x,y) to draw the line between, and ``start_point`` is the distance along this line to draw the initial boundary -(which is normal to the line). \ No newline at end of file +(which is normal to the line). + +lineage +------- + + +threshold +--------- + +Use an existing model with new data +----------------------------------- + +There is also one further mode, ``--use-model``, which may be useful in limited circumstances. This +applies any of the above models to a new dataset without refitting it. This may be useful if a reference +dataset has changed (been added to or removed from) and you do not wish to refit the model, for example +because it is already in use. However, typically you would use :doc:`query_assignment` with ``--update-db`` +to add to a model. diff --git a/docs/online.rst b/docs/online.rst index a18362df..72c3c8b0 100644 --- a/docs/online.rst +++ b/docs/online.rst @@ -1,6 +1,6 @@ PopPUNK-web ================= -This is the newest feature in the PopPUNK pipeline, available at http://poppunk.net/web. +This is the newest feature in the PopPUNK pipeline, available at https://web.poppunk.net/. PoPUNK-web has been designed for non-specialists and allows you to easily analyse and assign lineages to your query genomic sequences, using pre-optimised species-specific databases and default parameters. diff --git a/docs/qc.rst b/docs/qc.rst index ea6813e2..9a21c81b 100644 --- a/docs/qc.rst +++ b/docs/qc.rst @@ -20,6 +20,9 @@ In all cases a file will be written at ``qcreport.txt`` which lists the failing reasons why they failed. If running with either prune or continue, you may also add ``--retain-failures`` to write a separate sketch database with the failed samples. +Random match chances in PopPUNK are only calculated and added to the database after the chosen +QC step. If you use ``poppunk_sketch`` directly, they will be added without any automated QC. + You can change the genome length cutoff with ``--length-sigma`` which sets the maximum number of standard deviations from the mean, and ``--length-range`` which sets an absolute range of allowable sizes. @@ -34,7 +37,7 @@ should increase the value of ``--max-a-dist``. Removing samples from an existing database ------------------------------------------ You can use the ``prune_poppunk`` command to remove samples from a database, -for example those found to be of poor quality (see :ref:`qc`). Create a file +for example those found to be of poor quality. Create a file ``remove.txt`` with the names of the samples you wish to remove, one per line, and run:: diff --git a/docs/scripts.rst b/docs/scripts.rst index 03d516ad..ef041d40 100644 --- a/docs/scripts.rst +++ b/docs/scripts.rst @@ -11,6 +11,17 @@ installed with the prefix 'poppunk' (e.g to run ``extract_distances.py``, run th .. contents:: :local: +Easy run mode +------------- +Previous versions of the software had an ``--easy-run`` mode which would run a pipeline of: + +- ``--create-db`` to sketch genomes +- ``--fit-model --dbscan`` to fit a flexible model +- ``--refine-model`` to improve this model + +This is now available as ``poppunk_easy_run.py`` which will chain calls to ``poppunk`` +and ``poppunk_visualise`` to replicate this functionality. + Writing the pairwise distances to an output file ------------------------------------------------ By default PopPUNK does not write the calculated :math:`\pi_n` and :math:`a` distances out, as this