diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 9ac13439..47fbf986 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -175,6 +175,7 @@ def main():
from .network import printClusters
from .plot import writeClusterCsv
+ from .plot import plot_scatter
from .prune_db import prune_distance_matrix
@@ -287,6 +288,11 @@ def main():
dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
storePickle(refList, queryList, True, distMat, dists_out)
+ # Plot results
+ plot_scatter(distMat,
+ args.output + "/" + os.path.basename(args.output) + "_distanceDistribution",
+ args.output + " distances")
+
#******************************#
#* *#
#* model fit and network *#
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 24b97dbe..8a974d7e 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -21,8 +21,6 @@
import pp_sketchlib
-from .plot import plot_scatter
-
# BGMM
from .bgmm import fit2dMultiGaussian
from .bgmm import assign_samples
@@ -126,7 +124,6 @@ def fit(self, X = None):
'''Initial steps for all fit functions.
Creates output directory. If preprocess is set then subsamples passed X
- and draws a scatter plot from result using :func:`~PopPUNK.plot.plot_scatter`.
Args:
X (numpy.array)
@@ -159,12 +156,6 @@ def fit(self, X = None):
self.scale = np.amax(self.subsampled_X, axis = 0)
self.subsampled_X /= self.scale
- # Show clustering
- plot_scatter(self.subsampled_X,
- self.scale,
- self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_distanceDistribution",
- self.outPrefix + " distances")
-
def plot(self, X=None):
'''Initial steps for all plot functions.
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index 0427b378..2ba936bf 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -6,6 +6,7 @@
import sys
import os
import subprocess
+import random
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
@@ -18,7 +19,7 @@
import pandas as pd
from collections import defaultdict
from scipy import spatial
-from sklearn import manifold
+from sklearn import manifold, utils
try: # sklearn >= 0.22
from sklearn.neighbors import KernelDensity
except ImportError:
@@ -27,7 +28,7 @@
from .utils import isolateNameToLabel
-def plot_scatter(X, scale, out_prefix, title, kde = True):
+def plot_scatter(X, out_prefix, title, kde = True):
"""Draws a 2D scatter plot (png) of the core and accessory distances
Also draws contours of kernel density estimare
@@ -35,8 +36,6 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
Args:
X (numpy.array)
n x 2 array of core and accessory distances for n samples.
- scale (numpy.array)
- Scaling factor from :class:`~PopPUNK.models.BGMMFit`
out_prefix (str)
Prefix for output plot file (.png will be appended)
title (str)
@@ -46,6 +45,15 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
(default = True)
"""
+ # Plot results - max 1M for speed
+ max_plot_samples = 1000000
+ if X.shape[0] > max_plot_samples:
+ X = utils.shuffle(X, random_state=random.randint(1,10000))[0:max_plot_samples,]
+
+ # Kernel estimate uses scaled data 0-1 on each axis
+ scale = np.amax(X, axis = 0)
+ X /= scale
+
plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k')
if kde:
xx, yy, xy = get_grid(0, 1, 100)
@@ -58,11 +66,13 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
z = z.reshape(xx.shape).T
levels = np.linspace(z.min(), z.max(), 10)
+ # Rescale contours
plt.contour(xx*scale[0], yy*scale[1], z, levels=levels[1:], cmap='plasma')
scatter_alpha = 1
else:
scatter_alpha = 0.1
+ # Plot on correct scale
plt.scatter(X[:,0]*scale[0].flat, X[:,1]*scale[1].flat, s=1, alpha=scatter_alpha)
plt.title(title)
diff --git a/docs/best_practises.rst b/docs/best_practises.rst
index 8aff718c..593bab24 100644
--- a/docs/best_practises.rst
+++ b/docs/best_practises.rst
@@ -42,6 +42,9 @@ has many advantages:
See :doc:`query_assignment` for instructions on how to use this mode.
+You can think of this as being similar to using an existing MLST/cgMLST/wgMLST scheme
+to define your sample's strains.
+
Fit your own model
^^^^^^^^^^^^^^^^^^
If a database isn't available for your species, you can fit your own. Details
diff --git a/docs/conf.py b/docs/conf.py
index c01c8484..66d95983 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -73,9 +73,9 @@
# built documents.
#
# The short X.Y version.
-version = '2.2.0'
+version = '2.3.0'
# The full version, including alpha/beta/rc tags.
-release = '2.2.0'
+release = '2.3.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
diff --git a/docs/index.rst b/docs/index.rst
index c3455e2e..a032d3dd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -23,7 +23,7 @@ by reading the :doc:`best_practises`.
.. important::
Looking for older versions of the documentation? For previous versions with
- the old API (`--assign-query`, `--refine-fit` etc) see `v2.2.0 `__.
+ the old API (``--assign-query``, ``--refine-fit`` etc) see `v2.2.0 `__.
For older versions which used mash, see `v1.2.0 `__.
.. toctree::
diff --git a/docs/installation.rst b/docs/installation.rst
index 4a644763..e6ca66a8 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -38,7 +38,7 @@ If you are having conflict issues with conda, our advice would be:
- Create a new environment for PopPUNK with ``conda create -n pp_env poppunk``
If you have an older version of PopPUNK, you can upgrade using this method -- you
-may also wish to specify the version, for example `conda install poppunk==2.3.0` if you
+may also wish to specify the version, for example ``conda install poppunk==2.3.0`` if you
wish to upgrade.
conda-forge also has some helpful tips: https://conda-forge.org/docs/user/tipsandtricks.html
diff --git a/docs/model_fitting.rst b/docs/model_fitting.rst
index 5550c3df..9e3c7835 100644
--- a/docs/model_fitting.rst
+++ b/docs/model_fitting.rst
@@ -1,7 +1,46 @@
Fitting new models
==================
+If you cannot find an existing model for your species in the
+`list `__ you will want to fit your own.
+This process is flexible, and there are five different models you can use depending
+on the population structure of your dataset.
+First, you must use ``poppunk --create-db`` to sketch your input data and calculate distances
+between all samples. This is detailed in :doc:`sketching`.
+
+Then, you wil use ``poppunk --fit-model `` with one of the following:
+
+- ``bgmm`` -- Bayesian Gaussian Mixture Model. Best for small sample collections
+ with strain-structure. Works best when distance distribution components are clearly
+ separated.
+- ``dbscan`` -- HDBSCAN. A good general method for larger sample collections with
+ strain-structure. Some points will always be designated as noise, so a subsequent run
+ of model refinement may help improve the fit.
+- ``refine`` -- Model refinement. Requires a model already fitted with ``bgmm`` or ``dbscan``
+ and attempts to improve it by maximising the network score. Particularly useful when
+ components overlap significantly (often due to recombination), or when the strain boundary
+ is thought to lie somewhere within a component.
+- ``lineage`` -- Lineage clustering. To find lineages within a strain (subclustering), or
+ find clusters in a population without strain structure. Uses a simple nearest neighbour approach
+ so is more of a heuristic. Network scores are not meaningful in this mode.
+- ``threshold`` -- Apply a given core or accessory distance threshold to define clusters. Useful if
+ a cutoff threshold is already known/calculated, is estimated from a plot, or to compare a threshold
+ between datasets or species.
+
+The most useful guide to deciding which model to use is the ``_distanceDistribution.png`` file
+showing the core and accessory distances.
+
+bgmm
+----
+
+
+dbscan
+------
+
+
+refine
+------
.. _manual-start:
@@ -19,4 +58,20 @@ A key, followed by its value (space separated).
``mean0`` and ``mean1`` define the points (x,y) to draw the line between, and
``start_point`` is the distance along this line to draw the initial boundary
-(which is normal to the line).
\ No newline at end of file
+(which is normal to the line).
+
+lineage
+-------
+
+
+threshold
+---------
+
+Use an existing model with new data
+-----------------------------------
+
+There is also one further mode, ``--use-model``, which may be useful in limited circumstances. This
+applies any of the above models to a new dataset without refitting it. This may be useful if a reference
+dataset has changed (been added to or removed from) and you do not wish to refit the model, for example
+because it is already in use. However, typically you would use :doc:`query_assignment` with ``--update-db``
+to add to a model.
diff --git a/docs/online.rst b/docs/online.rst
index a18362df..72c3c8b0 100644
--- a/docs/online.rst
+++ b/docs/online.rst
@@ -1,6 +1,6 @@
PopPUNK-web
=================
-This is the newest feature in the PopPUNK pipeline, available at http://poppunk.net/web.
+This is the newest feature in the PopPUNK pipeline, available at https://web.poppunk.net/.
PoPUNK-web has been designed for non-specialists and allows you to easily analyse and assign lineages to your query genomic sequences,
using pre-optimised species-specific databases and default parameters.
diff --git a/docs/qc.rst b/docs/qc.rst
index ea6813e2..9a21c81b 100644
--- a/docs/qc.rst
+++ b/docs/qc.rst
@@ -20,6 +20,9 @@ In all cases a file will be written at ``qcreport.txt`` which lists the failing
reasons why they failed. If running with either prune or continue, you may also add ``--retain-failures``
to write a separate sketch database with the failed samples.
+Random match chances in PopPUNK are only calculated and added to the database after the chosen
+QC step. If you use ``poppunk_sketch`` directly, they will be added without any automated QC.
+
You can change the genome length cutoff with ``--length-sigma`` which sets the maximum number
of standard deviations from the mean, and ``--length-range`` which sets an absolute range of
allowable sizes.
@@ -34,7 +37,7 @@ should increase the value of ``--max-a-dist``.
Removing samples from an existing database
------------------------------------------
You can use the ``prune_poppunk`` command to remove samples from a database,
-for example those found to be of poor quality (see :ref:`qc`). Create a file
+for example those found to be of poor quality. Create a file
``remove.txt`` with the names of the samples you wish to remove, one per line,
and run::
diff --git a/docs/scripts.rst b/docs/scripts.rst
index 03d516ad..ef041d40 100644
--- a/docs/scripts.rst
+++ b/docs/scripts.rst
@@ -11,6 +11,17 @@ installed with the prefix 'poppunk' (e.g to run ``extract_distances.py``, run th
.. contents::
:local:
+Easy run mode
+-------------
+Previous versions of the software had an ``--easy-run`` mode which would run a pipeline of:
+
+- ``--create-db`` to sketch genomes
+- ``--fit-model --dbscan`` to fit a flexible model
+- ``--refine-model`` to improve this model
+
+This is now available as ``poppunk_easy_run.py`` which will chain calls to ``poppunk``
+and ``poppunk_visualise`` to replicate this functionality.
+
Writing the pairwise distances to an output file
------------------------------------------------
By default PopPUNK does not write the calculated :math:`\pi_n` and :math:`a` distances out, as this