From 59aa17a680100a64abc9362b0e5ce9972ac7f496 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Tue, 22 Dec 2020 16:05:04 +0000
Subject: [PATCH] Run the distance plot at create-db time

---
 PopPUNK/__main__.py     |  6 +++++
 PopPUNK/models.py       |  9 -------
 PopPUNK/plot.py         | 18 ++++++++++---
 docs/best_practises.rst |  3 +++
 docs/conf.py            |  4 +--
 docs/index.rst          |  2 +-
 docs/installation.rst   |  2 +-
 docs/model_fitting.rst  | 57 ++++++++++++++++++++++++++++++++++++++++-
 docs/online.rst         |  2 +-
 docs/qc.rst             |  5 +++-
 docs/scripts.rst        | 11 ++++++++
 11 files changed, 99 insertions(+), 20 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 9ac13439..47fbf986 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -175,6 +175,7 @@ def main():
     from .network import printClusters
 
     from .plot import writeClusterCsv
+    from .plot import plot_scatter
 
     from .prune_db import prune_distance_matrix
 
@@ -287,6 +288,11 @@ def main():
         dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
         storePickle(refList, queryList, True, distMat, dists_out)
 
+        # Plot results
+        plot_scatter(distMat,
+                     args.output + "/" + os.path.basename(args.output) + "_distanceDistribution",
+                     args.output + " distances")
+
     #******************************#
     #*                            *#
     #* model fit and network      *#
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 24b97dbe..8a974d7e 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -21,8 +21,6 @@
 
 import pp_sketchlib
 
-from .plot import plot_scatter
-
 # BGMM
 from .bgmm import fit2dMultiGaussian
 from .bgmm import assign_samples
@@ -126,7 +124,6 @@ def fit(self, X = None):
         '''Initial steps for all fit functions.
 
         Creates output directory. If preprocess is set then subsamples passed X
-        and draws a scatter plot from result using :func:`~PopPUNK.plot.plot_scatter`.
 
         Args:
             X (numpy.array)
@@ -159,12 +156,6 @@ def fit(self, X = None):
             self.scale = np.amax(self.subsampled_X, axis = 0)
             self.subsampled_X /= self.scale
 
-            # Show clustering
-            plot_scatter(self.subsampled_X,
-                         self.scale,
-                         self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_distanceDistribution",
-                         self.outPrefix + " distances")
-
     def plot(self, X=None):
         '''Initial steps for all plot functions.
 
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index 0427b378..2ba936bf 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -6,6 +6,7 @@
 import sys
 import os
 import subprocess
+import random
 import numpy as np
 import matplotlib as mpl
 mpl.use('Agg')
@@ -18,7 +19,7 @@
 import pandas as pd
 from collections import defaultdict
 from scipy import spatial
-from sklearn import manifold
+from sklearn import manifold, utils
 try:  # sklearn >= 0.22
     from sklearn.neighbors import KernelDensity
 except ImportError:
@@ -27,7 +28,7 @@
 
 from .utils import isolateNameToLabel
 
-def plot_scatter(X, scale, out_prefix, title, kde = True):
+def plot_scatter(X, out_prefix, title, kde = True):
     """Draws a 2D scatter plot (png) of the core and accessory distances
 
     Also draws contours of kernel density estimare
@@ -35,8 +36,6 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
     Args:
         X (numpy.array)
             n x 2 array of core and accessory distances for n samples.
-        scale (numpy.array)
-            Scaling factor from :class:`~PopPUNK.models.BGMMFit`
         out_prefix (str)
             Prefix for output plot file (.png will be appended)
         title (str)
@@ -46,6 +45,15 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
 
             (default = True)
     """
+    # Plot results - max 1M for speed
+    max_plot_samples = 1000000
+    if X.shape[0] > max_plot_samples:
+        X = utils.shuffle(X, random_state=random.randint(1,10000))[0:max_plot_samples,]
+
+    # Kernel estimate uses scaled data 0-1 on each axis
+    scale = np.amax(X, axis = 0)
+    X /= scale
+
     plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k')
     if kde:
         xx, yy, xy = get_grid(0, 1, 100)
@@ -58,11 +66,13 @@ def plot_scatter(X, scale, out_prefix, title, kde = True):
         z = z.reshape(xx.shape).T
 
         levels = np.linspace(z.min(), z.max(), 10)
+        # Rescale contours
         plt.contour(xx*scale[0], yy*scale[1], z, levels=levels[1:], cmap='plasma')
         scatter_alpha = 1
     else:
         scatter_alpha = 0.1
 
+    # Plot on correct scale
     plt.scatter(X[:,0]*scale[0].flat, X[:,1]*scale[1].flat, s=1, alpha=scatter_alpha)
 
     plt.title(title)
diff --git a/docs/best_practises.rst b/docs/best_practises.rst
index 8aff718c..593bab24 100644
--- a/docs/best_practises.rst
+++ b/docs/best_practises.rst
@@ -42,6 +42,9 @@ has many advantages:
 
 See :doc:`query_assignment` for instructions on how to use this mode.
 
+You can think of this as being similar to using an existing MLST/cgMLST/wgMLST scheme
+to define your sample's strains.
+
 Fit your own model
 ^^^^^^^^^^^^^^^^^^
 If a database isn't available for your species, you can fit your own. Details
diff --git a/docs/conf.py b/docs/conf.py
index c01c8484..66d95983 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -73,9 +73,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '2.2.0'
+version = '2.3.0'
 # The full version, including alpha/beta/rc tags.
-release = '2.2.0'
+release = '2.3.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/index.rst b/docs/index.rst
index c3455e2e..a032d3dd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -23,7 +23,7 @@ by reading the :doc:`best_practises`.
 
 .. important::
    Looking for older versions of the documentation? For previous versions with
-   the old API (`--assign-query`, `--refine-fit` etc) see `v2.2.0 <https://poppunk.readthedocs.io/en/v2.2.0-docs/>`__.
+   the old API (``--assign-query``, ``--refine-fit`` etc) see `v2.2.0 <https://poppunk.readthedocs.io/en/v2.2.0-docs/>`__.
    For older versions which used mash, see `v1.2.0 <https://poppunk.readthedocs.io/en/v1.2.2-docs/>`__.
 
 .. toctree::
diff --git a/docs/installation.rst b/docs/installation.rst
index 4a644763..e6ca66a8 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -38,7 +38,7 @@ If you are having conflict issues with conda, our advice would be:
 - Create a new environment for PopPUNK with ``conda create -n pp_env poppunk``
 
 If you have an older version of PopPUNK, you can upgrade using this method -- you
-may also wish to specify the version, for example `conda install poppunk==2.3.0` if you
+may also wish to specify the version, for example ``conda install poppunk==2.3.0`` if you
 wish to upgrade.
 
 conda-forge also has some helpful tips: https://conda-forge.org/docs/user/tipsandtricks.html
diff --git a/docs/model_fitting.rst b/docs/model_fitting.rst
index 5550c3df..9e3c7835 100644
--- a/docs/model_fitting.rst
+++ b/docs/model_fitting.rst
@@ -1,7 +1,46 @@
 Fitting new models
 ==================
 
+If you cannot find an existing model for your species in the
+`list <https://poppunk.net/pages/databases.html>`__ you will want to fit your own.
+This process is flexible, and there are five different models you can use depending
+on the population structure of your dataset.
 
+First, you must use ``poppunk --create-db`` to sketch your input data and calculate distances
+between all samples. This is detailed in :doc:`sketching`.
+
+Then, you wil use ``poppunk --fit-model <model_name>`` with one of the following:
+
+- ``bgmm`` -- Bayesian Gaussian Mixture Model. Best for small sample collections
+  with strain-structure. Works best when distance distribution components are clearly
+  separated.
+- ``dbscan`` -- HDBSCAN. A good general method for larger sample collections with
+  strain-structure. Some points will always be designated as noise, so a subsequent run
+  of model refinement may help improve the fit.
+- ``refine`` -- Model refinement. Requires a model already fitted with ``bgmm`` or ``dbscan``
+  and attempts to improve it by maximising the network score. Particularly useful when
+  components overlap significantly (often due to recombination), or when the strain boundary
+  is thought to lie somewhere within a component.
+- ``lineage`` -- Lineage clustering. To find lineages within a strain (subclustering), or
+  find clusters in a population without strain structure. Uses a simple nearest neighbour approach
+  so is more of a heuristic. Network scores are not meaningful in this mode.
+- ``threshold`` -- Apply a given core or accessory distance threshold to define clusters. Useful if
+  a cutoff threshold is already known/calculated, is estimated from a plot, or to compare a threshold
+  between datasets or species.
+
+The most useful guide to deciding which model to use is the ``_distanceDistribution.png`` file
+showing the core and accessory distances.
+
+bgmm
+----
+
+
+dbscan
+------
+
+
+refine
+------
 
 .. _manual-start:
 
@@ -19,4 +58,20 @@ A key, followed by its value (space separated).
 
 ``mean0`` and ``mean1`` define the points (x,y) to draw the line between, and
 ``start_point`` is the distance along this line to draw the initial boundary
-(which is normal to the line).
\ No newline at end of file
+(which is normal to the line).
+
+lineage
+-------
+
+
+threshold
+---------
+
+Use an existing model with new data
+-----------------------------------
+
+There is also one further mode, ``--use-model``, which may be useful in limited circumstances. This
+applies any of the above models to a new dataset without refitting it. This may be useful if a reference
+dataset has changed (been added to or removed from) and you do not wish to refit the model, for example
+because it is already in use. However, typically you would use :doc:`query_assignment` with ``--update-db``
+to add to a model.
diff --git a/docs/online.rst b/docs/online.rst
index a18362df..72c3c8b0 100644
--- a/docs/online.rst
+++ b/docs/online.rst
@@ -1,6 +1,6 @@
 PopPUNK-web
 =================
-This is the newest feature in the PopPUNK pipeline, available at http://poppunk.net/web.
+This is the newest feature in the PopPUNK pipeline, available at https://web.poppunk.net/.
 PoPUNK-web has been designed for non-specialists and allows you to easily analyse and assign lineages to your query genomic sequences,
 using pre-optimised species-specific databases and default parameters.
 
diff --git a/docs/qc.rst b/docs/qc.rst
index ea6813e2..9a21c81b 100644
--- a/docs/qc.rst
+++ b/docs/qc.rst
@@ -20,6 +20,9 @@ In all cases a file will be written at ``qcreport.txt`` which lists the failing
 reasons why they failed. If running with either prune or continue, you may also add ``--retain-failures``
 to write a separate sketch database with the failed samples.
 
+Random match chances in PopPUNK are only calculated and added to the database after the chosen
+QC step. If you use ``poppunk_sketch`` directly, they will be added without any automated QC.
+
 You can change the genome length cutoff with ``--length-sigma`` which sets the maximum number
 of standard deviations from the mean, and ``--length-range`` which sets an absolute range of
 allowable sizes.
@@ -34,7 +37,7 @@ should increase the value of ``--max-a-dist``.
 Removing samples from an existing database
 ------------------------------------------
 You can use the ``prune_poppunk`` command to remove samples from a database,
-for example those found to be of poor quality (see :ref:`qc`). Create a file
+for example those found to be of poor quality. Create a file
 ``remove.txt`` with the names of the samples you wish to remove, one per line,
 and run::
 
diff --git a/docs/scripts.rst b/docs/scripts.rst
index 03d516ad..ef041d40 100644
--- a/docs/scripts.rst
+++ b/docs/scripts.rst
@@ -11,6 +11,17 @@ installed with the prefix 'poppunk' (e.g to run ``extract_distances.py``, run th
 .. contents::
    :local:
 
+Easy run mode
+-------------
+Previous versions of the software had an ``--easy-run`` mode which would run a pipeline of:
+
+- ``--create-db`` to sketch genomes
+- ``--fit-model --dbscan`` to fit a flexible model
+- ``--refine-model`` to improve this model
+
+This is now available as ``poppunk_easy_run.py`` which will chain calls to ``poppunk``
+and ``poppunk_visualise`` to replicate this functionality.
+
 Writing the pairwise distances to an output file
 ------------------------------------------------
 By default PopPUNK does not write the calculated :math:`\pi_n` and :math:`a` distances out, as this