Skip to content

Commit

Permalink
Replace withinBoundary with sketchlib function
Browse files Browse the repository at this point in the history
Closes #78
  • Loading branch information
johnlees committed Jul 3, 2020
1 parent 567eace commit 96b1288
Show file tree
Hide file tree
Showing 9 changed files with 25 additions and 83 deletions.
15 changes: 9 additions & 6 deletions PopPUNK/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from scipy.spatial.distance import euclidean
from scipy import stats

import pp_sketchlib

from .plot import plot_scatter

# BGMM
Expand All @@ -36,7 +38,6 @@
# refine
from .refine import refineFit
from .refine import likelihoodBoundary
from .refine import withinBoundary
from .refine import readManualStart
from .plot import plot_refined_results

Expand Down Expand Up @@ -717,8 +718,8 @@ def plot(self, X, y=None):
self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_refined_fit")


def assign(self, X, slope=None):
'''Assign the clustering of new samples using :func:`~PopPUNK.refine.withinBoundary`
def assign(self, X, slope=None, cpus=1):
'''Assign the clustering of new samples
Args:
X (numpy.array)
Expand All @@ -728,6 +729,8 @@ def assign(self, X, slope=None):
Set to 0 for a vertical line, 1 for a horizontal line, or
2 to use a slope
cpus (int)
Number of threads to use
Returns:
y (numpy.array)
Cluster assignments by samples
Expand All @@ -736,11 +739,11 @@ def assign(self, X, slope=None):
raise RuntimeError("Trying to assign using an unfitted model")
else:
if slope == 2 or (slope == None and self.slope == 2):
y = withinBoundary(X/self.scale, self.optimal_x, self.optimal_y)
y = pp_sketchlib.assignThreshold(X/self.scale, 2, self.optimal_x, self.optimal_y, cpus)
elif slope == 0 or (slope == None and self.slope == 0):
y = withinBoundary(X/self.scale, self.core_boundary, 0, slope=0)
y = pp_sketchlib.assignThreshold(X/self.scale, 0, self.core_boundary, 0, cpus)
elif slope == 1 or (slope == None and self.slope == 1):
y = withinBoundary(X/self.scale, 0, self.accessory_boundary, slope=1)
y = pp_sketchlib.assignThreshold(X/self.scale, 1, 0, self.accessory_boundary, cpus)

return y

Expand Down
4 changes: 2 additions & 2 deletions PopPUNK/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,15 +390,15 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length,

constructDatabase(tmpFile, kmers, sketchSize, tmpDirName, estimated_length, True, threads, False)
qlist1, qlist2, distMat = queryDatabase(rNames = list(unassigned),
qNames = list(unassigned),
qNames = list(unassigned),
dbPrefix = tmpDirName,
queryPrefix = tmpDirName,
klist = kmers,
self = True,
number_plot_fits = 0,
threads = threads)
queryAssignation = model.assign(distMat)

# identify any links between queries and store in the same links dict
# links dict now contains lists of links both to original database and new queries
for assignment, (query1, query2) in zip(queryAssignation, iterDistRows(qlist1, qlist2, self=True)):
Expand Down
54 changes: 8 additions & 46 deletions PopPUNK/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
# additional
from functools import partial
import numpy as np
from numba import jit
import scipy.optimize
import collections
try:
Expand All @@ -19,6 +18,7 @@
except ImportError as e:
sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n")
sys.exit(0)
import pp_sketchlib

from .network import constructNetwork
from .network import networkSummary
Expand Down Expand Up @@ -81,14 +81,14 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
sys.stderr.write("Trying to optimise score globally\n")
global_grid_resolution = 40 # Seems to work
s_range = np.linspace(-min_move, max_move, num = global_grid_resolution)

# Move distMat into shared memory
with SharedMemoryManager() as smm:
shm_distMat = smm.SharedMemory(size = distMat.nbytes)
distances_shared_array = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = shm_distMat.buf)
distances_shared_array[:] = distMat[:]
distances_shared = NumpyShared(name = shm_distMat.name, shape = distMat.shape, dtype = distMat.dtype)

with Pool(processes = num_processes) as pool:
global_s = pool.map(partial(newNetwork,
sample_names = sample_names,
Expand Down Expand Up @@ -124,47 +124,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
return start_point, optimal_x, optimal_y


@jit(nopython=True)
def withinBoundary(dists, x_max, y_max, slope=2):
"""Classifies points as within or outside of a refined boundary.
Numba JIT compiled for speed.
Also used to assign new points in :func:`~PopPUNK.models.RefineFit.assign`
Args:
dists (numpy.array)
Core and accessory distances to classify
x_max (float)
The x-axis intercept from :func:`~decisionBoundary`
y_max (float)
The y-axis intercept from :func:`~decisionBoundary`
slope (int)
Set to 0 for a vertical line, 1 for a horizontal line, or
2 to use a slope
Returns:
signs (numpy.array)
For each sample in dists, -1 if within-strain and 1 if between-strain.
0 if exactly on boundary.
"""
# See https://stackoverflow.com/questions/2049582/how-to-determine-if-a-point-is-in-a-2d-triangle
# x_max and y_max from decisionBoundary
boundary_test = np.ones((dists.shape[0]))
for row in range(boundary_test.size):
if slope == 2:
in_tri = dists[row, 0]*dists[row, 1] - (x_max-dists[row, 0])*(y_max-dists[row, 1])
elif slope == 0:
in_tri = dists[row, 0] - x_max
elif slope == 1:
in_tri = dists[row, 1] - y_max

if in_tri < 0:
boundary_test[row] = -1
elif in_tri == 0:
boundary_test[row] = 0
return(boundary_test)


def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, slope=2):
def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, slope=2, cpus = 1):
"""Wrapper function for :func:`~PopPUNK.network.constructNetwork` which is called
by optimisation functions moving a triangular decision boundary.
Expand All @@ -187,14 +147,16 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, slope=2):
slope (int)
Set to 0 for a vertical line, 1 for a horizontal line, or
2 to use a slope
cpus (int)
Number of CPUs to use for calculating assignment
Returns:
score (float)
-1 * network score. Where network score is from :func:`~PopPUNK.network.networkSummary`
"""
if isinstance(distMat, NumpyShared):
distMat_shm = shared_memory.SharedMemory(name = distMat.name)
distMat = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = distMat_shm.buf)

# Set up boundary
new_intercept = transformLine(s, start_point, mean1)
if slope == 2:
Expand All @@ -207,7 +169,7 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, slope=2):
y_max = new_intercept[1]

# Make network
boundary_assignments = withinBoundary(distMat, x_max, y_max, slope)
boundary_assignments = pp_sketchlib.assignThreshold(distMat, slope, x_max, y_max, cpus)
G = constructNetwork(sample_names, sample_names, boundary_assignments, -1, summarise = False)

# Return score
Expand Down
14 changes: 0 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,6 @@ conda config --add channels bioconda
conda config --add channels conda-forge
```

### Through pip
If you do not have conda you can also install through pip:
```
python3 -m pip install poppunk
```
You will need to be using Python 3.8 or higher.

Using both of these methods command `poppunk` will then be directly executable.
Alternatively clone this repository:
```
git clone [email protected]:johnlees/PopPUNK.git
```
Then run with `python poppunk-runner.py`.

## Quick usage

See the [quickstart](https://poppunk.readthedocs.io/en/latest/quickstart.html) guide
Expand Down
1 change: 0 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
"scipy",
"sklearn",
"matplotlib",
"numba",
"dendropy",
"pp-sketchlib",
"h5py"]
Expand Down
17 changes: 6 additions & 11 deletions docs/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@ dependencies::
Then run with ``poppunk``.

.. important::
PopPUNK requires python3.8 to run (which on many default Linux installations is
From v2.1.0 onwards, PopPUNK requires python3.8 to run
(which on many default Linux installations is
run using ``python3`` rather than ``python``).

.. important::
From v2.1.2 onwards, PopPUNK no longer supports ``mash``. If you want to
use older databases created with ``mash``, please downgrade to <v2

Installing with conda (recommended)
-----------------------------------
If you do not have ``conda`` you can install it through
Expand All @@ -32,9 +37,6 @@ If you do not have conda, you can also install through pip::

python3 -m pip install poppunk

You will also need `mash <http://mash.readthedocs.io/en/latest/>`__ (v2 or higher)
installed (see :ref:`dependencies`).

Clone the code
--------------
You can also clone the github to run the latest version, which is executed by::
Expand All @@ -59,18 +61,11 @@ We tested PopPUNK with the following packages:
* ``matplotlib`` (2.1.2)
* ``networkx`` (2.1)
* ``numpy`` (1.14.1)
* ``numba`` (0.36.2)
* ``pandas`` (0.22.0)
* ``scikit-learn`` (0.19.1)
* ``scipy`` (1.0.0)
* ``sharedmem`` (0.3.5)

``numba`` may need ``gcc >=v4.8`` to install correctly through pip (if you are
getting ``OSError`` or ``'GLIBCXX_3.4.17' not found``).

You will need a `mash <http://mash.readthedocs.io/en/latest/>`__ installation
which is v2.0 or higher

Optionally, you can use `rapidnj <http://birc.au.dk/software/rapidnj/>`__
if producing output with ``--microreact`` and ``--rapidnj`` options. We used
v2.3.2.
Expand Down
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ dependencies:
- pandas
- scikit-learn
- dendropy
- numba
- matplotlib
- networkx
- mash
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ hdbscan>=0.8.13
matplotlib>=2.1.2
networkx>=2.1
numpy>=1.14.1
numba>=0.36.2
pandas>=0.22.0
scikit-learn>=0.19.1
scipy>=1.0.0
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ def find_version(*file_paths):
'scipy',
'scikit-learn',
'DendroPy',
'numba',
'pandas',
'networkx>=2.0',
'matplotlib',
Expand Down

0 comments on commit 96b1288

Please sign in to comment.