Skip to content

Commit

Permalink
Merge pull request #146 from johnlees/network-score
Browse files Browse the repository at this point in the history
Add alternative network scores
  • Loading branch information
johnlees authored Jan 31, 2021
2 parents 549d74f + 1b1d30a commit a43a96c
Show file tree
Hide file tree
Showing 22 changed files with 1,028 additions and 151 deletions.
51 changes: 51 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
cmake_minimum_required(VERSION 3.18)
project(poppunk_refine)
set(CMAKE_CXX_STANDARD 14)

# Variable definitions
set(TARGET_NAME poppunk_refine)
add_compile_definitions(PYTHON_EXT)

# Add -O0 to remove optimizations when using gcc
IF(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0")
ENDIF(CMAKE_COMPILER_IS_GNUCC)

if(UNIX AND NOT APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS")
set(CMAKE_LD_FLAGS "${CMAKE_LDFLAGS} -Wl,--as-needed")
endif()

# Set paths for non standard lib/ and include/ locations
if(DEFINED ENV{CONDA_PREFIX})
include_directories($ENV{CONDA_PREFIX}/include)
link_directories($ENV{CONDA_PREFIX}/lib)
link_directories($ENV{CONDA_PREFIX}/lib/intel64)
endif()

# Add libraries
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
find_package(pybind11 REQUIRED)
find_package (Eigen3 3.3 REQUIRED NO_MODULE)
#find_package(OpenMP) # This links system openmp if present - conda sorts out rpath but take care

# Define python library target
add_library("${TARGET_NAME}" MODULE)

# Compile CPU library
target_sources("${TARGET_NAME}" PRIVATE src/python_bindings.cpp
src/boundary.cpp)

set_target_properties("${TARGET_NAME}" PROPERTIES
CXX_VISIBILITY_PRESET "hidden"
INTERPROCEDURAL_OPTIMIZATION TRUE
PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}"
)

target_link_libraries("${TARGET_NAME}" PRIVATE pybind11::module Eigen3::Eigen
z gomp openblas gfortran m dl)
#if(OpenMP_CXX_FOUND)
# target_link_libraries("${TARGET_NAME}" PRIVATE OpenMP::OpenMP_CXX)
#endif()
4 changes: 2 additions & 2 deletions PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.3.0'
__version__ = '2.4.0'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 1
SKETCHLIB_MINOR = 6
SKETCHLIB_PATCH = 0
SKETCHLIB_PATCH = 2
28 changes: 22 additions & 6 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,15 @@
import subprocess
from collections import defaultdict

# required from v2.1.1 onwards (no mash support)
import pp_sketchlib
# Try to import sketchlib
try:
import pp_sketchlib
except ImportError as e:
sys.stderr.write("Sketchlib backend not available\n")
sys.exit(1)

import poppunk_refine
import h5py

# import poppunk package
from .__init__ import __version__
Expand Down Expand Up @@ -114,13 +121,19 @@ def get_options():
type=float, default = None)
refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. '
'See documentation for help.', default=None)
refinementGroup.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually',
choices=['both', 'core', 'accessory'], default = False)
refinementGroup.add_argument('--no-local', help='Do not perform the local optimization step (speed up on very large datasets)',
default=False, action='store_true')
refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries '
'to clusters [default = reference database directory]', type = str)
refinementGroup.add_argument('--core-only', help='Save the core distance fit (with ')
refinementGroup.add_argument('--score-idx',
help='Index of score to use [default = 0]',
type=int, default = 0, choices=[0, 1, 2])
refineMode = refinementGroup.add_mutually_exclusive_group()
refineMode.add_argument('--unconstrained',
help='Optimise both boundary gradient and intercept',
default=False, action='store_true')
refineMode.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually',
choices=['both', 'core', 'accessory'], default=False)

# lineage clustering within strains
lineagesGroup = parser.add_argument_group('Lineage analysis options')
Expand Down Expand Up @@ -334,7 +347,8 @@ def main():
model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz',
output)
sys.stderr.write("Loaded previous model of type: " + model.type + "\n")
if args.fit_model == "refine" and (model.type != 'bgmm' and model.type != 'dbscan'):
if args.fit_model == "refine" and args.manual_start == None \
and model.type != 'bgmm' and model.type != 'dbscan':
sys.stderr.write("Model needs to be from BGMM or DBSCAN to refine\n")
sys.exit(1)

Expand Down Expand Up @@ -368,6 +382,8 @@ def main():
args.pos_shift, args.neg_shift,
args.manual_start,
args.indiv_refine,
args.unconstrained,
args.score_idx,
args.no_local,
args.threads)
new_model.plot(distMat)
Expand Down
44 changes: 28 additions & 16 deletions PopPUNK/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from scipy.sparse import coo_matrix, bmat, find

import pp_sketchlib
import poppunk_refine

# BGMM
from .bgmm import fit2dMultiGaussian
Expand Down Expand Up @@ -526,9 +527,10 @@ def __init__(self, outPrefix):
self.within_label = -1
self.slope = 2
self.threshold = False
self.unconstrained = False

def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False,
no_local = False, threads = 1):
unconstrained = False, score_idx = 0, no_local = False, threads = 1):
'''Extends :func:`~ClusterFit.fit`
Fits the distances by optimising network score, by calling
Expand Down Expand Up @@ -557,6 +559,11 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
Run refinement for core and accessory distances separately
(default = False).
unconstrained (bool)
If True, search in 2D and change the slope of the boundary
score_idx (int)
Index of score from :func:`~PopPUNK.network.networkSummary` to use
[default = 0]
no_local (bool)
Turn off the local optimisation step.
Quicker, but may be less well refined.
Expand All @@ -572,9 +579,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
self.scale = np.copy(model.scale)
self.max_move = max_move
self.min_move = min_move
self.unconstrained = unconstrained

# Get starting point
assignment = model.assign(X)
model.no_scale()
if startFile:
self.mean0, self.mean1, self.start_s = readManualStart(startFile)
Expand Down Expand Up @@ -607,9 +614,11 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
raise RuntimeError("Unrecognised model type")

# Main refinement in 2D
self.start_point, self.optimal_x, self.optimal_y, self.min_move, self.max_move = refineFit(X/self.scale,
sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move,
slope = 2, no_local = no_local, num_processes = threads)
self.start_point, self.optimal_x, self.optimal_y, self.min_move, self.max_move = \
refineFit(X/self.scale,
sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move,
slope = 2, score_idx = score_idx, unconstrained = unconstrained,
no_local = no_local, num_processes = threads)
self.fitted = True

# Try and do a 1D refinement for both core and accessory
Expand All @@ -619,13 +628,15 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
try:
sys.stderr.write("Refining core and accessory separately\n")
# optimise core distance boundary
start_point, self.core_boundary, core_acc, self.min_move, self.max_move = refineFit(X/self.scale,
sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move,
slope = 0, no_local = no_local,num_processes = threads)
start_point, self.core_boundary, core_acc, self.min_move, self.max_move = \
refineFit(X/self.scale,
sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move,
slope = 0, score_idx = score_idx, no_local = no_local,num_processes = threads)
# optimise accessory distance boundary
start_point, acc_core, self.accessory_boundary, self.min_move, self.max_move = refineFit(X/self.scale,
sample_names, self.start_s,self.mean0, self.mean1, self.max_move, self.min_move, slope = 1,
no_local = no_local, num_processes = threads)
start_point, acc_core, self.accessory_boundary, self.min_move, self.max_move = \
refineFit(X/self.scale,
sample_names, self.start_s,self.mean0, self.mean1, self.max_move, self.min_move,
slope = 1, score_idx = score_idx, no_local = no_local, num_processes = threads)
self.indiv_fitted = True
except RuntimeError as e:
sys.stderr.write("Could not separately refine core and accessory boundaries. "
Expand Down Expand Up @@ -670,6 +681,7 @@ def apply_threshold(self, X, threshold):
self.fitted = True
self.threshold = True
self.indiv_fitted = False
self.unconstrained = False

y = self.assign(X)
return y
Expand Down Expand Up @@ -740,8 +752,8 @@ def plot(self, X, y=None):

plot_refined_results(plot_X, self.assign(plot_X), self.optimal_x, self.optimal_y, self.core_boundary,
self.accessory_boundary, self.mean0, self.mean1, self.start_point, self.min_move,
self.max_move, self.scale, self.threshold, self.indiv_fitted, "Refined fit boundary",
self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_refined_fit")
self.max_move, self.scale, self.threshold, self.indiv_fitted, self.unconstrained,
"Refined fit boundary", self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_refined_fit")


def assign(self, X, slope=None, cpus=1):
Expand All @@ -765,11 +777,11 @@ def assign(self, X, slope=None, cpus=1):
raise RuntimeError("Trying to assign using an unfitted model")
else:
if slope == 2 or (slope == None and self.slope == 2):
y = pp_sketchlib.assignThreshold(X/self.scale, 2, self.optimal_x, self.optimal_y, cpus)
y = poppunk_refine.assignThreshold(X/self.scale, 2, self.optimal_x, self.optimal_y, cpus)
elif slope == 0 or (slope == None and self.slope == 0):
y = pp_sketchlib.assignThreshold(X/self.scale, 0, self.core_boundary, 0, cpus)
y = poppunk_refine.assignThreshold(X/self.scale, 0, self.core_boundary, 0, cpus)
elif slope == 1 or (slope == None and self.slope == 1):
y = pp_sketchlib.assignThreshold(X/self.scale, 1, 0, self.accessory_boundary, cpus)
y = poppunk_refine.assignThreshold(X/self.scale, 1, 0, self.accessory_boundary, cpus)

return y

Expand Down
55 changes: 39 additions & 16 deletions PopPUNK/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ def cliquePrune(component, graph, reference_indices, components_list):
"""Wrapper function around :func:`~getCliqueRefs` so it can be
called by a multiprocessing pool
"""
if gt.openmp_enabled():
gt.openmp_set_num_threads(1)
subgraph = gt.GraphView(graph, vfilt=components_list == component)
refs = reference_indices.copy()
if subgraph.num_vertices() <= 2:
Expand Down Expand Up @@ -364,39 +366,60 @@ def constructNetwork(rlist, qlist, assignments, within_label,

# print some summaries
if summarise:
(components, density, transitivity, score) = networkSummary(G)
sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t" + str(components),
"\tDensity\t" + "{:.4f}".format(density),
"\tTransitivity\t" + "{:.4f}".format(transitivity),
"\tScore\t" + "{:.4f}".format(score)])
(metrics, scores) = networkSummary(G)
sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]),
"\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]),
"\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]),
"\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]),
"\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]),
"\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]),
"\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]),
"\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])])
+ "\n")

return G

def networkSummary(G):
def networkSummary(G, calc_betweenness=True):
"""Provides summary values about the network
Args:
G (graph)
The network of strains from :func:`~constructNetwork`
calc_betweenness (bool)
Whether to calculate betweenness stats
Returns:
components (int)
The number of connected components (and clusters)
density (float)
The proportion of possible edges used
transitivity (float)
Network transitivity (triads/triangles)
score (float)
A score of network fit, given by :math:`\mathrm{transitivity} * (1-\mathrm{density})`
metrics (list)
List with # components, density, transitivity, mean betweenness
and weighted mean betweenness
scores (list)
List of scores
"""
component_assignments, component_frequencies = gt.label_components(G)
components = len(component_frequencies)
density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
transitivity = gt.global_clustering(G)[0]
score = transitivity * (1-density)

return(components, density, transitivity, score)
mean_bt = 0
weighted_mean_bt = 0
if calc_betweenness:
betweenness = []
sizes = []
for component, size in enumerate(component_frequencies):
if size > 3:
vfilt = component_assignments.a == component
subgraph = gt.GraphView(G, vfilt=vfilt)
betweenness.append(max(gt.betweenness(subgraph, norm = True)[0].a))
sizes.append(size)

if len(betweenness) > 1:
mean_bt = np.mean(betweenness)
weighted_mean_bt = np.average(betweenness, weights=sizes)

metrics = [components, density, transitivity, mean_bt, weighted_mean_bt]
base_score = transitivity * (1 - density)
scores = [base_score, base_score * (1 - metrics[3]), base_score * (1 - metrics[4])]
return(metrics, scores)

def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
assignments, model, queryDB, queryQuery = False,
Expand Down
24 changes: 17 additions & 7 deletions PopPUNK/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .trees import write_tree, mst_to_phylogeny

from .utils import isolateNameToLabel
from .utils import decisionBoundary

def plot_scatter(X, out_prefix, title, kde = True):
"""Draws a 2D scatter plot (png) of the core and accessory distances
Expand Down Expand Up @@ -225,7 +226,7 @@ def plot_dbscan_results(X, y, n_clusters, out_prefix):

def plot_refined_results(X, Y, x_boundary, y_boundary, core_boundary, accessory_boundary,
mean0, mean1, start_point, min_move, max_move, scale, threshold, indiv_boundaries,
title, out_prefix):
unconstrained, title, out_prefix):
"""Draw a scatter plot (png) to show the refined model fit
A scatter plot of core and accessory distances, coloured by component
Expand Down Expand Up @@ -285,12 +286,21 @@ def plot_refined_results(X, Y, x_boundary, y_boundary, core_boundary, accessory_

# Draw boundary search range
if mean0 is not None and mean1 is not None and min_move is not None and max_move is not None and start_point is not None:
minimum_xy = transformLine(-min_move, start_point, mean1) * scale
maximum_xy = transformLine(max_move, start_point, mean1) * scale
plt.plot([minimum_xy[0], maximum_xy[0]], [minimum_xy[1], maximum_xy[1]],
color='k', linewidth=1, linestyle=':', label='Search range')
start_point *= scale
plt.plot(start_point[0], start_point[1], 'ro', label='Initial boundary')
if unconstrained:
gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0])
opt_start = decisionBoundary(mean0, gradient) * scale
opt_end = decisionBoundary(mean1, gradient) * scale
plt.fill([opt_start[0], opt_end[0], 0, 0],
[0, 0, opt_end[1], opt_start[1]],
fill=True, facecolor='lightcoral', alpha = 0.2,
label='Search range')
else:
minimum_xy = transformLine(-min_move, start_point, mean1) * scale
maximum_xy = transformLine(max_move, start_point, mean1) * scale
plt.plot([minimum_xy[0], maximum_xy[0]], [minimum_xy[1], maximum_xy[1]],
color='k', linewidth=1, linestyle=':', label='Search range')
start_point *= scale
plt.plot(start_point[0], start_point[1], 'ro', label='Initial boundary')

mean0 *= scale
mean1 *= scale
Expand Down
Loading

0 comments on commit a43a96c

Please sign in to comment.