From 7e93b635f86a9719974f8c036bf7a3782599aacf Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 30 Jul 2021 13:41:35 +0100 Subject: [PATCH 01/18] First draft of C++ extend function --- src/boundary.hpp | 5 +++ src/extend.cpp | 101 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 src/extend.cpp diff --git a/src/boundary.hpp b/src/boundary.hpp index d29b2448..5fb0bdb8 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -17,8 +17,13 @@ typedef Eigen::Matrix NumpyMatrix; typedef std::tuple, std::vector, std::vector> network_coo; +typedef std::tuple, std::vector, std::vector> + sparse_coo; typedef std::vector> edge_tuple; +template +std::vector sort_indexes(const T &v, const uint32_t n_threads) + Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, const int slope, const float x_max, const float y_max, unsigned int num_threads); diff --git a/src/extend.cpp b/src/extend.cpp new file mode 100644 index 00000000..b0273a08 --- /dev/null +++ b/src/extend.cpp @@ -0,0 +1,101 @@ +/* + * + * extend.cpp + * Functions to extend a sparse distance matrix + * + */ +#include +#include // assert +#include // floor/sqrt +#include // size_t +#include +#include +#include +#include + +#include "boundary.hpp" + +sparse_coo extend(const sparse_coo &sparse_rr_mat, + const NumpyMatrix &qq_mat_square, + const NumpyMatrix &qr_mat_rect, const size_t kNN) { + const size_t nr_samples = qr_mat_rect.rows(); + const size_t nq_samples = qr_mat_rect.cols(); + + // Get indices where each row starts in the sparse matrix + const std::vector i_vec = std::get<0>(sparse_rr_mat); + std::vector row_start_idx(nr_samples + 1); + size_t i_idx = 0; + row_start_idx[0] = 0; + row_start_idx[nr_samples] = i_vec.size(); + for (long i = 1; i < nr_samples; ++i) { + while (i_vec[i_idx] < i) { + i_idx++; + } + row_start_idx[i] = i_idx; + } + + // ijv vectors + std::vector dists; + std::vector i_vec; + std::vector j_vec; + const std::vector dist_vec = std::get<2>(sparse_rr_mat); + for (long i = 0; i < nr_samples + nq_samples; ++i) { + // Extract the dists for the row from the qr (dense) and rr (sparse) + // matrices + Eigen::VectorXf rr_dists, qr_dists; + if (i < nr_samples) { + qr_dists = qr_mat_rect.row(i); + Eigen::Map rr_map(dist_vec.data() + row_start_idx[i], + row_start_idx[i + 1] - + row_start_idx[i]); + rr_dists = rr_map; + } else { + rr_dists = qr_mat_rect.col(i - nr_samples); + qr_dists = qq_mat_square.row(i - nr_samples); + } + + // Sort these. Then do a merge + std::vector qr_ordered_idx = sort_indexes(qr_dists, 1); + std::vector rr_ordered_idx = sort_indexes(rr_dists, 1); + + long unique_neighbors = 0; + float prev_value = -1; + auto rr_it = rr_ordered_idx.cbegin(); + auto qr_it = qr_ordered_idx.cbegin(); + while (qr_it != qr_ordered_idx.cend() && rr_it != rr_ordered_idx.cend()) { + // Get the next smallest dist, and corresponding j + long j; + float dist; + if (rr_it == rr_ordered_idx.cend() || + (!qr_it == qr_ordered_idx.cend() && + qr_dists[*qr_it] <= rr_dists[*rr_it])) { + j = *qr_it + nr_samples; + dist = qr_dists[*qr_it]; + ++qr_it; + } else { + j = *rr_it; + dist = rr_dists[*rr_it]; + ++rr_it; + } + + if (j == i) { + continue; + } + bool new_val = abs(dist - prev_value) < epsilon; + if (unique_neighbors < kNN || new_val) { + dists.push_back(dist); + i_vec.push_back(i); + j_vec.push_back(j); + if (!new_val) { + unique_neighbors++; + prev_value = dist; + } + } else { + break; // next i + } + } + } + return (std::make_tuple(i_vec, j_vec, dists)); +} + +sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t kNN) {} \ No newline at end of file From 416024ab5caa061c15b7a40c00297a9a26e35c8e Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 30 Jul 2021 13:58:38 +0100 Subject: [PATCH 02/18] Add python bindings for new extend function --- PopPUNK/models.py | 4 +-- src/boundary.hpp | 4 +-- src/extend.cpp | 70 ++++++++++++++++++++++++++++++++--------- src/extend.hpp | 9 ++++++ src/python_bindings.cpp | 55 +++++++++++++++----------------- 5 files changed, 94 insertions(+), 48 deletions(-) create mode 100644 src/extend.hpp diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 40e32260..ce89676b 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1016,7 +1016,7 @@ def fit(self, X, accessory): ''' # Check if model requires GPU check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True) - + ClusterFit.fit(self, X) sample_size = int(round(0.5 * (1 + np.sqrt(1 + 8 * X.shape[0])))) if (max(self.ranks) >= sample_size): @@ -1160,7 +1160,7 @@ def extend(self, qqDists, qrDists): if self.use_gpu: qqDists = cp.array(qqDists) qrDists = cp.array(qrDists) - + # Reshape qq and qr dist matrices qqSquare = pp_sketchlib.longToSquare(qqDists[:, [self.dist_col]], self.threads) qqSquare[qqSquare < epsilon] = epsilon diff --git a/src/boundary.hpp b/src/boundary.hpp index 5fb0bdb8..8486d963 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -1,7 +1,7 @@ /* * - * matrix.hpp - * functions in matrix_ops.cpp + * boundary.hpp + * prototypes for boundary fns * */ #pragma once diff --git a/src/extend.cpp b/src/extend.cpp index b0273a08..9c16a1fb 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -4,24 +4,16 @@ * Functions to extend a sparse distance matrix * */ -#include -#include // assert -#include // floor/sqrt #include // size_t #include -#include -#include #include -#include "boundary.hpp" +#include "extend.hpp" -sparse_coo extend(const sparse_coo &sparse_rr_mat, - const NumpyMatrix &qq_mat_square, - const NumpyMatrix &qr_mat_rect, const size_t kNN) { - const size_t nr_samples = qr_mat_rect.rows(); - const size_t nq_samples = qr_mat_rect.cols(); +const float epsilon = 1E-10; - // Get indices where each row starts in the sparse matrix +// Get indices where each row starts in the sparse matrix +std::vector row_start_indices(const sparse_coo &sparse_rr_mat) { const std::vector i_vec = std::get<0>(sparse_rr_mat); std::vector row_start_idx(nr_samples + 1); size_t i_idx = 0; @@ -33,6 +25,16 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, } row_start_idx[i] = i_idx; } + return row_start_idx; +} + +sparse_coo extend(const sparse_coo &sparse_rr_mat, + const NumpyMatrix &qq_mat_square, + const NumpyMatrix &qr_mat_rect, const size_t kNN) { + const size_t nr_samples = qr_mat_rect.rows(); + const size_t nq_samples = qr_mat_rect.cols(); + + std::vector row_start_idx = row_start_indices(sparse_rr_mat); // ijv vectors std::vector dists; @@ -54,10 +56,12 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, qr_dists = qq_mat_square.row(i - nr_samples); } - // Sort these. Then do a merge + // Sort these. Then do a merge below std::vector qr_ordered_idx = sort_indexes(qr_dists, 1); std::vector rr_ordered_idx = sort_indexes(rr_dists, 1); + // See sparsify_dists in pp_sketchlib. + // This is very similar, but merging two lists as input long unique_neighbors = 0; float prev_value = -1; auto rr_it = rr_ordered_idx.cbegin(); @@ -98,4 +102,42 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, return (std::make_tuple(i_vec, j_vec, dists)); } -sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t kNN) {} \ No newline at end of file +sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t kNN) { + const size_t nr_samples = qr_mat_rect.rows(); + std::vector row_start_idx = row_start_indices(sparse_rr_mat); + + // ijv vectors + std::vector dists; + std::vector i_vec; + std::vector j_vec; + const std::vector dist_vec = std::get<2>(sparse_rr_mat); + for (long i = 0; i < nr_samples; ++i) { + Eigen::Map rr_dists(dist_vec.data() + row_start_idx[i], + row_start_idx[i + 1] - + row_start_idx[i]); + std::vector rr_ordered_idx = sort_indexes(rr_dists, 1); + + long unique_neighbors = 0; + float prev_value = -1; + for (long rr_it = rr_ordered_idx.cbegin(); rr_it != rr_ordered_idx.cend(); ++rr_it) { + long j = *rr_it; + float dist = rr_dists[*rr_it]; + if (j == i) { + continue; + } + bool new_val = abs(dist - prev_value) < epsilon; + if (unique_neighbors < kNN || new_val) { + dists.push_back(dist); + i_vec.push_back(i); + j_vec.push_back(j); + if (!new_val) { + unique_neighbors++; + prev_value = dist; + } + } else { + break; // next i + } + } + } + return (std::make_tuple(i_vec, j_vec, dists)); +} diff --git a/src/extend.hpp b/src/extend.hpp new file mode 100644 index 00000000..3162debc --- /dev/null +++ b/src/extend.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "boundary.hpp" + +sparse_coo extend(const sparse_coo &sparse_rr_mat, + const NumpyMatrix &qq_mat_square, + const NumpyMatrix &qr_mat_rect, const size_t kNN); + +sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t kNN); diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 1fdf6634..93562f47 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -12,6 +12,7 @@ namespace py = pybind11; #include "boundary.hpp" +#include "extend.hpp" // Wrapper which makes a ref to the python/numpy array Eigen::VectorXf assignThreshold(const Eigen::Ref &distMat, @@ -31,27 +32,18 @@ edge_tuple edgeThreshold(const Eigen::Ref &distMat, } edge_tuple generateTuples(const std::vector &assignments, - const int within_label, - bool self, - const int num_ref, - const int int_offset) { - edge_tuple edges = generate_tuples(assignments, - within_label, - self, - num_ref, - int_offset); - return (edges); + const int within_label, bool self, const int num_ref, + const int int_offset) { + edge_tuple edges = + generate_tuples(assignments, within_label, self, num_ref, int_offset); + return (edges); } -edge_tuple generateAllTuples(const int num_ref, - const int num_queries, - bool self = true, - const int int_offset = 0) { - edge_tuple edges = generate_all_tuples(num_ref, - num_queries, - self, - int_offset); - return (edges); +edge_tuple generateAllTuples(const int num_ref, const int num_queries, + bool self = true, const int int_offset = 0) { + edge_tuple edges = + generate_all_tuples(num_ref, num_queries, self, int_offset); + return (edges); } network_coo thresholdIterate1D(const Eigen::Ref &distMat, @@ -99,21 +91,15 @@ PYBIND11_MODULE(poppunk_refine, m) { m.def("generateTuples", &generateTuples, py::return_value_policy::reference_internal, - "Return edge tuples based on assigned groups", - py::arg("assignments"), - py::arg("within_label"), - py::arg("self") = true, - py::arg("num_ref") = 0, + "Return edge tuples based on assigned groups", py::arg("assignments"), + py::arg("within_label"), py::arg("self") = true, py::arg("num_ref") = 0, py::arg("int_offset") = 0); m.def("generateAllTuples", &generateAllTuples, - py::return_value_policy::reference_internal, - "Return all edge tuples", - py::arg("num_ref"), - py::arg("num_queries") = 0, - py::arg("self") = true, + py::return_value_policy::reference_internal, "Return all edge tuples", + py::arg("num_ref"), py::arg("num_queries") = 0, py::arg("self") = true, py::arg("int_offset") = 0); - + m.def("thresholdIterate1D", &thresholdIterate1D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", @@ -125,4 +111,13 @@ PYBIND11_MODULE(poppunk_refine, m) { py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", py::arg("distMat").noconvert(), py::arg("x_max"), py::arg("y_max")); + + m.def("extend", &extend, py::return_value_policy::reference_internal, + "Extend a sparse distance matrix keeping k nearest-neighbours", + py::arg("rr_mat"), py::arg("qq_mat").noconvert(), + py::arg("qr_mat").noconvert(), py::arg("kNN")); + + m.def("lowerRank", &lower_rank, py::return_value_policy::reference_internal, + "Extend a sparse distance matrix keeping k nearest-neighbours", + py::arg("rr_mat"), py::arg("kNN")); } From 3184e0f056893f6bfb2c7711ea80cf65c73298ee Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 30 Jul 2021 15:11:28 +0100 Subject: [PATCH 03/18] Replace extend in models.py --- CMakeLists.txt | 3 +- PopPUNK/models.py | 75 ++++++++++------------------------------- src/boundary.hpp | 4 +-- src/extend.cpp | 35 ++++++++++--------- src/extend.hpp | 9 +++-- src/python_bindings.cpp | 2 +- 6 files changed, 46 insertions(+), 82 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 88ddd57e..6f82f992 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,8 @@ add_library("${TARGET_NAME}" MODULE) # Compile CPU library target_sources("${TARGET_NAME}" PRIVATE src/python_bindings.cpp - src/boundary.cpp) + src/boundary.cpp + src/extend.cpp) set_target_properties("${TARGET_NAME}" PROPERTIES CXX_VISIBILITY_PRESET "hidden" diff --git a/PopPUNK/models.py b/PopPUNK/models.py index ce89676b..3465ef84 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1152,7 +1152,6 @@ def extend(self, qqDists, qrDists): y (list of tuples) Edges to include in network ''' - # Check if model requires GPU check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True) @@ -1163,68 +1162,28 @@ def extend(self, qqDists, qrDists): # Reshape qq and qr dist matrices qqSquare = pp_sketchlib.longToSquare(qqDists[:, [self.dist_col]], self.threads) - qqSquare[qqSquare < epsilon] = epsilon n_ref = self.nn_dists[self.ranks[0]].shape[0] n_query = qqSquare.shape[1] qrRect = qrDists[:, [self.dist_col]].reshape(n_query, n_ref) - qrRect[qrRect < epsilon] = epsilon - - for rank in self.ranks: - # Add the matrices together to make a large square matrix - if self.use_gpu: - full_mat = cupyx.scipy.sparse.bmat([[self.nn_dists[rank], - qrRect.transpose()], - [qrRect,qqSquare]], - format = 'csr', - dtype = self.nn_dists[rank].dtype) - else: - full_mat = scipy.sparse.bmat([[self.nn_dists[rank], - qrRect.transpose()], - [qrRect,qqSquare]], - format = 'csr', - dtype = self.nn_dists[rank].dtype) - - # Reapply the rank to each row, using sparse matrix functions - data = [] - row = [] - col = [] - for row_idx in range(full_mat.shape[0]): - sample_row = full_mat.getrow(row_idx) - if self.use_gpu: - dist_row, dist_col, dist = cupyx.scipy.sparse.find(sample_row) - else: - dist_row, dist_col, dist = scipy.sparse.find(sample_row) - dist[dist < epsilon] = epsilon - dist_idx_sort = np.argsort(dist) - - # Identical to C++ code in matrix_ops.cpp:sparsify_dists - neighbours = 0 - prev_val = -1 - for sort_idx in dist_idx_sort: - if row_idx == dist_col[sort_idx]: - continue - new_val = abs(dist[sort_idx] - prev_val) < epsilon - if (neighbours < rank or new_val): - data.append(dist[sort_idx]) - row.append(row_idx) - col.append(dist_col[sort_idx]) - - if not new_val: - neighbours += 1 - prev_val = data[-1] - else: - break - if self.use_gpu: - self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix( - (cp.array(data), (cp.array(row), cp.array(col))), - shape=(full_mat.shape[0], full_mat.shape[0]), - dtype = self.nn_dists[rank].dtype) - else: - self.nn_dists[rank] = scipy.sparse.coo_matrix((data, (row, col)), - shape=(full_mat.shape[0], full_mat.shape[0]), - dtype = self.nn_dists[rank].dtype) + max_rank = max(self.ranks) + rrSparse = self.nn_dists[max_rank] + self.nn_dists[max_rank] = \ + poppunk_refine.extend( + (rrSparse.row, rrSparse.col, rrSparse.data), + qqSquare, + qrRect, + max_rank) + + higher_rank = self.nn_dists[max_rank] + for rank in sorted(self.ranks, reverse=True)[1:]: + self.nn_dists[rank] = \ + poppunk_refine.lower_rank( + (higher_rank.row, higher_rank.col, higher_rank.data), + n_ref + n_query, + rank) + higher_rank = self.nn_dists[rank] y = self.assign(min(self.ranks)) return y diff --git a/src/boundary.hpp b/src/boundary.hpp index 8486d963..15a75455 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -17,12 +17,10 @@ typedef Eigen::Matrix NumpyMatrix; typedef std::tuple, std::vector, std::vector> network_coo; -typedef std::tuple, std::vector, std::vector> - sparse_coo; typedef std::vector> edge_tuple; template -std::vector sort_indexes(const T &v, const uint32_t n_threads) +std::vector sort_indexes(const T &v, const uint32_t n_threads); Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, const int slope, const float x_max, const float y_max, diff --git a/src/extend.cpp b/src/extend.cpp index 9c16a1fb..14a44fb5 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -13,7 +13,8 @@ const float epsilon = 1E-10; // Get indices where each row starts in the sparse matrix -std::vector row_start_indices(const sparse_coo &sparse_rr_mat) { +std::vector row_start_indices(const sparse_coo &sparse_rr_mat, + const size_t nr_samples) { const std::vector i_vec = std::get<0>(sparse_rr_mat); std::vector row_start_idx(nr_samples + 1); size_t i_idx = 0; @@ -28,19 +29,19 @@ std::vector row_start_indices(const sparse_coo &sparse_rr_mat) { return row_start_idx; } -sparse_coo extend(const sparse_coo &sparse_rr_mat, - const NumpyMatrix &qq_mat_square, +sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_square, const NumpyMatrix &qr_mat_rect, const size_t kNN) { const size_t nr_samples = qr_mat_rect.rows(); const size_t nq_samples = qr_mat_rect.cols(); - std::vector row_start_idx = row_start_indices(sparse_rr_mat); + std::vector row_start_idx = + row_start_indices(sparse_rr_mat, nr_samples); // ijv vectors std::vector dists; std::vector i_vec; std::vector j_vec; - const std::vector dist_vec = std::get<2>(sparse_rr_mat); + std::vector dist_vec = std::get<2>(sparse_rr_mat); for (long i = 0; i < nr_samples + nq_samples; ++i) { // Extract the dists for the row from the qr (dense) and rr (sparse) // matrices @@ -48,8 +49,8 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, if (i < nr_samples) { qr_dists = qr_mat_rect.row(i); Eigen::Map rr_map(dist_vec.data() + row_start_idx[i], - row_start_idx[i + 1] - - row_start_idx[i]); + row_start_idx[i + 1] - + row_start_idx[i]); rr_dists = rr_map; } else { rr_dists = qr_mat_rect.col(i - nr_samples); @@ -71,7 +72,7 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, long j; float dist; if (rr_it == rr_ordered_idx.cend() || - (!qr_it == qr_ordered_idx.cend() && + (!(qr_it == qr_ordered_idx.cend()) && qr_dists[*qr_it] <= rr_dists[*rr_it])) { j = *qr_it + nr_samples; dist = qr_dists[*qr_it]; @@ -102,24 +103,26 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, return (std::make_tuple(i_vec, j_vec, dists)); } -sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t kNN) { - const size_t nr_samples = qr_mat_rect.rows(); - std::vector row_start_idx = row_start_indices(sparse_rr_mat); +sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t n_samples, + const size_t kNN) { + std::vector row_start_idx = + row_start_indices(sparse_rr_mat, n_samples); // ijv vectors std::vector dists; std::vector i_vec; std::vector j_vec; - const std::vector dist_vec = std::get<2>(sparse_rr_mat); - for (long i = 0; i < nr_samples; ++i) { + std::vector dist_vec = std::get<2>(sparse_rr_mat); + for (long i = 0; i < n_samples; ++i) { Eigen::Map rr_dists(dist_vec.data() + row_start_idx[i], - row_start_idx[i + 1] - - row_start_idx[i]); + row_start_idx[i + 1] - + row_start_idx[i]); std::vector rr_ordered_idx = sort_indexes(rr_dists, 1); long unique_neighbors = 0; float prev_value = -1; - for (long rr_it = rr_ordered_idx.cbegin(); rr_it != rr_ordered_idx.cend(); ++rr_it) { + for (auto rr_it = rr_ordered_idx.cbegin(); rr_it != rr_ordered_idx.cend(); + ++rr_it) { long j = *rr_it; float dist = rr_dists[*rr_it]; if (j == i) { diff --git a/src/extend.hpp b/src/extend.hpp index 3162debc..01ec51bb 100644 --- a/src/extend.hpp +++ b/src/extend.hpp @@ -2,8 +2,11 @@ #include "boundary.hpp" -sparse_coo extend(const sparse_coo &sparse_rr_mat, - const NumpyMatrix &qq_mat_square, +typedef std::tuple, std::vector, std::vector> + sparse_coo; + +sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_square, const NumpyMatrix &qr_mat_rect, const size_t kNN); -sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t kNN); +sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t n_samples, + const size_t kNN); diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 93562f47..64d1385b 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -119,5 +119,5 @@ PYBIND11_MODULE(poppunk_refine, m) { m.def("lowerRank", &lower_rank, py::return_value_policy::reference_internal, "Extend a sparse distance matrix keeping k nearest-neighbours", - py::arg("rr_mat"), py::arg("kNN")); + py::arg("rr_mat"), py::arg("n_samples"), py::arg("kNN")); } From 65eaf45288778f81f5cfc36106a684c373dbba9d Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 30 Jul 2021 15:45:31 +0100 Subject: [PATCH 04/18] Fix template prototype --- PopPUNK/__init__.py | 2 +- src/boundary.cpp | 19 ------------------- src/boundary.hpp | 17 ++++++++++++++++- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 82bed522..79da77c1 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,7 +3,7 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.4.3' +__version__ = '2.4.4' # Minimum sketchlib version SKETCHLIB_MAJOR = 1 diff --git a/src/boundary.cpp b/src/boundary.cpp index d1991fa7..915bb4be 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -4,18 +4,13 @@ * Functions to move a network boundary * */ -#include #include // assert #include // floor/sqrt #include // size_t #include -#include #include #include -// Parallel sort -#include - #include "boundary.hpp" const float epsilon = 1E-10; @@ -41,20 +36,6 @@ inline size_t square_to_condensed(const size_t i, const size_t j, return (n * i - ((i * (i + 1)) >> 1) + j - 1 - i); } -// https://stackoverflow.com/a/12399290 -template -std::vector sort_indexes(const T &v, const uint32_t n_threads) { - // initialize original index locations - std::vector idx(v.size()); - std::iota(idx.begin(), idx.end(), 0); - - boost::sort::parallel_stable_sort( - idx.begin(), idx.end(), [&v](long i1, long i2) { return v[i1] < v[i2]; }, - n_threads); - - return idx; -} - // Unnormalised (signed_ distance between a point (x0, y0) and a line defined // by the two points (xmax, 0) and (0, ymax) // Divide by 1/sqrt(xmax^2 + ymax^2) to get distance diff --git a/src/boundary.hpp b/src/boundary.hpp index 15a75455..6aff8162 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -10,17 +10,32 @@ #include #include #include +#include +#include #include +// Parallel sort +#include typedef Eigen::Matrix NumpyMatrix; typedef std::tuple, std::vector, std::vector> network_coo; typedef std::vector> edge_tuple; +// https://stackoverflow.com/a/12399290 template -std::vector sort_indexes(const T &v, const uint32_t n_threads); +std::vector sort_indexes(const T &v, const uint32_t n_threads) { + // initialize original index locations + std::vector idx(v.size()); + std::iota(idx.begin(), idx.end(), 0); + + boost::sort::parallel_stable_sort( + idx.begin(), idx.end(), [&v](long i1, long i2) { return v[i1] < v[i2]; }, + n_threads); + + return idx; +} Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, const int slope, const float x_max, const float y_max, From b9b80073cfa59072ed323a2c679d1aead701d214 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 30 Jul 2021 17:06:46 +0100 Subject: [PATCH 05/18] Save sparse matrices in npz format --- PopPUNK/models.py | 44 ++++++++++++++++++++++++++------------------ test/test-update.py | 5 +++++ 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 3465ef84..61477f89 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -998,6 +998,20 @@ def __init__(self, outPrefix, ranks, use_gpu = False): self.ranks.append(int(rank)) self.use_gpu = use_gpu + def __save_sparse__(self, data, row, col, rank, n_samples, dtype): + if self.use_gpu: + data = cp.array(data) + data[data < epsilon] = epsilon + self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix((data, (cp.array(row), cp.array(col))), + shape=(n_samples, n_samples), + dtype = dtype) + else: + data = np.array(data) + data[data < epsilon] = epsilon + self.nn_dists[rank] = scipy.sparse.coo_matrix((data, (row, col)), + shape=(n_samples, n_samples), + dtype = dtype) + def fit(self, X, accessory): '''Extends :func:`~ClusterFit.fit` @@ -1036,18 +1050,7 @@ def fit(self, X, accessory): 0, rank ) - if self.use_gpu: - data = cp.array(data) - data[data < epsilon] = epsilon - self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix((data,(cp.array(row),cp.array(col))), - shape=(sample_size, sample_size), - dtype = X.dtype) - else: - data = np.array(data) - data[data < epsilon] = epsilon - self.nn_dists[rank] = scipy.sparse.coo_matrix((data, (row, col)), - shape=(sample_size, sample_size), - dtype = X.dtype) + self.__save_sparse__(data, row, col, rank, sample_size, X.dtype) self.fitted = True @@ -1162,28 +1165,33 @@ def extend(self, qqDists, qrDists): # Reshape qq and qr dist matrices qqSquare = pp_sketchlib.longToSquare(qqDists[:, [self.dist_col]], self.threads) + qqSquare[qqSquare < epsilon] = epsilon n_ref = self.nn_dists[self.ranks[0]].shape[0] n_query = qqSquare.shape[1] qrRect = qrDists[:, [self.dist_col]].reshape(n_query, n_ref) + qrRect[qrRect < epsilon] = epsilon max_rank = max(self.ranks) rrSparse = self.nn_dists[max_rank] - self.nn_dists[max_rank] = \ + higher_rank = \ poppunk_refine.extend( (rrSparse.row, rrSparse.col, rrSparse.data), qqSquare, qrRect, max_rank) + self.__save_sparse__(higher_rank[2], higher_rank[0], higher_rank[1], + max_rank, n_ref + n_query, rrSparse.dtype) - higher_rank = self.nn_dists[max_rank] for rank in sorted(self.ranks, reverse=True)[1:]: - self.nn_dists[rank] = \ - poppunk_refine.lower_rank( - (higher_rank.row, higher_rank.col, higher_rank.data), + higher_rank = \ + poppunk_refine.lowerRank( + higher_rank, n_ref + n_query, rank) - higher_rank = self.nn_dists[rank] + self.__save_sparse__(higher_rank[2], higher_rank[0], higher_rank[1], + rank, n_ref + n_query, rrSparse.dtype) y = self.assign(min(self.ranks)) return y + diff --git a/test/test-update.py b/test/test-update.py index 22aa630e..4fbd70e4 100755 --- a/test/test-update.py +++ b/test/test-update.py @@ -122,3 +122,8 @@ def old_get_seq_tuples(rows,cols): S4 = scipy.sparse.load_npz("batch3/batch3_rank2_fit.npz") compare_sparse_matrices(S3,S4,rlist3,rlist4) + +# Check rank 1 +S5 = scipy.sparse.load_npz("batch123/batch123_rank1_fit.npz") +S6 = scipy.sparse.load_npz("batch3/batch3_rank1_fit.npz") +compare_sparse_matrices(S5,S6,rlist3,rlist4) From 7cc18a7932594689fc84d42c0fe3e549e19575ae Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 12:25:05 +0100 Subject: [PATCH 06/18] Fix j index in lower_rank --- src/extend.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/extend.cpp b/src/extend.cpp index 14a44fb5..f7561950 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -113,6 +113,7 @@ sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t n_samples, std::vector i_vec; std::vector j_vec; std::vector dist_vec = std::get<2>(sparse_rr_mat); + const std::vector j_sparse = std::get<1>(sparse_rr_mat); for (long i = 0; i < n_samples; ++i) { Eigen::Map rr_dists(dist_vec.data() + row_start_idx[i], row_start_idx[i + 1] - @@ -123,7 +124,7 @@ sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t n_samples, float prev_value = -1; for (auto rr_it = rr_ordered_idx.cbegin(); rr_it != rr_ordered_idx.cend(); ++rr_it) { - long j = *rr_it; + long j = std::get<1>(sparse_rr_mat)[row_start_idx[i] + *rr_it]; float dist = rr_dists[*rr_it]; if (j == i) { continue; From 2f819d38ee72609eedbf62cb078a3bc001e4ce5b Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 12:27:15 +0100 Subject: [PATCH 07/18] Add some temporary extra messages into update test --- test/test-update.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/test/test-update.py b/test/test-update.py index 4fbd70e4..3a27a1c3 100755 --- a/test/test-update.py +++ b/test/test-update.py @@ -24,15 +24,20 @@ def run_regression(x, y, threshold = 0.99): res = stats.linregress(x, y) print("R^2: " + str(res.rvalue**2)) - if res.rvalue**2 < threshold: - sys.stderr.write("Distance matrix order failed!\n") - sys.exit(1) + #if res.rvalue**2 < threshold: + # sys.stderr.write("Distance matrix order failed!\n") + # sys.exit(1) def compare_sparse_matrices(d1,d2,r1,r2): + print(d1.todense()) + print(d2.todense()) d1_pairs = get_seq_tuples(d1.row,d1.col,r1) d2_pairs = get_seq_tuples(d2.row,d2.col,r2) d1_dists = [] d2_dists = [] + #if (len(d1_pairs) != len(d2_pairs)): + # sys.stderr.write("Distance matrix number of entries differ!\n") + # sys.exit(1) for (pair1,dist1) in zip(d1_pairs,d1.data): for (pair2,dist2) in zip(d2_pairs,d2.data): @@ -41,6 +46,8 @@ def compare_sparse_matrices(d1,d2,r1,r2): d2_dists.append(dist2) break + print(len(d1_pairs)) + print(len(d2_pairs)) run_regression(np.asarray(d1_dists),np.asarray(d2_dists)) def get_seq_tuples(rows,cols,names): @@ -91,6 +98,11 @@ def old_get_seq_tuples(rows,cols): S2 = scipy.sparse.load_npz("batch2/batch2_rank2_fit.npz") compare_sparse_matrices(S1,S2,rlist1,rlist2) +# Check rank 1 +S3 = scipy.sparse.load_npz("batch12/batch12_rank1_fit.npz") +S4 = scipy.sparse.load_npz("batch2/batch2_rank1_fit.npz") +compare_sparse_matrices(S3,S4,rlist1,rlist2) + # Check distances after second query # Check that order is the same after doing 1 + 2 + 3 with --update-db, as doing all of 1 + 2 + 3 together @@ -118,12 +130,12 @@ def old_get_seq_tuples(rows,cols): # Check sparse distances after second query with open("batch123/batch123.dists.pkl", 'rb') as pickle_file: rlist3, qlist, self = pickle.load(pickle_file) -S3 = scipy.sparse.load_npz("batch123/batch123_rank2_fit.npz") -S4 = scipy.sparse.load_npz("batch3/batch3_rank2_fit.npz") +S5 = scipy.sparse.load_npz("batch123/batch123_rank2_fit.npz") +S6 = scipy.sparse.load_npz("batch3/batch3_rank2_fit.npz") -compare_sparse_matrices(S3,S4,rlist3,rlist4) +compare_sparse_matrices(S5,S6,rlist3,rlist4) # Check rank 1 -S5 = scipy.sparse.load_npz("batch123/batch123_rank1_fit.npz") -S6 = scipy.sparse.load_npz("batch3/batch3_rank1_fit.npz") -compare_sparse_matrices(S5,S6,rlist3,rlist4) +S7 = scipy.sparse.load_npz("batch123/batch123_rank1_fit.npz") +S8 = scipy.sparse.load_npz("batch3/batch3_rank1_fit.npz") +compare_sparse_matrices(S7,S8,rlist3,rlist4) From dc92e40f4b757ed6d325953660caca6e0d9cf661 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 12:54:43 +0100 Subject: [PATCH 08/18] Add some debug print statements into extend fn --- src/extend.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/extend.cpp b/src/extend.cpp index f7561950..79082df7 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -37,6 +37,9 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ std::vector row_start_idx = row_start_indices(sparse_rr_mat, nr_samples); + std::cout << qq_mat_square << std::endl; + std::cout << qr_mat_rect << std::endl; + // ijv vectors std::vector dists; std::vector i_vec; @@ -60,6 +63,12 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ // Sort these. Then do a merge below std::vector qr_ordered_idx = sort_indexes(qr_dists, 1); std::vector rr_ordered_idx = sort_indexes(rr_dists, 1); + for (int z = 0; z < qr_ordered_idx.size(); ++z) { + std::cout << z << "\t" << qr_ordered_idx[z] << "\t" << qr_dists[qr_ordered_idx[z]] << std::endl; + } + for (int z = 0; z < rr_ordered_idx.size(); ++z) { + std::cout << z << "\t" << rr_ordered_idx[z] << "\t" << rr_dists[rr_ordered_idx[z]] << std::endl; + } // See sparsify_dists in pp_sketchlib. // This is very similar, but merging two lists as input From 6f02fa0dd0a530d55ec946a45b3a838047a1eab1 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 13:51:01 +0100 Subject: [PATCH 09/18] Change j index in the extend fn --- src/extend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/extend.cpp b/src/extend.cpp index 79082df7..d6537268 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -87,7 +87,7 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ dist = qr_dists[*qr_it]; ++qr_it; } else { - j = *rr_it; + j = std::get<1>(sparse_rr_mat)[row_start_idx[i] + *rr_it]; dist = rr_dists[*rr_it]; ++rr_it; } From 0738b0de480b237b26920ddc3863a96a2d3f312d Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 14:04:10 +0100 Subject: [PATCH 10/18] j idx in extend for qq rows --- src/extend.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/extend.cpp b/src/extend.cpp index d6537268..6c75fbaf 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -87,7 +87,11 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ dist = qr_dists[*qr_it]; ++qr_it; } else { - j = std::get<1>(sparse_rr_mat)[row_start_idx[i] + *rr_it]; + if (i < nr_samples) { + j = std::get<1>(sparse_rr_mat)[row_start_idx[i] + *rr_it]; + } else { + j = *rr_it; + } dist = rr_dists[*rr_it]; ++rr_it; } From 36339979f2f79272abcd051b0f484d78fdee0bc5 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 15:06:26 +0100 Subject: [PATCH 11/18] take transpose of qr_mat --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 61477f89..3ed0a4f2 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1169,7 +1169,7 @@ def extend(self, qqDists, qrDists): n_ref = self.nn_dists[self.ranks[0]].shape[0] n_query = qqSquare.shape[1] - qrRect = qrDists[:, [self.dist_col]].reshape(n_query, n_ref) + qrRect = qrDists[:, [self.dist_col]].reshape(n_query, n_ref).T qrRect[qrRect < epsilon] = epsilon max_rank = max(self.ranks) From 8c81d7981aa854a01411be684044fee3ad2604fd Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 15:11:23 +0100 Subject: [PATCH 12/18] Remove debug printing --- src/extend.cpp | 9 --------- test/test-update.py | 16 ++++++---------- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/src/extend.cpp b/src/extend.cpp index 6c75fbaf..75368adb 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -37,9 +37,6 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ std::vector row_start_idx = row_start_indices(sparse_rr_mat, nr_samples); - std::cout << qq_mat_square << std::endl; - std::cout << qr_mat_rect << std::endl; - // ijv vectors std::vector dists; std::vector i_vec; @@ -63,12 +60,6 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ // Sort these. Then do a merge below std::vector qr_ordered_idx = sort_indexes(qr_dists, 1); std::vector rr_ordered_idx = sort_indexes(rr_dists, 1); - for (int z = 0; z < qr_ordered_idx.size(); ++z) { - std::cout << z << "\t" << qr_ordered_idx[z] << "\t" << qr_dists[qr_ordered_idx[z]] << std::endl; - } - for (int z = 0; z < rr_ordered_idx.size(); ++z) { - std::cout << z << "\t" << rr_ordered_idx[z] << "\t" << rr_dists[rr_ordered_idx[z]] << std::endl; - } // See sparsify_dists in pp_sketchlib. // This is very similar, but merging two lists as input diff --git a/test/test-update.py b/test/test-update.py index 3a27a1c3..14d48680 100755 --- a/test/test-update.py +++ b/test/test-update.py @@ -24,20 +24,18 @@ def run_regression(x, y, threshold = 0.99): res = stats.linregress(x, y) print("R^2: " + str(res.rvalue**2)) - #if res.rvalue**2 < threshold: - # sys.stderr.write("Distance matrix order failed!\n") - # sys.exit(1) + if res.rvalue**2 < threshold: + sys.stderr.write("Distance matrix order failed!\n") + sys.exit(1) def compare_sparse_matrices(d1,d2,r1,r2): - print(d1.todense()) - print(d2.todense()) d1_pairs = get_seq_tuples(d1.row,d1.col,r1) d2_pairs = get_seq_tuples(d2.row,d2.col,r2) d1_dists = [] d2_dists = [] - #if (len(d1_pairs) != len(d2_pairs)): - # sys.stderr.write("Distance matrix number of entries differ!\n") - # sys.exit(1) + if (len(d1_pairs) != len(d2_pairs)): + sys.stderr.write("Distance matrix number of entries differ!\n") + sys.exit(1) for (pair1,dist1) in zip(d1_pairs,d1.data): for (pair2,dist2) in zip(d2_pairs,d2.data): @@ -46,8 +44,6 @@ def compare_sparse_matrices(d1,d2,r1,r2): d2_dists.append(dist2) break - print(len(d1_pairs)) - print(len(d2_pairs)) run_regression(np.asarray(d1_dists),np.asarray(d2_dists)) def get_seq_tuples(rows,cols,names): From daf0ecbe55f244a8e64e31e514ed2f69bd496ce0 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 15:40:20 +0100 Subject: [PATCH 13/18] Parallelise extend code --- PopPUNK/models.py | 34 ++++++++++++++++++++++--------- src/extend.cpp | 45 +++++++++++++++++++++++++++++++---------- src/extend.hpp | 6 ++++-- src/python_bindings.cpp | 3 ++- 4 files changed, 65 insertions(+), 23 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 3ed0a4f2..ea61b330 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -999,6 +999,8 @@ def __init__(self, outPrefix, ranks, use_gpu = False): self.use_gpu = use_gpu def __save_sparse__(self, data, row, col, rank, n_samples, dtype): + '''Save a sparse matrix in coo format + ''' if self.use_gpu: data = cp.array(data) data[data < epsilon] = epsilon @@ -1012,6 +1014,21 @@ def __save_sparse__(self, data, row, col, rank, n_samples, dtype): shape=(n_samples, n_samples), dtype = dtype) + def __reduce_rank__(self, higher_rank_sparse_mat, lower_rank, n_samples, dtype): + '''Lowers the rank of a fit and saves it + ''' + lower_rank_sparse_mat = \ + poppunk_refine.lowerRank( + higher_rank_sparse_mat, + n_samples, + lower_rank) + self.__save_sparse__(lower_rank_sparse_mat[2], + lower_rank_sparse_mat[0], + lower_rank_sparse_mat[1], + lower_rank, + n_samples, + dtype) + def fit(self, X, accessory): '''Extends :func:`~ClusterFit.fit` @@ -1179,18 +1196,17 @@ def extend(self, qqDists, qrDists): (rrSparse.row, rrSparse.col, rrSparse.data), qqSquare, qrRect, - max_rank) + max_rank, + self.threads) self.__save_sparse__(higher_rank[2], higher_rank[0], higher_rank[1], max_rank, n_ref + n_query, rrSparse.dtype) - for rank in sorted(self.ranks, reverse=True)[1:]: - higher_rank = \ - poppunk_refine.lowerRank( - higher_rank, - n_ref + n_query, - rank) - self.__save_sparse__(higher_rank[2], higher_rank[0], higher_rank[1], - rank, n_ref + n_query, rrSparse.dtype) + with Pool(processes=self.threads) as pool: + pool.map(partial(self.__reduce_rank__, + higher_rank_sparse_mat = higher_rank, + n_samples = n_ref + n_query, + dtype = rrSparse.dtype), + sorted(self.ranks, reverse=True)[1:]) y = self.assign(min(self.ranks)) return y diff --git a/src/extend.cpp b/src/extend.cpp index 75368adb..a82dd072 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -29,8 +29,22 @@ std::vector row_start_indices(const sparse_coo &sparse_rr_mat, return row_start_idx; } -sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_square, - const NumpyMatrix &qr_mat_rect, const size_t kNN) { +template +std::vector combine_vectors(const std::vector> &vec, + const size_t len) { + std::vector all(len); + auto all_it = all.begin(); + for (size_t i = 0; i < vec.size(); ++i) { + std::copy(vec[i].cbegin(), vec[i].cend(); all_it); + all_it += vec[i].size(); + } + return all; +} + +sparse_coo extend(const sparse_coo &sparse_rr_mat, + const NumpyMatrix &qq_mat_square, + const NumpyMatrix &qr_mat_rect, const size_t kNN, + const size_t num_threads) { const size_t nr_samples = qr_mat_rect.rows(); const size_t nq_samples = qr_mat_rect.cols(); @@ -38,10 +52,13 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ row_start_indices(sparse_rr_mat, nr_samples); // ijv vectors - std::vector dists; - std::vector i_vec; - std::vector j_vec; + std::vector> dists(nr_samples + nq_samples); + std::vector> i_vec(nr_samples + nq_samples); + std::vector> j_vec(nr_samples + nq_samples); + size_t len = 0; + std::vector dist_vec = std::get<2>(sparse_rr_mat); +#pragma omp parallel for schedule(static) num_threads(num_threads) reduction(+:len) for (long i = 0; i < nr_samples + nq_samples; ++i) { // Extract the dists for the row from the qr (dense) and rr (sparse) // matrices @@ -92,9 +109,9 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ } bool new_val = abs(dist - prev_value) < epsilon; if (unique_neighbors < kNN || new_val) { - dists.push_back(dist); - i_vec.push_back(i); - j_vec.push_back(j); + dists[i].push_back(dist); + i_vec[i].push_back(i); + j_vec[i].push_back(j); if (!new_val) { unique_neighbors++; prev_value = dist; @@ -103,14 +120,20 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_squ break; // next i } } + len = dists.size(); } - return (std::make_tuple(i_vec, j_vec, dists)); + + // Combine the lists from each thread + std::vector dists_all = combine_vectors(dists, len); + std::vector i_vec_all = combine_vectors(i_vec, len); + std::vector j_vec_all = combine_vectors(j_vec, len); + + return (std::make_tuple(i_vec_all, j_vec_all, dists_all)); } sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t n_samples, const size_t kNN) { - std::vector row_start_idx = - row_start_indices(sparse_rr_mat, n_samples); + std::vector row_start_idx = row_start_indices(sparse_rr_mat, n_samples); // ijv vectors std::vector dists; diff --git a/src/extend.hpp b/src/extend.hpp index 01ec51bb..9dd4cb49 100644 --- a/src/extend.hpp +++ b/src/extend.hpp @@ -5,8 +5,10 @@ typedef std::tuple, std::vector, std::vector> sparse_coo; -sparse_coo extend(const sparse_coo &sparse_rr_mat, const NumpyMatrix &qq_mat_square, - const NumpyMatrix &qr_mat_rect, const size_t kNN); +sparse_coo extend(const sparse_coo &sparse_rr_mat, + const NumpyMatrix &qq_mat_square, + const NumpyMatrix &qr_mat_rect, const size_t kNN, + const size_t num_threads); sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t n_samples, const size_t kNN); diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 64d1385b..cdf1ddd1 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -115,7 +115,8 @@ PYBIND11_MODULE(poppunk_refine, m) { m.def("extend", &extend, py::return_value_policy::reference_internal, "Extend a sparse distance matrix keeping k nearest-neighbours", py::arg("rr_mat"), py::arg("qq_mat").noconvert(), - py::arg("qr_mat").noconvert(), py::arg("kNN")); + py::arg("qr_mat").noconvert(), py::arg("kNN"), + py::arg("num_threads") = 1); m.def("lowerRank", &lower_rank, py::return_value_policy::reference_internal, "Extend a sparse distance matrix keeping k nearest-neighbours", From 5d9a52275c469d764d86ce3b6feeca1271ec6614 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 15:41:31 +0100 Subject: [PATCH 14/18] Fix semicolon in std copy --- src/extend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/extend.cpp b/src/extend.cpp index a82dd072..c583cd33 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -35,7 +35,7 @@ std::vector combine_vectors(const std::vector> &vec, std::vector all(len); auto all_it = all.begin(); for (size_t i = 0; i < vec.size(); ++i) { - std::copy(vec[i].cbegin(), vec[i].cend(); all_it); + std::copy(vec[i].cbegin(), vec[i].cend(), all_it); all_it += vec[i].size(); } return all; From 545c9f8a39e3f79dcaa8c64b94d398060dd8042a Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 15:47:08 +0100 Subject: [PATCH 15/18] Set self kwargs in multiprocessing partial --- PopPUNK/models.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index ea61b330..309d1656 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1203,9 +1203,10 @@ def extend(self, qqDists, qrDists): with Pool(processes=self.threads) as pool: pool.map(partial(self.__reduce_rank__, - higher_rank_sparse_mat = higher_rank, - n_samples = n_ref + n_query, - dtype = rrSparse.dtype), + self=self, + higher_rank_sparse_mat=higher_rank, + n_samples=n_ref + n_query, + dtype=rrSparse.dtype), sorted(self.ranks, reverse=True)[1:]) y = self.assign(min(self.ranks)) From 925a57d1aafa064662067eb56d4ec97be35b2c7f Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 16:18:40 +0100 Subject: [PATCH 16/18] Length reduction fix --- src/extend.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/extend.cpp b/src/extend.cpp index c583cd33..d8b17646 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -120,8 +120,9 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, break; // next i } } - len = dists.size(); + len += dists.size(); } + std::cout << len << std::endl; // Combine the lists from each thread std::vector dists_all = combine_vectors(dists, len); From 55a5a3cbf7658a3789404a878f29abc88304e052 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 2 Aug 2021 17:37:48 +0100 Subject: [PATCH 17/18] Fix threading of multi-rank --- PopPUNK/models.py | 14 ++++++++++---- src/extend.cpp | 3 +-- test/test-update.py | 2 ++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 309d1656..edb73156 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -971,6 +971,10 @@ def assign(self, X, slope=None): return y +# Wrapper function for LineageFit.__reduce_rank__ to be called by +# multiprocessing threads +def reduce_rank(lower_rank, fit, higher_rank_sparse_mat, n_samples, dtype): + fit.__reduce_rank__(higher_rank_sparse_mat, lower_rank, n_samples, dtype) class LineageFit(ClusterFit): '''Class for fits using the lineage assignment model. Inherits from :class:`ClusterFit`. @@ -1201,13 +1205,15 @@ def extend(self, qqDists, qrDists): self.__save_sparse__(higher_rank[2], higher_rank[0], higher_rank[1], max_rank, n_ref + n_query, rrSparse.dtype) - with Pool(processes=self.threads) as pool: - pool.map(partial(self.__reduce_rank__, - self=self, + # Apply lower ranks + thread_map(partial(reduce_rank, + fit=self, higher_rank_sparse_mat=higher_rank, n_samples=n_ref + n_query, dtype=rrSparse.dtype), - sorted(self.ranks, reverse=True)[1:]) + sorted(self.ranks, reverse=True)[1:], + max_workers=self.threads, + disable=True) y = self.assign(min(self.ranks)) return y diff --git a/src/extend.cpp b/src/extend.cpp index d8b17646..653fb3c3 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -120,9 +120,8 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, break; // next i } } - len += dists.size(); + len += dists[i].size(); } - std::cout << len << std::endl; // Combine the lists from each thread std::vector dists_all = combine_vectors(dists, len); diff --git a/test/test-update.py b/test/test-update.py index 14d48680..12b1b9d0 100755 --- a/test/test-update.py +++ b/test/test-update.py @@ -35,6 +35,8 @@ def compare_sparse_matrices(d1,d2,r1,r2): d2_dists = [] if (len(d1_pairs) != len(d2_pairs)): sys.stderr.write("Distance matrix number of entries differ!\n") + print(d1_pairs) + print(d2_pairs) sys.exit(1) for (pair1,dist1) in zip(d1_pairs,d1.data): From dd00b17ddbaf88a6bc9dd1705b9ba29403c0d49f Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 4 Aug 2021 11:01:15 +0100 Subject: [PATCH 18/18] Invert meaning of 'new_val' bool --- src/extend.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/extend.cpp b/src/extend.cpp index 653fb3c3..8f704703 100644 --- a/src/extend.cpp +++ b/src/extend.cpp @@ -107,12 +107,12 @@ sparse_coo extend(const sparse_coo &sparse_rr_mat, if (j == i) { continue; } - bool new_val = abs(dist - prev_value) < epsilon; - if (unique_neighbors < kNN || new_val) { + bool new_val = abs(dist - prev_value) >= epsilon; + if (unique_neighbors < kNN || !new_val) { dists[i].push_back(dist); i_vec[i].push_back(i); j_vec[i].push_back(j); - if (!new_val) { + if (new_val) { unique_neighbors++; prev_value = dist; } @@ -156,12 +156,12 @@ sparse_coo lower_rank(const sparse_coo &sparse_rr_mat, const size_t n_samples, if (j == i) { continue; } - bool new_val = abs(dist - prev_value) < epsilon; - if (unique_neighbors < kNN || new_val) { + bool new_val = abs(dist - prev_value) >= epsilon; + if (unique_neighbors < kNN || !new_val) { dists.push_back(dist); i_vec.push_back(i); j_vec.push_back(j); - if (!new_val) { + if (new_val) { unique_neighbors++; prev_value = dist; }