diff --git a/pp_sketch/__main__.py b/pp_sketch/__main__.py index b7fb779d..fc7c9f8b 100644 --- a/pp_sketch/__main__.py +++ b/pp_sketch/__main__.py @@ -404,4 +404,4 @@ def main(): sys.exit(0) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/pp_sketch/matrix.py b/pp_sketch/matrix.py index 61685871..e6f89068 100644 --- a/pp_sketch/matrix.py +++ b/pp_sketch/matrix.py @@ -8,10 +8,10 @@ import pp_sketchlib -def sparsify(distMat, cutoff, kNN, threads): - sparse_coordinates = pp_sketchlib.sparsifyDists(distMat, - distCutoff=cutoff, - kNN=kNN) +def sparsify(distMat, cutoff, threads): + sparse_coordinates = pp_sketchlib.sparsifyDistsByThreshold(distMat, + distCutoff=cutoff, + num_threads=threads) sparse_scipy = ijv_to_coo(sparse_coordinates, distMat.shape, np.float32) # Mirror to fill in lower triangle diff --git a/src/dist/matrix.hpp b/src/dist/matrix.hpp index e7935334..f957d5b7 100644 --- a/src/dist/matrix.hpp +++ b/src/dist/matrix.hpp @@ -7,24 +7,23 @@ #pragma once #include -#include -#include -#include #include +#include +#include #include +#include #include -#include "matrix_types.hpp" #include "matrix_idx.hpp" +#include "matrix_types.hpp" // This type not used in any nvcc code -using NumpyMatrix = Eigen::Matrix; +using NumpyMatrix = + Eigen::Matrix; -//https://stackoverflow.com/a/12399290 -template -std::vector sort_indexes(const T &v) -{ +// https://stackoverflow.com/a/12399290 +template std::vector sort_indexes(const T &v) { // initialize original index locations std::vector idx(v.size()); @@ -44,6 +43,6 @@ NumpyMatrix long_to_square(const Eigen::VectorXf &rrDists, Eigen::VectorXf square_to_long(const NumpyMatrix &squareDists, const unsigned int num_threads); -sparse_coo sparsify_dists(const NumpyMatrix &denseDists, - const float distCutoff, - const unsigned long int kNN); +sparse_coo sparsify_dists_by_threshold(const NumpyMatrix &denseDists, + const float distCutoff, + const size_t num_threads); diff --git a/src/dist/matrix_ops.cpp b/src/dist/matrix_ops.cpp index de48990e..0a96d67e 100644 --- a/src/dist/matrix_ops.cpp +++ b/src/dist/matrix_ops.cpp @@ -10,11 +10,9 @@ #include #include #include - +#include #include "api.hpp" -const float epsilon = 1E-10; - // Prototypes for cppying functions run in threads void square_block(const Eigen::VectorXf &longDists, NumpyMatrix &squareMatrix, const size_t n_samples, const size_t start, @@ -25,55 +23,49 @@ void rectangle_block(const Eigen::VectorXf &longDists, const size_t nqqSamples, const size_t start, const size_t max_elems); -sparse_coo sparsify_dists(const NumpyMatrix &denseDists, const float distCutoff, - const unsigned long int kNN) { - if (kNN > 0 && distCutoff > 0) { - throw std::runtime_error("Specify only one of kNN or distCutoff"); - } else if (kNN < 1 && distCutoff < 0) { - throw std::runtime_error("kNN must be > 1 or distCutoff > 0"); +template +std::vector combine_vectors(const std::vector> &vec, + const size_t len) { + std::vector all(len); + auto all_it = all.begin(); + for (size_t i = 0; i < vec.size(); ++i) { + std::copy(vec[i].cbegin(), vec[i].cend(), all_it); + all_it += vec[i].size(); + } + return all; +} + +sparse_coo sparsify_dists_by_threshold(const NumpyMatrix &denseDists, + const float distCutoff, + const size_t num_threads) { + + if (distCutoff < 0) { + throw std::runtime_error("Distance threshold must be at least 0"); } + // Parallelisation parameter + size_t len = 0; + // ijv vectors - std::vector dists; - std::vector i_vec; - std::vector j_vec; - if (distCutoff > 0) { - for (long i = 0; i < denseDists.rows(); i++) { - for (long j = i + 1; j < denseDists.cols(); j++) { - if (denseDists(i, j) < distCutoff) { - dists.push_back(denseDists(i, j)); - i_vec.push_back(i); - j_vec.push_back(j); - } - } - } - } else if (kNN >= 1) { - // Only add the k nearest (unique) neighbours - // May be >k if repeats, often zeros - for (long i = 0; i < denseDists.rows(); i++) { - unsigned long unique_neighbors = 0; - float prev_value = -1; - for (auto j : sort_indexes(denseDists.row(i))) { - if (j == i) { - continue; // Ignore diagonal which will always be one of the closest - } - bool new_val = abs(denseDists(i, j) - prev_value) < epsilon; - if (unique_neighbors < kNN || new_val) { - dists.push_back(denseDists(i, j)); - i_vec.push_back(i); - j_vec.push_back(j); - if (!new_val) { - unique_neighbors++; - prev_value = denseDists(i, j); - } - } else { - break; - } + long num_samples = denseDists.rows(); + std::vector> dists(num_samples); + std::vector> i_vec(num_samples); + std::vector> j_vec(num_samples); +#pragma omp parallel for schedule(static) num_threads(num_threads) reduction(+:len) + for (long i = 0; i < num_samples; i++) { + for (long j = i + 1; j < denseDists.cols(); j++) { + if (denseDists(i, j) < distCutoff) { + dists[i].push_back(denseDists(i, j)); + i_vec[i].push_back(i); + j_vec[i].push_back(j); } } + len += i_vec[i].size(); } - - return (std::make_tuple(i_vec, j_vec, dists)); + std::vector dists_all = combine_vectors(dists, len); + std::vector i_vec_all = combine_vectors(i_vec, len); + std::vector j_vec_all = combine_vectors(j_vec, len); + return (std::make_tuple(i_vec_all, j_vec_all, dists_all)); } NumpyMatrix long_to_square(const Eigen::VectorXf &rrDists, diff --git a/src/sketchlib_bindings.cpp b/src/sketchlib_bindings.cpp index d1edd162..5d7ead7e 100644 --- a/src/sketchlib_bindings.cpp +++ b/src/sketchlib_bindings.cpp @@ -137,12 +137,13 @@ NumpyMatrix queryDatabase(const std::string &ref_db_name, } sparse_coo -querySelfSparse(const std::string &ref_db_name, const std::vector &ref_names, - std::vector kmer_lengths, const bool random_correct = true, - const bool jaccard = false, const unsigned long int kNN = 0, - const float dist_cutoff = 0.0f, const size_t dist_col = 0, - const size_t num_threads = 1, const bool use_gpu = false, - const int device_id = 0) { +querySelfSparse(const std::string &ref_db_name, + const std::vector &ref_names, + std::vector kmer_lengths, + const bool random_correct = true, const bool jaccard = false, + const unsigned long int kNN = 0, const float dist_cutoff = 0.0f, + const size_t dist_col = 0, const size_t num_threads = 1, + const bool use_gpu = false, const int device_id = 0) { if (use_gpu && (jaccard || kmer_lengths.size() < 2 || dist_col > 1)) { throw std::runtime_error( "Extracting Jaccard distances not supported on GPU"); @@ -160,26 +161,28 @@ querySelfSparse(const std::string &ref_db_name, const std::vector & sparse_coo sparse_dists; if (dist_cutoff > 0) { NumpyMatrix dists = queryDatabase(ref_db_name, ref_db_name, ref_names, - ref_names, kmer_lengths, random_correct, - false, num_threads, use_gpu, device_id); + ref_names, kmer_lengths, random_correct, + false, num_threads, use_gpu, device_id); Eigen::VectorXf dummy_query_ref; Eigen::VectorXf dummy_query_query; NumpyMatrix long_form = long_to_square(dists.col(dist_col), dummy_query_ref, - dummy_query_query, num_threads); - sparse_dists = sparsify_dists(long_form, dist_cutoff, kNN); + dummy_query_query, num_threads); + sparse_dists = + sparsify_dists_by_threshold(long_form, dist_cutoff, num_threads); } else { #ifdef GPU_AVAILABLE if (use_gpu) { - sparse_dists = query_db_sparse_cuda(ref_sketches, kmer_lengths, random, - kNN, dist_col, device_id, num_threads); + sparse_dists = + query_db_sparse_cuda(ref_sketches, kmer_lengths, random, kNN, + dist_col, device_id, num_threads); } else { sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random, - jaccard, kNN, dist_col, num_threads); + jaccard, kNN, dist_col, num_threads); } #else - sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random, - jaccard, kNN, dist_col, num_threads); + sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random, jaccard, + kNN, dist_col, num_threads); #endif } @@ -216,9 +219,11 @@ double jaccardDist(const std::string &db_name, const std::string &sample1, } // Wrapper which makes a ref to the python/numpy array -sparse_coo sparsifyDists(const Eigen::Ref &denseDists, - const float distCutoff, const unsigned long int kNN) { - sparse_coo coo_idx = sparsify_dists(denseDists, distCutoff, kNN); +sparse_coo sparsifyDistsByThreshold(const Eigen::Ref &denseDists, + const float distCutoff, + const size_t num_threads) { + sparse_coo coo_idx = + sparsify_dists_by_threshold(denseDists, distCutoff, num_threads); return coo_idx; } @@ -245,12 +250,11 @@ PYBIND11_MODULE(pp_sketchlib, m) { m.def("querySelfSparse", &querySelfSparse, py::return_value_policy::reference_internal, "Find self-self distances between sketches; return a sparse matrix", - py::arg("ref_db_name"), py::arg("rList"), py::arg("klist"), py::arg("random_correct") = true, - py::arg("jaccard") = false, + py::arg("ref_db_name"), py::arg("rList"), py::arg("klist"), + py::arg("random_correct") = true, py::arg("jaccard") = false, py::arg("kNN") = 0, py::arg("dist_cutoff") = 0.0f, - py::arg("dist_col") = 0, - py::arg("num_threads") = 1, py::arg("use_gpu") = false, - py::arg("device_id") = 0); + py::arg("dist_col") = 0, py::arg("num_threads") = 1, + py::arg("use_gpu") = false, py::arg("device_id") = 0); m.def("addRandom", &addRandomToDb, "Add random match chances into older databases", py::arg("db_name"), @@ -279,11 +283,11 @@ PYBIND11_MODULE(pp_sketchlib, m) { py::arg("query_ref_distVec").noconvert(), py::arg("query_query_distVec").noconvert(), py::arg("num_threads") = 1); - m.def("sparsifyDists", &sparsifyDists, + m.def("sparsifyDistsByThreshold", &sparsifyDistsByThreshold, py::return_value_policy::reference_internal, "Transform all distances into a sparse matrix", py::arg("distMat").noconvert(), py::arg("distCutoff") = 0, - py::arg("kNN") = 0); + py::arg("num_threads") = 1); m.attr("version") = VERSION_INFO; m.attr("sketchVersion") = SKETCH_VERSION; diff --git a/test/test-dists.py b/test/test-dists.py index 2843a804..828da062 100644 --- a/test/test-dists.py +++ b/test/test-dists.py @@ -29,6 +29,36 @@ def iterDistRows(refSeqs, querySeqs, self=True): for ref in refSeqs: yield(ref, query) +def get_kNN_sparse_tuple(square_core_mat,kNN): + i_vec = [] + j_vec = [] + dist_vec = [] + for i in range(0,square_core_mat.shape[0]): + sorted_indices = np.argsort(square_core_mat[i,:]) + j_index = 0 + neighbour_count = 0 + while neighbour_count < kNN: + if (sorted_indices[j_index] != i): + i_vec.append(i) + j_vec.append(sorted_indices[j_index]) + dist_vec.append(square_core_mat[i,sorted_indices[j_index]]) + neighbour_count = neighbour_count + 1 + j_index = j_index + 1 + sparse_knn = (i_vec,j_vec,dist_vec) + return sparse_knn + +def get_threshold_sparse_tuple(square_core_mat,threshold): + i_vec = [] + j_vec = [] + dist_vec = [] + for i in range(0,square_core_mat.shape[0]): + for j in range(i,square_core_mat.shape[1]): + if square_core_mat[i,j] < threshold and i != j: + i_vec.append(i) + j_vec.append(j) + dist_vec.append(square_core_mat[i,j]) + sparse_threshold = (i_vec,j_vec,dist_vec) + return sparse_threshold description = 'Run poppunk sketching/distances' parser = argparse.ArgumentParser(description=description, @@ -110,7 +140,7 @@ def iterDistRows(refSeqs, querySeqs, self=True): rList=rList, klist=db_kmers, kNN=kNN) -sparse_knn = pp_sketchlib.sparsifyDists(distMat=square_core_mat, distCutoff=0, kNN=kNN) +sparse_knn = get_kNN_sparse_tuple(square_core_mat,kNN) if (sparseDistMat[0] != sparse_knn[0] or sparseDistMat[1] != sparse_knn[1]): sys.stderr.write("Sparse distances (kNN) mismatching\n") @@ -126,7 +156,7 @@ def iterDistRows(refSeqs, querySeqs, self=True): rList=rList, klist=db_kmers, dist_cutoff=cutoff) -sparse_threshold = pp_sketchlib.sparsifyDists(distMat=square_core_mat, distCutoff=cutoff, kNN=0) +sparse_threshold = get_threshold_sparse_tuple(square_core_mat,cutoff) if (sparseDistMat[0] != sparse_threshold[0] or sparseDistMat[1] != sparse_threshold[1]): sys.stderr.write("Sparse distances (cutoff) mismatching\n") diff --git a/test/test-matrix.py b/test/test-matrix.py index 3cf227fe..c82c5db3 100644 --- a/test/test-matrix.py +++ b/test/test-matrix.py @@ -9,10 +9,10 @@ from pp_sketch.matrix import sparsify except ImportError as e: from scipy.sparse import coo_matrix - def sparsify(distMat, cutoff, kNN, threads): - sparse_coordinates = pp_sketchlib.sparsifyDists(distMat=distMat, - distCutoff=cutoff, - kNN=kNN) + def sparsify(distMat, cutoff, threads): + sparse_coordinates = pp_sketchlib.sparsifyDistsByThreshold(distMat=distMat, + distCutoff=cutoff, + num_threads=threads) sparse_scipy = coo_matrix((sparse_coordinates[2], (sparse_coordinates[0], sparse_coordinates[1])), shape=distMat.shape, @@ -60,28 +60,8 @@ def check_res(res, expected): check_res(pp_sketchlib.squareToLong(distMat=square1_res, num_threads=2), rr_mat) # sparsification -sparse1 = sparsify(square2_res, cutoff=5, kNN=0, threads=2) +sparse1 = sparsify(square2_res, cutoff=5, threads=2) sparse1_res = square2_res.copy() sparse1_res[sparse1_res >= 5] = 0 check_res(sparse1.todense(), sparse1_res) - -kNN = 2 -sparse2 = sparsify(square2_res, cutoff=0, kNN=kNN, threads=2) - -sparse2_res = square2_res.copy() -row_sort = np.argsort(sparse2_res, axis=1) -for i, row in enumerate(sparse2_res): - neighbours = 0 - prev_val = 0 - for j in row_sort[i, :]: - if i == j or row[j] == prev_val: - continue - else: - prev_val = row[j] - if neighbours >= kNN: - sparse2_res[i, j] = 0 - else: - neighbours += 1 - -check_res(sparse2.todense(), sparse2_res)