Skip to content

Commit

Permalink
Merge pull request bacpop#82 from bacpop/flexible_lineages
Browse files Browse the repository at this point in the history
Lineage model fitting - sketchlib changes
  • Loading branch information
johnlees authored Oct 24, 2022
2 parents a7ea4cc + 413f57e commit c61ee1a
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 115 deletions.
2 changes: 1 addition & 1 deletion pp_sketch/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,4 +404,4 @@ def main():
sys.exit(0)

if __name__ == "__main__":
main()
main()
8 changes: 4 additions & 4 deletions pp_sketch/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

import pp_sketchlib

def sparsify(distMat, cutoff, kNN, threads):
sparse_coordinates = pp_sketchlib.sparsifyDists(distMat,
distCutoff=cutoff,
kNN=kNN)
def sparsify(distMat, cutoff, threads):
sparse_coordinates = pp_sketchlib.sparsifyDistsByThreshold(distMat,
distCutoff=cutoff,
num_threads=threads)
sparse_scipy = ijv_to_coo(sparse_coordinates, distMat.shape, np.float32)

# Mirror to fill in lower triangle
Expand Down
23 changes: 11 additions & 12 deletions src/dist/matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,23 @@
#pragma once

#include <algorithm>
#include <numeric>
#include <vector>
#include <cstdint>
#include <cstddef>
#include <cstdint>
#include <numeric>
#include <string>
#include <vector>

#include <Eigen/Dense>

#include "matrix_types.hpp"
#include "matrix_idx.hpp"
#include "matrix_types.hpp"

// This type not used in any nvcc code
using NumpyMatrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
using NumpyMatrix =
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;

//https://stackoverflow.com/a/12399290
template <typename T>
std::vector<long> sort_indexes(const T &v)
{
// https://stackoverflow.com/a/12399290
template <typename T> std::vector<long> sort_indexes(const T &v) {

// initialize original index locations
std::vector<long> idx(v.size());
Expand All @@ -44,6 +43,6 @@ NumpyMatrix long_to_square(const Eigen::VectorXf &rrDists,
Eigen::VectorXf square_to_long(const NumpyMatrix &squareDists,
const unsigned int num_threads);

sparse_coo sparsify_dists(const NumpyMatrix &denseDists,
const float distCutoff,
const unsigned long int kNN);
sparse_coo sparsify_dists_by_threshold(const NumpyMatrix &denseDists,
const float distCutoff,
const size_t num_threads);
84 changes: 38 additions & 46 deletions src/dist/matrix_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@
#include <omp.h>
#include <string>
#include <vector>

#include <iostream>
#include "api.hpp"

const float epsilon = 1E-10;

// Prototypes for cppying functions run in threads
void square_block(const Eigen::VectorXf &longDists, NumpyMatrix &squareMatrix,
const size_t n_samples, const size_t start,
Expand All @@ -25,55 +23,49 @@ void rectangle_block(const Eigen::VectorXf &longDists,
const size_t nqqSamples, const size_t start,
const size_t max_elems);

sparse_coo sparsify_dists(const NumpyMatrix &denseDists, const float distCutoff,
const unsigned long int kNN) {
if (kNN > 0 && distCutoff > 0) {
throw std::runtime_error("Specify only one of kNN or distCutoff");
} else if (kNN < 1 && distCutoff < 0) {
throw std::runtime_error("kNN must be > 1 or distCutoff > 0");
template <typename T>
std::vector<T> combine_vectors(const std::vector<std::vector<T>> &vec,
const size_t len) {
std::vector<T> all(len);
auto all_it = all.begin();
for (size_t i = 0; i < vec.size(); ++i) {
std::copy(vec[i].cbegin(), vec[i].cend(), all_it);
all_it += vec[i].size();
}
return all;
}

sparse_coo sparsify_dists_by_threshold(const NumpyMatrix &denseDists,
const float distCutoff,
const size_t num_threads) {

if (distCutoff < 0) {
throw std::runtime_error("Distance threshold must be at least 0");
}

// Parallelisation parameter
size_t len = 0;

// ijv vectors
std::vector<float> dists;
std::vector<long> i_vec;
std::vector<long> j_vec;
if (distCutoff > 0) {
for (long i = 0; i < denseDists.rows(); i++) {
for (long j = i + 1; j < denseDists.cols(); j++) {
if (denseDists(i, j) < distCutoff) {
dists.push_back(denseDists(i, j));
i_vec.push_back(i);
j_vec.push_back(j);
}
}
}
} else if (kNN >= 1) {
// Only add the k nearest (unique) neighbours
// May be >k if repeats, often zeros
for (long i = 0; i < denseDists.rows(); i++) {
unsigned long unique_neighbors = 0;
float prev_value = -1;
for (auto j : sort_indexes(denseDists.row(i))) {
if (j == i) {
continue; // Ignore diagonal which will always be one of the closest
}
bool new_val = abs(denseDists(i, j) - prev_value) < epsilon;
if (unique_neighbors < kNN || new_val) {
dists.push_back(denseDists(i, j));
i_vec.push_back(i);
j_vec.push_back(j);
if (!new_val) {
unique_neighbors++;
prev_value = denseDists(i, j);
}
} else {
break;
}
long num_samples = denseDists.rows();
std::vector<std::vector<float>> dists(num_samples);
std::vector<std::vector<long>> i_vec(num_samples);
std::vector<std::vector<long>> j_vec(num_samples);
#pragma omp parallel for schedule(static) num_threads(num_threads) reduction(+:len)
for (long i = 0; i < num_samples; i++) {
for (long j = i + 1; j < denseDists.cols(); j++) {
if (denseDists(i, j) < distCutoff) {
dists[i].push_back(denseDists(i, j));
i_vec[i].push_back(i);
j_vec[i].push_back(j);
}
}
len += i_vec[i].size();
}

return (std::make_tuple(i_vec, j_vec, dists));
std::vector<float> dists_all = combine_vectors(dists, len);
std::vector<long> i_vec_all = combine_vectors(i_vec, len);
std::vector<long> j_vec_all = combine_vectors(j_vec, len);
return (std::make_tuple(i_vec_all, j_vec_all, dists_all));
}

NumpyMatrix long_to_square(const Eigen::VectorXf &rrDists,
Expand Down
54 changes: 29 additions & 25 deletions src/sketchlib_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,13 @@ NumpyMatrix queryDatabase(const std::string &ref_db_name,
}

sparse_coo
querySelfSparse(const std::string &ref_db_name, const std::vector<std::string> &ref_names,
std::vector<size_t> kmer_lengths, const bool random_correct = true,
const bool jaccard = false, const unsigned long int kNN = 0,
const float dist_cutoff = 0.0f, const size_t dist_col = 0,
const size_t num_threads = 1, const bool use_gpu = false,
const int device_id = 0) {
querySelfSparse(const std::string &ref_db_name,
const std::vector<std::string> &ref_names,
std::vector<size_t> kmer_lengths,
const bool random_correct = true, const bool jaccard = false,
const unsigned long int kNN = 0, const float dist_cutoff = 0.0f,
const size_t dist_col = 0, const size_t num_threads = 1,
const bool use_gpu = false, const int device_id = 0) {
if (use_gpu && (jaccard || kmer_lengths.size() < 2 || dist_col > 1)) {
throw std::runtime_error(
"Extracting Jaccard distances not supported on GPU");
Expand All @@ -160,26 +161,28 @@ querySelfSparse(const std::string &ref_db_name, const std::vector<std::string> &
sparse_coo sparse_dists;
if (dist_cutoff > 0) {
NumpyMatrix dists = queryDatabase(ref_db_name, ref_db_name, ref_names,
ref_names, kmer_lengths, random_correct,
false, num_threads, use_gpu, device_id);
ref_names, kmer_lengths, random_correct,
false, num_threads, use_gpu, device_id);

Eigen::VectorXf dummy_query_ref;
Eigen::VectorXf dummy_query_query;
NumpyMatrix long_form = long_to_square(dists.col(dist_col), dummy_query_ref,
dummy_query_query, num_threads);
sparse_dists = sparsify_dists(long_form, dist_cutoff, kNN);
dummy_query_query, num_threads);
sparse_dists =
sparsify_dists_by_threshold(long_form, dist_cutoff, num_threads);
} else {
#ifdef GPU_AVAILABLE
if (use_gpu) {
sparse_dists = query_db_sparse_cuda(ref_sketches, kmer_lengths, random,
kNN, dist_col, device_id, num_threads);
sparse_dists =
query_db_sparse_cuda(ref_sketches, kmer_lengths, random, kNN,
dist_col, device_id, num_threads);
} else {
sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random,
jaccard, kNN, dist_col, num_threads);
jaccard, kNN, dist_col, num_threads);
}
#else
sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random,
jaccard, kNN, dist_col, num_threads);
sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random, jaccard,
kNN, dist_col, num_threads);
#endif
}

Expand Down Expand Up @@ -216,9 +219,11 @@ double jaccardDist(const std::string &db_name, const std::string &sample1,
}

// Wrapper which makes a ref to the python/numpy array
sparse_coo sparsifyDists(const Eigen::Ref<NumpyMatrix> &denseDists,
const float distCutoff, const unsigned long int kNN) {
sparse_coo coo_idx = sparsify_dists(denseDists, distCutoff, kNN);
sparse_coo sparsifyDistsByThreshold(const Eigen::Ref<NumpyMatrix> &denseDists,
const float distCutoff,
const size_t num_threads) {
sparse_coo coo_idx =
sparsify_dists_by_threshold(denseDists, distCutoff, num_threads);
return coo_idx;
}

Expand All @@ -245,12 +250,11 @@ PYBIND11_MODULE(pp_sketchlib, m) {
m.def("querySelfSparse", &querySelfSparse,
py::return_value_policy::reference_internal,
"Find self-self distances between sketches; return a sparse matrix",
py::arg("ref_db_name"), py::arg("rList"), py::arg("klist"), py::arg("random_correct") = true,
py::arg("jaccard") = false,
py::arg("ref_db_name"), py::arg("rList"), py::arg("klist"),
py::arg("random_correct") = true, py::arg("jaccard") = false,
py::arg("kNN") = 0, py::arg("dist_cutoff") = 0.0f,
py::arg("dist_col") = 0,
py::arg("num_threads") = 1, py::arg("use_gpu") = false,
py::arg("device_id") = 0);
py::arg("dist_col") = 0, py::arg("num_threads") = 1,
py::arg("use_gpu") = false, py::arg("device_id") = 0);

m.def("addRandom", &addRandomToDb,
"Add random match chances into older databases", py::arg("db_name"),
Expand Down Expand Up @@ -279,11 +283,11 @@ PYBIND11_MODULE(pp_sketchlib, m) {
py::arg("query_ref_distVec").noconvert(),
py::arg("query_query_distVec").noconvert(), py::arg("num_threads") = 1);

m.def("sparsifyDists", &sparsifyDists,
m.def("sparsifyDistsByThreshold", &sparsifyDistsByThreshold,
py::return_value_policy::reference_internal,
"Transform all distances into a sparse matrix",
py::arg("distMat").noconvert(), py::arg("distCutoff") = 0,
py::arg("kNN") = 0);
py::arg("num_threads") = 1);

m.attr("version") = VERSION_INFO;
m.attr("sketchVersion") = SKETCH_VERSION;
Expand Down
34 changes: 32 additions & 2 deletions test/test-dists.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,36 @@ def iterDistRows(refSeqs, querySeqs, self=True):
for ref in refSeqs:
yield(ref, query)

def get_kNN_sparse_tuple(square_core_mat,kNN):
i_vec = []
j_vec = []
dist_vec = []
for i in range(0,square_core_mat.shape[0]):
sorted_indices = np.argsort(square_core_mat[i,:])
j_index = 0
neighbour_count = 0
while neighbour_count < kNN:
if (sorted_indices[j_index] != i):
i_vec.append(i)
j_vec.append(sorted_indices[j_index])
dist_vec.append(square_core_mat[i,sorted_indices[j_index]])
neighbour_count = neighbour_count + 1
j_index = j_index + 1
sparse_knn = (i_vec,j_vec,dist_vec)
return sparse_knn

def get_threshold_sparse_tuple(square_core_mat,threshold):
i_vec = []
j_vec = []
dist_vec = []
for i in range(0,square_core_mat.shape[0]):
for j in range(i,square_core_mat.shape[1]):
if square_core_mat[i,j] < threshold and i != j:
i_vec.append(i)
j_vec.append(j)
dist_vec.append(square_core_mat[i,j])
sparse_threshold = (i_vec,j_vec,dist_vec)
return sparse_threshold

description = 'Run poppunk sketching/distances'
parser = argparse.ArgumentParser(description=description,
Expand Down Expand Up @@ -110,7 +140,7 @@ def iterDistRows(refSeqs, querySeqs, self=True):
rList=rList,
klist=db_kmers,
kNN=kNN)
sparse_knn = pp_sketchlib.sparsifyDists(distMat=square_core_mat, distCutoff=0, kNN=kNN)
sparse_knn = get_kNN_sparse_tuple(square_core_mat,kNN)
if (sparseDistMat[0] != sparse_knn[0] or
sparseDistMat[1] != sparse_knn[1]):
sys.stderr.write("Sparse distances (kNN) mismatching\n")
Expand All @@ -126,7 +156,7 @@ def iterDistRows(refSeqs, querySeqs, self=True):
rList=rList,
klist=db_kmers,
dist_cutoff=cutoff)
sparse_threshold = pp_sketchlib.sparsifyDists(distMat=square_core_mat, distCutoff=cutoff, kNN=0)
sparse_threshold = get_threshold_sparse_tuple(square_core_mat,cutoff)
if (sparseDistMat[0] != sparse_threshold[0] or
sparseDistMat[1] != sparse_threshold[1]):
sys.stderr.write("Sparse distances (cutoff) mismatching\n")
Expand Down
30 changes: 5 additions & 25 deletions test/test-matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
from pp_sketch.matrix import sparsify
except ImportError as e:
from scipy.sparse import coo_matrix
def sparsify(distMat, cutoff, kNN, threads):
sparse_coordinates = pp_sketchlib.sparsifyDists(distMat=distMat,
distCutoff=cutoff,
kNN=kNN)
def sparsify(distMat, cutoff, threads):
sparse_coordinates = pp_sketchlib.sparsifyDistsByThreshold(distMat=distMat,
distCutoff=cutoff,
num_threads=threads)
sparse_scipy = coo_matrix((sparse_coordinates[2],
(sparse_coordinates[0], sparse_coordinates[1])),
shape=distMat.shape,
Expand Down Expand Up @@ -60,28 +60,8 @@ def check_res(res, expected):
check_res(pp_sketchlib.squareToLong(distMat=square1_res, num_threads=2), rr_mat)

# sparsification
sparse1 = sparsify(square2_res, cutoff=5, kNN=0, threads=2)
sparse1 = sparsify(square2_res, cutoff=5, threads=2)

sparse1_res = square2_res.copy()
sparse1_res[sparse1_res >= 5] = 0
check_res(sparse1.todense(), sparse1_res)

kNN = 2
sparse2 = sparsify(square2_res, cutoff=0, kNN=kNN, threads=2)

sparse2_res = square2_res.copy()
row_sort = np.argsort(sparse2_res, axis=1)
for i, row in enumerate(sparse2_res):
neighbours = 0
prev_val = 0
for j in row_sort[i, :]:
if i == j or row[j] == prev_val:
continue
else:
prev_val = row[j]
if neighbours >= kNN:
sparse2_res[i, j] = 0
else:
neighbours += 1

check_res(sparse2.todense(), sparse2_res)

0 comments on commit c61ee1a

Please sign in to comment.