From 7fada0ed485e05413b5a4cc0211e8862df289c2c Mon Sep 17 00:00:00 2001 From: John Lees <lees.john6@gmail.com> Date: Tue, 26 Jan 2021 17:03:06 +0000 Subject: [PATCH 1/5] Remove the boundary code from the package Moved to PopPUNK See https://github.com/johnlees/PopPUNK/pull/146 --- pp_sketch/__init__.py | 2 +- src/dist/matrix.hpp | 19 ----- src/dist/matrix_ops.cpp | 158 ------------------------------------- src/sketchlib_bindings.cpp | 68 ---------------- test/test-matrix.py | 92 --------------------- 5 files changed, 1 insertion(+), 338 deletions(-) diff --git a/pp_sketch/__init__.py b/pp_sketch/__init__.py index 247b6c89..a7c744ff 100644 --- a/pp_sketch/__init__.py +++ b/pp_sketch/__init__.py @@ -3,4 +3,4 @@ '''PopPUNK sketching functions''' -__version__ = '1.6.2' +__version__ = '1.6.3' diff --git a/src/dist/matrix.hpp b/src/dist/matrix.hpp index 1d21758b..d5ab3bae 100644 --- a/src/dist/matrix.hpp +++ b/src/dist/matrix.hpp @@ -31,22 +31,3 @@ sparse_coo sparsify_dists(const NumpyMatrix &denseDists, const float distCutoff, const unsigned long int kNN, const unsigned int num_threads = 1); - -Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, - const int slope, - const float x_max, - const float y_max, - unsigned int num_threads); - -network_coo threshold_iterate_1D(const NumpyMatrix &distMat, - const std::vector<double> &offsets, - const int slope, - const float x0, - const float y0, - const float x1, - const float y1, - const int num_threads = 1); - -network_coo threshold_iterate_2D(const NumpyMatrix &distMat, - const std::vector<float> &x_max, - const float y_max); diff --git a/src/dist/matrix_ops.cpp b/src/dist/matrix_ops.cpp index c88d9843..b9567ccb 100644 --- a/src/dist/matrix_ops.cpp +++ b/src/dist/matrix_ops.cpp @@ -209,161 +209,3 @@ Eigen::VectorXf square_to_long(const NumpyMatrix &squareDists, return (longDists); } - -// Unnormalised (signed_ distance between a point (x0, y0) and a line defined -// by the two points (xmax, 0) and (0, ymax) -// Divide by 1/sqrt(xmax^2 + ymax^2) to get distance -inline float line_dist(const float x0, - const float y0, - const float x_max, - const float y_max, - const int slope) -{ - float boundary_side = 0; - if (slope == 2) - { - boundary_side = y0 * x_max + x0 * y_max - x_max * y_max; - } - else if (slope == 0) - { - boundary_side = x0 - x_max; - } - else if (slope == 1) - { - boundary_side = y0 - y_max; - } - - return boundary_side; -} - -Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, - const int slope, - const float x_max, - const float y_max, - unsigned int num_threads) -{ - Eigen::VectorXf boundary_test(distMat.rows()); - -#pragma omp parallel for schedule(static) num_threads(num_threads) - for (long row_idx = 0; row_idx < distMat.rows(); row_idx++) - { - float in_tri = line_dist(distMat(row_idx, 0), distMat(row_idx, 1), - x_max, y_max, slope); - float boundary_side; - if (in_tri == 0) - { - boundary_side = 0; - } - else if (in_tri > 0) - { - boundary_side = 1; - } - else - { - boundary_side = -1; - } - boundary_test[row_idx] = boundary_side; - } - return (boundary_test); -} - -// Line defined between (x0, y0) and (x1, y1) -// Offset is distance along this line, starting at (x0, y0) -network_coo threshold_iterate_1D(const NumpyMatrix &distMat, - const std::vector<double> &offsets, - const int slope, - const float x0, - const float y0, - const float x1, - const float y1, - const int num_threads) -{ - std::vector<long> i_vec; - std::vector<long> j_vec; - std::vector<long> offset_idx; - const float gradient = (y1 - y0) / (x1 - x0); // == tan(theta) - const size_t n_samples = rows_to_samples(distMat); - - std::vector<float> boundary_dist(distMat.rows()); - std::vector<long> boundary_order; - long sorted_idx = 0; - for (int offset_nr = 0; offset_nr < offsets.size(); ++offset_nr) - { - float x_intercept = x0 + offsets[offset_nr] * (1 / std::sqrt(1 + gradient)); - float y_intercept = y0 + offsets[offset_nr] * (gradient / std::sqrt(1 + gradient)); - float x_max, y_max; - if (slope == 2) - { - x_max = x_intercept + y_intercept * gradient; - y_max = y_intercept + x_intercept / gradient; - } - else if (slope == 0) - { - x_max = x_intercept; - y_max = 0; - } - else - { - x_max = 0; - y_max = y_intercept; - } - - // printf("grad:%f xint:%f yint:%f x_max:%f y_max:%f\n", - // gradient, x_intercept, y_intercept, x_max, y_max); - // Calculate the distances and sort them on the first loop entry - if (offset_nr == 0) - { -#pragma omp parallel for schedule(static) num_threads(num_threads) - for (long row_idx = 0; row_idx < distMat.rows(); row_idx++) - { - boundary_dist[row_idx] = line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max, y_max, slope); - } - boundary_order = sort_indexes(boundary_dist); - } - - long row_idx = boundary_order[sorted_idx]; - while (sorted_idx < boundary_order.size() && - line_dist(distMat(row_idx, 0), - distMat(row_idx, 1), - x_max, y_max, slope) <= 0) - { - long i = calc_row_idx(row_idx, n_samples); - long j = calc_col_idx(row_idx, i, n_samples); - i_vec.push_back(i); - j_vec.push_back(j); - offset_idx.push_back(offset_nr); - sorted_idx++; - row_idx = boundary_order[sorted_idx]; - } - } - return (std::make_tuple(i_vec, j_vec, offset_idx)); -} - -network_coo threshold_iterate_2D(const NumpyMatrix &distMat, - const std::vector<float> &x_max, - const float y_max) -{ - std::vector<long> i_vec; - std::vector<long> j_vec; - std::vector<long> offset_idx; - const size_t n_samples = rows_to_samples(distMat); - - for (int offset_nr = 0; offset_nr < x_max.size(); ++offset_nr) - { - for (long row_idx = 0; row_idx < distMat.rows(); row_idx++) - { - if (line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max[offset_nr], y_max, 2) <= 0) - { - if (offset_nr == 0 || (line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max[offset_nr - 1], y_max, 2) > 0)) - { - long i = calc_row_idx(row_idx, n_samples); - long j = calc_col_idx(row_idx, i, n_samples); - i_vec.push_back(i); - j_vec.push_back(j); - offset_idx.push_back(offset_nr); - } - } - } - } - return (std::make_tuple(i_vec, j_vec, offset_idx)); -} diff --git a/src/sketchlib_bindings.cpp b/src/sketchlib_bindings.cpp index f2379323..4f80c321 100644 --- a/src/sketchlib_bindings.cpp +++ b/src/sketchlib_bindings.cpp @@ -295,52 +295,6 @@ sparse_coo sparsifyDists(const Eigen::Ref<NumpyMatrix> &denseDists, return coo_idx; } -// Wrapper which makes a ref to the python/numpy array -Eigen::VectorXf assignThreshold(const Eigen::Ref<NumpyMatrix> &distMat, - const int slope, - const double x_max, - const double y_max, - const unsigned int num_threads = 1) -{ - Eigen::VectorXf assigned = assign_threshold(distMat, - slope, - x_max, - y_max, - num_threads); - return (assigned); -} - -network_coo thresholdIterate1D(const Eigen::Ref<NumpyMatrix> &distMat, - const std::vector<double> &offsets, - const int slope, - const double x0, - const double y0, - const double x1, - const double y1, - const int num_threads) -{ - if (!std::is_sorted(offsets.begin(), offsets.end())) - { - throw std::runtime_error("Offsets to thresholdIterate1D must be sorted"); - } - std::tuple<std::vector<long>, std::vector<long>, std::vector<long>> add_idx = - threshold_iterate_1D(distMat, offsets, slope, x0, y0, x1, y1, num_threads); - return (add_idx); -} - -network_coo thresholdIterate2D(const Eigen::Ref<NumpyMatrix> &distMat, - const std::vector<float> &x_max, - const float y_max) -{ - if (!std::is_sorted(x_max.begin(), x_max.end())) - { - throw std::runtime_error("x_max range to thresholdIterate2D must be sorted"); - } - std::tuple<std::vector<long>, std::vector<long>, std::vector<long>> add_idx = - threshold_iterate_2D(distMat, x_max, y_max); - return (add_idx); -} - PYBIND11_MODULE(pp_sketchlib, m) { m.doc() = "Sketch implementation for PopPUNK"; @@ -421,28 +375,6 @@ PYBIND11_MODULE(pp_sketchlib, m) py::arg("kNN") = 0, py::arg("num_threads") = 1); - m.def("assignThreshold", &assignThreshold, py::return_value_policy::reference_internal, "Assign samples based on their relation to a 2D boundary", - py::arg("distMat").noconvert(), - py::arg("slope"), - py::arg("x_max"), - py::arg("y_max"), - py::arg("num_threads") = 1); - - m.def("thresholdIterate1D", &thresholdIterate1D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", - py::arg("distMat").noconvert(), - py::arg("offsets"), - py::arg("slope"), - py::arg("x0"), - py::arg("y0"), - py::arg("x1"), - py::arg("y1"), - py::arg("num_threads") = 1); - - m.def("thresholdIterate2D", &thresholdIterate2D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", - py::arg("distMat").noconvert(), - py::arg("x_max"), - py::arg("ymax")); - // Exceptions py::register_exception<HighFive::Exception>(m, "HDF5Exception"); } diff --git a/test/test-matrix.py b/test/test-matrix.py index 59cbf61b..fdc8eae2 100644 --- a/test/test-matrix.py +++ b/test/test-matrix.py @@ -25,22 +25,6 @@ def sparsify(distMat, cutoff, kNN, threads): return(sparse_scipy) -# Original PopPUNK function (with some improvements) -def withinBoundary(dists, x_max, y_max, slope=2): - boundary_test = np.ones((dists.shape[0])) - for row in range(boundary_test.size): - if slope == 2: - in_tri = dists[row, 1] * x_max + dists[row, 0] * y_max - x_max * y_max - elif slope == 0: - in_tri = dists[row, 0] - x_max - elif slope == 1: - in_tri = dists[row, 1] - y_max - if abs(in_tri) < np.finfo(np.float32).eps: - boundary_test[row] = 0 - elif in_tri < 0: - boundary_test[row] = -1 - return(boundary_test) - def check_res(res, expected): if (not np.all(res == expected)): print(res) @@ -73,82 +57,6 @@ def check_res(res, expected): check_res(pp_sketchlib.squareToLong(square1_res, 2), rr_mat) -# assigning -x = np.arange(0, 1, 0.1, dtype=np.float32) -y = np.arange(0, 1, 0.1, dtype=np.float32) -xv, yv = np.meshgrid(x, y) -distMat = np.hstack((xv.reshape(-1,1), yv.reshape(-1,1))) -assign0 = pp_sketchlib.assignThreshold(distMat, 0, 0.5, 0.5, 2) -assign1 = pp_sketchlib.assignThreshold(distMat, 1, 0.5, 0.5, 2) -assign2 = pp_sketchlib.assignThreshold(distMat, 2, 0.5, 0.5, 2) - -assign0_res = withinBoundary(distMat, 0.5, 0.5, 0) -assign1_res = withinBoundary(distMat, 0.5, 0.5, 1) -assign2_res = withinBoundary(distMat, 0.5, 0.5, 2) - -check_res(assign0, assign0_res) -check_res(assign1, assign1_res) -check_res(assign2, assign2_res) - -# move boundary 1D -# example is symmetrical at points (0.1, 0.1); (0.2, 0.2); (0.3, 0.3) -samples = 100 -distMat = np.random.rand(int(0.5 * samples * (samples - 1)), 2) -distMat = np.array(distMat, dtype = np.float32) -offsets = [x * sqrt(2) for x in [-0.1, 0.0, 0.1]] -i_vec, j_vec, idx_vec = pp_sketchlib.thresholdIterate1D(distMat, offsets, 2, 0.2, 0.2, 0.3, 0.3) -sketchlib_i = [] -sketchlib_j = [] -for offset_idx, offset in enumerate(offsets): - for i, j, idx in zip(i_vec, j_vec, idx_vec): - if idx > offset_idx: - break - elif idx == offset_idx: - sketchlib_i.append(i) - sketchlib_j.append(j) - - py_i = [] - py_j = [] - xmax = 0.4 + (2 * (offset/sqrt(2))) - assign = pp_sketchlib.assignThreshold(distMat, 2, xmax, xmax, 1) - dist_idx = 0 - for i in range(samples): - for j in range(i + 1, samples): - if assign[dist_idx] <= 0: - py_i.append(i) - py_j.append(j) - dist_idx += 1 - if set(zip(py_i, py_j)) != set(zip(sketchlib_i, sketchlib_j)): - raise RuntimeError("Threshold 1D iterate mismatch at offset " + str(offset)) - -# move boundary 2D -# example is for boundaries (0.1, 0.2); (0.2, 0.2); (0.3, 0.2) -offsets = [0.1, 0.2, 0.3] -y_max = 0.2 -i_vec, j_vec, idx_vec = pp_sketchlib.thresholdIterate2D(distMat, offsets, y_max) -sketchlib_i = [] -sketchlib_j = [] -for offset_idx, offset in enumerate(offsets): - for i, j, idx in zip(i_vec, j_vec, idx_vec): - if idx > offset_idx: - break - elif idx == offset_idx: - sketchlib_i.append(i) - sketchlib_j.append(j) - - py_i = [] - py_j = [] - assign = pp_sketchlib.assignThreshold(distMat, 2, offset, y_max, 1) - dist_idx = 0 - for i in range(samples): - for j in range(i + 1, samples): - if assign[dist_idx] <= 0: - py_i.append(i) - py_j.append(j) - dist_idx += 1 - if set(zip(py_i, py_j)) != set(zip(sketchlib_i, sketchlib_j)): - raise RuntimeError("Threshold 2D iterate mismatch at offset " + str(offset)) - # sparsification sparse1 = sparsify(square2_res, cutoff=5, kNN=0, threads=2) From 2f32be7090fea6c55bc5f61f36934fdf49086e7b Mon Sep 17 00:00:00 2001 From: John Lees <lees.john6@gmail.com> Date: Wed, 27 Jan 2021 12:05:20 +0000 Subject: [PATCH 2/5] Update library calls in README --- README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.md b/README.md index c6254fdb..bb8c2ae8 100644 --- a/README.md +++ b/README.md @@ -162,10 +162,6 @@ Available functions: into a single square distance matrix (returns a numpy array). - `sparsifyDists()` - Convert a square distance matrix into a sparse matrix, by applying a distance threshold or number of nearest neighbours (returns a sparse COO matrix). -- `assignThreshold()` - Assign 2D core/accessory points either side of a decision boundary (returns a numpy vector). -- `thresholdIterate1D()` - Move an assignment boundary normal to two points by a vector of offsets, getting the network edges which are iteratively added. -- `thresholdIterate2d()` - Move an assignment boundary defined by y_max and a -vector of xmax, getting the network edges which are iteratively added. ### hdf5 database files From 85f278d6d5856166e07b32e2226f175748bb3d87 Mon Sep 17 00:00:00 2001 From: John Lees <lees.john6@gmail.com> Date: Fri, 29 Jan 2021 14:34:42 +0000 Subject: [PATCH 3/5] Attempt to add single consistent version to module and cmd line --- CMakeLists.txt | 1 + setup.py | 4 +++- src/Makefile | 3 ++- src/sketchlib_bindings.cpp | 4 ++++ test/run_test.py | 6 ++++++ 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c56afc03..e9bdb099 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ IF(CMAKE_COMPILER_IS_GNUCC) set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0") ENDIF(CMAKE_COMPILER_IS_GNUCC) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVERSION_INFO=\"$ENV{SKETCHLIB_BUILD_VERSION}\"") if(UNIX AND NOT APPLE) if(CMAKE_CXX_COMPILER STREQUAL "icpc") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -fast -xCASCADELAKE -DMKL_ILP64 -m64 -static-intel") diff --git a/setup.py b/setup.py index 4e2c82f5..87d98f93 100644 --- a/setup.py +++ b/setup.py @@ -78,9 +78,11 @@ def build_extension(self, ext): env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''), self.distribution.get_version()) - target = os.getenv('SKETCHLIB_INSTALL', None) if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) + + os.environ['SKETCHLIB_BUILD_VERSION'] = find_version(ext.sourcedir + "/pp_sketch/__init__.py") + target = os.getenv('SKETCHLIB_INSTALL', None) if target == 'conda': subprocess.check_call(['make', 'python', 'LIBLOC="' + os.environ['PREFIX'] + '"'], cwd=ext.sourcedir + '/src', env=env) subprocess.check_call(['make', 'install_python', 'LIBLOC="' + os.environ['PREFIX'] + '"', 'PYTHON_LIB_PATH=' + extdir], cwd=ext.sourcedir + '/src', env=env) diff --git a/src/Makefile b/src/Makefile index e1ec37c0..5ebaec4c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,3 +1,4 @@ +VERSION = ${SKETCHLIB_BUILD_VERSION} CXXFLAGS+=-Wall -Wextra -std=c++14 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC ifdef DEBUG @@ -40,7 +41,7 @@ endif PYTHON_LIB = pp_sketchlib$(shell python3-config --extension-suffix) # python specific options -python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS -DVERSION_INFO=\"1.6.2\" $(shell python3 -m pybind11 --includes) +python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS -DVERSION_INFO=\"$(VERSION)" $(shell python3 -m pybind11 --includes) PROGRAMS=sketch_test matrix_test read_test gpu_dist_test diff --git a/src/sketchlib_bindings.cpp b/src/sketchlib_bindings.cpp index 4f80c321..93d2b950 100644 --- a/src/sketchlib_bindings.cpp +++ b/src/sketchlib_bindings.cpp @@ -13,6 +13,7 @@ namespace py = pybind11; #include <highfive/H5Exception.hpp> +#include "version.h" #include "api.hpp" NumpyMatrix longToSquare(const Eigen::Ref<Eigen::VectorXf> &distVec, @@ -375,6 +376,9 @@ PYBIND11_MODULE(pp_sketchlib, m) py::arg("kNN") = 0, py::arg("num_threads") = 1); + m.attr("version") = VERSION_INFO; + m.attr("sketchVersion") = SKETCH_VERSION; + // Exceptions py::register_exception<HighFive::Exception>(m, "HDF5Exception"); } diff --git a/test/run_test.py b/test/run_test.py index 7a3d9379..add8063e 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -23,6 +23,12 @@ subprocess.run("tar xf example_set.tar.bz2", shell=True, check=True) # python tests + +# module attributes +import pp_sketchlib +print(pp_sketchlib.version) +print(pp_sketchlib.sketchVersion) + # create sketches sys.stderr.write("Sketch smoke test\n") subprocess.run("poppunk_sketch --sketch --rfile references.txt --ref-db test_db --sketch-size 10000 --min-k 15 --k-step 4 --cpus 2", shell=True, check=True) From 0372c62af6fc835137a3c657b06fccdf7a1a01c6 Mon Sep 17 00:00:00 2001 From: John Lees <lees.john6@gmail.com> Date: Fri, 29 Jan 2021 16:00:59 +0000 Subject: [PATCH 4/5] Remove double definition of version in CMakeLists.txt --- CMakeLists.txt | 1 - setup.py | 1 - 2 files changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9bdb099..c56afc03 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,6 @@ IF(CMAKE_COMPILER_IS_GNUCC) set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0") ENDIF(CMAKE_COMPILER_IS_GNUCC) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVERSION_INFO=\"$ENV{SKETCHLIB_BUILD_VERSION}\"") if(UNIX AND NOT APPLE) if(CMAKE_CXX_COMPILER STREQUAL "icpc") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -fast -xCASCADELAKE -DMKL_ILP64 -m64 -static-intel") diff --git a/setup.py b/setup.py index 87d98f93..13d21073 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,6 @@ def build_extension(self, ext): if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) - os.environ['SKETCHLIB_BUILD_VERSION'] = find_version(ext.sourcedir + "/pp_sketch/__init__.py") target = os.getenv('SKETCHLIB_INSTALL', None) if target == 'conda': subprocess.check_call(['make', 'python', 'LIBLOC="' + os.environ['PREFIX'] + '"'], cwd=ext.sourcedir + '/src', env=env) From 239edea879c54d7671ea8586b67ee282f3193f04 Mon Sep 17 00:00:00 2001 From: John Lees <lees.john6@gmail.com> Date: Fri, 29 Jan 2021 17:13:17 +0000 Subject: [PATCH 5/5] Remove unnecessary version define from Makefile --- src/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index 5ebaec4c..5f7ecb85 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,3 @@ -VERSION = ${SKETCHLIB_BUILD_VERSION} - CXXFLAGS+=-Wall -Wextra -std=c++14 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC ifdef DEBUG CXXFLAGS+= -O0 -g @@ -41,7 +39,7 @@ endif PYTHON_LIB = pp_sketchlib$(shell python3-config --extension-suffix) # python specific options -python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS -DVERSION_INFO=\"$(VERSION)" $(shell python3 -m pybind11 --includes) +python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS $(shell python3 -m pybind11 --includes) PROGRAMS=sketch_test matrix_test read_test gpu_dist_test