Skip to content

Commit

Permalink
Merge pull request bacpop#50 from johnlees/i49-remove-fns
Browse files Browse the repository at this point in the history
Remove the boundary code from the package
  • Loading branch information
johnlees authored Jan 30, 2021
2 parents 125e580 + 239edea commit 3cd498e
Show file tree
Hide file tree
Showing 9 changed files with 13 additions and 344 deletions.
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,6 @@ Available functions:
into a single square distance matrix (returns a numpy array).
- `sparsifyDists()` - Convert a square distance matrix into a sparse matrix, by applying a
distance threshold or number of nearest neighbours (returns a sparse COO matrix).
- `assignThreshold()` - Assign 2D core/accessory points either side of a decision boundary (returns a numpy vector).
- `thresholdIterate1D()` - Move an assignment boundary normal to two points by a vector of offsets, getting the network edges which are iteratively added.
- `thresholdIterate2d()` - Move an assignment boundary defined by y_max and a
vector of xmax, getting the network edges which are iteratively added.

### hdf5 database files

Expand Down
2 changes: 1 addition & 1 deletion pp_sketch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

'''PopPUNK sketching functions'''

__version__ = '1.6.2'
__version__ = '1.6.3'
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,10 @@ def build_extension(self, ext):
env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
self.distribution.get_version())

target = os.getenv('SKETCHLIB_INSTALL', None)
if not os.path.exists(self.build_temp):
os.makedirs(self.build_temp)

target = os.getenv('SKETCHLIB_INSTALL', None)
if target == 'conda':
subprocess.check_call(['make', 'python', 'LIBLOC="' + os.environ['PREFIX'] + '"'], cwd=ext.sourcedir + '/src', env=env)
subprocess.check_call(['make', 'install_python', 'LIBLOC="' + os.environ['PREFIX'] + '"', 'PYTHON_LIB_PATH=' + extdir], cwd=ext.sourcedir + '/src', env=env)
Expand Down
3 changes: 1 addition & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

CXXFLAGS+=-Wall -Wextra -std=c++14 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
ifdef DEBUG
CXXFLAGS+= -O0 -g
Expand Down Expand Up @@ -40,7 +39,7 @@ endif
PYTHON_LIB = pp_sketchlib$(shell python3-config --extension-suffix)

# python specific options
python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS -DVERSION_INFO=\"1.6.2\" $(shell python3 -m pybind11 --includes)
python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS $(shell python3 -m pybind11 --includes)

PROGRAMS=sketch_test matrix_test read_test gpu_dist_test

Expand Down
19 changes: 0 additions & 19 deletions src/dist/matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,3 @@ sparse_coo sparsify_dists(const NumpyMatrix &denseDists,
const float distCutoff,
const unsigned long int kNN,
const unsigned int num_threads = 1);

Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat,
const int slope,
const float x_max,
const float y_max,
unsigned int num_threads);

network_coo threshold_iterate_1D(const NumpyMatrix &distMat,
const std::vector<double> &offsets,
const int slope,
const float x0,
const float y0,
const float x1,
const float y1,
const int num_threads = 1);

network_coo threshold_iterate_2D(const NumpyMatrix &distMat,
const std::vector<float> &x_max,
const float y_max);
158 changes: 0 additions & 158 deletions src/dist/matrix_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,161 +209,3 @@ Eigen::VectorXf square_to_long(const NumpyMatrix &squareDists,

return (longDists);
}

// Unnormalised (signed_ distance between a point (x0, y0) and a line defined
// by the two points (xmax, 0) and (0, ymax)
// Divide by 1/sqrt(xmax^2 + ymax^2) to get distance
inline float line_dist(const float x0,
const float y0,
const float x_max,
const float y_max,
const int slope)
{
float boundary_side = 0;
if (slope == 2)
{
boundary_side = y0 * x_max + x0 * y_max - x_max * y_max;
}
else if (slope == 0)
{
boundary_side = x0 - x_max;
}
else if (slope == 1)
{
boundary_side = y0 - y_max;
}

return boundary_side;
}

Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat,
const int slope,
const float x_max,
const float y_max,
unsigned int num_threads)
{
Eigen::VectorXf boundary_test(distMat.rows());

#pragma omp parallel for schedule(static) num_threads(num_threads)
for (long row_idx = 0; row_idx < distMat.rows(); row_idx++)
{
float in_tri = line_dist(distMat(row_idx, 0), distMat(row_idx, 1),
x_max, y_max, slope);
float boundary_side;
if (in_tri == 0)
{
boundary_side = 0;
}
else if (in_tri > 0)
{
boundary_side = 1;
}
else
{
boundary_side = -1;
}
boundary_test[row_idx] = boundary_side;
}
return (boundary_test);
}

// Line defined between (x0, y0) and (x1, y1)
// Offset is distance along this line, starting at (x0, y0)
network_coo threshold_iterate_1D(const NumpyMatrix &distMat,
const std::vector<double> &offsets,
const int slope,
const float x0,
const float y0,
const float x1,
const float y1,
const int num_threads)
{
std::vector<long> i_vec;
std::vector<long> j_vec;
std::vector<long> offset_idx;
const float gradient = (y1 - y0) / (x1 - x0); // == tan(theta)
const size_t n_samples = rows_to_samples(distMat);

std::vector<float> boundary_dist(distMat.rows());
std::vector<long> boundary_order;
long sorted_idx = 0;
for (int offset_nr = 0; offset_nr < offsets.size(); ++offset_nr)
{
float x_intercept = x0 + offsets[offset_nr] * (1 / std::sqrt(1 + gradient));
float y_intercept = y0 + offsets[offset_nr] * (gradient / std::sqrt(1 + gradient));
float x_max, y_max;
if (slope == 2)
{
x_max = x_intercept + y_intercept * gradient;
y_max = y_intercept + x_intercept / gradient;
}
else if (slope == 0)
{
x_max = x_intercept;
y_max = 0;
}
else
{
x_max = 0;
y_max = y_intercept;
}

// printf("grad:%f xint:%f yint:%f x_max:%f y_max:%f\n",
// gradient, x_intercept, y_intercept, x_max, y_max);
// Calculate the distances and sort them on the first loop entry
if (offset_nr == 0)
{
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (long row_idx = 0; row_idx < distMat.rows(); row_idx++)
{
boundary_dist[row_idx] = line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max, y_max, slope);
}
boundary_order = sort_indexes(boundary_dist);
}

long row_idx = boundary_order[sorted_idx];
while (sorted_idx < boundary_order.size() &&
line_dist(distMat(row_idx, 0),
distMat(row_idx, 1),
x_max, y_max, slope) <= 0)
{
long i = calc_row_idx(row_idx, n_samples);
long j = calc_col_idx(row_idx, i, n_samples);
i_vec.push_back(i);
j_vec.push_back(j);
offset_idx.push_back(offset_nr);
sorted_idx++;
row_idx = boundary_order[sorted_idx];
}
}
return (std::make_tuple(i_vec, j_vec, offset_idx));
}

network_coo threshold_iterate_2D(const NumpyMatrix &distMat,
const std::vector<float> &x_max,
const float y_max)
{
std::vector<long> i_vec;
std::vector<long> j_vec;
std::vector<long> offset_idx;
const size_t n_samples = rows_to_samples(distMat);

for (int offset_nr = 0; offset_nr < x_max.size(); ++offset_nr)
{
for (long row_idx = 0; row_idx < distMat.rows(); row_idx++)
{
if (line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max[offset_nr], y_max, 2) <= 0)
{
if (offset_nr == 0 || (line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max[offset_nr - 1], y_max, 2) > 0))
{
long i = calc_row_idx(row_idx, n_samples);
long j = calc_col_idx(row_idx, i, n_samples);
i_vec.push_back(i);
j_vec.push_back(j);
offset_idx.push_back(offset_nr);
}
}
}
}
return (std::make_tuple(i_vec, j_vec, offset_idx));
}
70 changes: 3 additions & 67 deletions src/sketchlib_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ namespace py = pybind11;

#include <highfive/H5Exception.hpp>

#include "version.h"
#include "api.hpp"

NumpyMatrix longToSquare(const Eigen::Ref<Eigen::VectorXf> &distVec,
Expand Down Expand Up @@ -295,52 +296,6 @@ sparse_coo sparsifyDists(const Eigen::Ref<NumpyMatrix> &denseDists,
return coo_idx;
}

// Wrapper which makes a ref to the python/numpy array
Eigen::VectorXf assignThreshold(const Eigen::Ref<NumpyMatrix> &distMat,
const int slope,
const double x_max,
const double y_max,
const unsigned int num_threads = 1)
{
Eigen::VectorXf assigned = assign_threshold(distMat,
slope,
x_max,
y_max,
num_threads);
return (assigned);
}

network_coo thresholdIterate1D(const Eigen::Ref<NumpyMatrix> &distMat,
const std::vector<double> &offsets,
const int slope,
const double x0,
const double y0,
const double x1,
const double y1,
const int num_threads)
{
if (!std::is_sorted(offsets.begin(), offsets.end()))
{
throw std::runtime_error("Offsets to thresholdIterate1D must be sorted");
}
std::tuple<std::vector<long>, std::vector<long>, std::vector<long>> add_idx =
threshold_iterate_1D(distMat, offsets, slope, x0, y0, x1, y1, num_threads);
return (add_idx);
}

network_coo thresholdIterate2D(const Eigen::Ref<NumpyMatrix> &distMat,
const std::vector<float> &x_max,
const float y_max)
{
if (!std::is_sorted(x_max.begin(), x_max.end()))
{
throw std::runtime_error("x_max range to thresholdIterate2D must be sorted");
}
std::tuple<std::vector<long>, std::vector<long>, std::vector<long>> add_idx =
threshold_iterate_2D(distMat, x_max, y_max);
return (add_idx);
}

PYBIND11_MODULE(pp_sketchlib, m)
{
m.doc() = "Sketch implementation for PopPUNK";
Expand Down Expand Up @@ -421,27 +376,8 @@ PYBIND11_MODULE(pp_sketchlib, m)
py::arg("kNN") = 0,
py::arg("num_threads") = 1);

m.def("assignThreshold", &assignThreshold, py::return_value_policy::reference_internal, "Assign samples based on their relation to a 2D boundary",
py::arg("distMat").noconvert(),
py::arg("slope"),
py::arg("x_max"),
py::arg("y_max"),
py::arg("num_threads") = 1);

m.def("thresholdIterate1D", &thresholdIterate1D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset",
py::arg("distMat").noconvert(),
py::arg("offsets"),
py::arg("slope"),
py::arg("x0"),
py::arg("y0"),
py::arg("x1"),
py::arg("y1"),
py::arg("num_threads") = 1);

m.def("thresholdIterate2D", &thresholdIterate2D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset",
py::arg("distMat").noconvert(),
py::arg("x_max"),
py::arg("ymax"));
m.attr("version") = VERSION_INFO;
m.attr("sketchVersion") = SKETCH_VERSION;

// Exceptions
py::register_exception<HighFive::Exception>(m, "HDF5Exception");
Expand Down
6 changes: 6 additions & 0 deletions test/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
subprocess.run("tar xf example_set.tar.bz2", shell=True, check=True)

# python tests

# module attributes
import pp_sketchlib
print(pp_sketchlib.version)
print(pp_sketchlib.sketchVersion)

# create sketches
sys.stderr.write("Sketch smoke test\n")
subprocess.run("poppunk_sketch --sketch --rfile references.txt --ref-db test_db --sketch-size 10000 --min-k 15 --k-step 4 --cpus 2", shell=True, check=True)
Expand Down
Loading

0 comments on commit 3cd498e

Please sign in to comment.