From 0e15a2d6d3bf1110cf69deb64e8e9818c8250a29 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 18 Oct 2022 17:00:06 -0400 Subject: [PATCH 1/7] Adding fused l2 nn argmin primitive for pylibraft --- cpp/CMakeLists.txt | 1 + .../raft_distance/fused_l2_min_arg.hpp | 60 +++++++ cpp/src/distance/fused_l2_min_arg.cu | 117 ++++++++++++++ cpp/test/util/fast_int_div.cu | 114 ++++++++++++++ .../pylibraft/distance/CMakeLists.txt | 3 +- .../pylibraft/pylibraft/distance/__init__.py | 1 + .../pylibraft/distance/fused_l2_nn.pyx | 149 ++++++++++++++++++ .../pylibraft/test/test_fused_l2_argmin.py | 51 ++++++ python/pylibraft/pylibraft/testing/utils.py | 1 + 9 files changed, 496 insertions(+), 1 deletion(-) create mode 100644 cpp/include/raft_distance/fused_l2_min_arg.hpp create mode 100644 cpp/src/distance/fused_l2_min_arg.cu create mode 100644 cpp/test/util/fast_int_div.cu create mode 100644 python/pylibraft/pylibraft/distance/fused_l2_nn.pyx create mode 100644 python/pylibraft/pylibraft/test/test_fused_l2_argmin.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 12bebfa2a5..c15020174b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -244,6 +244,7 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance) if(RAFT_COMPILE_DIST_LIBRARY) add_library(raft_distance_lib src/distance/pairwise_distance.cu + src/distance/fused_l2_min_arg.cu src/distance/specializations/detail/canberra.cu src/distance/specializations/detail/chebyshev.cu src/distance/specializations/detail/correlation.cu diff --git a/cpp/include/raft_distance/fused_l2_min_arg.hpp b/cpp/include/raft_distance/fused_l2_min_arg.hpp new file mode 100644 index 0000000000..f289865aa2 --- /dev/null +++ b/cpp/include/raft_distance/fused_l2_min_arg.hpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft::distance::runtime { + /** + * @brief Wrapper around fusedL2NN with minimum reduction operators. + * + * fusedL2NN cannot be compiled in the distance library due to the lambda + * operators, so this wrapper covers the most common case (minimum). + * This should be preferred to the more generic API when possible, in order to + * reduce compilation times for users of the shared library. + * @param[in] handle raft handle + * @param[out] min will contain the reduced output (Length = `m`) + * (on device) + * @param[in] x first matrix. Row major. Dim = `m x k`. + * (on device). + * @param[in] y second matrix. Row major. Dim = `n x k`. + * (on device). + * @param[in] m gemm m + * @param[in] n gemm n + * @param[in] k gemm k + */ + void fused_l2_nn_min_arg( + raft::handle_t const& handle, + int* min, + const float* x, + const float* y, + int m, + int n, + int k, + bool sqrt); + + void fused_l2_nn_min_arg( + raft::handle_t const& handle, + int* min, + const double* x, + const double* y, + int m, + int n, + int k, + bool sqrt); + + +} // end namespace raft::distance::runtime \ No newline at end of file diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu new file mode 100644 index 0000000000..7e96bf9555 --- /dev/null +++ b/cpp/src/distance/fused_l2_min_arg.cu @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::distance::runtime { + + template + struct KeyValueIndexOp { + __host__ __device__ __forceinline__ IndexT + operator()(const raft::KeyValuePair& a) const + { + printf("%d, %f\n", a.key, a.value); + return a.key; + } + }; + +template + void compute_fused_l2_nn_min_arg( + raft::handle_t const& handle, + idx_t* min, + const value_t* x, + const value_t* y, + idx_t m, + idx_t n, + idx_t k, + bool sqrt) { + rmm::device_uvector workspace(m, handle.get_stream()); + auto kvp = raft::make_device_vector>(handle, m); + + rmm::device_uvector x_norms(m, handle.get_stream()); + rmm::device_uvector y_norms(n, handle.get_stream()); + raft::linalg::rowNorm(x_norms.data(), x, k, m, raft::linalg::L2Norm, true, handle.get_stream()); + raft::linalg::rowNorm(y_norms.data(), y, k, n, raft::linalg::L2Norm, true, handle.get_stream()); + + fusedL2NNMinReduce(kvp.data_handle(), x, y, x_norms.data(), y_norms.data(), m, n, k, (void*)workspace.data(), sqrt, true, handle.get_stream()); + + raft::print_device_vector("x", x, m*k, std::cout); + raft::print_device_vector("y", y, n*k, std::cout); + + raft::print_device_vector("x_norms", x_norms.data(), m, std::cout); + raft::print_device_vector("y_norms", y_norms.data(), n, std::cout); + + KeyValueIndexOp conversion_op; + thrust::transform(handle.get_thrust_policy(), kvp.data_handle(), kvp.data_handle()+m, min, conversion_op); + handle.sync_stream(); + raft::print_device_vector("min", min, m, std::cout); + } + + /** + * @brief Wrapper around fusedL2NN with minimum reduction operators. + * + * fusedL2NN cannot be compiled in the distance library due to the lambda + * operators, so this wrapper covers the most common case (minimum). + * This should be preferred to the more generic API when possible, in order to + * reduce compilation times for users of the shared library. + * @param[in] handle raft handle + * @param[out] min will contain the reduced output (Length = `m`) + * (on device) + * @param[in] x first matrix. Row major. Dim = `m x k`. + * (on device). + * @param[in] y second matrix. Row major. Dim = `n x k`. + * (on device). + * @param[in] xn L2 squared norm of `x`. Length = `m`. (on device). + * @param[in] yn L2 squared norm of `y`. Length = `n`. (on device) + * @param[in] m gemm m + * @param[in] n gemm n + * @param[in] k gemm k + */ + void fused_l2_nn_min_arg( + raft::handle_t const& handle, + int* min, + const float* x, + const float* y, + int m, + int n, + int k, + bool sqrt) { + + compute_fused_l2_nn_min_arg(handle, min, x, y, m, n, k, sqrt); + } + + void fused_l2_nn_min_arg( + raft::handle_t const& handle, + int* min, + const double* x, + const double* y, + int m, + int n, + int k, + bool sqrt) { + + compute_fused_l2_nn_min_arg(handle, min, x, y, m, n, k, sqrt); +} + + +} // end namespace raft::distance::runtime \ No newline at end of file diff --git a/cpp/test/util/fast_int_div.cu b/cpp/test/util/fast_int_div.cu new file mode 100644 index 0000000000..2649bb7445 --- /dev/null +++ b/cpp/test/util/fast_int_div.cu @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include + +#include + +#include + +namespace raft::util { + + TEST(FastIntDiv, CpuTest) +{ + for (int i = 0; i < 100; ++i) { + // get a positive divisor + int divisor; + do { + divisor = rand(); +} while (divisor <= 0); +FastIntDiv fid(divisor); +// run it against a few random numbers and compare the outputs +for (int i = 0; i < 10000; ++i) { +auto num = rand(); +auto correct = num / divisor; +auto computed = num / fid; +ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; +num = rand(); +correct = num % divisor; +computed = num % fid; +ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; +num = -num; +correct = num / divisor; +computed = num / fid; +ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; +num = rand(); +correct = num % divisor; +computed = num % fid; +ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; +} +} +} + +__global__ void fastIntDivTestKernel( + int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len) +{ + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < len) { + computed[tid] = in[tid] % fid; + correct[tid] = in[tid] % divisor; + computed[len + tid] = -in[tid] % fid; + correct[len + tid] = -in[tid] % divisor; + } +} + +TEST(FastIntDiv, GpuTest) +{ +cudaStream_t stream = 0; +RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + +static const int len = 100000; +static const int TPB = 128; +rmm::device_uvector computed(len * 2, stream); +rmm::device_uvector correct(len * 2, stream); +rmm::device_uvector in(len, stream); +for (int i = 0; i < 100; ++i) { +// get a positive divisor +int divisor; +do { +divisor = rand(); +} while (divisor <= 0); +FastIntDiv fid(divisor); +// run it against a few random numbers and compare the outputs +std::vector h_in(len); +for (int i = 0; i < len; ++i) { +h_in[i] = rand(); +} +raft::update_device(in.data(), h_in.data(), len, stream); +int nblks = raft::ceildiv(len, TPB); +fastIntDivTestKernel<<>>( + computed.data(), correct.data(), in.data(), fid, divisor, len); +RAFT_CUDA_TRY(cudaStreamSynchronize(0)); +ASSERT_TRUE(devArrMatch(correct.data(), computed.data(), len * 2, raft::Compare())) +<< " divisor=" << divisor; +} +} + +FastIntDiv dummyFunc(int num) +{ + FastIntDiv fd(num); + return fd; +} + +TEST(FastIntDiv, IncorrectUsage) +{ +ASSERT_THROW(dummyFunc(-1), raft::exception); +ASSERT_THROW(dummyFunc(0), raft::exception); +} + +} // namespace raft::util diff --git a/python/pylibraft/pylibraft/distance/CMakeLists.txt b/python/pylibraft/pylibraft/distance/CMakeLists.txt index 707ea737b3..d074171e58 100644 --- a/python/pylibraft/pylibraft/distance/CMakeLists.txt +++ b/python/pylibraft/pylibraft/distance/CMakeLists.txt @@ -13,7 +13,8 @@ # ============================================================================= # Set the list of Cython files to build -set(cython_sources pairwise_distance.pyx) +set(cython_sources pairwise_distance.pyx + fused_l2_nn.pyx) set(linked_libraries raft::raft raft::distance) # Build all of the Cython targets diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py index ca3e6c5a2e..a3c4e2229b 100644 --- a/python/pylibraft/pylibraft/distance/__init__.py +++ b/python/pylibraft/pylibraft/distance/__init__.py @@ -13,4 +13,5 @@ # limitations under the License. # +from .fused_l2_nn import fused_l2_nn_argmin from .pairwise_distance import distance as pairwise_distance \ No newline at end of file diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx new file mode 100644 index 0000000000..629bc9c323 --- /dev/null +++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx @@ -0,0 +1,149 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import numpy as np + +from libc.stdint cimport uintptr_t +from cython.operator cimport dereference as deref + +from libcpp cimport bool +from .distance_type cimport DistanceType +from pylibraft.common.handle cimport handle_t + + +def is_c_cont(cai, dt): + return "strides" not in cai or \ + cai["strides"] is None or \ + cai["strides"][1] == dt.itemsize + + +cdef extern from "raft_distance/fused_l2_min_arg.hpp" \ + namespace "raft::distance::runtime": + + void fused_l2_nn_min_arg( + const handle_t &handle, + int* min, + const float* x, + const float* y, + int m, + int n, + int k, + bool sqrt); + + void fused_l2_nn_min_arg( + const handle_t &handle, + int* min, + const double* x, + const double* y, + int m, + int n, + int k, + bool sqrt); + +def fused_l2_nn_argmin(X, Y, output, sqrt = True): + """ + Compute the 1-nearest neighbors between X and Y using the L2 distance + + Parameters + ---------- + + X : CUDA array interface compliant matrix shape (m, k) + Y : CUDA array interface compliant matrix shape (n, k) + output : Writable CUDA array interface matrix shape (m, 1) + + Examples + -------- + + .. code-block:: python + + import cupy as cp + + from pylibraft.distance import fused_l2_nn + + n_samples = 5000 + n_clusters = 5 + n_features = 50 + + in1 = cp.random.random_sample((n_samples, n_features), + dtype=cp.float32) + in2 = cp.random.random_sample((n_clusters, n_features), + dtype=cp.float32) + output = cp.empty((n_samples, 1), dtype=cp.int32) + + fused_l2_nn_argmin(in1, in2, output) + """ + + x_cai = X.__cuda_array_interface__ + y_cai = Y.__cuda_array_interface__ + output_cai = output.__cuda_array_interface__ + + m = x_cai["shape"][0] + n = y_cai["shape"][0] + + x_k = x_cai["shape"][1] + y_k = y_cai["shape"][1] + + if x_k != y_k: + raise ValueError("Inputs must have same number of columns. " + "a=%s, b=%s" % (x_k, y_k)) + + x_ptr = x_cai["data"][0] + y_ptr = y_cai["data"][0] + + d_ptr = output_cai["data"][0] + + cdef handle_t *h = new handle_t() + + x_dt = np.dtype(x_cai["typestr"]) + y_dt = np.dtype(y_cai["typestr"]) + d_dt = np.dtype(output_cai["typestr"]) + + x_c_contiguous = is_c_cont(x_cai, x_dt) + y_c_contiguous = is_c_cont(y_cai, y_dt) + + if x_c_contiguous != y_c_contiguous: + raise ValueError("Inputs must have matching strides") + + print(x_dt) + if x_dt != y_dt: + raise ValueError("Inputs must have the same dtypes") + if d_dt != np.int32: + raise ValueError("Output array must be int32") + + if x_dt == np.float32: + fused_l2_nn_min_arg(deref(h), + d_ptr, + x_ptr, + y_ptr, + m, + n, + x_k, + sqrt) + elif x_dt == np.float64: + fused_l2_nn_min_arg(deref(h), + d_ptr, + x_ptr, + y_ptr, + m, + n, + x_k, + sqrt) + else: + raise ValueError("dtype %s not supported" % x_dt) diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py new file mode 100644 index 0000000000..71026d1f11 --- /dev/null +++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from scipy.spatial.distance import cdist +import pytest +import numpy as np + +from pylibraft.distance import fused_l2_nn_argmin +from pylibraft.testing.utils import TestDeviceBuffer + + +@pytest.mark.parametrize("n_rows", [10]) +@pytest.mark.parametrize("n_clusters", [5]) +@pytest.mark.parametrize("n_cols", [3]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype): + input1 = np.random.random_sample((n_rows, n_cols)) + input1 = np.asarray(input1, order="C").astype(dtype) + + input2 = np.random.random_sample((n_clusters, n_cols)) + input2 = np.asarray(input2, order="C").astype(dtype) + + output = np.zeros((n_rows), dtype="int32") + expected = cdist(input1, input2, metric="euclidean") + + expected = expected.argmin(axis=1) + + input1_device = TestDeviceBuffer(input1, "C") + input2_device = TestDeviceBuffer(input2, "C") + output_device = TestDeviceBuffer(output, "C") + + fused_l2_nn_argmin(input1_device, input2_device, output_device, False) + + actual = output_device.copy_to_host() + + print(str(expected)) + + print(str(actual)) + assert np.allclose(expected, actual, rtol=1e-4) diff --git a/python/pylibraft/pylibraft/testing/utils.py b/python/pylibraft/pylibraft/testing/utils.py index 53115e991c..979fbb5672 100644 --- a/python/pylibraft/pylibraft/testing/utils.py +++ b/python/pylibraft/pylibraft/testing/utils.py @@ -21,6 +21,7 @@ class TestDeviceBuffer: def __init__(self, ndarray, order): + self.ndarray_ = ndarray self.device_buffer_ = \ rmm.DeviceBuffer.to_device(ndarray.ravel(order=order).tobytes()) From aaf75bb637b03804233e22972a0659a720fb5c4b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 18 Oct 2022 17:18:53 -0400 Subject: [PATCH 2/7] Adding a few more test caseds. --- python/pylibraft/pylibraft/test/test_fused_l2_argmin.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py index 71026d1f11..9b7fa16fe3 100644 --- a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py +++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py @@ -21,9 +21,9 @@ from pylibraft.testing.utils import TestDeviceBuffer -@pytest.mark.parametrize("n_rows", [10]) -@pytest.mark.parametrize("n_clusters", [5]) -@pytest.mark.parametrize("n_cols", [3]) +@pytest.mark.parametrize("n_rows", [10, 100]) +@pytest.mark.parametrize("n_clusters", [5, 10]) +@pytest.mark.parametrize("n_cols", [3, 5]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype): input1 = np.random.random_sample((n_rows, n_cols)) @@ -41,7 +41,7 @@ def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype): input2_device = TestDeviceBuffer(input2, "C") output_device = TestDeviceBuffer(output, "C") - fused_l2_nn_argmin(input1_device, input2_device, output_device, False) + fused_l2_nn_argmin(input1_device, input2_device, output_device, True) actual = output_device.copy_to_host() From b7a58355d8b28f0302f38725fc61cf038ee35a53 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 18 Oct 2022 17:22:00 -0400 Subject: [PATCH 3/7] Cleaning up style and prints --- .../raft_distance/fused_l2_min_arg.hpp | 39 +++-- cpp/src/distance/fused_l2_min_arg.cu | 135 +++++++++--------- .../pylibraft/test/test_fused_l2_argmin.py | 4 - 3 files changed, 86 insertions(+), 92 deletions(-) diff --git a/cpp/include/raft_distance/fused_l2_min_arg.hpp b/cpp/include/raft_distance/fused_l2_min_arg.hpp index f289865aa2..b882a6ef26 100644 --- a/cpp/include/raft_distance/fused_l2_min_arg.hpp +++ b/cpp/include/raft_distance/fused_l2_min_arg.hpp @@ -18,7 +18,7 @@ #include namespace raft::distance::runtime { - /** +/** * @brief Wrapper around fusedL2NN with minimum reduction operators. * * fusedL2NN cannot be compiled in the distance library due to the lambda @@ -36,25 +36,22 @@ namespace raft::distance::runtime { * @param[in] n gemm n * @param[in] k gemm k */ - void fused_l2_nn_min_arg( - raft::handle_t const& handle, - int* min, - const float* x, - const float* y, - int m, - int n, - int k, - bool sqrt); +void fused_l2_nn_min_arg(raft::handle_t const& handle, + int* min, + const float* x, + const float* y, + int m, + int n, + int k, + bool sqrt); - void fused_l2_nn_min_arg( - raft::handle_t const& handle, - int* min, - const double* x, - const double* y, - int m, - int n, - int k, - bool sqrt); +void fused_l2_nn_min_arg(raft::handle_t const& handle, + int* min, + const double* x, + const double* y, + int m, + int n, + int k, + bool sqrt); - -} // end namespace raft::distance::runtime \ No newline at end of file +} // end namespace raft::distance::runtime \ No newline at end of file diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu index 7e96bf9555..49aedc47b2 100644 --- a/cpp/src/distance/fused_l2_min_arg.cu +++ b/cpp/src/distance/fused_l2_min_arg.cu @@ -14,60 +14,64 @@ * limitations under the License. */ -#include +#include +#include +#include #include +#include #include -#include #include #include -#include -#include namespace raft::distance::runtime { - template - struct KeyValueIndexOp { - __host__ __device__ __forceinline__ IndexT - operator()(const raft::KeyValuePair& a) const - { - printf("%d, %f\n", a.key, a.value); - return a.key; - } - }; +template +struct KeyValueIndexOp { + __host__ __device__ __forceinline__ IndexT + operator()(const raft::KeyValuePair& a) const + { + return a.key; + } +}; -template - void compute_fused_l2_nn_min_arg( - raft::handle_t const& handle, - idx_t* min, - const value_t* x, - const value_t* y, - idx_t m, - idx_t n, - idx_t k, - bool sqrt) { - rmm::device_uvector workspace(m, handle.get_stream()); - auto kvp = raft::make_device_vector>(handle, m); +template +void compute_fused_l2_nn_min_arg(raft::handle_t const& handle, + idx_t* min, + const value_t* x, + const value_t* y, + idx_t m, + idx_t n, + idx_t k, + bool sqrt) +{ + rmm::device_uvector workspace(m, handle.get_stream()); + auto kvp = raft::make_device_vector>(handle, m); - rmm::device_uvector x_norms(m, handle.get_stream()); - rmm::device_uvector y_norms(n, handle.get_stream()); - raft::linalg::rowNorm(x_norms.data(), x, k, m, raft::linalg::L2Norm, true, handle.get_stream()); - raft::linalg::rowNorm(y_norms.data(), y, k, n, raft::linalg::L2Norm, true, handle.get_stream()); + rmm::device_uvector x_norms(m, handle.get_stream()); + rmm::device_uvector y_norms(n, handle.get_stream()); + raft::linalg::rowNorm(x_norms.data(), x, k, m, raft::linalg::L2Norm, true, handle.get_stream()); + raft::linalg::rowNorm(y_norms.data(), y, k, n, raft::linalg::L2Norm, true, handle.get_stream()); - fusedL2NNMinReduce(kvp.data_handle(), x, y, x_norms.data(), y_norms.data(), m, n, k, (void*)workspace.data(), sqrt, true, handle.get_stream()); + fusedL2NNMinReduce(kvp.data_handle(), + x, + y, + x_norms.data(), + y_norms.data(), + m, + n, + k, + (void*)workspace.data(), + sqrt, + true, + handle.get_stream()); - raft::print_device_vector("x", x, m*k, std::cout); - raft::print_device_vector("y", y, n*k, std::cout); - - raft::print_device_vector("x_norms", x_norms.data(), m, std::cout); - raft::print_device_vector("y_norms", y_norms.data(), n, std::cout); - - KeyValueIndexOp conversion_op; - thrust::transform(handle.get_thrust_policy(), kvp.data_handle(), kvp.data_handle()+m, min, conversion_op); - handle.sync_stream(); - raft::print_device_vector("min", min, m, std::cout); - } + KeyValueIndexOp conversion_op; + thrust::transform( + handle.get_thrust_policy(), kvp.data_handle(), kvp.data_handle() + m, min, conversion_op); + handle.sync_stream(); +} - /** +/** * @brief Wrapper around fusedL2NN with minimum reduction operators. * * fusedL2NN cannot be compiled in the distance library due to the lambda @@ -87,31 +91,28 @@ template * @param[in] n gemm n * @param[in] k gemm k */ - void fused_l2_nn_min_arg( - raft::handle_t const& handle, - int* min, - const float* x, - const float* y, - int m, - int n, - int k, - bool sqrt) { - - compute_fused_l2_nn_min_arg(handle, min, x, y, m, n, k, sqrt); - } - - void fused_l2_nn_min_arg( - raft::handle_t const& handle, - int* min, - const double* x, - const double* y, - int m, - int n, - int k, - bool sqrt) { - - compute_fused_l2_nn_min_arg(handle, min, x, y, m, n, k, sqrt); +void fused_l2_nn_min_arg(raft::handle_t const& handle, + int* min, + const float* x, + const float* y, + int m, + int n, + int k, + bool sqrt) +{ + compute_fused_l2_nn_min_arg(handle, min, x, y, m, n, k, sqrt); } +void fused_l2_nn_min_arg(raft::handle_t const& handle, + int* min, + const double* x, + const double* y, + int m, + int n, + int k, + bool sqrt) +{ + compute_fused_l2_nn_min_arg(handle, min, x, y, m, n, k, sqrt); +} -} // end namespace raft::distance::runtime \ No newline at end of file +} // end namespace raft::distance::runtime \ No newline at end of file diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py index 9b7fa16fe3..b12cc30472 100644 --- a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py +++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py @@ -42,10 +42,6 @@ def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype): output_device = TestDeviceBuffer(output, "C") fused_l2_nn_argmin(input1_device, input2_device, output_device, True) - actual = output_device.copy_to_host() - print(str(expected)) - - print(str(actual)) assert np.allclose(expected, actual, rtol=1e-4) From e6731dd83c4700a2400fc2cf275e19edc87be9c0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 18 Oct 2022 17:27:20 -0400 Subject: [PATCH 4/7] CLeaning up --- cpp/CMakeLists.txt | 2 +- .../raft_distance/fused_l2_min_arg.hpp | 5 +++-- cpp/src/distance/fused_l2_min_arg.cu | 20 ------------------- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c15020174b..d8525b057d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -244,7 +244,7 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance) if(RAFT_COMPILE_DIST_LIBRARY) add_library(raft_distance_lib src/distance/pairwise_distance.cu - src/distance/fused_l2_min_arg.cu + src/distance/fused_l2_min_arg.cu src/distance/specializations/detail/canberra.cu src/distance/specializations/detail/chebyshev.cu src/distance/specializations/detail/correlation.cu diff --git a/cpp/include/raft_distance/fused_l2_min_arg.hpp b/cpp/include/raft_distance/fused_l2_min_arg.hpp index b882a6ef26..f7d3748666 100644 --- a/cpp/include/raft_distance/fused_l2_min_arg.hpp +++ b/cpp/include/raft_distance/fused_l2_min_arg.hpp @@ -18,13 +18,13 @@ #include namespace raft::distance::runtime { + /** * @brief Wrapper around fusedL2NN with minimum reduction operators. * * fusedL2NN cannot be compiled in the distance library due to the lambda * operators, so this wrapper covers the most common case (minimum). - * This should be preferred to the more generic API when possible, in order to - * reduce compilation times for users of the shared library. + * * @param[in] handle raft handle * @param[out] min will contain the reduced output (Length = `m`) * (on device) @@ -35,6 +35,7 @@ namespace raft::distance::runtime { * @param[in] m gemm m * @param[in] n gemm n * @param[in] k gemm k + * @param[in] sqrt Whether the output `minDist` should contain L2-sqrt */ void fused_l2_nn_min_arg(raft::handle_t const& handle, int* min, diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu index 49aedc47b2..c722b5a566 100644 --- a/cpp/src/distance/fused_l2_min_arg.cu +++ b/cpp/src/distance/fused_l2_min_arg.cu @@ -71,26 +71,6 @@ void compute_fused_l2_nn_min_arg(raft::handle_t const& handle, handle.sync_stream(); } -/** - * @brief Wrapper around fusedL2NN with minimum reduction operators. - * - * fusedL2NN cannot be compiled in the distance library due to the lambda - * operators, so this wrapper covers the most common case (minimum). - * This should be preferred to the more generic API when possible, in order to - * reduce compilation times for users of the shared library. - * @param[in] handle raft handle - * @param[out] min will contain the reduced output (Length = `m`) - * (on device) - * @param[in] x first matrix. Row major. Dim = `m x k`. - * (on device). - * @param[in] y second matrix. Row major. Dim = `n x k`. - * (on device). - * @param[in] xn L2 squared norm of `x`. Length = `m`. (on device). - * @param[in] yn L2 squared norm of `y`. Length = `n`. (on device) - * @param[in] m gemm m - * @param[in] n gemm n - * @param[in] k gemm k - */ void fused_l2_nn_min_arg(raft::handle_t const& handle, int* min, const float* x, From e9a86ec8a6a3b5da4707f8774fc83f4128661b22 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 18 Oct 2022 19:23:31 -0400 Subject: [PATCH 5/7] Style --- cpp/test/util/fast_int_div.cu | 130 +++++++++++++++++----------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/cpp/test/util/fast_int_div.cu b/cpp/test/util/fast_int_div.cu index 2649bb7445..1689724f9c 100644 --- a/cpp/test/util/fast_int_div.cu +++ b/cpp/test/util/fast_int_div.cu @@ -15,8 +15,8 @@ */ #include "../test_utils.h" -#include #include +#include #include @@ -24,91 +24,91 @@ namespace raft::util { - TEST(FastIntDiv, CpuTest) +TEST(FastIntDiv, CpuTest) { - for (int i = 0; i < 100; ++i) { + for (int i = 0; i < 100; ++i) { // get a positive divisor int divisor; do { - divisor = rand(); -} while (divisor <= 0); -FastIntDiv fid(divisor); -// run it against a few random numbers and compare the outputs -for (int i = 0; i < 10000; ++i) { -auto num = rand(); -auto correct = num / divisor; -auto computed = num / fid; -ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; -num = rand(); -correct = num % divisor; -computed = num % fid; -ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; -num = -num; -correct = num / divisor; -computed = num / fid; -ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; -num = rand(); -correct = num % divisor; -computed = num % fid; -ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; -} -} + divisor = rand(); + } while (divisor <= 0); + FastIntDiv fid(divisor); + // run it against a few random numbers and compare the outputs + for (int i = 0; i < 10000; ++i) { + auto num = rand(); + auto correct = num / divisor; + auto computed = num / fid; + ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; + num = rand(); + correct = num % divisor; + computed = num % fid; + ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; + num = -num; + correct = num / divisor; + computed = num / fid; + ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; + num = rand(); + correct = num % divisor; + computed = num % fid; + ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; + } + } } __global__ void fastIntDivTestKernel( - int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len) + int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len) { - auto tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid < len) { - computed[tid] = in[tid] % fid; - correct[tid] = in[tid] % divisor; - computed[len + tid] = -in[tid] % fid; - correct[len + tid] = -in[tid] % divisor; - } + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < len) { + computed[tid] = in[tid] % fid; + correct[tid] = in[tid] % divisor; + computed[len + tid] = -in[tid] % fid; + correct[len + tid] = -in[tid] % divisor; + } } TEST(FastIntDiv, GpuTest) { -cudaStream_t stream = 0; -RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + cudaStream_t stream = 0; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); -static const int len = 100000; -static const int TPB = 128; -rmm::device_uvector computed(len * 2, stream); -rmm::device_uvector correct(len * 2, stream); -rmm::device_uvector in(len, stream); -for (int i = 0; i < 100; ++i) { -// get a positive divisor -int divisor; -do { -divisor = rand(); -} while (divisor <= 0); -FastIntDiv fid(divisor); -// run it against a few random numbers and compare the outputs -std::vector h_in(len); -for (int i = 0; i < len; ++i) { -h_in[i] = rand(); -} -raft::update_device(in.data(), h_in.data(), len, stream); -int nblks = raft::ceildiv(len, TPB); -fastIntDivTestKernel<<>>( - computed.data(), correct.data(), in.data(), fid, divisor, len); -RAFT_CUDA_TRY(cudaStreamSynchronize(0)); -ASSERT_TRUE(devArrMatch(correct.data(), computed.data(), len * 2, raft::Compare())) -<< " divisor=" << divisor; -} + static const int len = 100000; + static const int TPB = 128; + rmm::device_uvector computed(len * 2, stream); + rmm::device_uvector correct(len * 2, stream); + rmm::device_uvector in(len, stream); + for (int i = 0; i < 100; ++i) { + // get a positive divisor + int divisor; + do { + divisor = rand(); + } while (divisor <= 0); + FastIntDiv fid(divisor); + // run it against a few random numbers and compare the outputs + std::vector h_in(len); + for (int i = 0; i < len; ++i) { + h_in[i] = rand(); + } + raft::update_device(in.data(), h_in.data(), len, stream); + int nblks = raft::ceildiv(len, TPB); + fastIntDivTestKernel<<>>( + computed.data(), correct.data(), in.data(), fid, divisor, len); + RAFT_CUDA_TRY(cudaStreamSynchronize(0)); + ASSERT_TRUE(devArrMatch(correct.data(), computed.data(), len * 2, raft::Compare())) + << " divisor=" << divisor; + } } FastIntDiv dummyFunc(int num) { - FastIntDiv fd(num); - return fd; + FastIntDiv fd(num); + return fd; } TEST(FastIntDiv, IncorrectUsage) { -ASSERT_THROW(dummyFunc(-1), raft::exception); -ASSERT_THROW(dummyFunc(0), raft::exception); + ASSERT_THROW(dummyFunc(-1), raft::exception); + ASSERT_THROW(dummyFunc(0), raft::exception); } } // namespace raft::util From 7c3f50fb4a227f13dc5eec8b62a4812d6c7235a8 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 18 Oct 2022 19:24:26 -0400 Subject: [PATCH 6/7] Removing accidental checkin --- cpp/test/util/fast_int_div.cu | 114 ---------------------------------- 1 file changed, 114 deletions(-) delete mode 100644 cpp/test/util/fast_int_div.cu diff --git a/cpp/test/util/fast_int_div.cu b/cpp/test/util/fast_int_div.cu deleted file mode 100644 index 1689724f9c..0000000000 --- a/cpp/test/util/fast_int_div.cu +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../test_utils.h" -#include -#include - -#include - -#include - -namespace raft::util { - -TEST(FastIntDiv, CpuTest) -{ - for (int i = 0; i < 100; ++i) { - // get a positive divisor - int divisor; - do { - divisor = rand(); - } while (divisor <= 0); - FastIntDiv fid(divisor); - // run it against a few random numbers and compare the outputs - for (int i = 0; i < 10000; ++i) { - auto num = rand(); - auto correct = num / divisor; - auto computed = num / fid; - ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; - num = rand(); - correct = num % divisor; - computed = num % fid; - ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; - num = -num; - correct = num / divisor; - computed = num / fid; - ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; - num = rand(); - correct = num % divisor; - computed = num % fid; - ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num; - } - } -} - -__global__ void fastIntDivTestKernel( - int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len) -{ - auto tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid < len) { - computed[tid] = in[tid] % fid; - correct[tid] = in[tid] % divisor; - computed[len + tid] = -in[tid] % fid; - correct[len + tid] = -in[tid] % divisor; - } -} - -TEST(FastIntDiv, GpuTest) -{ - cudaStream_t stream = 0; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - - static const int len = 100000; - static const int TPB = 128; - rmm::device_uvector computed(len * 2, stream); - rmm::device_uvector correct(len * 2, stream); - rmm::device_uvector in(len, stream); - for (int i = 0; i < 100; ++i) { - // get a positive divisor - int divisor; - do { - divisor = rand(); - } while (divisor <= 0); - FastIntDiv fid(divisor); - // run it against a few random numbers and compare the outputs - std::vector h_in(len); - for (int i = 0; i < len; ++i) { - h_in[i] = rand(); - } - raft::update_device(in.data(), h_in.data(), len, stream); - int nblks = raft::ceildiv(len, TPB); - fastIntDivTestKernel<<>>( - computed.data(), correct.data(), in.data(), fid, divisor, len); - RAFT_CUDA_TRY(cudaStreamSynchronize(0)); - ASSERT_TRUE(devArrMatch(correct.data(), computed.data(), len * 2, raft::Compare())) - << " divisor=" << divisor; - } -} - -FastIntDiv dummyFunc(int num) -{ - FastIntDiv fd(num); - return fd; -} - -TEST(FastIntDiv, IncorrectUsage) -{ - ASSERT_THROW(dummyFunc(-1), raft::exception); - ASSERT_THROW(dummyFunc(0), raft::exception); -} - -} // namespace raft::util From 6d2b9d0ca34c2c72a9505d0a4465514e9bb7f260 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 18 Oct 2022 20:10:55 -0400 Subject: [PATCH 7/7] Fixing flake8 style --- .../pylibraft/distance/fused_l2_nn.pyx | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx index 629bc9c323..5fb837c114 100644 --- a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx +++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx @@ -38,26 +38,27 @@ cdef extern from "raft_distance/fused_l2_min_arg.hpp" \ namespace "raft::distance::runtime": void fused_l2_nn_min_arg( - const handle_t &handle, - int* min, - const float* x, - const float* y, - int m, - int n, - int k, - bool sqrt); + const handle_t &handle, + int* min, + const float* x, + const float* y, + int m, + int n, + int k, + bool sqrt) void fused_l2_nn_min_arg( - const handle_t &handle, - int* min, - const double* x, - const double* y, - int m, - int n, - int k, - bool sqrt); - -def fused_l2_nn_argmin(X, Y, output, sqrt = True): + const handle_t &handle, + int* min, + const double* x, + const double* y, + int m, + int n, + int k, + bool sqrt) + + +def fused_l2_nn_argmin(X, Y, output, sqrt=True): """ Compute the 1-nearest neighbors between X and Y using the L2 distance @@ -129,21 +130,21 @@ def fused_l2_nn_argmin(X, Y, output, sqrt = True): if x_dt == np.float32: fused_l2_nn_min_arg(deref(h), - d_ptr, - x_ptr, - y_ptr, - m, - n, - x_k, - sqrt) + d_ptr, + x_ptr, + y_ptr, + m, + n, + x_k, + sqrt) elif x_dt == np.float64: fused_l2_nn_min_arg(deref(h), - d_ptr, - x_ptr, - y_ptr, - m, - n, - x_k, - sqrt) + d_ptr, + x_ptr, + y_ptr, + m, + n, + x_k, + sqrt) else: raise ValueError("dtype %s not supported" % x_dt)