From 0e15a2d6d3bf1110cf69deb64e8e9818c8250a29 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 17:00:06 -0400
Subject: [PATCH 1/7] Adding fused l2 nn argmin primitive for pylibraft

---
 cpp/CMakeLists.txt                            |   1 +
 .../raft_distance/fused_l2_min_arg.hpp        |  60 +++++++
 cpp/src/distance/fused_l2_min_arg.cu          | 117 ++++++++++++++
 cpp/test/util/fast_int_div.cu                 | 114 ++++++++++++++
 .../pylibraft/distance/CMakeLists.txt         |   3 +-
 .../pylibraft/pylibraft/distance/__init__.py  |   1 +
 .../pylibraft/distance/fused_l2_nn.pyx        | 149 ++++++++++++++++++
 .../pylibraft/test/test_fused_l2_argmin.py    |  51 ++++++
 python/pylibraft/pylibraft/testing/utils.py   |   1 +
 9 files changed, 496 insertions(+), 1 deletion(-)
 create mode 100644 cpp/include/raft_distance/fused_l2_min_arg.hpp
 create mode 100644 cpp/src/distance/fused_l2_min_arg.cu
 create mode 100644 cpp/test/util/fast_int_div.cu
 create mode 100644 python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
 create mode 100644 python/pylibraft/pylibraft/test/test_fused_l2_argmin.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12bebfa2a5..c15020174b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -244,6 +244,7 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 if(RAFT_COMPILE_DIST_LIBRARY)
   add_library(raft_distance_lib
     src/distance/pairwise_distance.cu
+          src/distance/fused_l2_min_arg.cu
     src/distance/specializations/detail/canberra.cu
     src/distance/specializations/detail/chebyshev.cu
     src/distance/specializations/detail/correlation.cu
diff --git a/cpp/include/raft_distance/fused_l2_min_arg.hpp b/cpp/include/raft_distance/fused_l2_min_arg.hpp
new file mode 100644
index 0000000000..f289865aa2
--- /dev/null
+++ b/cpp/include/raft_distance/fused_l2_min_arg.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
+
+namespace raft::distance::runtime {
+    /**
+ * @brief Wrapper around fusedL2NN with minimum reduction operators.
+ *
+ * fusedL2NN cannot be compiled in the distance library due to the lambda
+ * operators, so this wrapper covers the most common case (minimum).
+ * This should be preferred to the more generic API when possible, in order to
+ * reduce compilation times for users of the shared library.
+ * @param[in] handle         raft handle
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ */
+    void fused_l2_nn_min_arg(
+            raft::handle_t const& handle,
+            int* min,
+            const float* x,
+            const float* y,
+            int m,
+            int n,
+            int k,
+            bool sqrt);
+
+    void fused_l2_nn_min_arg(
+            raft::handle_t const& handle,
+            int* min,
+            const double* x,
+            const double* y,
+            int m,
+            int n,
+            int k,
+            bool sqrt);
+
+
+} // end namespace raft::distance::runtime
\ No newline at end of file
diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu
new file mode 100644
index 0000000000..7e96bf9555
--- /dev/null
+++ b/cpp/src/distance/fused_l2_min_arg.cu
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/specializations.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <thrust/for_each.h>
+#include <thrust/tuple.h>
+#include <raft/core/kvp.hpp>
+#include <raft/core/handle.hpp>
+
+namespace raft::distance::runtime {
+
+    template <typename IndexT, typename DataT>
+    struct KeyValueIndexOp {
+        __host__ __device__ __forceinline__ IndexT
+        operator()(const raft::KeyValuePair<IndexT, DataT>& a) const
+        {
+            printf("%d, %f\n", a.key, a.value);
+            return a.key;
+        }
+    };
+
+template<typename value_t, typename idx_t>
+    void compute_fused_l2_nn_min_arg(
+            raft::handle_t const& handle,
+            idx_t* min,
+            const value_t* x,
+            const value_t* y,
+            idx_t m,
+            idx_t n,
+            idx_t k,
+            bool sqrt) {
+        rmm::device_uvector<int> workspace(m, handle.get_stream());
+        auto kvp = raft::make_device_vector<raft::KeyValuePair<idx_t, value_t>>(handle, m);
+
+        rmm::device_uvector<value_t> x_norms(m, handle.get_stream());
+        rmm::device_uvector<value_t> y_norms(n, handle.get_stream());
+        raft::linalg::rowNorm(x_norms.data(), x, k, m, raft::linalg::L2Norm, true, handle.get_stream());
+        raft::linalg::rowNorm(y_norms.data(), y, k, n, raft::linalg::L2Norm, true, handle.get_stream());
+
+        fusedL2NNMinReduce(kvp.data_handle(), x, y, x_norms.data(), y_norms.data(), m, n, k, (void*)workspace.data(), sqrt, true, handle.get_stream());
+
+        raft::print_device_vector("x", x, m*k, std::cout);
+        raft::print_device_vector("y", y, n*k, std::cout);
+
+    raft::print_device_vector("x_norms", x_norms.data(), m, std::cout);
+    raft::print_device_vector("y_norms", y_norms.data(), n, std::cout);
+
+    KeyValueIndexOp<idx_t, value_t> conversion_op;
+        thrust::transform(handle.get_thrust_policy(), kvp.data_handle(), kvp.data_handle()+m, min, conversion_op);
+        handle.sync_stream();
+        raft::print_device_vector("min", min, m, std::cout);
+    }
+
+    /**
+ * @brief Wrapper around fusedL2NN with minimum reduction operators.
+ *
+ * fusedL2NN cannot be compiled in the distance library due to the lambda
+ * operators, so this wrapper covers the most common case (minimum).
+ * This should be preferred to the more generic API when possible, in order to
+ * reduce compilation times for users of the shared library.
+ * @param[in] handle         raft handle
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ */
+    void fused_l2_nn_min_arg(
+            raft::handle_t const& handle,
+            int* min,
+            const float* x,
+            const float* y,
+            int m,
+            int n,
+            int k,
+            bool sqrt) {
+
+        compute_fused_l2_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
+    }
+
+    void fused_l2_nn_min_arg(
+            raft::handle_t const& handle,
+            int* min,
+            const double* x,
+            const double* y,
+            int m,
+            int n,
+            int k,
+            bool sqrt) {
+
+        compute_fused_l2_nn_min_arg<double, int>(handle, min, x, y, m, n, k, sqrt);
+}
+
+
+} // end namespace raft::distance::runtime
\ No newline at end of file
diff --git a/cpp/test/util/fast_int_div.cu b/cpp/test/util/fast_int_div.cu
new file mode 100644
index 0000000000..2649bb7445
--- /dev/null
+++ b/cpp/test/util/fast_int_div.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <raft/util/fast_int_div.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+namespace raft::util {
+
+    TEST(FastIntDiv, CpuTest)
+{
+    for (int i = 0; i < 100; ++i) {
+    // get a positive divisor
+    int divisor;
+    do {
+    divisor = rand();
+} while (divisor <= 0);
+FastIntDiv fid(divisor);
+// run it against a few random numbers and compare the outputs
+for (int i = 0; i < 10000; ++i) {
+auto num      = rand();
+auto correct  = num / divisor;
+auto computed = num / fid;
+ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
+num      = rand();
+correct  = num % divisor;
+computed = num % fid;
+ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
+num      = -num;
+correct  = num / divisor;
+computed = num / fid;
+ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
+num      = rand();
+correct  = num % divisor;
+computed = num % fid;
+ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
+}
+}
+}
+
+__global__ void fastIntDivTestKernel(
+        int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len)
+{
+    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid < len) {
+        computed[tid]       = in[tid] % fid;
+        correct[tid]        = in[tid] % divisor;
+        computed[len + tid] = -in[tid] % fid;
+        correct[len + tid]  = -in[tid] % divisor;
+    }
+}
+
+TEST(FastIntDiv, GpuTest)
+{
+cudaStream_t stream = 0;
+RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+static const int len = 100000;
+static const int TPB = 128;
+rmm::device_uvector<int> computed(len * 2, stream);
+rmm::device_uvector<int> correct(len * 2, stream);
+rmm::device_uvector<int> in(len, stream);
+for (int i = 0; i < 100; ++i) {
+// get a positive divisor
+int divisor;
+do {
+divisor = rand();
+} while (divisor <= 0);
+FastIntDiv fid(divisor);
+// run it against a few random numbers and compare the outputs
+std::vector<int> h_in(len);
+for (int i = 0; i < len; ++i) {
+h_in[i] = rand();
+}
+raft::update_device(in.data(), h_in.data(), len, stream);
+int nblks = raft::ceildiv(len, TPB);
+fastIntDivTestKernel<<<nblks, TPB, 0, 0>>>(
+        computed.data(), correct.data(), in.data(), fid, divisor, len);
+RAFT_CUDA_TRY(cudaStreamSynchronize(0));
+ASSERT_TRUE(devArrMatch(correct.data(), computed.data(), len * 2, raft::Compare<int>()))
+<< " divisor=" << divisor;
+}
+}
+
+FastIntDiv dummyFunc(int num)
+{
+    FastIntDiv fd(num);
+    return fd;
+}
+
+TEST(FastIntDiv, IncorrectUsage)
+{
+ASSERT_THROW(dummyFunc(-1), raft::exception);
+ASSERT_THROW(dummyFunc(0), raft::exception);
+}
+
+}  // namespace raft::util
diff --git a/python/pylibraft/pylibraft/distance/CMakeLists.txt b/python/pylibraft/pylibraft/distance/CMakeLists.txt
index 707ea737b3..d074171e58 100644
--- a/python/pylibraft/pylibraft/distance/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/distance/CMakeLists.txt
@@ -13,7 +13,8 @@
 # =============================================================================
 
 # Set the list of Cython files to build
-set(cython_sources pairwise_distance.pyx)
+set(cython_sources pairwise_distance.pyx
+                   fused_l2_nn.pyx)
 set(linked_libraries raft::raft raft::distance)
 
 # Build all of the Cython targets
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
index ca3e6c5a2e..a3c4e2229b 100644
--- a/python/pylibraft/pylibraft/distance/__init__.py
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 #
 
+from .fused_l2_nn import fused_l2_nn_argmin
 from .pairwise_distance import distance as pairwise_distance
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
new file mode 100644
index 0000000000..629bc9c323
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
@@ -0,0 +1,149 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import numpy as np
+
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+
+from libcpp cimport bool
+from .distance_type cimport DistanceType
+from pylibraft.common.handle cimport handle_t
+
+
+def is_c_cont(cai, dt):
+    return "strides" not in cai or \
+        cai["strides"] is None or \
+        cai["strides"][1] == dt.itemsize
+
+
+cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
+        namespace "raft::distance::runtime":
+
+    void fused_l2_nn_min_arg(
+            const handle_t &handle,
+            int* min,
+            const float* x,
+            const float* y,
+            int m,
+            int n,
+            int k,
+            bool sqrt);
+
+    void fused_l2_nn_min_arg(
+            const handle_t &handle,
+            int* min,
+            const double* x,
+            const double* y,
+            int m,
+            int n,
+            int k,
+            bool sqrt);
+
+def fused_l2_nn_argmin(X, Y, output, sqrt = True):
+    """
+    Compute the 1-nearest neighbors between X and Y using the L2 distance
+
+    Parameters
+    ----------
+
+    X : CUDA array interface compliant matrix shape (m, k)
+    Y : CUDA array interface compliant matrix shape (n, k)
+    output : Writable CUDA array interface matrix shape (m, 1)
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import cupy as cp
+
+        from pylibraft.distance import fused_l2_nn
+
+        n_samples = 5000
+        n_clusters = 5
+        n_features = 50
+
+        in1 = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
+        in2 = cp.random.random_sample((n_clusters, n_features),
+                                      dtype=cp.float32)
+        output = cp.empty((n_samples, 1), dtype=cp.int32)
+
+        fused_l2_nn_argmin(in1, in2, output)
+   """
+
+    x_cai = X.__cuda_array_interface__
+    y_cai = Y.__cuda_array_interface__
+    output_cai = output.__cuda_array_interface__
+
+    m = x_cai["shape"][0]
+    n = y_cai["shape"][0]
+
+    x_k = x_cai["shape"][1]
+    y_k = y_cai["shape"][1]
+
+    if x_k != y_k:
+        raise ValueError("Inputs must have same number of columns. "
+                         "a=%s, b=%s" % (x_k, y_k))
+
+    x_ptr = <uintptr_t>x_cai["data"][0]
+    y_ptr = <uintptr_t>y_cai["data"][0]
+
+    d_ptr = <uintptr_t>output_cai["data"][0]
+
+    cdef handle_t *h = new handle_t()
+
+    x_dt = np.dtype(x_cai["typestr"])
+    y_dt = np.dtype(y_cai["typestr"])
+    d_dt = np.dtype(output_cai["typestr"])
+
+    x_c_contiguous = is_c_cont(x_cai, x_dt)
+    y_c_contiguous = is_c_cont(y_cai, y_dt)
+
+    if x_c_contiguous != y_c_contiguous:
+        raise ValueError("Inputs must have matching strides")
+
+    print(x_dt)
+    if x_dt != y_dt:
+        raise ValueError("Inputs must have the same dtypes")
+    if d_dt != np.int32:
+        raise ValueError("Output array must be int32")
+
+    if x_dt == np.float32:
+        fused_l2_nn_min_arg(deref(h),
+                    <int*> d_ptr,
+                    <float*> x_ptr,
+                    <float*> y_ptr,
+                    <int>m,
+                    <int>n,
+                    <int>x_k,
+                    <bool>sqrt)
+    elif x_dt == np.float64:
+        fused_l2_nn_min_arg(deref(h),
+                    <int*> d_ptr,
+                    <double*> x_ptr,
+                    <double*> y_ptr,
+                    <int>m,
+                    <int>n,
+                    <int>x_k,
+                    <bool>sqrt)
+    else:
+        raise ValueError("dtype %s not supported" % x_dt)
diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
new file mode 100644
index 0000000000..71026d1f11
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from scipy.spatial.distance import cdist
+import pytest
+import numpy as np
+
+from pylibraft.distance import fused_l2_nn_argmin
+from pylibraft.testing.utils import TestDeviceBuffer
+
+
+@pytest.mark.parametrize("n_rows", [10])
+@pytest.mark.parametrize("n_clusters", [5])
+@pytest.mark.parametrize("n_cols", [3])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
+    input1 = np.random.random_sample((n_rows, n_cols))
+    input1 = np.asarray(input1, order="C").astype(dtype)
+
+    input2 = np.random.random_sample((n_clusters, n_cols))
+    input2 = np.asarray(input2, order="C").astype(dtype)
+
+    output = np.zeros((n_rows), dtype="int32")
+    expected = cdist(input1, input2, metric="euclidean")
+
+    expected = expected.argmin(axis=1)
+
+    input1_device = TestDeviceBuffer(input1, "C")
+    input2_device = TestDeviceBuffer(input2, "C")
+    output_device = TestDeviceBuffer(output, "C")
+
+    fused_l2_nn_argmin(input1_device, input2_device, output_device, False)
+
+    actual = output_device.copy_to_host()
+
+    print(str(expected))
+
+    print(str(actual))
+    assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/python/pylibraft/pylibraft/testing/utils.py b/python/pylibraft/pylibraft/testing/utils.py
index 53115e991c..979fbb5672 100644
--- a/python/pylibraft/pylibraft/testing/utils.py
+++ b/python/pylibraft/pylibraft/testing/utils.py
@@ -21,6 +21,7 @@
 class TestDeviceBuffer:
 
     def __init__(self, ndarray, order):
+
         self.ndarray_ = ndarray
         self.device_buffer_ = \
             rmm.DeviceBuffer.to_device(ndarray.ravel(order=order).tobytes())

From aaf75bb637b03804233e22972a0659a720fb5c4b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 17:18:53 -0400
Subject: [PATCH 2/7] Adding a few more test caseds.

---
 python/pylibraft/pylibraft/test/test_fused_l2_argmin.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
index 71026d1f11..9b7fa16fe3 100644
--- a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
+++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
@@ -21,9 +21,9 @@
 from pylibraft.testing.utils import TestDeviceBuffer
 
 
-@pytest.mark.parametrize("n_rows", [10])
-@pytest.mark.parametrize("n_clusters", [5])
-@pytest.mark.parametrize("n_cols", [3])
+@pytest.mark.parametrize("n_rows", [10, 100])
+@pytest.mark.parametrize("n_clusters", [5, 10])
+@pytest.mark.parametrize("n_cols", [3, 5])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
     input1 = np.random.random_sample((n_rows, n_cols))
@@ -41,7 +41,7 @@ def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
     input2_device = TestDeviceBuffer(input2, "C")
     output_device = TestDeviceBuffer(output, "C")
 
-    fused_l2_nn_argmin(input1_device, input2_device, output_device, False)
+    fused_l2_nn_argmin(input1_device, input2_device, output_device, True)
 
     actual = output_device.copy_to_host()
 

From b7a58355d8b28f0302f38725fc61cf038ee35a53 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 17:22:00 -0400
Subject: [PATCH 3/7] Cleaning up style and prints

---
 .../raft_distance/fused_l2_min_arg.hpp        |  39 +++--
 cpp/src/distance/fused_l2_min_arg.cu          | 135 +++++++++---------
 .../pylibraft/test/test_fused_l2_argmin.py    |   4 -
 3 files changed, 86 insertions(+), 92 deletions(-)

diff --git a/cpp/include/raft_distance/fused_l2_min_arg.hpp b/cpp/include/raft_distance/fused_l2_min_arg.hpp
index f289865aa2..b882a6ef26 100644
--- a/cpp/include/raft_distance/fused_l2_min_arg.hpp
+++ b/cpp/include/raft_distance/fused_l2_min_arg.hpp
@@ -18,7 +18,7 @@
 #include <raft/distance/distance_types.hpp>
 
 namespace raft::distance::runtime {
-    /**
+/**
  * @brief Wrapper around fusedL2NN with minimum reduction operators.
  *
  * fusedL2NN cannot be compiled in the distance library due to the lambda
@@ -36,25 +36,22 @@ namespace raft::distance::runtime {
  * @param[in]  n             gemm n
  * @param[in]  k             gemm k
  */
-    void fused_l2_nn_min_arg(
-            raft::handle_t const& handle,
-            int* min,
-            const float* x,
-            const float* y,
-            int m,
-            int n,
-            int k,
-            bool sqrt);
+void fused_l2_nn_min_arg(raft::handle_t const& handle,
+                         int* min,
+                         const float* x,
+                         const float* y,
+                         int m,
+                         int n,
+                         int k,
+                         bool sqrt);
 
-    void fused_l2_nn_min_arg(
-            raft::handle_t const& handle,
-            int* min,
-            const double* x,
-            const double* y,
-            int m,
-            int n,
-            int k,
-            bool sqrt);
+void fused_l2_nn_min_arg(raft::handle_t const& handle,
+                         int* min,
+                         const double* x,
+                         const double* y,
+                         int m,
+                         int n,
+                         int k,
+                         bool sqrt);
 
-
-} // end namespace raft::distance::runtime
\ No newline at end of file
+}  // end namespace raft::distance::runtime
\ No newline at end of file
diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu
index 7e96bf9555..49aedc47b2 100644
--- a/cpp/src/distance/fused_l2_min_arg.cu
+++ b/cpp/src/distance/fused_l2_min_arg.cu
@@ -14,60 +14,64 @@
  * limitations under the License.
  */
 
-#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/kvp.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
 #include <raft/distance/specializations.cuh>
-#include <raft/core/device_mdarray.hpp>
 #include <thrust/for_each.h>
 #include <thrust/tuple.h>
-#include <raft/core/kvp.hpp>
-#include <raft/core/handle.hpp>
 
 namespace raft::distance::runtime {
 
-    template <typename IndexT, typename DataT>
-    struct KeyValueIndexOp {
-        __host__ __device__ __forceinline__ IndexT
-        operator()(const raft::KeyValuePair<IndexT, DataT>& a) const
-        {
-            printf("%d, %f\n", a.key, a.value);
-            return a.key;
-        }
-    };
+template <typename IndexT, typename DataT>
+struct KeyValueIndexOp {
+  __host__ __device__ __forceinline__ IndexT
+  operator()(const raft::KeyValuePair<IndexT, DataT>& a) const
+  {
+    return a.key;
+  }
+};
 
-template<typename value_t, typename idx_t>
-    void compute_fused_l2_nn_min_arg(
-            raft::handle_t const& handle,
-            idx_t* min,
-            const value_t* x,
-            const value_t* y,
-            idx_t m,
-            idx_t n,
-            idx_t k,
-            bool sqrt) {
-        rmm::device_uvector<int> workspace(m, handle.get_stream());
-        auto kvp = raft::make_device_vector<raft::KeyValuePair<idx_t, value_t>>(handle, m);
+template <typename value_t, typename idx_t>
+void compute_fused_l2_nn_min_arg(raft::handle_t const& handle,
+                                 idx_t* min,
+                                 const value_t* x,
+                                 const value_t* y,
+                                 idx_t m,
+                                 idx_t n,
+                                 idx_t k,
+                                 bool sqrt)
+{
+  rmm::device_uvector<int> workspace(m, handle.get_stream());
+  auto kvp = raft::make_device_vector<raft::KeyValuePair<idx_t, value_t>>(handle, m);
 
-        rmm::device_uvector<value_t> x_norms(m, handle.get_stream());
-        rmm::device_uvector<value_t> y_norms(n, handle.get_stream());
-        raft::linalg::rowNorm(x_norms.data(), x, k, m, raft::linalg::L2Norm, true, handle.get_stream());
-        raft::linalg::rowNorm(y_norms.data(), y, k, n, raft::linalg::L2Norm, true, handle.get_stream());
+  rmm::device_uvector<value_t> x_norms(m, handle.get_stream());
+  rmm::device_uvector<value_t> y_norms(n, handle.get_stream());
+  raft::linalg::rowNorm(x_norms.data(), x, k, m, raft::linalg::L2Norm, true, handle.get_stream());
+  raft::linalg::rowNorm(y_norms.data(), y, k, n, raft::linalg::L2Norm, true, handle.get_stream());
 
-        fusedL2NNMinReduce(kvp.data_handle(), x, y, x_norms.data(), y_norms.data(), m, n, k, (void*)workspace.data(), sqrt, true, handle.get_stream());
+  fusedL2NNMinReduce(kvp.data_handle(),
+                     x,
+                     y,
+                     x_norms.data(),
+                     y_norms.data(),
+                     m,
+                     n,
+                     k,
+                     (void*)workspace.data(),
+                     sqrt,
+                     true,
+                     handle.get_stream());
 
-        raft::print_device_vector("x", x, m*k, std::cout);
-        raft::print_device_vector("y", y, n*k, std::cout);
-
-    raft::print_device_vector("x_norms", x_norms.data(), m, std::cout);
-    raft::print_device_vector("y_norms", y_norms.data(), n, std::cout);
-
-    KeyValueIndexOp<idx_t, value_t> conversion_op;
-        thrust::transform(handle.get_thrust_policy(), kvp.data_handle(), kvp.data_handle()+m, min, conversion_op);
-        handle.sync_stream();
-        raft::print_device_vector("min", min, m, std::cout);
-    }
+  KeyValueIndexOp<idx_t, value_t> conversion_op;
+  thrust::transform(
+    handle.get_thrust_policy(), kvp.data_handle(), kvp.data_handle() + m, min, conversion_op);
+  handle.sync_stream();
+}
 
-    /**
+/**
  * @brief Wrapper around fusedL2NN with minimum reduction operators.
  *
  * fusedL2NN cannot be compiled in the distance library due to the lambda
@@ -87,31 +91,28 @@ template<typename value_t, typename idx_t>
  * @param[in]  n             gemm n
  * @param[in]  k             gemm k
  */
-    void fused_l2_nn_min_arg(
-            raft::handle_t const& handle,
-            int* min,
-            const float* x,
-            const float* y,
-            int m,
-            int n,
-            int k,
-            bool sqrt) {
-
-        compute_fused_l2_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
-    }
-
-    void fused_l2_nn_min_arg(
-            raft::handle_t const& handle,
-            int* min,
-            const double* x,
-            const double* y,
-            int m,
-            int n,
-            int k,
-            bool sqrt) {
-
-        compute_fused_l2_nn_min_arg<double, int>(handle, min, x, y, m, n, k, sqrt);
+void fused_l2_nn_min_arg(raft::handle_t const& handle,
+                         int* min,
+                         const float* x,
+                         const float* y,
+                         int m,
+                         int n,
+                         int k,
+                         bool sqrt)
+{
+  compute_fused_l2_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
 }
 
+void fused_l2_nn_min_arg(raft::handle_t const& handle,
+                         int* min,
+                         const double* x,
+                         const double* y,
+                         int m,
+                         int n,
+                         int k,
+                         bool sqrt)
+{
+  compute_fused_l2_nn_min_arg<double, int>(handle, min, x, y, m, n, k, sqrt);
+}
 
-} // end namespace raft::distance::runtime
\ No newline at end of file
+}  // end namespace raft::distance::runtime
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
index 9b7fa16fe3..b12cc30472 100644
--- a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
+++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
@@ -42,10 +42,6 @@ def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
     output_device = TestDeviceBuffer(output, "C")
 
     fused_l2_nn_argmin(input1_device, input2_device, output_device, True)
-
     actual = output_device.copy_to_host()
 
-    print(str(expected))
-
-    print(str(actual))
     assert np.allclose(expected, actual, rtol=1e-4)

From e6731dd83c4700a2400fc2cf275e19edc87be9c0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 17:27:20 -0400
Subject: [PATCH 4/7] CLeaning up

---
 cpp/CMakeLists.txt                            |  2 +-
 .../raft_distance/fused_l2_min_arg.hpp        |  5 +++--
 cpp/src/distance/fused_l2_min_arg.cu          | 20 -------------------
 3 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c15020174b..d8525b057d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -244,7 +244,7 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 if(RAFT_COMPILE_DIST_LIBRARY)
   add_library(raft_distance_lib
     src/distance/pairwise_distance.cu
-          src/distance/fused_l2_min_arg.cu
+    src/distance/fused_l2_min_arg.cu
     src/distance/specializations/detail/canberra.cu
     src/distance/specializations/detail/chebyshev.cu
     src/distance/specializations/detail/correlation.cu
diff --git a/cpp/include/raft_distance/fused_l2_min_arg.hpp b/cpp/include/raft_distance/fused_l2_min_arg.hpp
index b882a6ef26..f7d3748666 100644
--- a/cpp/include/raft_distance/fused_l2_min_arg.hpp
+++ b/cpp/include/raft_distance/fused_l2_min_arg.hpp
@@ -18,13 +18,13 @@
 #include <raft/distance/distance_types.hpp>
 
 namespace raft::distance::runtime {
+
 /**
  * @brief Wrapper around fusedL2NN with minimum reduction operators.
  *
  * fusedL2NN cannot be compiled in the distance library due to the lambda
  * operators, so this wrapper covers the most common case (minimum).
- * This should be preferred to the more generic API when possible, in order to
- * reduce compilation times for users of the shared library.
+ *
  * @param[in] handle         raft handle
  * @param[out] min           will contain the reduced output (Length = `m`)
  *                           (on device)
@@ -35,6 +35,7 @@ namespace raft::distance::runtime {
  * @param[in]  m             gemm m
  * @param[in]  n             gemm n
  * @param[in]  k             gemm k
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
  */
 void fused_l2_nn_min_arg(raft::handle_t const& handle,
                          int* min,
diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu
index 49aedc47b2..c722b5a566 100644
--- a/cpp/src/distance/fused_l2_min_arg.cu
+++ b/cpp/src/distance/fused_l2_min_arg.cu
@@ -71,26 +71,6 @@ void compute_fused_l2_nn_min_arg(raft::handle_t const& handle,
   handle.sync_stream();
 }
 
-/**
- * @brief Wrapper around fusedL2NN with minimum reduction operators.
- *
- * fusedL2NN cannot be compiled in the distance library due to the lambda
- * operators, so this wrapper covers the most common case (minimum).
- * This should be preferred to the more generic API when possible, in order to
- * reduce compilation times for users of the shared library.
- * @param[in] handle         raft handle
- * @param[out] min           will contain the reduced output (Length = `m`)
- *                           (on device)
- * @param[in]  x             first matrix. Row major. Dim = `m x k`.
- *                           (on device).
- * @param[in]  y             second matrix. Row major. Dim = `n x k`.
- *                           (on device).
- * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
- * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
- * @param[in]  m             gemm m
- * @param[in]  n             gemm n
- * @param[in]  k             gemm k
- */
 void fused_l2_nn_min_arg(raft::handle_t const& handle,
                          int* min,
                          const float* x,

From e9a86ec8a6a3b5da4707f8774fc83f4128661b22 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 19:23:31 -0400
Subject: [PATCH 5/7] Style

---
 cpp/test/util/fast_int_div.cu | 130 +++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 65 deletions(-)

diff --git a/cpp/test/util/fast_int_div.cu b/cpp/test/util/fast_int_div.cu
index 2649bb7445..1689724f9c 100644
--- a/cpp/test/util/fast_int_div.cu
+++ b/cpp/test/util/fast_int_div.cu
@@ -15,8 +15,8 @@
  */
 
 #include "../test_utils.h"
-#include <raft/util/fast_int_div.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/fast_int_div.cuh>
 
 #include <rmm/device_uvector.hpp>
 
@@ -24,91 +24,91 @@
 
 namespace raft::util {
 
-    TEST(FastIntDiv, CpuTest)
+TEST(FastIntDiv, CpuTest)
 {
-    for (int i = 0; i < 100; ++i) {
+  for (int i = 0; i < 100; ++i) {
     // get a positive divisor
     int divisor;
     do {
-    divisor = rand();
-} while (divisor <= 0);
-FastIntDiv fid(divisor);
-// run it against a few random numbers and compare the outputs
-for (int i = 0; i < 10000; ++i) {
-auto num      = rand();
-auto correct  = num / divisor;
-auto computed = num / fid;
-ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
-num      = rand();
-correct  = num % divisor;
-computed = num % fid;
-ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
-num      = -num;
-correct  = num / divisor;
-computed = num / fid;
-ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
-num      = rand();
-correct  = num % divisor;
-computed = num % fid;
-ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
-}
-}
+      divisor = rand();
+    } while (divisor <= 0);
+    FastIntDiv fid(divisor);
+    // run it against a few random numbers and compare the outputs
+    for (int i = 0; i < 10000; ++i) {
+      auto num      = rand();
+      auto correct  = num / divisor;
+      auto computed = num / fid;
+      ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
+      num      = rand();
+      correct  = num % divisor;
+      computed = num % fid;
+      ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
+      num      = -num;
+      correct  = num / divisor;
+      computed = num / fid;
+      ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
+      num      = rand();
+      correct  = num % divisor;
+      computed = num % fid;
+      ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
+    }
+  }
 }
 
 __global__ void fastIntDivTestKernel(
-        int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len)
+  int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len)
 {
-    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-    if (tid < len) {
-        computed[tid]       = in[tid] % fid;
-        correct[tid]        = in[tid] % divisor;
-        computed[len + tid] = -in[tid] % fid;
-        correct[len + tid]  = -in[tid] % divisor;
-    }
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < len) {
+    computed[tid]       = in[tid] % fid;
+    correct[tid]        = in[tid] % divisor;
+    computed[len + tid] = -in[tid] % fid;
+    correct[len + tid]  = -in[tid] % divisor;
+  }
 }
 
 TEST(FastIntDiv, GpuTest)
 {
-cudaStream_t stream = 0;
-RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+  cudaStream_t stream = 0;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
 
-static const int len = 100000;
-static const int TPB = 128;
-rmm::device_uvector<int> computed(len * 2, stream);
-rmm::device_uvector<int> correct(len * 2, stream);
-rmm::device_uvector<int> in(len, stream);
-for (int i = 0; i < 100; ++i) {
-// get a positive divisor
-int divisor;
-do {
-divisor = rand();
-} while (divisor <= 0);
-FastIntDiv fid(divisor);
-// run it against a few random numbers and compare the outputs
-std::vector<int> h_in(len);
-for (int i = 0; i < len; ++i) {
-h_in[i] = rand();
-}
-raft::update_device(in.data(), h_in.data(), len, stream);
-int nblks = raft::ceildiv(len, TPB);
-fastIntDivTestKernel<<<nblks, TPB, 0, 0>>>(
-        computed.data(), correct.data(), in.data(), fid, divisor, len);
-RAFT_CUDA_TRY(cudaStreamSynchronize(0));
-ASSERT_TRUE(devArrMatch(correct.data(), computed.data(), len * 2, raft::Compare<int>()))
-<< " divisor=" << divisor;
-}
+  static const int len = 100000;
+  static const int TPB = 128;
+  rmm::device_uvector<int> computed(len * 2, stream);
+  rmm::device_uvector<int> correct(len * 2, stream);
+  rmm::device_uvector<int> in(len, stream);
+  for (int i = 0; i < 100; ++i) {
+    // get a positive divisor
+    int divisor;
+    do {
+      divisor = rand();
+    } while (divisor <= 0);
+    FastIntDiv fid(divisor);
+    // run it against a few random numbers and compare the outputs
+    std::vector<int> h_in(len);
+    for (int i = 0; i < len; ++i) {
+      h_in[i] = rand();
+    }
+    raft::update_device(in.data(), h_in.data(), len, stream);
+    int nblks = raft::ceildiv(len, TPB);
+    fastIntDivTestKernel<<<nblks, TPB, 0, 0>>>(
+      computed.data(), correct.data(), in.data(), fid, divisor, len);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(0));
+    ASSERT_TRUE(devArrMatch(correct.data(), computed.data(), len * 2, raft::Compare<int>()))
+      << " divisor=" << divisor;
+  }
 }
 
 FastIntDiv dummyFunc(int num)
 {
-    FastIntDiv fd(num);
-    return fd;
+  FastIntDiv fd(num);
+  return fd;
 }
 
 TEST(FastIntDiv, IncorrectUsage)
 {
-ASSERT_THROW(dummyFunc(-1), raft::exception);
-ASSERT_THROW(dummyFunc(0), raft::exception);
+  ASSERT_THROW(dummyFunc(-1), raft::exception);
+  ASSERT_THROW(dummyFunc(0), raft::exception);
 }
 
 }  // namespace raft::util

From 7c3f50fb4a227f13dc5eec8b62a4812d6c7235a8 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 19:24:26 -0400
Subject: [PATCH 6/7] Removing accidental checkin

---
 cpp/test/util/fast_int_div.cu | 114 ----------------------------------
 1 file changed, 114 deletions(-)
 delete mode 100644 cpp/test/util/fast_int_div.cu

diff --git a/cpp/test/util/fast_int_div.cu b/cpp/test/util/fast_int_div.cu
deleted file mode 100644
index 1689724f9c..0000000000
--- a/cpp/test/util/fast_int_div.cu
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../test_utils.h"
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/fast_int_div.cuh>
-
-#include <rmm/device_uvector.hpp>
-
-#include <gtest/gtest.h>
-
-namespace raft::util {
-
-TEST(FastIntDiv, CpuTest)
-{
-  for (int i = 0; i < 100; ++i) {
-    // get a positive divisor
-    int divisor;
-    do {
-      divisor = rand();
-    } while (divisor <= 0);
-    FastIntDiv fid(divisor);
-    // run it against a few random numbers and compare the outputs
-    for (int i = 0; i < 10000; ++i) {
-      auto num      = rand();
-      auto correct  = num / divisor;
-      auto computed = num / fid;
-      ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
-      num      = rand();
-      correct  = num % divisor;
-      computed = num % fid;
-      ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
-      num      = -num;
-      correct  = num / divisor;
-      computed = num / fid;
-      ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
-      num      = rand();
-      correct  = num % divisor;
-      computed = num % fid;
-      ASSERT_EQ(correct, computed) << " divisor=" << divisor << " num=" << num;
-    }
-  }
-}
-
-__global__ void fastIntDivTestKernel(
-  int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len)
-{
-  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < len) {
-    computed[tid]       = in[tid] % fid;
-    correct[tid]        = in[tid] % divisor;
-    computed[len + tid] = -in[tid] % fid;
-    correct[len + tid]  = -in[tid] % divisor;
-  }
-}
-
-TEST(FastIntDiv, GpuTest)
-{
-  cudaStream_t stream = 0;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-  static const int len = 100000;
-  static const int TPB = 128;
-  rmm::device_uvector<int> computed(len * 2, stream);
-  rmm::device_uvector<int> correct(len * 2, stream);
-  rmm::device_uvector<int> in(len, stream);
-  for (int i = 0; i < 100; ++i) {
-    // get a positive divisor
-    int divisor;
-    do {
-      divisor = rand();
-    } while (divisor <= 0);
-    FastIntDiv fid(divisor);
-    // run it against a few random numbers and compare the outputs
-    std::vector<int> h_in(len);
-    for (int i = 0; i < len; ++i) {
-      h_in[i] = rand();
-    }
-    raft::update_device(in.data(), h_in.data(), len, stream);
-    int nblks = raft::ceildiv(len, TPB);
-    fastIntDivTestKernel<<<nblks, TPB, 0, 0>>>(
-      computed.data(), correct.data(), in.data(), fid, divisor, len);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(0));
-    ASSERT_TRUE(devArrMatch(correct.data(), computed.data(), len * 2, raft::Compare<int>()))
-      << " divisor=" << divisor;
-  }
-}
-
-FastIntDiv dummyFunc(int num)
-{
-  FastIntDiv fd(num);
-  return fd;
-}
-
-TEST(FastIntDiv, IncorrectUsage)
-{
-  ASSERT_THROW(dummyFunc(-1), raft::exception);
-  ASSERT_THROW(dummyFunc(0), raft::exception);
-}
-
-}  // namespace raft::util

From 6d2b9d0ca34c2c72a9505d0a4465514e9bb7f260 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 20:10:55 -0400
Subject: [PATCH 7/7] Fixing flake8 style

---
 .../pylibraft/distance/fused_l2_nn.pyx        | 65 ++++++++++---------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
index 629bc9c323..5fb837c114 100644
--- a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
+++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
@@ -38,26 +38,27 @@ cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
         namespace "raft::distance::runtime":
 
     void fused_l2_nn_min_arg(
-            const handle_t &handle,
-            int* min,
-            const float* x,
-            const float* y,
-            int m,
-            int n,
-            int k,
-            bool sqrt);
+        const handle_t &handle,
+        int* min,
+        const float* x,
+        const float* y,
+        int m,
+        int n,
+        int k,
+        bool sqrt)
 
     void fused_l2_nn_min_arg(
-            const handle_t &handle,
-            int* min,
-            const double* x,
-            const double* y,
-            int m,
-            int n,
-            int k,
-            bool sqrt);
-
-def fused_l2_nn_argmin(X, Y, output, sqrt = True):
+        const handle_t &handle,
+        int* min,
+        const double* x,
+        const double* y,
+        int m,
+        int n,
+        int k,
+        bool sqrt)
+
+
+def fused_l2_nn_argmin(X, Y, output, sqrt=True):
     """
     Compute the 1-nearest neighbors between X and Y using the L2 distance
 
@@ -129,21 +130,21 @@ def fused_l2_nn_argmin(X, Y, output, sqrt = True):
 
     if x_dt == np.float32:
         fused_l2_nn_min_arg(deref(h),
-                    <int*> d_ptr,
-                    <float*> x_ptr,
-                    <float*> y_ptr,
-                    <int>m,
-                    <int>n,
-                    <int>x_k,
-                    <bool>sqrt)
+                            <int*> d_ptr,
+                            <float*> x_ptr,
+                            <float*> y_ptr,
+                            <int>m,
+                            <int>n,
+                            <int>x_k,
+                            <bool>sqrt)
     elif x_dt == np.float64:
         fused_l2_nn_min_arg(deref(h),
-                    <int*> d_ptr,
-                    <double*> x_ptr,
-                    <double*> y_ptr,
-                    <int>m,
-                    <int>n,
-                    <int>x_k,
-                    <bool>sqrt)
+                            <int*> d_ptr,
+                            <double*> x_ptr,
+                            <double*> y_ptr,
+                            <int>m,
+                            <int>n,
+                            <int>x_k,
+                            <bool>sqrt)
     else:
         raise ValueError("dtype %s not supported" % x_dt)