From 45ed019d16292623b003505d08dce0d07ee3e058 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 22 Mar 2021 13:30:43 -0400
Subject: [PATCH 01/10] changes for 1) slhc python wrapper and 2) better
 scoring of clusters for gtests

---
 cpp/include/raft/sparse/hierarchy/common.h    |   8 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |   6 +
 .../raft/sparse/hierarchy/detail/mst.cuh      |   2 -
 .../raft/sparse/hierarchy/single_linkage.hpp  |   3 +
 .../sparse/selection/connect_components.cuh   |   2 -
 cpp/test/sparse/linkage.cu                    | 110 +++++++++++++++++-
 6 files changed, 121 insertions(+), 10 deletions(-)
diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 48326bd347..625d1e043c 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -29,7 +29,9 @@ enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 };
  * @tparam value_t
  */
 template <typename value_idx, typename value_t>
-struct linkage_output {
+class linkage_output {
+
+ public:
   value_idx m;
   value_idx n_clusters;
 
@@ -41,8 +43,8 @@ struct linkage_output {
   value_idx *children;  // size: (m-1, 2)
 };
 
-struct linkage_output_int_float : public linkage_output<int, float> {};
-struct linkage_output__int64_float : public linkage_output<int64_t, float> {};
+class linkage_output_int_float : public linkage_output<int, float> {};
+class linkage_output__int64_float : public linkage_output<int64_t, float> {};
 
 };  // namespace hierarchy
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index aa2cfc8fe9..324cfdc4a7 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -287,6 +287,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
 
   size_t n_edges = (n_leaves - 1) * 2;
 
+  raft::print_device_vector("children", children, (n_leaves - 1) * 2, std::cout);
+
   thrust::device_ptr<const value_idx> d_ptr =
     thrust::device_pointer_cast(children);
   value_idx n_vertices =
@@ -319,12 +321,16 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
   raft::copy_async(label_roots.data(), children + children_cpy_start,
                    child_size, stream);
 
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
   //  thrust::device_ptr<value_idx> t_label_roots =
   //    thrust::device_pointer_cast(label_roots.data());
   //
   thrust::sort(thrust_policy, label_roots.data(),
                label_roots.data() + (child_size), thrust::greater<value_idx>());
 
+  raft::print_device_vector("label_roots", label_roots.data(), label_roots.size(), std::cout);
+
   rmm::device_uvector<value_idx> tmp_labels(n_vertices, stream);
 
   // Init labels to -1
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 1b890c239b..31f2e83ba1 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -166,8 +166,6 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
     mst_coo = connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n,
                                                     color.data());
 
-    printf("Edges: %d\n", mst_coo.n_edges);
-
     RAFT_EXPECTS(
       mst_coo.n_edges == m - 1,
       "MST was not able to connect knn graph in a single iteration.");
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index a773b31e05..242437e7b4 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -97,6 +97,9 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
   detail::extract_flattened_clusters(handle, out->labels, out->children,
                                      n_clusters, m);
 
+
+  raft::print_device_vector("cuml labels", out->labels, m, std::cout);
+
   out->m = m;
   out->n_clusters = n_clusters;
   out->n_leaves = m;
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index a1944fc0b3..fb70955daf 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -209,8 +209,6 @@ __global__ void min_components_by_color_kernel(
 
   __syncthreads();
 
-  //  printf("block %d thread %d did final sync\n", blockIdx.x, threadIdx.x);
-
   value_idx out_offset = out_indptr[blockIdx.x];
 
   // TODO: Do this across threads, using an atomic counter for each color
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index fd1aca92ca..8e8e9153af 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -46,6 +46,109 @@ struct LinkageInputs {
   int c;
 };
 
+/**
+ * @brief kernel to calculate the values of a and b
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points
+ * @param a: number of pairs of points that both the clusters have classified the same
+ * @param b: number of pairs of points that both the clusters have classified differently
+ */
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
+__global__ void computeTheNumerator(const T* firstClusterArray,
+                                    const T* secondClusterArray, uint64_t size,
+                                    uint64_t* a, uint64_t* b) {
+  //calculating the indices of pairs of datapoints compared by the current thread
+  uint64_t j = threadIdx.x + blockIdx.x * blockDim.x;
+  uint64_t i = threadIdx.y + blockIdx.y * blockDim.y;
+
+  //thread-local variables to count a and b
+  uint64_t myA = 0, myB = 0;
+
+  if (i < size && j < size && j < i) {
+    //checking if the pair have been classified the same by both the clusters
+    if (firstClusterArray[i] == firstClusterArray[j] &&
+        secondClusterArray[i] == secondClusterArray[j]) {
+      ++myA;
+    }
+
+      //checking if the pair have been classified differently by both the clusters
+    else if (firstClusterArray[i] != firstClusterArray[j] &&
+             secondClusterArray[i] != secondClusterArray[j]) {
+      ++myB;
+    }
+  }
+
+  //specialize blockReduce for a 2D block of 1024 threads of type uint64_t
+  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X,
+  cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+    BlockReduce;
+
+  //Allocate shared memory for blockReduce
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  //summing up thread-local counts specific to a block
+  myA = BlockReduce(temp_storage).Sum(myA);
+  __syncthreads();
+  myB = BlockReduce(temp_storage).Sum(myB);
+  __syncthreads();
+
+  //executed once per block
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)a, myA);
+    raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)b, myB);
+  }
+}
+
+/**
+* @brief Function to calculate RandIndex
+* <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+* @param firstClusterArray: the array of classes of type T
+* @param secondClusterArray: the array of classes of type T
+* @param size: the size of the data points of type uint64_t
+* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr<MLCommon::deviceAllocator>
+* @param stream: the cudaStream object
+*/
+template <typename T>
+double compute_rand_index(T* firstClusterArray, T* secondClusterArray,
+                          uint64_t size,
+                          std::shared_ptr<raft::mr::device::allocator> allocator,
+                          cudaStream_t stream) {
+  //rand index for size less than 2 is not defined
+  ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
+
+  //allocating and initializing memory for a and b in the GPU
+  raft::mr::device::buffer<uint64_t> arr_buf(allocator, stream, 2);
+  CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
+
+  //kernel configuration
+  static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
+  dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
+  dim3 numBlocks(raft::ceildiv<int>(size, numThreadsPerBlock.x),
+                 raft::ceildiv<int>(size, numThreadsPerBlock.y));
+
+  //calling the kernel
+  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y>
+  <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+    firstClusterArray, secondClusterArray, size, arr_buf.data(),
+    arr_buf.data() + 1);
+
+  //synchronizing and updating the calculated values of a and b from device to host
+  uint64_t ab_host[2] = {0};
+  raft::update_host(ab_host, arr_buf.data(), 2, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  //error handling
+  CUDA_CHECK(cudaGetLastError());
+
+  //denominator
+  uint64_t nChooseTwo = size * (size - 1) / 2;
+
+  //calculating the rand_index
+  return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo);
+}
+
+
 template <typename T, typename IdxT>
 ::std::ostream &operator<<(::std::ostream &os,
                            const LinkageInputs<T, IdxT> &dims) {
@@ -83,10 +186,12 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
     raft::hierarchy::single_linkage<
       IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
       handle, data.data(), params.n_row, params.n_col,
-      raft::distance::DistanceType::L2Unexpanded, &out_arrs, params.c,
+      raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c,
       params.n_clusters);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+
+    score = compute_rand_index(labels, labels_ref, params.n_row, handle.get_device_allocator(), handle.get_stream());
   }
 
   void SetUp() override { basicTest(); }
@@ -492,8 +597,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
 
 typedef LinkageTest<float, int> LinkageTestF_Int;
 TEST_P(LinkageTestF_Int, Result) {
-  EXPECT_TRUE(
-    raft::devArrMatch(labels, labels_ref, params.n_row, raft::Compare<int>()));
+  EXPECT_TRUE(score == 1.0);
 }
 
 INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int,

From 95f62b8f3840fa3868ecf5d3813f6dae16ccf75b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 22 Mar 2021 13:34:15 -0400
Subject: [PATCH 02/10] Fixing style

---
 cpp/include/raft/sparse/hierarchy/common.h    |  1 -
 .../sparse/hierarchy/detail/agglomerative.cuh |  6 ++--
 .../raft/sparse/hierarchy/single_linkage.hpp  |  1 -
 cpp/test/sparse/linkage.cu                    | 30 +++++++++----------
 4 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 625d1e043c..29f541498b 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -30,7 +30,6 @@ enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 };
  */
 template <typename value_idx, typename value_t>
 class linkage_output {
-
  public:
   value_idx m;
   value_idx n_clusters;
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 324cfdc4a7..96f2122ba5 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -287,7 +287,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
 
   size_t n_edges = (n_leaves - 1) * 2;
 
-  raft::print_device_vector("children", children, (n_leaves - 1) * 2, std::cout);
+  raft::print_device_vector("children", children, (n_leaves - 1) * 2,
+                            std::cout);
 
   thrust::device_ptr<const value_idx> d_ptr =
     thrust::device_pointer_cast(children);
@@ -329,7 +330,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
   thrust::sort(thrust_policy, label_roots.data(),
                label_roots.data() + (child_size), thrust::greater<value_idx>());
 
-  raft::print_device_vector("label_roots", label_roots.data(), label_roots.size(), std::cout);
+  raft::print_device_vector("label_roots", label_roots.data(),
+                            label_roots.size(), std::cout);
 
   rmm::device_uvector<value_idx> tmp_labels(n_vertices, stream);
 
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 242437e7b4..b27f4dfce1 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -97,7 +97,6 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
   detail::extract_flattened_clusters(handle, out->labels, out->children,
                                      n_clusters, m);
 
-
   raft::print_device_vector("cuml labels", out->labels, m, std::cout);
 
   out->m = m;
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 8e8e9153af..ce567e4298 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -72,7 +72,7 @@ __global__ void computeTheNumerator(const T* firstClusterArray,
       ++myA;
     }
 
-      //checking if the pair have been classified differently by both the clusters
+    //checking if the pair have been classified differently by both the clusters
     else if (firstClusterArray[i] != firstClusterArray[j] &&
              secondClusterArray[i] != secondClusterArray[j]) {
       ++myB;
@@ -81,7 +81,7 @@ __global__ void computeTheNumerator(const T* firstClusterArray,
 
   //specialize blockReduce for a 2D block of 1024 threads of type uint64_t
   typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X,
-  cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+                           cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
     BlockReduce;
 
   //Allocate shared memory for blockReduce
@@ -110,10 +110,9 @@ __global__ void computeTheNumerator(const T* firstClusterArray,
 * @param stream: the cudaStream object
 */
 template <typename T>
-double compute_rand_index(T* firstClusterArray, T* secondClusterArray,
-                          uint64_t size,
-                          std::shared_ptr<raft::mr::device::allocator> allocator,
-                          cudaStream_t stream) {
+double compute_rand_index(
+  T* firstClusterArray, T* secondClusterArray, uint64_t size,
+  std::shared_ptr<raft::mr::device::allocator> allocator, cudaStream_t stream) {
   //rand index for size less than 2 is not defined
   ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
 
@@ -129,9 +128,9 @@ double compute_rand_index(T* firstClusterArray, T* secondClusterArray,
 
   //calling the kernel
   computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y>
-  <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
-    firstClusterArray, secondClusterArray, size, arr_buf.data(),
-    arr_buf.data() + 1);
+    <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+      firstClusterArray, secondClusterArray, size, arr_buf.data(),
+      arr_buf.data() + 1);
 
   //synchronizing and updating the calculated values of a and b from device to host
   uint64_t ab_host[2] = {0};
@@ -148,10 +147,9 @@ double compute_rand_index(T* firstClusterArray, T* secondClusterArray,
   return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo);
 }
 
-
 template <typename T, typename IdxT>
-::std::ostream &operator<<(::std::ostream &os,
-                           const LinkageInputs<T, IdxT> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const LinkageInputs<T, IdxT>& dims) {
   return os;
 }
 
@@ -191,7 +189,9 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
-    score = compute_rand_index(labels, labels_ref, params.n_row, handle.get_device_allocator(), handle.get_stream());
+    score =
+      compute_rand_index(labels, labels_ref, params.n_row,
+                         handle.get_device_allocator(), handle.get_stream());
   }
 
   void SetUp() override { basicTest(); }
@@ -596,9 +596,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
    -4}};
 
 typedef LinkageTest<float, int> LinkageTestF_Int;
-TEST_P(LinkageTestF_Int, Result) {
-  EXPECT_TRUE(score == 1.0);
-}
+TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); }
 
 INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int,
                         ::testing::ValuesIn(linkage_inputsf2));

From 446967a72e4f09c0bc4a9287a41ecd0c6d36d9e0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 22 Mar 2021 14:28:21 -0400
Subject: [PATCH 03/10] Adding debugging logging to mst

---
 cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 0235994feb..1f09c02c03 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -183,6 +183,8 @@ MST_solver<vertex_t, edge_t, weight_t>::solve() {
     prev_mst_edge_count = mst_edge_count;
   }
 
+  std::cout << "Packaging results" << std::endl;
+
   // result packaging
   thrust::host_vector<edge_t> host_mst_edge_count = mst_edge_count;
   mst_result.n_edges = host_mst_edge_count[0];
@@ -191,6 +193,7 @@ MST_solver<vertex_t, edge_t, weight_t>::solve() {
   mst_result.weights.resize(mst_result.n_edges, stream);
 
   // raft::print_device_vector("Colors before sending: ", color_index, 7, std::cout);
+  std::cout << "Returning results" << std::endl;
 
   return mst_result;
 }

From 61c944ec2dcbf8e1d8f232c8d45dd1fbc6286be7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 22 Mar 2021 15:37:48 -0400
Subject: [PATCH 04/10] Still debugging

---
 cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 1f09c02c03..32a3c361ab 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -192,6 +192,10 @@ MST_solver<vertex_t, edge_t, weight_t>::solve() {
   mst_result.dst.resize(mst_result.n_edges, stream);
   mst_result.weights.resize(mst_result.n_edges, stream);
 
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+
   // raft::print_device_vector("Colors before sending: ", color_index, 7, std::cout);
   std::cout << "Returning results" << std::endl;
 

From 78c4932a756f869a68f16e4c027356bae3a64a75 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 22 Mar 2021 16:38:21 -0400
Subject: [PATCH 05/10] Style

---
 cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 32a3c361ab..f0ed45a9b1 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -192,10 +192,8 @@ MST_solver<vertex_t, edge_t, weight_t>::solve() {
   mst_result.dst.resize(mst_result.n_edges, stream);
   mst_result.weights.resize(mst_result.n_edges, stream);
 
-
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-
   // raft::print_device_vector("Colors before sending: ", color_index, 7, std::cout);
   std::cout << "Returning results" << std::endl;
 

From e9427313481dcecdf9bff6cb13957fe33bb8cdae Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 22 Mar 2021 17:44:47 -0400
Subject: [PATCH 06/10] Removed explicit sync from connect components gtest.
 Doesn't seem to havehelped

---
 cpp/test/sparse/connect_components.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index 965dc20330..241062f54f 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -104,8 +104,6 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
       handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(),
       params.n_row, knn_graph_coo.nnz, colors.data(), stream, false);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
     printf("Got here.\n");
 
     raft::print_device_vector("colors", colors.data(), params.n_row, std::cout);

From 03074a584249b513b1635e3bfea99355c3805175 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 22 Mar 2021 19:20:54 -0400
Subject: [PATCH 07/10] Removing debug prints

---
 .../raft/sparse/hierarchy/detail/agglomerative.cuh    | 11 -----------
 cpp/include/raft/sparse/hierarchy/single_linkage.hpp  |  2 --
 cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh |  7 -------
 3 files changed, 20 deletions(-)

diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 96f2122ba5..4ebe5e660f 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -287,9 +287,6 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
 
   size_t n_edges = (n_leaves - 1) * 2;
 
-  raft::print_device_vector("children", children, (n_leaves - 1) * 2,
-                            std::cout);
-
   thrust::device_ptr<const value_idx> d_ptr =
     thrust::device_pointer_cast(children);
   value_idx n_vertices =
@@ -322,17 +319,9 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
   raft::copy_async(label_roots.data(), children + children_cpy_start,
                    child_size, stream);
 
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  //  thrust::device_ptr<value_idx> t_label_roots =
-  //    thrust::device_pointer_cast(label_roots.data());
-  //
   thrust::sort(thrust_policy, label_roots.data(),
                label_roots.data() + (child_size), thrust::greater<value_idx>());
 
-  raft::print_device_vector("label_roots", label_roots.data(),
-                            label_roots.size(), std::cout);
-
   rmm::device_uvector<value_idx> tmp_labels(n_vertices, stream);
 
   // Init labels to -1
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index b27f4dfce1..a773b31e05 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -97,8 +97,6 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
   detail::extract_flattened_clusters(handle, out->labels, out->children,
                                      n_clusters, m);
 
-  raft::print_device_vector("cuml labels", out->labels, m, std::cout);
-
   out->m = m;
   out->n_clusters = n_clusters;
   out->n_leaves = m;
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index f0ed45a9b1..4e9876b3ba 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -183,8 +183,6 @@ MST_solver<vertex_t, edge_t, weight_t>::solve() {
     prev_mst_edge_count = mst_edge_count;
   }
 
-  std::cout << "Packaging results" << std::endl;
-
   // result packaging
   thrust::host_vector<edge_t> host_mst_edge_count = mst_edge_count;
   mst_result.n_edges = host_mst_edge_count[0];
@@ -192,11 +190,6 @@ MST_solver<vertex_t, edge_t, weight_t>::solve() {
   mst_result.dst.resize(mst_result.n_edges, stream);
   mst_result.weights.resize(mst_result.n_edges, stream);
 
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  // raft::print_device_vector("Colors before sending: ", color_index, 7, std::cout);
-  std::cout << "Returning results" << std::endl;
-
   return mst_result;
 }
 

From 204292e3fb592913eaae51f516427d28a8db6af7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 23 Mar 2021 11:47:20 -0400
Subject: [PATCH 08/10] Updating cub to see if it helps.

---
 cpp/cmake/Dependencies.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake
index 080efb5b1f..e1e21a9cd3 100644
--- a/cpp/cmake/Dependencies.cmake
+++ b/cpp/cmake/Dependencies.cmake
@@ -23,7 +23,7 @@ if(NOT CUB_IS_PART_OF_CTK)
   set(CUB_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub CACHE STRING "Path to cub repo")
   ExternalProject_Add(cub
     GIT_REPOSITORY    https://github.com/thrust/cub.git
-    GIT_TAG           1.8.0
+    GIT_TAG           1.12.0
     PREFIX            ${CUB_DIR}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""

From 505eed6b08e490501a99744ad0a59904b200a8fc Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 23 Mar 2021 23:48:23 -0400
Subject: [PATCH 09/10] Hoping some updates might fix gtest hngs

---
 .../sparse/hierarchy/detail/agglomerative.cuh |   4 +
 .../hierarchy/detail/connectivities.cuh       |   2 +
 .../raft/sparse/hierarchy/detail/mst.cuh      |  20 +-
 cpp/include/raft/sparse/op/sort.h             |   2 +
 .../sparse/selection/connect_components.cuh   | 191 ++++++++++--------
 5 files changed, 133 insertions(+), 86 deletions(-)

diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 4ebe5e660f..e94fdfb970 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -103,6 +103,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
   std::vector<value_idx> mst_dst_h(n_edges);
   std::vector<value_t> mst_weights_h(n_edges);
 
+  printf("n_edges: %d\n", n_edges);
+
   update_host(mst_src_h.data(), rows, n_edges, stream);
   update_host(mst_dst_h.data(), cols, n_edges, stream);
   update_host(mst_weights_h.data(), data, n_edges, stream);
@@ -132,6 +134,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
 
   out_size.resize(n_edges, stream);
 
+  printf("Finished dendrogram\n");
+
   raft::update_device(children, children_h.data(), n_edges * 2, stream);
   raft::update_device(out_size.data(), out_size_h.data(), n_edges, stream);
 }
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 229f2034b0..2fe53ad204 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -68,6 +68,8 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
     raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo,
                                        c);
 
+    CUDA_CHECK(cudaDeviceSynchronize());
+
     indices.resize(knn_graph_coo.nnz, stream);
     data.resize(knn_graph_coo.nnz, stream);
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 31f2e83ba1..bc3d6a7cc3 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -100,6 +100,7 @@ raft::Graph_COO<value_idx, value_idx, value_t> connect_knn_graph(
   raft::copy_async(msf.weights.data() + msf.n_edges, connected_edges.vals(),
                    connected_edges.nnz, stream);
 
+  printf("connected components nnz: %d\n", final_nnz);
   raft::sparse::COO<value_t, value_idx> final_coo(d_alloc, stream);
   raft::sparse::linalg::symmetrize(handle, msf.src.data(), msf.dst.data(),
                                    msf.weights.data(), m, n, final_nnz,
@@ -162,14 +163,25 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
     handle, indptr, indices, pw_dists, (value_idx)m, nnz, color.data(), stream,
     false);
 
-  if (linkage::get_n_components(color.data(), m, stream) > 1) {
+  int iters = 1;
+  int n_components = linkage::get_n_components(color.data(), m, stream);
+  while (n_components > 1 && iters < 100) {
+    printf("Found %d components. Going to try connecting graph\n",
+           n_components);
     mst_coo = connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n,
                                                     color.data());
 
-    RAFT_EXPECTS(
-      mst_coo.n_edges == m - 1,
-      "MST was not able to connect knn graph in a single iteration.");
+    iters++;
+
+    n_components = linkage::get_n_components(color.data(), m, stream);
+    //
+    //    printf("Connecting knn graph!\n");
+    //
+    //    RAFT_EXPECTS(
+    //      mst_coo.n_edges == m - 1,
+    //      "MST was not able to connect knn graph in a single iteration.");
   }
+  printf("Found %d components.\n", n_components);
 
   sort_coo_by_data(mst_coo.src.data(), mst_coo.dst.data(),
                    mst_coo.weights.data(), mst_coo.n_edges, stream);
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index b039e52517..642ff2b8be 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -71,6 +71,8 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
 
   CUSPARSE_CHECK(cusparseCreateIdentityPermutation(handle, nnz, d_P.data()));
 
+  printf("nnz: %d\n", nnz);
+
   CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, rows, cols,
                                        d_P.data(), pBuffer.data()));
 
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index fb70955daf..8ccfd184b4 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -105,6 +105,22 @@ struct FixConnectivitiesRedOp {
   }
 };
 
+struct TupleComp {
+  template <typename one, typename two>
+  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+    // sort first by each sample's color,
+    if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
+    if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
+
+    // then by the color of each sample's closest neighbor,
+    if (thrust::get<1>(t1) < thrust::get<1>(t2)) return true;
+    if (thrust::get<1>(t1) > thrust::get<1>(t2)) return false;
+
+    // then sort by value in descending order
+    return thrust::get<2>(t1).value > thrust::get<2>(t2).value;
+  }
+};
+
 /**
  * Count the unique vertices adjacent to each component.
  * This is essentially a count_unique_by_key.
@@ -117,26 +133,75 @@ __global__ void count_components_by_color_kernel(value_idx *out_indptr,
   value_idx tid = threadIdx.x;
   value_idx row = blockIdx.x;
 
-  __shared__ extern value_idx count_smem[];
-
   value_idx start_offset = colors_indptr[row];
   value_idx stop_offset = colors_indptr[row + 1];
 
-  for (value_idx i = tid; i < n_colors; i += blockDim.x) {
-    count_smem[i] = 0;
+  value_idx agg = 0;
+
+  typedef cub::BlockReduce<value_idx, 256> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  value_idx v = 0;
+  for (value_idx i = tid; i < (stop_offset - start_offset) - 1;
+       i += blockDim.x) {
+    // Columns are sorted by row so only columns that are != next column
+    // should get counted
+    v += (colors_nn[start_offset + i + 1] != colors_nn[start_offset + i])
+         // last thread should always write something
+         || i == (stop_offset - start_offset - 2);
   }
 
-  __syncthreads();
+  agg = BlockReduce(temp_storage).Reduce(v, cub::Sum());
+
+  if (threadIdx.x == 0) out_indptr[row] = agg;
+}
+
+template <typename value_idx, typename value_t>
+__global__ void min_components_by_color_kernel(
+  value_idx *out_cols, value_t *out_vals, value_idx *out_rows,
+  const value_idx *out_indptr, const value_idx *colors_indptr,
+  const value_idx *colors_nn, const value_idx *indices,
+  const cub::KeyValuePair<value_idx, value_t> *kvp, value_idx n_colors) {
+  __shared__ value_idx smem[2];
+
+  value_idx *cur_offset = smem;
+  value_idx *mutex = smem + 1;
 
-  for (value_idx i = tid; i < (stop_offset - start_offset); i += blockDim.x) {
-    count_smem[colors_nn[start_offset + i]] = 1;
+  if (threadIdx.x == 0) {
+    mutex[0] = 0;
+    cur_offset[0] = 0;
   }
 
   __syncthreads();
 
-  for (value_idx i = tid; i < n_colors; i += blockDim.x) {
-    // TODO: Warp-level reduction
-    atomicAdd(out_indptr + row, count_smem[i] > 0);
+  value_idx tid = threadIdx.x;
+  value_idx row = blockIdx.x;
+
+  value_idx out_offset = out_indptr[blockIdx.x];
+
+  value_idx start_offset = colors_indptr[row];
+  value_idx stop_offset = colors_indptr[row + 1];
+
+  for (value_idx i = tid; i < (stop_offset - start_offset) - 1;
+       i += blockDim.x) {
+    // Columns are sorted by row so ony columns that are != previous column
+    // should get counted
+    value_idx cur_color = colors_nn[start_offset + i];
+    if (colors_nn[start_offset + i + 1] != cur_color
+        // last thread should always write something
+        || (i == stop_offset - start_offset - 2)) {
+      while (atomicCAS(mutex, 0, 1) == 1)
+        ;
+      __threadfence();
+
+      value_idx local_offset = cur_offset[0];
+      out_rows[out_offset + local_offset] = indices[start_offset + i];
+      out_cols[out_offset + local_offset] = kvp[start_offset + i].key;
+      out_vals[out_offset + local_offset] = kvp[start_offset + i].value;
+      cur_offset[0] += 1;
+      __threadfence();
+      atomicCAS(mutex, 1, 0);
+    }
   }
 }
 
@@ -155,77 +220,23 @@ void count_components_by_color(value_idx *out_indptr,
                                const value_idx *colors_indptr,
                                const value_idx *colors_nn, value_idx n_colors,
                                cudaStream_t stream) {
-  count_components_by_color_kernel<<<n_colors, 256,
-                                     n_colors * sizeof(value_idx), stream>>>(
+  count_components_by_color_kernel<<<n_colors, 256, 0, stream>>>(
     out_indptr, colors_indptr, colors_nn, n_colors);
 }
 
-/**
- * colors_nn is not assumed to be sorted wrt colors_indptr
- * so we need to perform atomic reductions in each thread.
- */
-template <typename value_idx, typename value_t>
-__global__ void min_components_by_color_kernel(
-  value_idx *out_cols, value_t *out_vals, value_idx *out_rows,
-  const value_idx *out_indptr, const value_idx *colors_indptr,
-  const value_idx *colors_nn, const value_idx *indices,
-  const cub::KeyValuePair<value_idx, value_t> *kvp, value_idx n_colors) {
-  __shared__ extern char min_smem[];
-
-  int *mutex = (int *)min_smem;
-
-  cub::KeyValuePair<value_idx, value_t> *min =
-    (cub::KeyValuePair<value_idx, value_t> *)(mutex + n_colors);
-  value_idx *src_inds = (value_idx *)(min + n_colors);
-
-  value_idx start_offset = colors_indptr[blockIdx.x];
-  value_idx stop_offset = colors_indptr[blockIdx.x + 1];
+template <typename LabelT, typename DataT>
+struct CubKVPMinReduce {
+  typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  // initialize
-  for (value_idx i = threadIdx.x; i < n_colors; i += blockDim.x) {
-    mutex[i] = 0;
-    auto skvp = min + i;
-    skvp->key = -1;
-    skvp->value = std::numeric_limits<value_t>::max();
+  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
+    return b.value < a.value ? b : a;
   }
 
-  __syncthreads();
-
-  for (value_idx i = threadIdx.x; i < (stop_offset - start_offset);
-       i += blockDim.x) {
-    value_idx new_color = colors_nn[start_offset + i];
-    while (atomicCAS(mutex + new_color, 0, 1) == 1)
-      ;
-    __threadfence();
-    auto cur_kvp = kvp[start_offset + i];
-    if (cur_kvp.value < min[new_color].value) {
-      src_inds[new_color] = indices[start_offset + i];
-      min[new_color].key = cur_kvp.key;
-      min[new_color].value = cur_kvp.value;
-    }
-    __threadfence();
-    atomicCAS(mutex + new_color, 1, 0);
+  DI KVP operator()(const KVP &a, const KVP &b) {
+    return b.value < a.value ? b : a;
   }
 
-  __syncthreads();
-
-  value_idx out_offset = out_indptr[blockIdx.x];
-
-  // TODO: Do this across threads, using an atomic counter for each color
-  if (threadIdx.x == 0) {
-    value_idx cur_offset = 0;
-
-    for (value_idx i = 0; i < n_colors; i++) {
-      auto min_color = min[i];
-      if (min_color.key > -1) {
-        out_rows[out_offset + cur_offset] = src_inds[i];
-        out_cols[out_offset + cur_offset] = min_color.key;
-        out_vals[out_offset + cur_offset] = min_color.value;
-        cur_offset += 1;
-      }
-    }
-  }
-}
+};  // KVPMinReduce
 
 /**
  * Computes the min set of unique components that neighbor the
@@ -249,10 +260,12 @@ void min_components_by_color(raft::sparse::COO<value_t, value_idx> &coo,
                              const value_idx *indices,
                              const cub::KeyValuePair<value_idx, value_t> *kvp,
                              value_idx n_colors, cudaStream_t stream) {
-  int smem_bytes = (n_colors * sizeof(int)) + (n_colors * sizeof(kvp)) +
-                   ((n_colors + 1) * sizeof(value_idx));
-
-  min_components_by_color_kernel<<<n_colors, 256, smem_bytes, stream>>>(
+  /**
+   * Arrays should be ordered by: colors_indptr->colors_n->kvp.value
+   * so the last element of each column in the input CSR should be
+   * the min.
+   */
+  min_components_by_color_kernel<<<n_colors, 256, 0, stream>>>(
     coo.cols(), coo.vals(), coo.rows(), out_indptr, colors_indptr, colors_nn,
     indices, kvp, n_colors);
 }
@@ -383,13 +396,13 @@ void sort_by_color(value_idx *colors, value_idx *nn_colors,
   thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter,
                arg_sort_iter + n_rows, src_indices);
 
-  auto keys = thrust::make_zip_iterator(thrust::make_tuple(colors));
-  auto vals = thrust::make_zip_iterator(
-    thrust::make_tuple((raft::linkage::KeyValuePair<value_idx, value_t> *)kvp,
-                       src_indices, nn_colors));
+  auto keys =
+    thrust::make_zip_iterator(thrust::make_tuple(colors, nn_colors, kvp));
+  auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals);
+  thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals,
+                      TupleComp());
 }
 
 /**
@@ -431,9 +444,15 @@ void connect_components(const raft::handle_t &handle,
   rmm::device_uvector<value_idx> color_neigh_degrees(n_components + 1, stream);
   rmm::device_uvector<value_idx> colors_indptr(n_components + 1, stream);
 
+  printf("Performing 1nn\n");
+
   perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors, X, n_rows,
               n_cols, d_alloc, stream);
 
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  printf("Sorting by color\n");
+
   /**
    * Sort data points by color (neighbors are not sorted)
    */
@@ -446,23 +465,31 @@ void connect_components(const raft::handle_t &handle,
   raft::sparse::convert::sorted_coo_to_csr(colors, n_rows, colors_indptr.data(),
                                            n_components + 1, d_alloc, stream);
 
+  printf("Building output colors indptr\n");
   // create output degree array for closest components per row
   build_output_colors_indptr(color_neigh_degrees.data(), colors_indptr.data(),
                              nn_colors.data(), n_components, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
 
   value_idx nnz;
   raft::update_host(&nnz, color_neigh_degrees.data() + n_components, 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
+  printf("Min components by color\n");
   raft::sparse::COO<value_t, value_idx> min_edges(d_alloc, stream, nnz);
   min_components_by_color(min_edges, color_neigh_degrees.data(),
                           colors_indptr.data(), nn_colors.data(),
                           src_indices.data(), temp_inds_dists.data(),
                           n_components, stream);
 
+  CUDA_CHECK(cudaStreamSynchronize(stream));
   // symmetrize
+
+  printf("Symmetrize\n");
   raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(),
                                    min_edges.vals(), n_rows, n_rows, nnz, out);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
 };  // end namespace linkage

From 5bee95f15f2903f4eddcac54ca2ad312586fb312 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 24 Mar 2021 00:31:56 -0400
Subject: [PATCH 10/10] Using workaround

---
 cpp/include/raft/sparse/selection/connect_components.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 8ccfd184b4..8f06a85ddc 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -396,8 +396,8 @@ void sort_by_color(value_idx *colors, value_idx *nn_colors,
   thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter,
                arg_sort_iter + n_rows, src_indices);
 
-  auto keys =
-    thrust::make_zip_iterator(thrust::make_tuple(colors, nn_colors, kvp));
+  auto keys = thrust::make_zip_iterator(thrust::make_tuple(
+    colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t> *)kvp));
   auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.