From 1030a49b2796fd5939ef6ba151d2d5546db3eed9 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Wed, 16 Jun 2021 12:36:33 -0400
Subject: [PATCH] Fea hungarian expose precision (#1673)

Closes #1645
Closes #1646

Expose the precision parameter (epsilon in the Date/Nagi implementation) of the Hungarian algorithm to be controllable by the user.  Add support for rectangular matrices.

Will be enabled for CI after https://github.com/rapidsai/raft/pull/275 is merged.

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Andrei Schaffer (https://github.com/aschaffer)
  - Brad Rees (https://github.com/BradReesWork)
  - Kumar Aatish (https://github.com/kaatish)

URL: https://github.com/rapidsai/cugraph/pull/1673
---
 cpp/include/cugraph/algorithms.hpp            |  68 +++++-
 cpp/src/linear_assignment/hungarian.cu        | 207 ++++++++++++----
 cpp/tests/linear_assignment/hungarian_test.cu | 227 +++++++++++++-----
 3 files changed, 387 insertions(+), 115 deletions(-)
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 0b0dd88ce29..64f02d60fb4 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -606,6 +606,40 @@ weight_t hungarian(raft::handle_t const &handle,
                    vertex_t const *workers,
                    vertex_t *assignment);
 
+/**
+ * @brief      Compute Hungarian algorithm on a weighted bipartite graph
+ *
+ * The Hungarian algorithm computes an assigment of "jobs" to "workers".  This function accepts
+ * a weighted graph and a vertex list identifying the "workers".  The weights in the weighted
+ * graph identify the cost of assigning a particular job to a worker.  The algorithm computes
+ * a minimum cost assignment and returns the cost as well as a vector identifying the assignment.
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam vertex_t                  Type of vertex identifiers. Supported value : int (signed,
+ * 32-bit)
+ * @tparam edge_t                    Type of edge identifiers.  Supported value : int (signed,
+ * 32-bit)
+ * @tparam weight_t                  Type of edge weights. Supported values : float or double.
+ *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  graph                 cuGRAPH COO graph
+ * @param[in]  num_workers           number of vertices in the worker set
+ * @param[in]  workers               device pointer to an array of worker vertex ids
+ * @param[out] assignment            device pointer to an array to which the assignment will be
+ * written. The array should be num_workers long, and will identify which vertex id (job) is
+ * assigned to that worker
+ * @param[in]  precision             parameter to define precision of comparisons
+ *                                   in reducing weights to zero.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t>
+weight_t hungarian(raft::handle_t const &handle,
+                   GraphCOOView<vertex_t, edge_t, weight_t> const &graph,
+                   vertex_t num_workers,
+                   vertex_t const *workers,
+                   vertex_t *assignment,
+                   weight_t precision);
+
 /**
  * @brief      Louvain implementation
  *
@@ -1052,6 +1086,38 @@ weight_t hungarian(raft::handle_t const &handle,
                    vertex_t num_columns,
                    vertex_t *assignment);
 
+/**
+ * @brief      Compute Hungarian algorithm on a weighted bipartite graph
+ *
+ * The Hungarian algorithm computes an assigment of "jobs" to "workers".  This function accepts
+ * a weighted graph and a vertex list identifying the "workers".  The weights in the weighted
+ * graph identify the cost of assigning a particular job to a worker.  The algorithm computes
+ * a minimum cost assignment and returns the cost as well as a vector identifying the assignment.
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam vertex_t                  Type of vertex identifiers. Supported value : int (signed,
+ * 32-bit)
+ * @tparam weight_t                  Type of edge weights. Supported values : float or double.
+ *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  costs                 pointer to array of costs, stored in row major order
+ * @param[in]  num_rows              number of rows in dense matrix
+ * @param[in]  num_cols              number of cols in dense matrix
+ * @param[out] assignment            device pointer to an array to which the assignment will be
+ *                                   written. The array should be num_cols long, and will identify
+ *                                   which vertex id (job) is assigned to that worker
+ * @param[in]  precision             parameter to define precision of comparisons
+ *                                   in reducing weights to zero.
+ */
+template <typename vertex_t, typename weight_t>
+weight_t hungarian(raft::handle_t const &handle,
+                   weight_t const *costs,
+                   vertex_t num_rows,
+                   vertex_t num_columns,
+                   vertex_t *assignment,
+                   weight_t precision);
+
 }  // namespace dense
 
 namespace experimental {
@@ -1325,4 +1391,4 @@ void weakly_connected_components(
   bool do_expensive_check = false);
 
 }  // namespace experimental
-}  // namespace cugraph
\ No newline at end of file
+}  // namespace cugraph
diff --git a/cpp/src/linear_assignment/hungarian.cu b/cpp/src/linear_assignment/hungarian.cu
index dfa1e43edad..77709d1e936 100644
--- a/cpp/src/linear_assignment/hungarian.cu
+++ b/cpp/src/linear_assignment/hungarian.cu
@@ -13,18 +13,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <limits>
-
-#include <rmm/thrust_rmm_allocator.h>
 #include <cugraph/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/lap/lap.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/random.h>
 
-#include <cugraph/utilities/error.hpp>
-
-#include <raft/lap/lap.cuh>
+#include <iostream>
+#include <limits>
 
 //#define TIMING
 
@@ -35,28 +37,82 @@
 namespace cugraph {
 namespace detail {
 
+template <typename weight_t>
+weight_t default_precision()
+{
+  return 0;
+}
+
+template <>
+float default_precision()
+{
+  return float{1e-6};
+}
+
+template <>
+double default_precision()
+{
+  return double{1e-6};
+}
+
 template <typename index_t, typename weight_t>
 weight_t hungarian(raft::handle_t const &handle,
                    index_t num_rows,
                    index_t num_cols,
                    weight_t const *d_original_cost,
                    index_t *d_assignment,
-                   cudaStream_t stream)
+                   weight_t precision)
 {
-  //
-  //  TODO:  Can Date/Nagi implementation in raft handle rectangular matrices?
-  //
-  CUGRAPH_EXPECTS(num_rows == num_cols, "Current implementation only supports square matrices");
-
-  rmm::device_vector<index_t> col_assignments_v(num_rows);
-
-  // Create an instance of LinearAssignmentProblem using problem size, number of subproblems
-  raft::lap::LinearAssignmentProblem<index_t, weight_t> lpx(handle, num_rows, 1);
-
-  // Solve LAP(s) for given cost matrix
-  lpx.solve(d_original_cost, d_assignment, col_assignments_v.data().get());
-
-  return lpx.getPrimalObjectiveValue(0);
+  if (num_rows == num_cols) {
+    rmm::device_uvector<index_t> col_assignments_v(num_rows, handle.get_stream_view());
+
+    // Create an instance of LinearAssignmentProblem using problem size, number of subproblems
+    raft::lap::LinearAssignmentProblem<index_t, weight_t> lpx(handle, num_rows, 1, precision);
+
+    // Solve LAP(s) for given cost matrix
+    lpx.solve(d_original_cost, d_assignment, col_assignments_v.data());
+
+    return lpx.getPrimalObjectiveValue(0);
+  } else {
+    //
+    //  Create a square matrix, copy d_original_cost into it.
+    //  Fill the extra rows/columns with max(d_original_cost)
+    //
+    index_t n         = std::max(num_rows, num_cols);
+    weight_t max_cost = thrust::reduce(rmm::exec_policy(handle.get_stream_view()),
+                                       d_original_cost,
+                                       d_original_cost + (num_rows * num_cols),
+                                       weight_t{0},
+                                       thrust::maximum<weight_t>());
+
+    rmm::device_uvector<weight_t> tmp_cost(n * n, handle.get_stream_view());
+    rmm::device_uvector<index_t> tmp_row_assignment_v(n, handle.get_stream_view());
+    rmm::device_uvector<index_t> tmp_col_assignment_v(n, handle.get_stream_view());
+
+    thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                      thrust::make_counting_iterator<index_t>(0),
+                      thrust::make_counting_iterator<index_t>(n * n),
+                      tmp_cost.begin(),
+                      [max_cost, d_original_cost, n, num_rows, num_cols] __device__(index_t i) {
+                        index_t row = i / n;
+                        index_t col = i % n;
+
+                        return ((row < num_rows) && (col < num_cols))
+                                 ? d_original_cost[row * num_cols + col]
+                                 : max_cost;
+                      });
+
+    raft::lap::LinearAssignmentProblem<index_t, weight_t> lpx(handle, n, 1, precision);
+
+    // Solve LAP(s) for given cost matrix
+    lpx.solve(tmp_cost.begin(), tmp_row_assignment_v.begin(), tmp_col_assignment_v.begin());
+
+    weight_t tmp_objective_value = lpx.getPrimalObjectiveValue(0);
+
+    raft::copy(d_assignment, tmp_row_assignment_v.begin(), num_rows, handle.get_stream());
+
+    return tmp_objective_value - max_cost * std::abs(num_rows - num_cols);
+  }
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
@@ -65,7 +121,7 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
                           vertex_t num_workers,
                           vertex_t const *workers,
                           vertex_t *assignment,
-                          cudaStream_t stream)
+                          weight_t precision)
 {
   CUGRAPH_EXPECTS(assignment != nullptr, "Invalid input argument: assignment pointer is NULL");
   CUGRAPH_EXPECTS(graph.edge_data != nullptr,
@@ -86,15 +142,16 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
 
   vertex_t matrix_dimension = std::max(num_rows, num_cols);
 
-  rmm::device_vector<weight_t> cost_v(matrix_dimension * matrix_dimension);
-  rmm::device_vector<vertex_t> tasks_v(num_cols);
-  rmm::device_vector<vertex_t> temp_tasks_v(graph.number_of_vertices);
-  rmm::device_vector<vertex_t> temp_workers_v(graph.number_of_vertices);
+  rmm::device_uvector<weight_t> cost_v(matrix_dimension * matrix_dimension,
+                                       handle.get_stream_view());
+  rmm::device_uvector<vertex_t> tasks_v(num_cols, handle.get_stream_view());
+  rmm::device_uvector<vertex_t> temp_tasks_v(graph.number_of_vertices, handle.get_stream_view());
+  rmm::device_uvector<vertex_t> temp_workers_v(graph.number_of_vertices, handle.get_stream_view());
 
-  weight_t *d_cost         = cost_v.data().get();
-  vertex_t *d_tasks        = tasks_v.data().get();
-  vertex_t *d_temp_tasks   = temp_tasks_v.data().get();
-  vertex_t *d_temp_workers = temp_workers_v.data().get();
+  weight_t *d_cost         = cost_v.data();
+  vertex_t *d_tasks        = tasks_v.data();
+  vertex_t *d_temp_tasks   = temp_tasks_v.data();
+  vertex_t *d_temp_workers = temp_workers_v.data();
   vertex_t *d_src_indices  = graph.src_indices;
   vertex_t *d_dst_indices  = graph.dst_indices;
   weight_t *d_edge_data    = graph.edge_data;
@@ -103,46 +160,50 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
   //  Renumber vertices internally.  Workers will become
   //  rows, tasks will become columns
   //
-  thrust::sequence(rmm::exec_policy(stream)->on(stream), temp_tasks_v.begin(), temp_tasks_v.end());
+  thrust::sequence(
+    rmm::exec_policy(handle.get_stream_view()), temp_tasks_v.begin(), temp_tasks_v.end());
 
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each(rmm::exec_policy(handle.get_stream_view()),
                    workers,
                    workers + num_workers,
                    [d_temp_tasks] __device__(vertex_t v) { d_temp_tasks[v] = -1; });
 
-  auto temp_end = thrust::copy_if(rmm::exec_policy(stream)->on(stream),
+  auto temp_end = thrust::copy_if(rmm::exec_policy(handle.get_stream_view()),
                                   temp_tasks_v.begin(),
                                   temp_tasks_v.end(),
                                   d_tasks,
                                   [] __device__(vertex_t v) { return v >= 0; });
 
   vertex_t size = thrust::distance(d_tasks, temp_end);
-  tasks_v.resize(size);
+  tasks_v.resize(size, handle.get_stream_view());
 
   //
   // Now we'll assign costs into the dense array
   //
-  thrust::fill(rmm::exec_policy(stream)->on(stream),
+  thrust::fill(rmm::exec_policy(handle.get_stream_view()),
                temp_workers_v.begin(),
                temp_workers_v.end(),
                vertex_t{-1});
+  thrust::fill(rmm::exec_policy(handle.get_stream_view()),
+               temp_tasks_v.begin(),
+               temp_tasks_v.end(),
+               vertex_t{-1});
   thrust::fill(
-    rmm::exec_policy(stream)->on(stream), temp_tasks_v.begin(), temp_tasks_v.end(), vertex_t{-1});
-  thrust::fill(rmm::exec_policy(stream)->on(stream), cost_v.begin(), cost_v.end(), weight_t{0});
+    rmm::exec_policy(handle.get_stream_view()), cost_v.begin(), cost_v.end(), weight_t{0});
 
   thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(handle.get_stream_view()),
     thrust::make_counting_iterator<vertex_t>(0),
     thrust::make_counting_iterator<vertex_t>(num_rows),
     [d_temp_workers, workers] __device__(vertex_t v) { d_temp_workers[workers[v]] = v; });
 
   thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(handle.get_stream_view()),
     thrust::make_counting_iterator<vertex_t>(0),
     thrust::make_counting_iterator<vertex_t>(num_cols),
     [d_temp_tasks, d_tasks] __device__(vertex_t v) { d_temp_tasks[d_tasks[v]] = v; });
 
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each(rmm::exec_policy(handle.get_stream_view()),
                    thrust::make_counting_iterator<edge_t>(0),
                    thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
                    [d_temp_workers,
@@ -170,11 +231,11 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
   //  temp_assignment_v will hold the assignment in the dense
   //  bipartite matrix numbering
   //
-  rmm::device_vector<vertex_t> temp_assignment_v(matrix_dimension);
-  vertex_t *d_temp_assignment = temp_assignment_v.data().get();
+  rmm::device_uvector<vertex_t> temp_assignment_v(matrix_dimension, handle.get_stream_view());
+  vertex_t *d_temp_assignment = temp_assignment_v.data();
 
   weight_t min_cost = detail::hungarian(
-    handle, matrix_dimension, matrix_dimension, d_cost, d_temp_assignment, stream);
+    handle, matrix_dimension, matrix_dimension, d_cost, d_temp_assignment, precision);
 
 #ifdef TIMING
   hr_timer.stop();
@@ -185,7 +246,7 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
   //
   //  Translate the assignment back to the original vertex ids
   //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each(rmm::exec_policy(handle.get_stream_view()),
                    thrust::make_counting_iterator<vertex_t>(0),
                    thrust::make_counting_iterator<vertex_t>(num_rows),
                    [d_tasks, d_temp_assignment, assignment] __device__(vertex_t id) {
@@ -210,17 +271,49 @@ weight_t hungarian(raft::handle_t const &handle,
                    vertex_t const *workers,
                    vertex_t *assignment)
 {
-  cudaStream_t stream{0};
+  return detail::hungarian_sparse(
+    handle, graph, num_workers, workers, assignment, detail::default_precision<weight_t>());
+}
 
-  return detail::hungarian_sparse(handle, graph, num_workers, workers, assignment, stream);
+template <typename vertex_t, typename edge_t, typename weight_t>
+weight_t hungarian(raft::handle_t const &handle,
+                   GraphCOOView<vertex_t, edge_t, weight_t> const &graph,
+                   vertex_t num_workers,
+                   vertex_t const *workers,
+                   vertex_t *assignment,
+                   weight_t precision)
+{
+  return detail::hungarian_sparse(handle, graph, num_workers, workers, assignment, precision);
 }
 
+template int32_t hungarian<int32_t, int32_t, int32_t>(
+  raft::handle_t const &,
+  GraphCOOView<int32_t, int32_t, int32_t> const &,
+  int32_t,
+  int32_t const *,
+  int32_t *,
+  int32_t);
+
+template float hungarian<int32_t, int32_t, float>(raft::handle_t const &,
+                                                  GraphCOOView<int32_t, int32_t, float> const &,
+                                                  int32_t,
+                                                  int32_t const *,
+                                                  int32_t *,
+                                                  float);
+template double hungarian<int32_t, int32_t, double>(raft::handle_t const &,
+                                                    GraphCOOView<int32_t, int32_t, double> const &,
+                                                    int32_t,
+                                                    int32_t const *,
+                                                    int32_t *,
+                                                    double);
+
 template int32_t hungarian<int32_t, int32_t, int32_t>(
   raft::handle_t const &,
   GraphCOOView<int32_t, int32_t, int32_t> const &,
   int32_t,
   int32_t const *,
   int32_t *);
+
 template float hungarian<int32_t, int32_t, float>(raft::handle_t const &,
                                                   GraphCOOView<int32_t, int32_t, float> const &,
                                                   int32_t,
@@ -241,9 +334,19 @@ weight_t hungarian(raft::handle_t const &handle,
                    index_t num_cols,
                    index_t *assignment)
 {
-  cudaStream_t stream{0};
+  return detail::hungarian(
+    handle, num_rows, num_cols, costs, assignment, detail::default_precision<weight_t>());
+}
 
-  return detail::hungarian(handle, num_rows, num_cols, costs, assignment, stream);
+template <typename index_t, typename weight_t>
+weight_t hungarian(raft::handle_t const &handle,
+                   weight_t const *costs,
+                   index_t num_rows,
+                   index_t num_cols,
+                   index_t *assignment,
+                   weight_t precision)
+{
+  return detail::hungarian(handle, num_rows, num_cols, costs, assignment, precision);
 }
 
 template int32_t hungarian<int32_t, int32_t>(
@@ -252,6 +355,12 @@ template float hungarian<int32_t, float>(
   raft::handle_t const &, float const *, int32_t, int32_t, int32_t *);
 template double hungarian<int32_t, double>(
   raft::handle_t const &, double const *, int32_t, int32_t, int32_t *);
+template int32_t hungarian<int32_t, int32_t>(
+  raft::handle_t const &, int32_t const *, int32_t, int32_t, int32_t *, int32_t);
+template float hungarian<int32_t, float>(
+  raft::handle_t const &, float const *, int32_t, int32_t, int32_t *, float);
+template double hungarian<int32_t, double>(
+  raft::handle_t const &, double const *, int32_t, int32_t, int32_t *, double);
 
 }  // namespace dense
 
diff --git a/cpp/tests/linear_assignment/hungarian_test.cu b/cpp/tests/linear_assignment/hungarian_test.cu
index 282524ffe0d..9698b5c3335 100644
--- a/cpp/tests/linear_assignment/hungarian_test.cu
+++ b/cpp/tests/linear_assignment/hungarian_test.cu
@@ -9,21 +9,23 @@
  *
  */
 
-#include "cuda_profiler_api.h"
-#include "gtest/gtest.h"
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <thrust/random.h>
+#include <utilities/high_res_timer.hpp>
 
 #include <cugraph/algorithms.hpp>
 #include <cugraph/graph.hpp>
 
 #include <raft/handle.hpp>
 
-#include <utilities/high_res_timer.hpp>
-
 #include <curand_kernel.h>
 
+#include <rmm/device_uvector.hpp>
+
+#include "cuda_profiler_api.h"
+#include "gtest/gtest.h"
+
+#include <thrust/equal.h>
+#include <thrust/random.h>
+
 __global__ void setup_generator(curandState *state)
 {
   int id = threadIdx.x + blockIdx.x * blockDim.x;
@@ -64,29 +66,35 @@ TEST_F(HungarianTest, Bipartite4x4)
 
   int32_t workers[] = {0, 1, 2, 3};
 
-  float min_cost     = 18.0;
-  int32_t expected[] = {6, 7, 5, 4};
+  float min_cost = 18.0;
+  std::vector<int32_t> expected({6, 7, 5, 4});
+  std::vector<int32_t> assignment({0, 0, 0, 0});
 
   int32_t length         = sizeof(src_data) / sizeof(src_data[0]);
   int32_t length_workers = sizeof(workers) / sizeof(workers[0]);
   int32_t num_vertices   = 1 + std::max(*std::max_element(src_data, src_data + length),
                                       *std::max_element(dst_data, dst_data + length));
 
-  rmm::device_vector<int32_t> src_v(src_data, src_data + length);
-  rmm::device_vector<int32_t> dst_v(dst_data, dst_data + length);
-  rmm::device_vector<float> cost_v(cost, cost + length);
-  rmm::device_vector<int32_t> workers_v(workers, workers + length_workers);
-  rmm::device_vector<int32_t> expected_v(expected, expected + length_workers);
-  rmm::device_vector<int32_t> assignment_v(length_workers);
+  rmm::device_uvector<int32_t> src_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> dst_v(length, handle.get_stream_view());
+  rmm::device_uvector<float> cost_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> workers_v(length_workers, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(length_workers, handle.get_stream_view());
+
+  raft::update_device(src_v.begin(), src_data, length, handle.get_stream());
+  raft::update_device(dst_v.begin(), dst_data, length, handle.get_stream());
+  raft::update_device(cost_v.begin(), cost, length, handle.get_stream());
+  raft::update_device(workers_v.begin(), workers, length_workers, handle.get_stream());
 
   cugraph::GraphCOOView<int32_t, int32_t, float> g(
-    src_v.data().get(), dst_v.data().get(), cost_v.data().get(), num_vertices, length);
+    src_v.data(), dst_v.data(), cost_v.data(), num_vertices, length);
 
-  float r = cugraph::hungarian(
-    handle, g, length_workers, workers_v.data().get(), assignment_v.data().get());
+  float r = cugraph::hungarian(handle, g, length_workers, workers_v.data(), assignment_v.data());
+
+  raft::update_host(assignment.data(), assignment_v.begin(), length_workers, handle.get_stream());
 
   EXPECT_EQ(min_cost, r);
-  EXPECT_EQ(expected_v, assignment_v);
+  EXPECT_EQ(assignment, expected);
 }
 
 TEST_F(HungarianTest, Bipartite5x5)
@@ -100,29 +108,36 @@ TEST_F(HungarianTest, Bipartite5x5)
 
   int32_t workers[] = {0, 1, 2, 3, 4};
 
-  float min_cost     = 51.0;
-  int32_t expected[] = {5, 7, 8, 6, 9};
+  float min_cost = 51.0;
+  std::vector<int32_t> expected({5, 7, 8, 6, 9});
+  std::vector<int32_t> assignment({0, 0, 0, 0, 0});
 
   int32_t length         = sizeof(src_data) / sizeof(src_data[0]);
   int32_t length_workers = sizeof(workers) / sizeof(workers[0]);
   int32_t num_vertices   = 1 + std::max(*std::max_element(src_data, src_data + length),
                                       *std::max_element(dst_data, dst_data + length));
 
-  rmm::device_vector<int32_t> src_v(src_data, src_data + length);
-  rmm::device_vector<int32_t> dst_v(dst_data, dst_data + length);
-  rmm::device_vector<float> cost_v(cost, cost + length);
-  rmm::device_vector<int32_t> workers_v(workers, workers + length_workers);
-  rmm::device_vector<int32_t> expected_v(expected, expected + length_workers);
-  rmm::device_vector<int32_t> assignment_v(length_workers);
+  rmm::device_uvector<int32_t> src_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> dst_v(length, handle.get_stream_view());
+  rmm::device_uvector<float> cost_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> workers_v(length_workers, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(length_workers, handle.get_stream_view());
+
+  raft::update_device(src_v.begin(), src_data, length, handle.get_stream());
+  raft::update_device(dst_v.begin(), dst_data, length, handle.get_stream());
+  raft::update_device(cost_v.begin(), cost, length, handle.get_stream());
+  raft::update_device(workers_v.begin(), workers, length_workers, handle.get_stream());
 
   cugraph::GraphCOOView<int32_t, int32_t, float> g(
-    src_v.data().get(), dst_v.data().get(), cost_v.data().get(), num_vertices, length);
+    src_v.data(), dst_v.data(), cost_v.data(), num_vertices, length);
 
-  float r = cugraph::hungarian(
-    handle, g, length_workers, workers_v.data().get(), assignment_v.data().get());
+  float r = cugraph::hungarian(handle, g, length_workers, workers_v.data(), assignment_v.data());
+
+  raft::update_host(
+    assignment.data(), assignment_v.begin(), assignment_v.size(), handle.get_stream());
 
   EXPECT_EQ(min_cost, r);
-  EXPECT_EQ(expected_v, assignment_v);
+  EXPECT_EQ(assignment, expected);
 }
 
 TEST_F(HungarianTest, Bipartite4x4_multiple_answers)
@@ -135,40 +150,44 @@ TEST_F(HungarianTest, Bipartite4x4_multiple_answers)
 
   int32_t workers[] = {0, 1, 2, 3};
 
-  float min_cost      = 13.0;
-  int32_t expected1[] = {7, 6, 5, 4};
-  int32_t expected2[] = {6, 7, 5, 4};
-  int32_t expected3[] = {7, 6, 4, 5};
-  int32_t expected4[] = {6, 7, 4, 5};
+  float min_cost = 13.0;
+
+  std::vector<int32_t> expected1({7, 6, 5, 4});
+  std::vector<int32_t> expected2({6, 7, 5, 4});
+  std::vector<int32_t> expected3({7, 6, 4, 5});
+  std::vector<int32_t> expected4({6, 7, 4, 5});
+  std::vector<int32_t> assignment({0, 0, 0, 0});
 
   int32_t length         = sizeof(src_data) / sizeof(src_data[0]);
   int32_t length_workers = sizeof(workers) / sizeof(workers[0]);
   int32_t num_vertices   = 1 + std::max(*std::max_element(src_data, src_data + length),
                                       *std::max_element(dst_data, dst_data + length));
 
-  rmm::device_vector<int32_t> src_v(src_data, src_data + length);
-  rmm::device_vector<int32_t> dst_v(dst_data, dst_data + length);
-  rmm::device_vector<float> cost_v(cost, cost + length);
-  rmm::device_vector<int32_t> workers_v(workers, workers + length_workers);
-  rmm::device_vector<int32_t> assignment_v(length_workers);
+  rmm::device_uvector<int32_t> src_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> dst_v(length, handle.get_stream_view());
+  rmm::device_uvector<float> cost_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> workers_v(length_workers, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(length_workers, handle.get_stream_view());
 
-  rmm::device_vector<int32_t> expected1_v(expected1, expected1 + length_workers);
-  rmm::device_vector<int32_t> expected2_v(expected2, expected2 + length_workers);
-  rmm::device_vector<int32_t> expected3_v(expected3, expected3 + length_workers);
-  rmm::device_vector<int32_t> expected4_v(expected4, expected4 + length_workers);
+  raft::update_device(src_v.begin(), src_data, length, handle.get_stream());
+  raft::update_device(dst_v.begin(), dst_data, length, handle.get_stream());
+  raft::update_device(cost_v.begin(), cost, length, handle.get_stream());
+  raft::update_device(workers_v.begin(), workers, length_workers, handle.get_stream());
 
   cugraph::GraphCOOView<int32_t, int32_t, float> g(
-    src_v.data().get(), dst_v.data().get(), cost_v.data().get(), num_vertices, length);
+    src_v.data(), dst_v.data(), cost_v.data(), num_vertices, length);
 
-  float r = cugraph::hungarian(
-    handle, g, length_workers, workers_v.data().get(), assignment_v.data().get());
+  float r = cugraph::hungarian(handle, g, length_workers, workers_v.data(), assignment_v.data());
 
   EXPECT_EQ(min_cost, r);
 
-  EXPECT_TRUE(thrust::equal(assignment_v.begin(), assignment_v.end(), expected1_v.begin()) ||
-              thrust::equal(assignment_v.begin(), assignment_v.end(), expected2_v.begin()) ||
-              thrust::equal(assignment_v.begin(), assignment_v.end(), expected3_v.begin()) ||
-              thrust::equal(assignment_v.begin(), assignment_v.end(), expected4_v.begin()));
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
+
+  EXPECT_TRUE(std::equal(assignment.begin(), assignment.end(), expected1.begin()) ||
+              std::equal(assignment.begin(), assignment.end(), expected2.begin()) ||
+              std::equal(assignment.begin(), assignment.end(), expected3.begin()) ||
+              std::equal(assignment.begin(), assignment.end(), expected4.begin()));
 }
 
 TEST_F(HungarianTest, May29InfLoop)
@@ -181,13 +200,82 @@ TEST_F(HungarianTest, May29InfLoop)
 
   float min_cost = 2;
 
-  rmm::device_vector<float> cost_v(cost, cost + num_rows * num_cols);
-  rmm::device_vector<int32_t> assignment_v(num_rows);
+  std::vector<int32_t> expected({3, 2, 1, 0});
+  std::vector<int32_t> assignment({0, 0, 0, 0});
+
+  rmm::device_uvector<float> cost_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
+
+  raft::update_device(cost_v.begin(), cost, num_rows * num_cols, handle.get_stream());
 
-  float r = cugraph::dense::hungarian(
-    handle, cost_v.data().get(), num_rows, num_cols, assignment_v.data().get());
+  float r =
+    cugraph::dense::hungarian(handle, cost_v.data(), num_rows, num_cols, assignment_v.data());
+
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
 
   EXPECT_EQ(min_cost, r);
+  EXPECT_EQ(assignment, expected);
+}
+
+TEST_F(HungarianTest, Dense4x6)
+{
+  raft::handle_t handle{};
+
+  int32_t num_rows = 4;
+  int32_t num_cols = 6;
+  float cost[]     = {0,  16, 1,    0,    90, 100, 33, 45, 0,    4,    90, 100,
+                  22, 0,  1000, 2000, 90, 100, 2,  0,  3000, 4000, 90, 100};
+
+  float min_cost = 2;
+
+  std::vector<int32_t> expected({3, 2, 1, 0});
+  std::vector<int32_t> assignment({0, 0, 0, 0});
+
+  rmm::device_uvector<float> cost_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
+
+  raft::update_device(cost_v.begin(), cost, num_rows * num_cols, handle.get_stream());
+
+  float r =
+    cugraph::dense::hungarian(handle, cost_v.data(), num_rows, num_cols, assignment_v.data());
+
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
+
+  EXPECT_EQ(min_cost, r);
+  EXPECT_EQ(assignment, expected);
+}
+
+TEST_F(HungarianTest, Dense6x4)
+{
+  raft::handle_t handle{};
+
+  int32_t num_rows = 6;
+  int32_t num_cols = 4;
+  float cost[]     = {0,  16, 1,    0,    33, 45,  0,   4,   90, 100, 110,  120,
+                  22, 0,  1000, 2000, 90, 100, 110, 120, 2,  0,   3000, 4000};
+
+  float min_cost = 2;
+
+  std::vector<int32_t> expected1({3, 2, 4, 1, 5, 0});
+  std::vector<int32_t> expected2({3, 2, 5, 1, 4, 0});
+  std::vector<int32_t> assignment({0, 0, 0, 0, 0, 0});
+
+  rmm::device_uvector<float> cost_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
+
+  raft::update_device(cost_v.begin(), cost, num_rows * num_cols, handle.get_stream());
+
+  float r =
+    cugraph::dense::hungarian(handle, cost_v.data(), num_rows, num_cols, assignment_v.data());
+
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
+
+  EXPECT_EQ(min_cost, r);
+  EXPECT_TRUE(std::equal(assignment.begin(), assignment.end(), expected1.begin()) ||
+              std::equal(assignment.begin(), assignment.end(), expected2.begin()));
 }
 
 TEST_F(HungarianTest, PythonTestFailure)
@@ -229,13 +317,22 @@ TEST_F(HungarianTest, PythonTestFailure)
 
   float min_cost = 16;
 
-  rmm::device_vector<float> cost_v(cost, cost + num_rows * num_cols);
-  rmm::device_vector<int32_t> assignment_v(num_rows);
+  std::vector<int32_t> expected({0, 2, 1, 4, 3});
+  std::vector<int32_t> assignment({0, 0, 0, 0, 0});
+
+  rmm::device_uvector<float> cost_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
+
+  raft::update_device(cost_v.begin(), cost, num_rows * num_cols, handle.get_stream());
+
+  float r =
+    cugraph::dense::hungarian(handle, cost_v.data(), num_rows, num_cols, assignment_v.data());
 
-  float r = cugraph::dense::hungarian(
-    handle, cost_v.data().get(), num_rows, num_cols, assignment_v.data().get());
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
 
   EXPECT_EQ(min_cost, r);
+  EXPECT_EQ(assignment, expected);
 }
 
 // FIXME:  Need to have tests with nxm (e.g. 4x5 and 5x4) to test those conditions
@@ -249,16 +346,16 @@ void random_test(int32_t num_rows, int32_t num_cols, int32_t upper_bound, int re
 
   HighResTimer hr_timer;
 
-  rmm::device_vector<int32_t>  data_v(num_rows * num_cols);
-  rmm::device_vector<curandState> state_vals_v(num_threads);
-  rmm::device_vector<int32_t> assignment_v(num_rows);
+  rmm::device_uvector<int32_t>  data_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<curandState> state_vals_v(num_threads, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
 
   std::vector<int32_t> validate(num_cols);
 
   hr_timer.start("initialization");
 
   cudaStream_t stream{0};
-  int32_t *d_data = data_v.data().get();
+  int32_t *d_data = data_v.data();
   //int64_t seed{85};
   int64_t seed{time(nullptr)};
 
@@ -280,7 +377,7 @@ void random_test(int32_t num_rows, int32_t num_cols, int32_t upper_bound, int re
 
   for (int i = 0 ; i < repetitions ; ++i) {
     hr_timer.start("hungarian");
-    r = cugraph::hungarian_dense(cost_v.data().get(), num_rows, num_cols, assignment_v.data().get());
+    r = cugraph::hungarian_dense(cost_v.data(), num_rows, num_cols, assignment_v.data());
     hr_timer.stop();
   }