rapidsai · rapids-bot · Mar 26, 2021 · Mar 17, 2021 · Mar 18, 2021 · Mar 18, 2021
@@ -218,7 +218,7 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
  * @param[out] route                          Device array containing the returned route.
  *
  */
-float traveling_salesperson(raft::handle_t &handle,
+float traveling_salesperson(raft::handle_t const &handle,
                             int const *vtx_ptr,
                             float const *x_pos,
                             float const *y_pos,

@@ -17,13 +17,15 @@
 #include <numeric>
 #include <raft/spatial/knn/knn.hpp>
 
+#include <utilities/high_res_timer.hpp>
+
 #include "tsp.hpp"
 #include "tsp_solver.hpp"
 
 namespace cugraph {
 namespace detail {
 
-TSP::TSP(raft::handle_t &handle,
+TSP::TSP(raft::handle_t const &handle,
          int const *vtx_ptr,
          float const *x_pos,
          float const *y_pos,
@@ -50,139 +52,153 @@ TSP::TSP(raft::handle_t &handle,
     max_threads_(handle_.get_device_properties().maxThreadsPerBlock),
     warp_size_(handle_.get_device_properties().warpSize),
     sm_count_(handle_.get_device_properties().multiProcessorCount),
-    restart_batch_(4096)
+    restart_batch_(8192),
+    neighbors_vec_((k_ + 1) * nodes_, stream_),
+    work_vec_(restart_batch_ * ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_),
+              stream_),
+    best_x_pos_vec_(1, stream_),
+    best_y_pos_vec_(1, stream_),
+    best_route_vec_(1, stream_)
 {
-  allocate();
+  setup();
 }
 
-void TSP::allocate()
+void TSP::setup()
 {
-  // Scalars
-  mylock_    = mylock_scalar_.data();
-  best_tour_ = best_tour_scalar_.data();
-  climbs_    = climbs_scalar_.data();
+  mylock_ = mylock_scalar_.data();
 
-  // Vectors
-  neighbors_vec_.resize((k_ + 1) * nodes_);
+  neighbors_ = neighbors_vec_.data();
   // pre-allocate workspace for climbs, each block needs a separate permutation space and search
   // buffer. We allocate a work buffer that will store the computed distances, px, py and the route.
   // We align it on the warp size.
-  work_vec_.resize(sizeof(float) * restart_batch_ *
-                   ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_));
+  work_ = work_vec_.data();
+
+  results_.best_x_pos = best_x_pos_vec_.data();
+  results_.best_y_pos = best_y_pos_vec_.data();
+  results_.best_route = best_route_vec_.data();
+  results_.best_cost  = best_cost_scalar_.data();
+}
 
-  // Pointers
-  neighbors_ = neighbors_vec_.data().get();
-  work_      = work_vec_.data().get();
+void TSP::reset_batch()
+{
+  mylock_scalar_.set_value_zero(stream_);
+  auto const max{std::numeric_limits<int>::max()};
+  best_cost_scalar_.set_value(max, stream_);
+}
+
+void TSP::get_initial_solution(int const batch)
+{
+  if (!beam_search_) {
+    random_init<<<restart_batch_, best_thread_num_>>>(
+      work_, x_pos_, y_pos_, vtx_ptr_, nstart_, nodes_, batch, restart_batch_);
+    CHECK_CUDA(stream_);
+  } else {
+    knn_init<<<restart_batch_, best_thread_num_>>>(
+      work_, x_pos_, y_pos_, vtx_ptr_, neighbors_, nstart_, nodes_, k_, batch, restart_batch_);
+    CHECK_CUDA(stream_);
+  }
 }
 
 float TSP::compute()
 {
-  float valid_coo_dist    = 0.f;
+  float final_cost        = 0.f;
   int num_restart_batches = (restarts_ + restart_batch_ - 1) / restart_batch_;
   int restart_resid       = restarts_ - (num_restart_batches - 1) * restart_batch_;
-  int global_best         = INT_MAX;
-  float *soln             = nullptr;
-  int *route_sol          = nullptr;
+  int global_best         = std::numeric_limits<int>::max();
   int best                = 0;
+
   std::vector<float> h_x_pos;
   std::vector<float> h_y_pos;
+  std::vector<int> h_route;
   h_x_pos.reserve(nodes_ + 1);
   h_y_pos.reserve(nodes_ + 1);
-
-  // Stats
-  int n_timers      = 3;
-  long total_climbs = 0;
-  std::vector<float> h_times;
-  struct timeval starttime, endtime;
-
-  // KNN call
-  knn();
+  h_route.reserve(nodes_);
+  std::vector<float *> addr_best_x_pos(1);
+  std::vector<float *> addr_best_y_pos(1);
+  std::vector<int *> addr_best_route(1);
+  HighResTimer hr_timer;
+  auto create_timer = [&hr_timer, this](char const *name) {
+    return VerboseTimer(name, hr_timer, verbose_);
+  };
 
   if (verbose_) {
-    std::cout << "Doing " << num_restart_batches - 1 << " batches of size " << restart_batch_
+    std::cout << "Doing " << num_restart_batches << " batches of size " << restart_batch_
               << ", with " << restart_resid << " tail\n";
     std::cout << "configuration: " << nodes_ << " nodes, " << restarts_ << " restart\n";
     std::cout << "optimizing graph with kswap = " << kswaps << "\n";
   }
 
   // Tell the cache how we want it to behave
   cudaFuncSetCacheConfig(search_solution, cudaFuncCachePreferEqual);
+  best_thread_num_ = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);
 
-  int threads = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);
-  if (verbose_) std::cout << "Calculated best thread number = " << threads << "\n";
+  if (verbose_) std::cout << "Calculated best thread number = " << best_thread_num_ << "\n";
 
-  rmm::device_vector<float> times(n_timers * threads + n_timers);
-  h_times.reserve(n_timers * threads + n_timers);
+  if (beam_search_) {
+    auto timer = create_timer("knn");
+    knn();
+  }
 
-  gettimeofday(&starttime, NULL);
-  for (int b = 0; b < num_restart_batches; ++b) {
-    reset<<<1, 1, 0, stream_>>>(mylock_, best_tour_, climbs_);
-    CHECK_CUDA(stream_);
+  for (auto batch = 0; batch < num_restart_batches; ++batch) {
+    reset_batch();
+    if (batch == num_restart_batches - 1) restart_batch_ = restart_resid;
 
-    if (b == num_restart_batches - 1) restart_batch_ = restart_resid;
-
-    search_solution<<<restart_batch_, threads, sizeof(int) * threads, stream_>>>(mylock_,
-                                                                                 best_tour_,
-                                                                                 vtx_ptr_,
-                                                                                 beam_search_,
-                                                                                 k_,
-                                                                                 nodes_,
-                                                                                 neighbors_,
-                                                                                 x_pos_,
-                                                                                 y_pos_,
-                                                                                 work_,
-                                                                                 nstart_,
-                                                                                 times.data().get(),
-                                                                                 climbs_,
-                                                                                 threads);
+    {
+      auto timer = create_timer("initial_sol");
+      get_initial_solution(batch);
+    }
 
-    CHECK_CUDA(stream_);
-    cudaDeviceSynchronize();
+    {
+      auto timer = create_timer("search_sol");
+      search_solution<<<restart_batch_,
+                        best_thread_num_,
+                        sizeof(int) * best_thread_num_,
+                        stream_>>>(
+        results_, mylock_, vtx_ptr_, beam_search_, k_, nodes_, x_pos_, y_pos_, work_, nstart_);
+      CHECK_CUDA(stream_);
+    }
+
+    {
+      auto timer = create_timer("optimal_tour");
+      get_optimal_tour<<<restart_batch_,
+                         best_thread_num_,
+                         sizeof(int) * best_thread_num_,
+                         stream_>>>(results_, mylock_, work_, nodes_);
+      CHECK_CUDA(stream_);
+    }
 
-    CUDA_TRY(cudaMemcpy(&best, best_tour_, sizeof(int), cudaMemcpyDeviceToHost));
     cudaDeviceSynchronize();
+    best = best_cost_scalar_.value(stream_);
+
     if (verbose_) std::cout << "Best reported by kernel = " << best << "\n";
 
     if (best < global_best) {
       global_best = best;
-      CUDA_TRY(cudaMemcpyFromSymbol(&soln, best_soln, sizeof(void *)));
-      cudaDeviceSynchronize();
-      CUDA_TRY(cudaMemcpyFromSymbol(&route_sol, best_route, sizeof(void *)));
-      cudaDeviceSynchronize();
+
+      raft::update_host(addr_best_x_pos.data(), results_.best_x_pos, 1, stream_);
+      raft::update_host(addr_best_y_pos.data(), results_.best_y_pos, 1, stream_);
+      raft::update_host(addr_best_route.data(), results_.best_route, 1, stream_);
+      CUDA_TRY(cudaStreamSynchronize(stream_));
+
+      raft::copy(h_x_pos.data(), addr_best_x_pos[0], nodes_ + 1, stream_);
+      raft::copy(h_y_pos.data(), addr_best_y_pos[0], nodes_ + 1, stream_);
+      raft::copy(h_route.data(), addr_best_route[0], nodes_, stream_);
+      raft::copy(route_, addr_best_route[0], nodes_, stream_);
+      CHECK_CUDA(stream_);
     }
-    total_climbs += climbs_scalar_.value(stream_);
-  }
-  gettimeofday(&endtime, NULL);
-  double runtime =
-    endtime.tv_sec + endtime.tv_usec / 1e6 - starttime.tv_sec - starttime.tv_usec / 1e6;
-  long long moves = 1LL * total_climbs * (nodes_ - 2) * (nodes_ - 1) / 2;
-
-  raft::copy(route_, route_sol, nodes_, stream_);
-
-  CUDA_TRY(cudaMemcpy(h_x_pos.data(), soln, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
-  cudaDeviceSynchronize();
-  CUDA_TRY(cudaMemcpy(
-    h_y_pos.data(), soln + nodes_ + 1, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
-  cudaDeviceSynchronize();
-
-  for (int i = 0; i < nodes_; ++i) {
-    if (verbose_) { std::cout << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
-    valid_coo_dist += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
   }
 
-  CUDA_TRY(cudaMemcpy(h_times.data(),
-                      times.data().get(),
-                      sizeof(float) * n_timers * threads + n_timers,
-                      cudaMemcpyDeviceToHost));
-  cudaDeviceSynchronize();
+  for (auto i = 0; i < nodes_; ++i) {
+    if (verbose_) { std::cout << h_route[i] << ": " << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
+    final_cost += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
+  }
 
   if (verbose_) {
-    std::cout << "Search runtime = " << runtime << ", " << moves * 1e-9 / runtime << " Gmoves/s\n";
+    hr_timer.display(std::cout);
     std::cout << "Optimized tour length = " << global_best << "\n";
-    print_times(h_times, n_timers, handle_.get_device(), threads);
   }
 
-  return valid_coo_dist;
+  return final_cost;
 }
 
 void TSP::knn()
@@ -192,17 +208,17 @@ void TSP::knn()
   int dim              = 2;
   bool row_major_order = false;
 
-  rmm::device_vector<float> input(nodes_ * dim);
-  float *input_ptr = input.data().get();
+  rmm::device_uvector<float> input(nodes_ * dim, stream_);
+  float *input_ptr = input.data();
   raft::copy(input_ptr, x_pos_, nodes_, stream_);
   raft::copy(input_ptr + nodes_, y_pos_, nodes_, stream_);
 
-  rmm::device_vector<float> search_data(nodes_ * dim);
-  float *search_data_ptr = search_data.data().get();
+  rmm::device_uvector<float> search_data(nodes_ * dim, stream_);
+  float *search_data_ptr = search_data.data();
   raft::copy(search_data_ptr, input_ptr, nodes_ * dim, stream_);
 
-  rmm::device_vector<float> distances(nodes_ * (k_ + 1));
-  float *distances_ptr = distances.data().get();
+  rmm::device_uvector<float> distances(nodes_ * (k_ + 1), stream_);
+  float *distances_ptr = distances.data();
 
   std::vector<float *> input_vec;
   std::vector<int> sizes_vec;
@@ -226,7 +242,7 @@ void TSP::knn()
 }
 }  // namespace detail
 
-float traveling_salesperson(raft::handle_t &handle,
+float traveling_salesperson(raft::handle_t const &handle,
                             int const *vtx_ptr,
                             float const *x_pos,
                             float const *y_pos,