Retrieve results in struct

rapidsai · rapids-bot · Mar 26, 2021 · Mar 17, 2021 · Mar 18, 2021 · Mar 18, 2021
commit 8ffe628d1725792131237a4b11e99a6a773b3084
@@ -52,14 +52,13 @@ TSP::TSP(raft::handle_t const &handle,
     sm_count_(handle_.get_device_properties().multiProcessorCount),
     restart_batch_(4096)
 {
-  allocate();
+  setup();
 }
 
-void TSP::allocate()
+void TSP::setup()
 {
   // Scalars
   mylock_    = mylock_scalar_.data();
-  best_tour_ = best_tour_scalar_.data();
   climbs_    = climbs_scalar_.data();
 
   // Vectors
@@ -69,30 +68,49 @@ void TSP::allocate()
   // We align it on the warp size.
   work_vec_.resize(restart_batch_ *
                    ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_));
+  best_x_pos_vec_.resize(1);
+  best_y_pos_vec_.resize(1);
+  best_route_vec_.resize(1);
 
   // Pointers
   neighbors_ = neighbors_vec_.data().get();
   work_      = work_vec_.data().get();
+
+  // Setup results
+  results_.best_x_pos = best_x_pos_vec_.data().get();
+  results_.best_y_pos = best_y_pos_vec_.data().get();
+  results_.best_route = best_route_vec_.data().get();
+  results_.best_cost = best_cost_scalar_.data();
+}
+
+void TSP::reset_batch() {
+  mylock_scalar_.set_value(0, stream_);
+  best_cost_scalar_.set_value(std::numeric_limits<int>::max(), stream_);
+  climbs_scalar_.set_value(0, stream_);
 }
 
 float TSP::compute()
 {
-  float valid_coo_dist    = 0.f;
+  // Setup
+  float final_cost    = 0.f;
   int num_restart_batches = (restarts_ + restart_batch_ - 1) / restart_batch_;
   int restart_resid       = restarts_ - (num_restart_batches - 1) * restart_batch_;
-  int global_best         = INT_MAX;
-  float *soln             = nullptr;
-  int *route_sol          = nullptr;
+  int global_best         = std::numeric_limits<int>::max();
   int best                = 0;
+
   std::vector<float> h_x_pos;
   std::vector<float> h_y_pos;
+  std::vector<int> h_route;
   h_x_pos.reserve(nodes_ + 1);
   h_y_pos.reserve(nodes_ + 1);
+  h_route.reserve(nodes_);
+
+	std::vector<float*> addr_best_x_pos(1);
+	std::vector<float*> addr_best_y_pos(1);
+	std::vector<int*> addr_best_route(1);
 
   // Stats
-  int n_timers      = 3;
   long total_climbs = 0;
-  std::vector<float> h_times;
   struct timeval starttime, endtime;
 
   // KNN call
@@ -111,18 +129,15 @@ float TSP::compute()
   int threads = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);
   if (verbose_) std::cout << "Calculated best thread number = " << threads << "\n";
 
-  rmm::device_vector<float> times(n_timers * threads + n_timers);
-  h_times.reserve(n_timers * threads + n_timers);
-
   gettimeofday(&starttime, NULL);
+
   for (int batch = 1; batch <= num_restart_batches; ++batch) {
-    reset<<<1, 1, 0, stream_>>>(mylock_, best_tour_, climbs_);
-    CHECK_CUDA(stream_);
 
+    reset_batch();
     if (batch == num_restart_batches) restart_batch_ = restart_resid;
 
-    search_solution<<<restart_batch_, threads, sizeof(int) * threads, stream_>>>(mylock_,
-                                                                                 best_tour_,
+    search_solution<<<restart_batch_, threads, sizeof(int) * threads, stream_>>>(results_,
+                                                                                 mylock_,
                                                                                  vtx_ptr_,
                                                                                  beam_search_,
                                                                                  k_,
@@ -132,24 +147,29 @@ float TSP::compute()
                                                                                  y_pos_,
                                                                                  work_,
                                                                                  nstart_,
-                                                                                 times.data().get(),
                                                                                  climbs_,
                                                                                  threads,
                                                                                  batch);
 
     CHECK_CUDA(stream_);
     cudaDeviceSynchronize();
 
-    CUDA_TRY(cudaMemcpy(&best, best_tour_, sizeof(int), cudaMemcpyDeviceToHost));
-    cudaDeviceSynchronize();
+    best = best_cost_scalar_.value(stream_);
+
     if (verbose_) std::cout << "Best reported by kernel = " << best << "\n";
 
     if (best < global_best) {
-      global_best = best;
-      CUDA_TRY(cudaMemcpyFromSymbol(&soln, best_soln, sizeof(void *)));
-      cudaDeviceSynchronize();
-      CUDA_TRY(cudaMemcpyFromSymbol(&route_sol, best_route, sizeof(void *)));
+			global_best = best;
+			cudaMemcpy(addr_best_x_pos.data(), results_.best_x_pos, sizeof(float*), cudaMemcpyDeviceToHost);
+			cudaMemcpy(addr_best_y_pos.data(), results_.best_y_pos, sizeof(float*), cudaMemcpyDeviceToHost);
+			cudaMemcpy(addr_best_route.data(), results_.best_route, sizeof(int*), cudaMemcpyDeviceToHost);
+      CHECK_CUDA(stream_);
       cudaDeviceSynchronize();
+
+      raft::copy(h_x_pos.data(), addr_best_x_pos[0], nodes_ + 1, stream_);
+      raft::copy(h_y_pos.data(), addr_best_y_pos[0], nodes_ + 1, stream_);
+      raft::copy(h_route.data(), addr_best_route[0], nodes_, stream_);
+      CHECK_CUDA(stream_);
     }
     total_climbs += climbs_scalar_.value(stream_);
   }
@@ -158,32 +178,17 @@ float TSP::compute()
     endtime.tv_sec + endtime.tv_usec / 1e6 - starttime.tv_sec - starttime.tv_usec / 1e6;
   long long moves = 1LL * total_climbs * (nodes_ - 2) * (nodes_ - 1) / 2;
 
-  raft::copy(route_, route_sol, nodes_, stream_);
-
-  CUDA_TRY(cudaMemcpy(h_x_pos.data(), soln, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
-  cudaDeviceSynchronize();
-  CUDA_TRY(cudaMemcpy(
-    h_y_pos.data(), soln + nodes_ + 1, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
-  cudaDeviceSynchronize();
-
   for (int i = 0; i < nodes_; ++i) {
     if (verbose_) { std::cout << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
-    valid_coo_dist += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
+    final_cost += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
   }
 
-  CUDA_TRY(cudaMemcpy(h_times.data(),
-                      times.data().get(),
-                      sizeof(float) * n_timers * threads + n_timers,
-                      cudaMemcpyDeviceToHost));
-  cudaDeviceSynchronize();
-
   if (verbose_) {
     std::cout << "Search runtime = " << runtime << ", " << moves * 1e-9 / runtime << " Gmoves/s\n";
     std::cout << "Optimized tour length = " << global_best << "\n";
-    print_times(h_times, n_timers, handle_.get_device(), threads);
   }
 
-  return valid_coo_dist;
+  return final_cost;
 }
 
 void TSP::knn()

@@ -25,6 +25,14 @@
 
 namespace cugraph {
 namespace detail {
+
+struct TSPResults{
+  float **best_x_pos;
+  float **best_y_pos;
+  int **best_route;
+  int *best_cost;
+};
+
 class TSP {
  public:
   TSP(raft::handle_t const &handle,
@@ -39,7 +47,8 @@ class TSP {
       bool verbose,
       int *route);
 
-  void allocate();
+  void setup();
+  void reset_batch();
   float compute();
   void knn();
   ~TSP(){};
@@ -69,20 +78,24 @@ class TSP {
 
   // Scalars
   rmm::device_scalar<int> mylock_scalar_;
-  rmm::device_scalar<int> best_tour_scalar_;
+  rmm::device_scalar<int> best_cost_scalar_;
   rmm::device_scalar<int> climbs_scalar_;
 
   int *mylock_;
-  int *best_tour_;
+  int *best_cost_;
   int *climbs_;
 
   // Vectors
   rmm::device_vector<int64_t> neighbors_vec_;
   rmm::device_vector<int> work_vec_;
+  rmm::device_vector<float*> best_x_pos_vec_;
+  rmm::device_vector<float*> best_y_pos_vec_;
+  rmm::device_vector<int*> best_route_vec_;
 
   int64_t *neighbors_;
   int *work_;
   int *work_route_;
+  TSPResults results_;
 };
 }  // namespace detail
 }  // namespace cugraph
@@ -29,18 +29,6 @@
 namespace cugraph {
 namespace detail {
 
-__device__ float *best_soln;
-__device__ int *best_route;
-
-__global__ void reset(int *mylock, int *best_tour, int *climbs)
-{
-  *mylock    = 0;
-  *best_tour = INT_MAX;
-  *climbs    = 0;
-  best_soln  = nullptr;
-  best_route = nullptr;
-}
-
 // random permutation kernel
 __device__ void random_init(float const *posx,
                             float const *posy,
@@ -212,7 +200,6 @@ __device__ void two_opt_search(
   }
 }
 
-// This function being runned for each block
 __device__ void hill_climbing(
   float *px, float *py, int *buf, int *path, int *shbuf, int const nodes, int *climbs)
 {
@@ -340,7 +327,8 @@ __device__ void hill_climbing(
 }
 
 __device__ void get_optimal_tour(
-  int *mylock, int *best_tour, float *px, float *py, int *path, int *shbuf, int const nodes)
+    TSPResults results, int *mylock, float *px, float *py, int *path,
+    int *shbuf, int const nodes)
 {
   // Now find actual length of the last tour, result of the climb
   int term = 0;
@@ -360,20 +348,21 @@ __device__ void get_optimal_tour(
   term = shbuf[0];
 
   if (threadIdx.x == 0) {
-    atomicMin(best_tour, term);
+    atomicMin(results.best_cost, term);
     while (atomicExch(mylock, 1) != 0)
       ;  // acquire
-    if (best_tour[0] == term) {
-      best_soln  = px;
-      best_route = path;
+    if (results.best_cost[0] == term) {
+      results.best_x_pos[0]  = px;
+      results.best_y_pos[0]   = py;
+      results.best_route[0] = path;
     }
     *mylock = 0;  // release
     __threadfence();
   }
 }
 
-__global__ __launch_bounds__(2048, 2) void search_solution(int *mylock,
-                                                           int *best_tour,
+__global__ __launch_bounds__(2048, 2) void search_solution(TSPResults results,
+                                                           int *mylock,
                                                            int const *vtx_ptr,
                                                            bool beam_search,
                                                            int const K,
@@ -383,7 +372,6 @@ __global__ __launch_bounds__(2048, 2) void search_solution(int *mylock,
                                                            float const *posy,
                                                            int *work,
                                                            int const nstart,
-                                                           float *times,
                                                            int *climbs,
                                                            int const threads,
                                                            int const batch)
@@ -393,24 +381,17 @@ __global__ __launch_bounds__(2048, 2) void search_solution(int *mylock,
   float *py = &px[nodes + 1];
   int *path = (int *)(&py[nodes + 1]);
   __shared__ int shbuf[tilesize];
-  clock_t start;
 
-  start = clock64();
   if (!beam_search)
     random_init(posx, posy, vtx_ptr, path, px, py, nstart, nodes, batch);
   else
     knn_init(posx, posy, vtx_ptr, neighbors, buf, path, px, py, nstart, nodes, K, batch);
   __syncthreads();
-  times[threadIdx.x] = clock64() - start;
 
-  start = clock64();
   hill_climbing(px, py, buf, path, shbuf, nodes, climbs);
   __syncthreads();
-  times[threads + threadIdx.x + 1] = clock64() - start;
 
-  start = clock64();
-  get_optimal_tour(mylock, best_tour, px, py, path, shbuf, nodes);
-  times[2 * threads + threadIdx.x + 1] = clock64() - start;
+  get_optimal_tour(results, mylock, px, py, path, shbuf, nodes);
 }
 }  // namespace detail
 }  // namespace cugraph
@@ -31,29 +31,6 @@ __host__ __device__ inline float euclidean_dist(float *px, float *py, int a, int
   return sqrtf((px[a] - px[b]) * (px[a] - px[b]) + (py[a] - py[b]) * (py[a] - py[b]));
 }
 
-static std::vector<std::string> device_func = {"Find First", "Hill Climbing", "Retrieve Path"};
-
-void print_times(std::vector<float> &h_times, int const n_timers, int device, int threads)
-{
-  int clock_rate;
-  cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device);
-
-  double total = 0;
-  h_times[0] /= (float)clock_rate;
-  total += h_times[0];
-  for (int i = 1; i < n_timers; ++i) {
-    h_times[i * threads + 1] /= (float)clock_rate;
-    total += h_times[i * threads + 1];
-  }
-  std::cout << "Stats: \n";
-  std::cout << device_func[0] << " time: " << h_times[0] * 1e-3 << " "
-            << (h_times[0] / total) * 100.0 << "%\n";
-  for (int i = 1; i < n_timers; ++i) {
-    std::cout << device_func[i] << " time: " << h_times[i * threads + 1] * 1e-3 << " "
-              << (h_times[i * threads + 1] / total) * 100.0 << "%\n";
-  }
-}
-
 // Get maximum number of threads we can run on based on number of nodes,
 // shared memory usage, max threads per block and SM, max blocks for SM and registers per SM.
 int best_thread_count(int nodes, int max_threads, int sm_count, int warp_size)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
@@ -282,7 +282,7 @@ ConfigureTest(FA2_TEST "${FA2_TEST_SRC}")
 # - TSP  tests --------------------------------------------------------------------------
 
 set(TSP_TEST_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/traversal/tsp_test.cu")
+    "${CMAKE_CURRENT_SOURCE_DIR}/traversal/tsp_main.cu")
 
   ConfigureTest(TSP_TEST "${TSP_TEST_SRC}" "")
 

@@ -67,6 +67,8 @@ typedef struct Tsp_Usecase_t {
 } Tsp_Usecase;
 
 static std::vector<Tsp_Usecase_t> euc_2d{
+{"tsplib/datasets/tsp225.tsp", 3916},
+  /*
   {"tsplib/datasets/a280.tsp", 2579},      {"tsplib/datasets/berlin52.tsp", 7542},
   {"tsplib/datasets/bier127.tsp", 118282}, {"tsplib/datasets/ch130.tsp", 6110},
   {"tsplib/datasets/ch150.tsp", 6528},     {"tsplib/datasets/d1291.tsp", 50801},
@@ -96,7 +98,7 @@ static std::vector<Tsp_Usecase_t> euc_2d{
   {"tsplib/datasets/tsp225.tsp", 3916},    {"tsplib/datasets/u1060.tsp", 224094},
   {"tsplib/datasets/u1432.tsp", 152970},   {"tsplib/datasets/u159.tsp", 42080},
   {"tsplib/datasets/u574.tsp", 36905},     {"tsplib/datasets/u724.tsp", 41910},
-  {"tsplib/datasets/vm1084.tsp", 239297},
+  {"tsplib/datasets/vm1084.tsp", 239297},*/
 };
 
 struct Route {
@@ -130,9 +132,10 @@ class Tests_Tsp : public ::testing::TestWithParam<Tsp_Usecase> {
 
     std::cout << "File: " << param.tsp_file.c_str() << "\n";
     int nodes = load_tsp(param.tsp_file.c_str(), &input);
+    std::cout << "Nodes: " << nodes << std::endl;
 
     // Device alloc
-    raft::handle_t handle;
+    raft::handle_t const handle;
     rmm::device_uvector<int> vertices(static_cast<size_t>(nodes), nullptr);
     rmm::device_uvector<int> route(static_cast<size_t>(nodes), nullptr);
     rmm::device_uvector<float> x_pos(static_cast<size_t>(nodes), nullptr);
@@ -154,7 +157,7 @@ class Tests_Tsp : public ::testing::TestWithParam<Tsp_Usecase> {
     bool beam_search = true;
     int k            = 4;
     int nstart       = 0;
-    bool verbose     = false;
+    bool verbose     = true;
 
     hr_clock.start();
     cudaDeviceSynchronize();