Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TSP solver bug fix #1480

Merged
merged 16 commits into from
Mar 26, 2021
Prev Previous commit
Next Next commit
Retrieve results in struct
hlinsen committed Mar 25, 2021
commit 8ffe628d1725792131237a4b11e99a6a773b3084
85 changes: 45 additions & 40 deletions cpp/src/traversal/tsp.cu
Original file line number Diff line number Diff line change
@@ -52,14 +52,13 @@ TSP::TSP(raft::handle_t const &handle,
sm_count_(handle_.get_device_properties().multiProcessorCount),
restart_batch_(4096)
{
allocate();
setup();
}

void TSP::allocate()
void TSP::setup()
{
// Scalars
mylock_ = mylock_scalar_.data();
best_tour_ = best_tour_scalar_.data();
climbs_ = climbs_scalar_.data();

// Vectors
@@ -69,30 +68,49 @@ void TSP::allocate()
// We align it on the warp size.
work_vec_.resize(restart_batch_ *
((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_));
best_x_pos_vec_.resize(1);
best_y_pos_vec_.resize(1);
best_route_vec_.resize(1);

// Pointers
neighbors_ = neighbors_vec_.data().get();
work_ = work_vec_.data().get();

// Setup results
results_.best_x_pos = best_x_pos_vec_.data().get();
results_.best_y_pos = best_y_pos_vec_.data().get();
results_.best_route = best_route_vec_.data().get();
results_.best_cost = best_cost_scalar_.data();
}

void TSP::reset_batch() {
mylock_scalar_.set_value(0, stream_);
best_cost_scalar_.set_value(std::numeric_limits<int>::max(), stream_);
climbs_scalar_.set_value(0, stream_);
}

float TSP::compute()
{
float valid_coo_dist = 0.f;
// Setup
float final_cost = 0.f;
int num_restart_batches = (restarts_ + restart_batch_ - 1) / restart_batch_;
int restart_resid = restarts_ - (num_restart_batches - 1) * restart_batch_;
int global_best = INT_MAX;
float *soln = nullptr;
int *route_sol = nullptr;
int global_best = std::numeric_limits<int>::max();
int best = 0;

std::vector<float> h_x_pos;
std::vector<float> h_y_pos;
std::vector<int> h_route;
h_x_pos.reserve(nodes_ + 1);
h_y_pos.reserve(nodes_ + 1);
h_route.reserve(nodes_);

std::vector<float*> addr_best_x_pos(1);
std::vector<float*> addr_best_y_pos(1);
std::vector<int*> addr_best_route(1);

// Stats
int n_timers = 3;
long total_climbs = 0;
std::vector<float> h_times;
struct timeval starttime, endtime;

// KNN call
@@ -111,18 +129,15 @@ float TSP::compute()
int threads = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);
if (verbose_) std::cout << "Calculated best thread number = " << threads << "\n";

rmm::device_vector<float> times(n_timers * threads + n_timers);
h_times.reserve(n_timers * threads + n_timers);

gettimeofday(&starttime, NULL);

for (int batch = 1; batch <= num_restart_batches; ++batch) {
reset<<<1, 1, 0, stream_>>>(mylock_, best_tour_, climbs_);
CHECK_CUDA(stream_);

reset_batch();
if (batch == num_restart_batches) restart_batch_ = restart_resid;

search_solution<<<restart_batch_, threads, sizeof(int) * threads, stream_>>>(mylock_,
best_tour_,
search_solution<<<restart_batch_, threads, sizeof(int) * threads, stream_>>>(results_,
mylock_,
vtx_ptr_,
beam_search_,
k_,
@@ -132,24 +147,29 @@ float TSP::compute()
y_pos_,
work_,
nstart_,
times.data().get(),
climbs_,
threads,
batch);

CHECK_CUDA(stream_);
cudaDeviceSynchronize();

CUDA_TRY(cudaMemcpy(&best, best_tour_, sizeof(int), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
best = best_cost_scalar_.value(stream_);

if (verbose_) std::cout << "Best reported by kernel = " << best << "\n";

if (best < global_best) {
global_best = best;
CUDA_TRY(cudaMemcpyFromSymbol(&soln, best_soln, sizeof(void *)));
cudaDeviceSynchronize();
CUDA_TRY(cudaMemcpyFromSymbol(&route_sol, best_route, sizeof(void *)));
global_best = best;
cudaMemcpy(addr_best_x_pos.data(), results_.best_x_pos, sizeof(float*), cudaMemcpyDeviceToHost);
cudaMemcpy(addr_best_y_pos.data(), results_.best_y_pos, sizeof(float*), cudaMemcpyDeviceToHost);
cudaMemcpy(addr_best_route.data(), results_.best_route, sizeof(int*), cudaMemcpyDeviceToHost);
CHECK_CUDA(stream_);
cudaDeviceSynchronize();

raft::copy(h_x_pos.data(), addr_best_x_pos[0], nodes_ + 1, stream_);
raft::copy(h_y_pos.data(), addr_best_y_pos[0], nodes_ + 1, stream_);
raft::copy(h_route.data(), addr_best_route[0], nodes_, stream_);
CHECK_CUDA(stream_);
}
total_climbs += climbs_scalar_.value(stream_);
}
@@ -158,32 +178,17 @@ float TSP::compute()
endtime.tv_sec + endtime.tv_usec / 1e6 - starttime.tv_sec - starttime.tv_usec / 1e6;
long long moves = 1LL * total_climbs * (nodes_ - 2) * (nodes_ - 1) / 2;

raft::copy(route_, route_sol, nodes_, stream_);

CUDA_TRY(cudaMemcpy(h_x_pos.data(), soln, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
CUDA_TRY(cudaMemcpy(
h_y_pos.data(), soln + nodes_ + 1, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();

for (int i = 0; i < nodes_; ++i) {
if (verbose_) { std::cout << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
valid_coo_dist += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
final_cost += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
}

CUDA_TRY(cudaMemcpy(h_times.data(),
times.data().get(),
sizeof(float) * n_timers * threads + n_timers,
cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();

if (verbose_) {
std::cout << "Search runtime = " << runtime << ", " << moves * 1e-9 / runtime << " Gmoves/s\n";
std::cout << "Optimized tour length = " << global_best << "\n";
print_times(h_times, n_timers, handle_.get_device(), threads);
}

return valid_coo_dist;
return final_cost;
}

void TSP::knn()
19 changes: 16 additions & 3 deletions cpp/src/traversal/tsp.hpp
Original file line number Diff line number Diff line change
@@ -25,6 +25,14 @@

namespace cugraph {
namespace detail {

struct TSPResults{
float **best_x_pos;
float **best_y_pos;
int **best_route;
int *best_cost;
};

class TSP {
public:
TSP(raft::handle_t const &handle,
@@ -39,7 +47,8 @@ class TSP {
bool verbose,
int *route);

void allocate();
void setup();
void reset_batch();
float compute();
void knn();
~TSP(){};
@@ -69,20 +78,24 @@ class TSP {

// Scalars
rmm::device_scalar<int> mylock_scalar_;
rmm::device_scalar<int> best_tour_scalar_;
rmm::device_scalar<int> best_cost_scalar_;
rmm::device_scalar<int> climbs_scalar_;

int *mylock_;
int *best_tour_;
int *best_cost_;
int *climbs_;

// Vectors
rmm::device_vector<int64_t> neighbors_vec_;
rmm::device_vector<int> work_vec_;
rmm::device_vector<float*> best_x_pos_vec_;
rmm::device_vector<float*> best_y_pos_vec_;
rmm::device_vector<int*> best_route_vec_;

int64_t *neighbors_;
int *work_;
int *work_route_;
TSPResults results_;
};
} // namespace detail
} // namespace cugraph
39 changes: 10 additions & 29 deletions cpp/src/traversal/tsp_solver.hpp
Original file line number Diff line number Diff line change
@@ -29,18 +29,6 @@
namespace cugraph {
namespace detail {

__device__ float *best_soln;
__device__ int *best_route;

__global__ void reset(int *mylock, int *best_tour, int *climbs)
{
*mylock = 0;
*best_tour = INT_MAX;
*climbs = 0;
best_soln = nullptr;
best_route = nullptr;
}

// random permutation kernel
__device__ void random_init(float const *posx,
float const *posy,
@@ -212,7 +200,6 @@ __device__ void two_opt_search(
}
}

// This function being runned for each block
__device__ void hill_climbing(
float *px, float *py, int *buf, int *path, int *shbuf, int const nodes, int *climbs)
{
@@ -340,7 +327,8 @@ __device__ void hill_climbing(
}

__device__ void get_optimal_tour(
int *mylock, int *best_tour, float *px, float *py, int *path, int *shbuf, int const nodes)
TSPResults results, int *mylock, float *px, float *py, int *path,
int *shbuf, int const nodes)
{
// Now find actual length of the last tour, result of the climb
int term = 0;
@@ -360,20 +348,21 @@ __device__ void get_optimal_tour(
term = shbuf[0];

if (threadIdx.x == 0) {
atomicMin(best_tour, term);
atomicMin(results.best_cost, term);
while (atomicExch(mylock, 1) != 0)
; // acquire
if (best_tour[0] == term) {
best_soln = px;
best_route = path;
if (results.best_cost[0] == term) {
results.best_x_pos[0] = px;
results.best_y_pos[0] = py;
results.best_route[0] = path;
}
*mylock = 0; // release
__threadfence();
}
}

__global__ __launch_bounds__(2048, 2) void search_solution(int *mylock,
int *best_tour,
__global__ __launch_bounds__(2048, 2) void search_solution(TSPResults results,
int *mylock,
int const *vtx_ptr,
bool beam_search,
int const K,
@@ -383,7 +372,6 @@ __global__ __launch_bounds__(2048, 2) void search_solution(int *mylock,
float const *posy,
int *work,
int const nstart,
float *times,
int *climbs,
int const threads,
int const batch)
@@ -393,24 +381,17 @@ __global__ __launch_bounds__(2048, 2) void search_solution(int *mylock,
float *py = &px[nodes + 1];
int *path = (int *)(&py[nodes + 1]);
__shared__ int shbuf[tilesize];
clock_t start;

start = clock64();
if (!beam_search)
random_init(posx, posy, vtx_ptr, path, px, py, nstart, nodes, batch);
else
knn_init(posx, posy, vtx_ptr, neighbors, buf, path, px, py, nstart, nodes, K, batch);
__syncthreads();
times[threadIdx.x] = clock64() - start;

start = clock64();
hill_climbing(px, py, buf, path, shbuf, nodes, climbs);
__syncthreads();
times[threads + threadIdx.x + 1] = clock64() - start;

start = clock64();
get_optimal_tour(mylock, best_tour, px, py, path, shbuf, nodes);
times[2 * threads + threadIdx.x + 1] = clock64() - start;
get_optimal_tour(results, mylock, px, py, path, shbuf, nodes);
}
} // namespace detail
} // namespace cugraph
23 changes: 0 additions & 23 deletions cpp/src/traversal/tsp_utils.hpp
Original file line number Diff line number Diff line change
@@ -31,29 +31,6 @@ __host__ __device__ inline float euclidean_dist(float *px, float *py, int a, int
return sqrtf((px[a] - px[b]) * (px[a] - px[b]) + (py[a] - py[b]) * (py[a] - py[b]));
}

static std::vector<std::string> device_func = {"Find First", "Hill Climbing", "Retrieve Path"};

void print_times(std::vector<float> &h_times, int const n_timers, int device, int threads)
{
int clock_rate;
cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device);

double total = 0;
h_times[0] /= (float)clock_rate;
total += h_times[0];
for (int i = 1; i < n_timers; ++i) {
h_times[i * threads + 1] /= (float)clock_rate;
total += h_times[i * threads + 1];
}
std::cout << "Stats: \n";
std::cout << device_func[0] << " time: " << h_times[0] * 1e-3 << " "
<< (h_times[0] / total) * 100.0 << "%\n";
for (int i = 1; i < n_timers; ++i) {
std::cout << device_func[i] << " time: " << h_times[i * threads + 1] * 1e-3 << " "
<< (h_times[i * threads + 1] / total) * 100.0 << "%\n";
}
}

// Get maximum number of threads we can run on based on number of nodes,
// shared memory usage, max threads per block and SM, max blocks for SM and registers per SM.
int best_thread_count(int nodes, int max_threads, int sm_count, int warp_size)
2 changes: 1 addition & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -282,7 +282,7 @@ ConfigureTest(FA2_TEST "${FA2_TEST_SRC}")
# - TSP tests --------------------------------------------------------------------------

set(TSP_TEST_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/traversal/tsp_test.cu")
"${CMAKE_CURRENT_SOURCE_DIR}/traversal/tsp_main.cu")

ConfigureTest(TSP_TEST "${TSP_TEST_SRC}" "")

9 changes: 6 additions & 3 deletions cpp/tests/traversal/tsp_test.cu
Original file line number Diff line number Diff line change
@@ -67,6 +67,8 @@ typedef struct Tsp_Usecase_t {
} Tsp_Usecase;

static std::vector<Tsp_Usecase_t> euc_2d{
{"tsplib/datasets/tsp225.tsp", 3916},
/*
{"tsplib/datasets/a280.tsp", 2579}, {"tsplib/datasets/berlin52.tsp", 7542},
{"tsplib/datasets/bier127.tsp", 118282}, {"tsplib/datasets/ch130.tsp", 6110},
{"tsplib/datasets/ch150.tsp", 6528}, {"tsplib/datasets/d1291.tsp", 50801},
@@ -96,7 +98,7 @@ static std::vector<Tsp_Usecase_t> euc_2d{
{"tsplib/datasets/tsp225.tsp", 3916}, {"tsplib/datasets/u1060.tsp", 224094},
{"tsplib/datasets/u1432.tsp", 152970}, {"tsplib/datasets/u159.tsp", 42080},
{"tsplib/datasets/u574.tsp", 36905}, {"tsplib/datasets/u724.tsp", 41910},
{"tsplib/datasets/vm1084.tsp", 239297},
{"tsplib/datasets/vm1084.tsp", 239297},*/
};

struct Route {
@@ -130,9 +132,10 @@ class Tests_Tsp : public ::testing::TestWithParam<Tsp_Usecase> {

std::cout << "File: " << param.tsp_file.c_str() << "\n";
int nodes = load_tsp(param.tsp_file.c_str(), &input);
std::cout << "Nodes: " << nodes << std::endl;

// Device alloc
raft::handle_t handle;
raft::handle_t const handle;
rmm::device_uvector<int> vertices(static_cast<size_t>(nodes), nullptr);
rmm::device_uvector<int> route(static_cast<size_t>(nodes), nullptr);
rmm::device_uvector<float> x_pos(static_cast<size_t>(nodes), nullptr);
@@ -154,7 +157,7 @@ class Tests_Tsp : public ::testing::TestWithParam<Tsp_Usecase> {
bool beam_search = true;
int k = 4;
int nstart = 0;
bool verbose = false;
bool verbose = true;

hr_clock.start();
cudaDeviceSynchronize();