Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TSP solver bug fix #1480

Merged
merged 16 commits into from
Mar 26, 2021
2 changes: 1 addition & 1 deletion cpp/include/algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
* @param[out] route Device array containing the returned route.
*
*/
float traveling_salesperson(raft::handle_t &handle,
float traveling_salesperson(raft::handle_t const &handle,
int const *vtx_ptr,
float const *x_pos,
float const *y_pos,
Expand Down
202 changes: 109 additions & 93 deletions cpp/src/traversal/tsp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
#include <numeric>
#include <raft/spatial/knn/knn.hpp>

#include <utilities/high_res_timer.hpp>

#include "tsp.hpp"
#include "tsp_solver.hpp"

namespace cugraph {
namespace detail {

TSP::TSP(raft::handle_t &handle,
TSP::TSP(raft::handle_t const &handle,
int const *vtx_ptr,
float const *x_pos,
float const *y_pos,
Expand All @@ -50,139 +52,153 @@ TSP::TSP(raft::handle_t &handle,
max_threads_(handle_.get_device_properties().maxThreadsPerBlock),
warp_size_(handle_.get_device_properties().warpSize),
sm_count_(handle_.get_device_properties().multiProcessorCount),
restart_batch_(4096)
restart_batch_(8192),
neighbors_vec_((k_ + 1) * nodes_, stream_),
work_vec_(restart_batch_ * ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_),
stream_),
best_x_pos_vec_(1, stream_),
best_y_pos_vec_(1, stream_),
best_route_vec_(1, stream_)
{
allocate();
setup();
}

void TSP::allocate()
void TSP::setup()
{
// Scalars
mylock_ = mylock_scalar_.data();
best_tour_ = best_tour_scalar_.data();
climbs_ = climbs_scalar_.data();
mylock_ = mylock_scalar_.data();

// Vectors
neighbors_vec_.resize((k_ + 1) * nodes_);
neighbors_ = neighbors_vec_.data();
// pre-allocate workspace for climbs, each block needs a separate permutation space and search
// buffer. We allocate a work buffer that will store the computed distances, px, py and the route.
// We align it on the warp size.
work_vec_.resize(sizeof(float) * restart_batch_ *
((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_));
work_ = work_vec_.data();

results_.best_x_pos = best_x_pos_vec_.data();
results_.best_y_pos = best_y_pos_vec_.data();
results_.best_route = best_route_vec_.data();
results_.best_cost = best_cost_scalar_.data();
}

// Pointers
neighbors_ = neighbors_vec_.data().get();
work_ = work_vec_.data().get();
void TSP::reset_batch()
{
mylock_scalar_.set_value_zero(stream_);
auto const max{std::numeric_limits<int>::max()};
best_cost_scalar_.set_value(max, stream_);
}

void TSP::get_initial_solution(int const batch)
{
if (!beam_search_) {
random_init<<<restart_batch_, best_thread_num_>>>(
work_, x_pos_, y_pos_, vtx_ptr_, nstart_, nodes_, batch, restart_batch_);
CHECK_CUDA(stream_);
} else {
knn_init<<<restart_batch_, best_thread_num_>>>(
work_, x_pos_, y_pos_, vtx_ptr_, neighbors_, nstart_, nodes_, k_, batch, restart_batch_);
CHECK_CUDA(stream_);
}
}

float TSP::compute()
{
float valid_coo_dist = 0.f;
float final_cost = 0.f;
int num_restart_batches = (restarts_ + restart_batch_ - 1) / restart_batch_;
int restart_resid = restarts_ - (num_restart_batches - 1) * restart_batch_;
int global_best = INT_MAX;
float *soln = nullptr;
int *route_sol = nullptr;
int global_best = std::numeric_limits<int>::max();
int best = 0;

std::vector<float> h_x_pos;
std::vector<float> h_y_pos;
std::vector<int> h_route;
h_x_pos.reserve(nodes_ + 1);
h_y_pos.reserve(nodes_ + 1);

// Stats
int n_timers = 3;
long total_climbs = 0;
std::vector<float> h_times;
struct timeval starttime, endtime;

// KNN call
knn();
h_route.reserve(nodes_);
std::vector<float *> addr_best_x_pos(1);
std::vector<float *> addr_best_y_pos(1);
std::vector<int *> addr_best_route(1);
HighResTimer hr_timer;
auto create_timer = [&hr_timer, this](char const *name) {
return VerboseTimer(name, hr_timer, verbose_);
};

if (verbose_) {
std::cout << "Doing " << num_restart_batches - 1 << " batches of size " << restart_batch_
std::cout << "Doing " << num_restart_batches << " batches of size " << restart_batch_
<< ", with " << restart_resid << " tail\n";
std::cout << "configuration: " << nodes_ << " nodes, " << restarts_ << " restart\n";
std::cout << "optimizing graph with kswap = " << kswaps << "\n";
}

// Tell the cache how we want it to behave
cudaFuncSetCacheConfig(search_solution, cudaFuncCachePreferEqual);
best_thread_num_ = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);

int threads = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);
if (verbose_) std::cout << "Calculated best thread number = " << threads << "\n";
if (verbose_) std::cout << "Calculated best thread number = " << best_thread_num_ << "\n";

rmm::device_vector<float> times(n_timers * threads + n_timers);
h_times.reserve(n_timers * threads + n_timers);
if (beam_search_) {
auto timer = create_timer("knn");
knn();
}

gettimeofday(&starttime, NULL);
for (int b = 0; b < num_restart_batches; ++b) {
reset<<<1, 1, 0, stream_>>>(mylock_, best_tour_, climbs_);
CHECK_CUDA(stream_);
for (auto batch = 0; batch < num_restart_batches; ++batch) {
reset_batch();
if (batch == num_restart_batches - 1) restart_batch_ = restart_resid;

if (b == num_restart_batches - 1) restart_batch_ = restart_resid;

search_solution<<<restart_batch_, threads, sizeof(int) * threads, stream_>>>(mylock_,
best_tour_,
vtx_ptr_,
beam_search_,
k_,
nodes_,
neighbors_,
x_pos_,
y_pos_,
work_,
nstart_,
times.data().get(),
climbs_,
threads);
{
auto timer = create_timer("initial_sol");
get_initial_solution(batch);
}

CHECK_CUDA(stream_);
cudaDeviceSynchronize();
{
auto timer = create_timer("search_sol");
search_solution<<<restart_batch_,
best_thread_num_,
sizeof(int) * best_thread_num_,
stream_>>>(
results_, mylock_, vtx_ptr_, beam_search_, k_, nodes_, x_pos_, y_pos_, work_, nstart_);
CHECK_CUDA(stream_);
}

{
auto timer = create_timer("optimal_tour");
get_optimal_tour<<<restart_batch_,
best_thread_num_,
sizeof(int) * best_thread_num_,
stream_>>>(results_, mylock_, work_, nodes_);
CHECK_CUDA(stream_);
}

CUDA_TRY(cudaMemcpy(&best, best_tour_, sizeof(int), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
best = best_cost_scalar_.value(stream_);

if (verbose_) std::cout << "Best reported by kernel = " << best << "\n";

if (best < global_best) {
global_best = best;
CUDA_TRY(cudaMemcpyFromSymbol(&soln, best_soln, sizeof(void *)));
cudaDeviceSynchronize();
CUDA_TRY(cudaMemcpyFromSymbol(&route_sol, best_route, sizeof(void *)));
cudaDeviceSynchronize();

raft::update_host(addr_best_x_pos.data(), results_.best_x_pos, 1, stream_);
raft::update_host(addr_best_y_pos.data(), results_.best_y_pos, 1, stream_);
raft::update_host(addr_best_route.data(), results_.best_route, 1, stream_);
CUDA_TRY(cudaStreamSynchronize(stream_));

raft::copy(h_x_pos.data(), addr_best_x_pos[0], nodes_ + 1, stream_);
raft::copy(h_y_pos.data(), addr_best_y_pos[0], nodes_ + 1, stream_);
raft::copy(h_route.data(), addr_best_route[0], nodes_, stream_);
raft::copy(route_, addr_best_route[0], nodes_, stream_);
CHECK_CUDA(stream_);
}
total_climbs += climbs_scalar_.value(stream_);
}
gettimeofday(&endtime, NULL);
double runtime =
endtime.tv_sec + endtime.tv_usec / 1e6 - starttime.tv_sec - starttime.tv_usec / 1e6;
long long moves = 1LL * total_climbs * (nodes_ - 2) * (nodes_ - 1) / 2;

raft::copy(route_, route_sol, nodes_, stream_);

CUDA_TRY(cudaMemcpy(h_x_pos.data(), soln, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
CUDA_TRY(cudaMemcpy(
h_y_pos.data(), soln + nodes_ + 1, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();

for (int i = 0; i < nodes_; ++i) {
if (verbose_) { std::cout << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
valid_coo_dist += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
}

CUDA_TRY(cudaMemcpy(h_times.data(),
times.data().get(),
sizeof(float) * n_timers * threads + n_timers,
cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
for (auto i = 0; i < nodes_; ++i) {
if (verbose_) { std::cout << h_route[i] << ": " << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
final_cost += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
}

if (verbose_) {
std::cout << "Search runtime = " << runtime << ", " << moves * 1e-9 / runtime << " Gmoves/s\n";
hr_timer.display(std::cout);
std::cout << "Optimized tour length = " << global_best << "\n";
print_times(h_times, n_timers, handle_.get_device(), threads);
}

return valid_coo_dist;
return final_cost;
}

void TSP::knn()
Expand All @@ -192,17 +208,17 @@ void TSP::knn()
int dim = 2;
bool row_major_order = false;

rmm::device_vector<float> input(nodes_ * dim);
float *input_ptr = input.data().get();
rmm::device_uvector<float> input(nodes_ * dim, stream_);
float *input_ptr = input.data();
raft::copy(input_ptr, x_pos_, nodes_, stream_);
raft::copy(input_ptr + nodes_, y_pos_, nodes_, stream_);

rmm::device_vector<float> search_data(nodes_ * dim);
float *search_data_ptr = search_data.data().get();
rmm::device_uvector<float> search_data(nodes_ * dim, stream_);
float *search_data_ptr = search_data.data();
raft::copy(search_data_ptr, input_ptr, nodes_ * dim, stream_);

rmm::device_vector<float> distances(nodes_ * (k_ + 1));
float *distances_ptr = distances.data().get();
rmm::device_uvector<float> distances(nodes_ * (k_ + 1), stream_);
float *distances_ptr = distances.data();

std::vector<float *> input_vec;
std::vector<int> sizes_vec;
Expand All @@ -226,7 +242,7 @@ void TSP::knn()
}
} // namespace detail

float traveling_salesperson(raft::handle_t &handle,
float traveling_salesperson(raft::handle_t const &handle,
int const *vtx_ptr,
float const *x_pos,
float const *y_pos,
Expand Down
Loading