diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp index e5bba7bd5ce..e41aa9a9050 100644 --- a/cpp/include/cugraph/algorithms.hpp +++ b/cpp/include/cugraph/algorithms.hpp @@ -142,6 +142,8 @@ void overlap_list(GraphCSRView const &graph, * @tparam weight_t Type of edge weights. Supported values : float * or double. * + * @param[in] handle Library handle (RAFT). If a communicator is set in the + * handle, the multi GPU version will be selected. * @param[in] graph cuGraph graph descriptor, should contain the * connectivity information as a COO. Graph is considered undirected. Edge weights are used for this * algorithm and set to 1 by default. @@ -178,7 +180,8 @@ void overlap_list(GraphCSRView const &graph, * */ template -void force_atlas2(GraphCOOView &graph, +void force_atlas2(raft::handle_t const &handle, + GraphCOOView &graph, float *pos, const int max_iter = 500, float *x_start = nullptr, diff --git a/cpp/src/layout/barnes_hut.hpp b/cpp/src/layout/barnes_hut.hpp index ebef93e74fb..a3d413225e4 100644 --- a/cpp/src/layout/barnes_hut.hpp +++ b/cpp/src/layout/barnes_hut.hpp @@ -17,14 +17,15 @@ #pragma once #include -#include +#include -#include #include +#include + #include #include +#include -#include #include "bh_kernels.hpp" #include "fa2_kernels.hpp" #include "utils.hpp" @@ -33,7 +34,8 @@ namespace cugraph { namespace detail { template -void barnes_hut(GraphCOOView &graph, +void barnes_hut(raft::handle_t const &handle, + GraphCOOView &graph, float *pos, const int max_iter = 500, float *x_start = nullptr, @@ -50,7 +52,7 @@ void barnes_hut(GraphCOOView &graph, bool verbose = false, internals::GraphBasedDimRedCallback *callback = nullptr) { - cudaStream_t stream = {nullptr}; + cudaStream_t stream = handle.get_stream(); const edge_t e = graph.number_of_edges; const vertex_t n = graph.number_of_vertices; @@ -65,15 +67,15 @@ void barnes_hut(GraphCOOView &graph, // Allocate more space //--------------------------------------------------- - rmm::device_vector d_limiter(1); - rmm::device_vector d_maxdepthd(1); - rmm::device_vector d_bottomd(1); - rmm::device_vector d_radiusd(1); + rmm::device_uvector d_limiter(1, stream); + rmm::device_uvector d_maxdepthd(1, stream); + rmm::device_uvector d_bottomd(1, stream); + rmm::device_uvector d_radiusd(1, stream); - unsigned *limiter = d_limiter.data().get(); - int *maxdepthd = d_maxdepthd.data().get(); - int *bottomd = d_bottomd.data().get(); - float *radiusd = d_radiusd.data().get(); + unsigned *limiter = d_limiter.data(); + int *maxdepthd = d_maxdepthd.data(); + int *bottomd = d_bottomd.data(); + float *radiusd = d_radiusd.data(); InitializationKernel<<<1, 1, 0, stream>>>(limiter, maxdepthd, radiusd); CHECK_CUDA(stream); @@ -83,51 +85,52 @@ void barnes_hut(GraphCOOView &graph, const float theta_squared = theta * theta; const int NNODES = nnodes; - rmm::device_vector d_startl(nnodes + 1, 0); - rmm::device_vector d_childl((nnodes + 1) * 4, 0); + rmm::device_uvector d_startl(nnodes + 1, stream); + rmm::device_uvector d_childl((nnodes + 1) * 4, stream); // FA2 requires degree + 1 - rmm::device_vector d_massl(nnodes + 1, 1.f); + rmm::device_uvector d_massl(nnodes + 1, stream); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_massl.begin(), d_massl.end(), 1.f); - rmm::device_vector d_maxxl(blocks * FACTOR1, 0); - rmm::device_vector d_maxyl(blocks * FACTOR1, 0); - rmm::device_vector d_minxl(blocks * FACTOR1, 0); - rmm::device_vector d_minyl(blocks * FACTOR1, 0); + rmm::device_uvector d_maxxl(blocks * FACTOR1, stream); + rmm::device_uvector d_maxyl(blocks * FACTOR1, stream); + rmm::device_uvector d_minxl(blocks * FACTOR1, stream); + rmm::device_uvector d_minyl(blocks * FACTOR1, stream); // Actual mallocs - int *startl = d_startl.data().get(); - int *childl = d_childl.data().get(); - int *massl = d_massl.data().get(); + int *startl = d_startl.data(); + int *childl = d_childl.data(); + int *massl = d_massl.data(); - float *maxxl = d_maxxl.data().get(); - float *maxyl = d_maxyl.data().get(); - float *minxl = d_minxl.data().get(); - float *minyl = d_minyl.data().get(); + float *maxxl = d_maxxl.data(); + float *maxyl = d_maxyl.data(); + float *minxl = d_minxl.data(); + float *minyl = d_minyl.data(); // SummarizationKernel - rmm::device_vector d_countl(nnodes + 1, 0); - int *countl = d_countl.data().get(); + rmm::device_uvector d_countl(nnodes + 1, stream); + int *countl = d_countl.data(); // SortKernel - rmm::device_vector d_sortl(nnodes + 1, 0); - int *sortl = d_sortl.data().get(); + rmm::device_uvector d_sortl(nnodes + 1, stream); + int *sortl = d_sortl.data(); // RepulsionKernel - rmm::device_vector d_rep_forces((nnodes + 1) * 2, 0); - float *rep_forces = d_rep_forces.data().get(); + rmm::device_uvector d_rep_forces((nnodes + 1) * 2, stream); + float *rep_forces = d_rep_forces.data(); - rmm::device_vector d_radius_squared(1, 0); - float *radiusd_squared = d_radius_squared.data().get(); + rmm::device_uvector d_radius_squared(1, stream); + float *radiusd_squared = d_radius_squared.data(); - rmm::device_vector d_nodes_pos((nnodes + 1) * 2, 0); - float *nodes_pos = d_nodes_pos.data().get(); + rmm::device_uvector d_nodes_pos((nnodes + 1) * 2, stream); + float *nodes_pos = d_nodes_pos.data(); // Initialize positions with random values int random_state = 0; // Copy start x and y positions. if (x_start && y_start) { - copy(n, x_start, nodes_pos); - copy(n, y_start, nodes_pos + nnodes + 1); + raft::copy(nodes_pos, x_start, n, stream); + raft::copy(nodes_pos + nnodes + 1, y_start, n, stream); } else { random_vector(nodes_pos, (nnodes + 1) * 2, random_state, stream); } @@ -138,15 +141,15 @@ void barnes_hut(GraphCOOView &graph, float *swinging{nullptr}; float *traction{nullptr}; - rmm::device_vector d_attract(n * 2, 0); - rmm::device_vector d_old_forces(n * 2, 0); - rmm::device_vector d_swinging(n, 0); - rmm::device_vector d_traction(n, 0); + rmm::device_uvector d_attract(n * 2, stream); + rmm::device_uvector d_old_forces(n * 2, stream); + rmm::device_uvector d_swinging(n, stream); + rmm::device_uvector d_traction(n, stream); - attract = d_attract.data().get(); - old_forces = d_old_forces.data().get(); - swinging = d_swinging.data().get(); - traction = d_traction.data().get(); + attract = d_attract.data(); + old_forces = d_old_forces.data(); + swinging = d_swinging.data(); + traction = d_traction.data(); // Sort COO for coalesced memory access. sort(graph, stream); @@ -191,10 +194,11 @@ void barnes_hut(GraphCOOView &graph, for (int iter = 0; iter < max_iter; ++iter) { // Reset force values - fill((nnodes + 1) * 2, rep_forces, 0.f); - fill(n * 2, attract, 0.f); - fill(n, swinging, 0.f); - fill(n, traction, 0.f); + thrust::fill( + rmm::exec_policy(stream)->on(stream), d_rep_forces.begin(), d_rep_forces.end(), 0.f); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_attract.begin(), d_attract.end(), 0.f); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_swinging.begin(), d_swinging.end(), 0.f); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_traction.begin(), d_traction.end(), 0.f); ResetKernel<<<1, 1, 0, stream>>>(radiusd_squared, bottomd, NNODES, radiusd); CHECK_CUDA(stream); @@ -320,15 +324,15 @@ void barnes_hut(GraphCOOView &graph, if (callback) callback->on_epoch_end(nodes_pos); if (verbose) { - printf("iteration %i, speed: %f, speed_efficiency: %f, ", iter + 1, speed, speed_efficiency); - printf("jt: %f, ", jt); - printf("swinging: %f, traction: %f\n", s, t); + std::cout << "iteration: " << iter + 1 << ", speed: " << speed + << ", speed_efficiency: " << speed_efficiency << ", jt: " << jt + << ", swinging: " << s << ", traction: " << t << "\n"; } } // Copy nodes positions into final output pos - copy(n, nodes_pos, pos); - copy(n, nodes_pos + nnodes + 1, pos + n); + raft::copy(pos, nodes_pos, n, stream); + raft::copy(pos + n, nodes_pos + nnodes + 1, n, stream); if (callback) callback->on_train_end(nodes_pos); } diff --git a/cpp/src/layout/exact_fa2.hpp b/cpp/src/layout/exact_fa2.hpp index abad5a5630f..d34f8843e9b 100644 --- a/cpp/src/layout/exact_fa2.hpp +++ b/cpp/src/layout/exact_fa2.hpp @@ -17,12 +17,13 @@ #pragma once #include -#include +#include -#include #include + #include #include +#include #include "exact_repulsion.hpp" #include "fa2_kernels.hpp" @@ -32,7 +33,8 @@ namespace cugraph { namespace detail { template -void exact_fa2(GraphCOOView &graph, +void exact_fa2(raft::handle_t const &handle, + GraphCOOView &graph, float *pos, const int max_iter = 500, float *x_start = nullptr, @@ -48,7 +50,7 @@ void exact_fa2(GraphCOOView &graph, bool verbose = false, internals::GraphBasedDimRedCallback *callback = nullptr) { - cudaStream_t stream = {nullptr}; + cudaStream_t stream = handle.get_stream(); const edge_t e = graph.number_of_edges; const vertex_t n = graph.number_of_vertices; @@ -59,27 +61,28 @@ void exact_fa2(GraphCOOView &graph, float *d_swinging{nullptr}; float *d_traction{nullptr}; - rmm::device_vector repel(n * 2, 0); - rmm::device_vector attract(n * 2, 0); - rmm::device_vector old_forces(n * 2, 0); + rmm::device_uvector repel(n * 2, stream); + rmm::device_uvector attract(n * 2, stream); + rmm::device_uvector old_forces(n * 2, stream); // FA2 requires degree + 1. - rmm::device_vector mass(n, 1); - rmm::device_vector swinging(n, 0); - rmm::device_vector traction(n, 0); - - d_repel = repel.data().get(); - d_attract = attract.data().get(); - d_old_forces = old_forces.data().get(); - d_mass = mass.data().get(); - d_swinging = swinging.data().get(); - d_traction = traction.data().get(); + rmm::device_uvector mass(n, stream); + thrust::fill(rmm::exec_policy(stream)->on(stream), mass.begin(), mass.end(), 1.f); + rmm::device_uvector swinging(n, stream); + rmm::device_uvector traction(n, stream); + + d_repel = repel.data(); + d_attract = attract.data(); + d_old_forces = old_forces.data(); + d_mass = mass.data(); + d_swinging = swinging.data(); + d_traction = traction.data(); int random_state = 0; random_vector(pos, n * 2, random_state, stream); if (x_start && y_start) { - copy(n, x_start, pos); - copy(n, y_start, pos + n); + raft::copy(pos, x_start, n, stream); + raft::copy(pos + n, y_start, n, stream); } // Sort COO for coalesced memory access. @@ -110,10 +113,10 @@ void exact_fa2(GraphCOOView &graph, for (int iter = 0; iter < max_iter; ++iter) { // Reset force arrays - fill(n * 2, d_repel, 0.f); - fill(n * 2, d_attract, 0.f); - fill(n, d_swinging, 0.f); - fill(n, d_traction, 0.f); + thrust::fill(rmm::exec_policy(stream)->on(stream), repel.begin(), repel.end(), 0.f); + thrust::fill(rmm::exec_policy(stream)->on(stream), attract.begin(), attract.end(), 0.f); + thrust::fill(rmm::exec_policy(stream)->on(stream), swinging.begin(), swinging.end(), 0.f); + thrust::fill(rmm::exec_policy(stream)->on(stream), traction.begin(), traction.end(), 0.f); // Exact repulsion apply_repulsion(pos, pos + n, d_repel, d_repel + n, d_mass, scaling_ratio, n, stream); @@ -180,9 +183,9 @@ void exact_fa2(GraphCOOView &graph, if (callback) callback->on_epoch_end(pos); if (verbose) { - printf("iteration %i, speed: %f, speed_efficiency: %f, ", iter + 1, speed, speed_efficiency); - printf("jt: %f, ", jt); - printf("swinging: %f, traction: %f\n", s, t); + std::cout << "iteration: " << iter + 1 << ", speed: " << speed + << ", speed_efficiency: " << speed_efficiency << ", jt: " << jt + << ", swinging: " << s << ", traction: " << t << "\n"; } } diff --git a/cpp/src/layout/force_atlas2.cu b/cpp/src/layout/force_atlas2.cu index 6da9b77b45d..86c95cc883e 100644 --- a/cpp/src/layout/force_atlas2.cu +++ b/cpp/src/layout/force_atlas2.cu @@ -20,7 +20,8 @@ namespace cugraph { template -void force_atlas2(GraphCOOView &graph, +void force_atlas2(raft::handle_t const &handle, + GraphCOOView &graph, float *pos, const int max_iter, float *x_start, @@ -42,7 +43,8 @@ void force_atlas2(GraphCOOView &graph, CUGRAPH_EXPECTS(graph.number_of_vertices != 0, "Invalid input: Graph is empty"); if (!barnes_hut_optimize) { - cugraph::detail::exact_fa2(graph, + cugraph::detail::exact_fa2(handle, + graph, pos, max_iter, x_start, @@ -58,7 +60,8 @@ void force_atlas2(GraphCOOView &graph, verbose, callback); } else { - cugraph::detail::barnes_hut(graph, + cugraph::detail::barnes_hut(handle, + graph, pos, max_iter, x_start, @@ -77,7 +80,8 @@ void force_atlas2(GraphCOOView &graph, } } -template void force_atlas2(GraphCOOView &graph, +template void force_atlas2(raft::handle_t const &handle, + GraphCOOView &graph, float *pos, const int max_iter, float *x_start, @@ -95,7 +99,8 @@ template void force_atlas2(GraphCOOView &graph bool verbose, internals::GraphBasedDimRedCallback *callback); -template void force_atlas2(GraphCOOView &graph, +template void force_atlas2(raft::handle_t const &handle, + GraphCOOView &graph, float *pos, const int max_iter, float *x_start, diff --git a/cpp/tests/layout/force_atlas2_test.cu b/cpp/tests/layout/force_atlas2_test.cu index c22c256ae02..f2f5561a7d8 100644 --- a/cpp/tests/layout/force_atlas2_test.cu +++ b/cpp/tests/layout/force_atlas2_test.cu @@ -111,9 +111,10 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam std::vector> adj_matrix(m, std::vector(m)); std::vector force_atlas2(m * 2); + raft::handle_t const handle; + auto stream = handle.get_stream(); // device alloc - rmm::device_vector force_atlas2_vector(m * 2); - float* d_force_atlas2 = force_atlas2_vector.data().get(); + rmm::device_uvector pos(m * 2, stream); // Read ASSERT_EQ((cugraph::test::mm_to_coo( @@ -131,13 +132,13 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam } // Allocate COO on device - rmm::device_vector srcs_v(nnz); - rmm::device_vector dests_v(nnz); - rmm::device_vector weights_v(nnz); + rmm::device_uvector srcs_v(nnz, stream); + rmm::device_uvector dests_v(nnz, stream); + rmm::device_uvector weights_v(nnz, stream); - int* srcs = srcs_v.data().get(); - int* dests = dests_v.data().get(); - T* weights = weights_v.data().get(); + int* srcs = srcs_v.data(); + int* dests = dests_v.data(); + T* weights = weights_v.data(); // FIXME: RAFT error handling mechanism should be used instead CUDA_TRY(cudaMemcpy(srcs, &cooRowInd[0], sizeof(int) * nnz, cudaMemcpyDefault)); @@ -163,8 +164,9 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam if (PERF) { hr_clock.start(); for (int i = 0; i < PERF_MULTIPLIER; ++i) { - cugraph::force_atlas2(G, - d_force_atlas2, + cugraph::force_atlas2(handle, + G, + pos.data(), max_iter, x_start, y_start, @@ -185,8 +187,9 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam force_atlas2_time.push_back(time_tmp); } else { cudaProfilerStart(); - cugraph::force_atlas2(G, - d_force_atlas2, + cugraph::force_atlas2(handle, + G, + pos.data(), max_iter, x_start, y_start, @@ -207,7 +210,7 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam // Copy pos to host std::vector h_pos(m * 2); - CUDA_TRY(cudaMemcpy(&h_pos[0], d_force_atlas2, sizeof(float) * m * 2, cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(&h_pos[0], pos.data(), sizeof(float) * m * 2, cudaMemcpyDeviceToHost)); // Transpose the data std::vector> C_contiguous_embedding(m, std::vector(2)); diff --git a/python/cugraph/layout/force_atlas2.pxd b/python/cugraph/layout/force_atlas2.pxd index bf5186c91f9..5496d1b655e 100644 --- a/python/cugraph/layout/force_atlas2.pxd +++ b/python/cugraph/layout/force_atlas2.pxd @@ -25,6 +25,7 @@ cdef extern from "cugraph/internals.hpp" namespace "cugraph::internals": cdef extern from "cugraph/algorithms.hpp" namespace "cugraph": cdef void force_atlas2[vertex_t, edge_t, weight_t]( + const handle_t &handle, GraphCOOView[vertex_t, edge_t, weight_t] &graph, float *pos, const int max_iter, diff --git a/python/cugraph/layout/force_atlas2_wrapper.pyx b/python/cugraph/layout/force_atlas2_wrapper.pyx index 7b801d19f1c..1644875f034 100644 --- a/python/cugraph/layout/force_atlas2_wrapper.pyx +++ b/python/cugraph/layout/force_atlas2_wrapper.pyx @@ -49,6 +49,10 @@ def force_atlas2(input_graph, Call force_atlas2 """ + cdef unique_ptr[handle_t] handle_ptr + handle_ptr.reset(new handle_t()) + handle_ = handle_ptr.get(); + if not input_graph.edgelist: input_graph.view_edge_list() @@ -61,12 +65,19 @@ def force_atlas2(input_graph, df = cudf.DataFrame() df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32)) - cdef uintptr_t c_src_indices = input_graph.edgelist.edgelist_df['src'].__cuda_array_interface__['data'][0] - cdef uintptr_t c_dst_indices = input_graph.edgelist.edgelist_df['dst'].__cuda_array_interface__['data'][0] + src = input_graph.edgelist.edgelist_df['src'] + dst = input_graph.edgelist.edgelist_df['dst'] + + [src, dst] = graph_primtypes_wrapper.datatype_cast([src, dst], [np.int32]) + + cdef uintptr_t c_src_indices = src.__cuda_array_interface__['data'][0] + cdef uintptr_t c_dst_indices = dst.__cuda_array_interface__['data'][0] cdef uintptr_t c_weights = NULL if input_graph.edgelist.weights: - c_weights = input_graph.edgelist.edgelist_df['weights'].__cuda_array_interface__['data'][0] + weights = input_graph.edgelist.edgelist_df["weights"] + [weights] = graph_primtypes_wrapper.datatype_cast([weights], [np.float32, np.float64]) + c_weights = weights.__cuda_array_interface__['data'][0] cdef uintptr_t x_start = NULL cdef uintptr_t y_start = NULL @@ -100,7 +111,8 @@ def force_atlas2(input_graph, graph_double = GraphCOOView[int,int, double](c_src_indices, c_dst_indices, c_weights, num_verts, num_edges) - c_force_atlas2[int, int, double](graph_double, + c_force_atlas2[int, int, double](handle_[0], + graph_double, pos_ptr, max_iter, x_start, @@ -121,7 +133,8 @@ def force_atlas2(input_graph, graph_float = GraphCOOView[int,int,float](c_src_indices, c_dst_indices, c_weights, num_verts, num_edges) - c_force_atlas2[int, int, float](graph_float, + c_force_atlas2[int, int, float](handle_[0], + graph_float, pos_ptr, max_iter, x_start,