Update references to CHECK_CUDA, CUDA_CHECK and CUDA_TRY to use new R…

…AFT_ names (rapidsai#2000) Replaces rapidsai#1975 Update all references to CHECK_CUDA, CUDA_CHECK and CUDA_TRY to use the new raft names Also updated calls to `cudaStreamSynchronize` to use the `handle.sync_stream()` method. Authors: - Chuck Hastings (https://github.com/ChuckHastings) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - Rick Ratzel (https://github.com/rlratzel) URL: rapidsai#2000
oorliu · Jan 6, 2022 · ba0e2d2 · ba0e2d2
1 parent c49f049
commit ba0e2d2
Show file tree

Hide file tree

Showing 76 changed files with 519 additions and 535 deletions.
diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -956,7 +956,7 @@ void update_frontier_v_push_if_out_nbr(
                           d_offsets.begin());
       std::vector<vertex_t> h_offsets(d_offsets.size());
       raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream());
-      CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+      RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
       h_offsets.push_back(matrix_partition_frontier_size);
       // FIXME: we may further improve performance by 1) concurrently running kernels on different
       // segments; 2) individually tuning block sizes for different segments; and 3) adding one more

diff --git a/cpp/src/centrality/betweenness_centrality.cu b/cpp/src/centrality/betweenness_centrality.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -95,7 +95,7 @@ vertex_t get_total_number_of_sources(raft::handle_t const& handle, vertex_t loca
                                  raft::comms::op_t::SUM,
                                  handle.get_stream());
     total_number_of_sources_used = d_number_of_sources.value(handle.get_stream());
-    // CUDA_TRY(
+    // RAFT_CUDA_TRY(
     // cudaMemcpy(&total_number_of_sources_used, data, sizeof(vertex_t), cudaMemcpyDeviceToHost));
   }
   return total_number_of_sources_used;
@@ -237,7 +237,8 @@ void BC<vertex_t, edge_t, weight_t, result_t>::compute_single_source(vertex_t so
   auto current_max_depth =
     thrust::max_element(handle_.get_thrust_policy(), distances_, distances_ + number_of_vertices_);
   vertex_t max_depth = 0;
-  CUDA_TRY(cudaMemcpy(&max_depth, current_max_depth, sizeof(vertex_t), cudaMemcpyDeviceToHost));
+  RAFT_CUDA_TRY(
+    cudaMemcpy(&max_depth, current_max_depth, sizeof(vertex_t), cudaMemcpyDeviceToHost));
   // Step 2) Dependency accumulation
   accumulate(source_vertex, max_depth);
 }

diff --git a/cpp/src/community/legacy/triangles_counting.cu b/cpp/src/community/legacy/triangles_counting.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,13 +96,13 @@ static inline void cubSum(InputIteratorT d_in,
 
   cub::DeviceReduce::Sum(
     nullptr, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
   cub::DeviceReduce::Sum(
     d_temp_storage.data(), temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   return;
 }
@@ -130,7 +130,7 @@ static inline void cubIf(InputIteratorT d_in,
                         select_op,
                         stream,
                         debug_synchronous);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
@@ -143,7 +143,7 @@ static inline void cubIf(InputIteratorT d_in,
                         select_op,
                         stream,
                         debug_synchronous);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   return;
 }
@@ -287,7 +287,7 @@ void tricnt_b2b(T nblock,
   // still best overall (with no psum)
   tricnt_b2b_k<THREADS, 32, BLK_BWL0><<<nblock, THREADS, 0, stream>>>(
     m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmapL0_d, bmldL0, bmapL1_d, bmldL1);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   return;
 }
 
@@ -395,7 +395,7 @@ void tricnt_bsh(T nblock, spmat_t<T>* m, uint64_t* ocnt_d, size_t bmld, cudaStre
 {
   tricnt_bsh_k<THREADS, 32><<<nblock, THREADS, sizeof(unsigned int) * bmld, stream>>>(
     m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmld);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   return;
 }
 
@@ -543,7 +543,7 @@ void tricnt_wrp(
   dim3 block(32, THREADS / 32);
   tricnt_wrp_ps_k<32, THREADS / 32, WP_LEN_TH1, WP_LEN_TH2>
     <<<nblock, block, 0, stream>>>(m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmap_d, bmld);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   return;
 }
 
@@ -631,7 +631,7 @@ void tricnt_thr(T nblock, spmat_t<T>* m, uint64_t* ocnt_d, cudaStream_t stream)
 
   tricnt_thr_k<THREADS, TH_CENT_K_LOCLEN>
     <<<nblock, THREADS, 0, stream>>>(m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   return;
 }
 
@@ -657,7 +657,7 @@ void create_nondangling_vector(
 
   cubIf(it, p_nonempty, out_num.data().get(), n, temp_func, stream);
   cudaMemcpy(n_nonempty, out_num.data().get(), sizeof(*n_nonempty), cudaMemcpyDeviceToHost);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 }
 
 template <typename T>
@@ -666,7 +666,7 @@ uint64_t reduce(uint64_t* v_d, T n, cudaStream_t stream)
   rmm::device_vector<uint64_t> tmp(1);
 
   cubSum(v_d, tmp.data().get(), n, stream);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   return tmp[0];
 }
@@ -717,12 +717,12 @@ TrianglesCount<IndexType>::TrianglesCount(IndexType num_vertices,
   cudaGetDevice(&device_id);
 
   cudaDeviceGetAttribute(&m_shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
-  CHECK_CUDA(m_stream);
+  RAFT_CHECK_CUDA(m_stream);
   cudaDeviceGetAttribute(&m_multi_processor_count, cudaDevAttrMultiProcessorCount, device_id);
-  CHECK_CUDA(m_stream);
+  RAFT_CHECK_CUDA(m_stream);
   cudaDeviceGetAttribute(
     &m_max_threads_per_multi_processor, cudaDevAttrMaxThreadsPerMultiProcessor, device_id);
-  CHECK_CUDA(m_stream);
+  RAFT_CHECK_CUDA(m_stream);
 
   m_seq.resize(m_mat.N, IndexType{0});
   create_nondangling_vector(m_mat.roff_d, m_seq.data().get(), &(m_mat.nrows), m_mat.N, m_stream);
@@ -758,7 +758,7 @@ void TrianglesCount<IndexType>::tcount_b2b()
 
   size_t free_bytes, total_bytes;
   cudaMemGetInfo(&free_bytes, &total_bytes);
-  CHECK_CUDA(m_stream);
+  RAFT_CHECK_CUDA(m_stream);
 
   size_t nblock_available = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmldL1);
 
@@ -793,7 +793,7 @@ void TrianglesCount<IndexType>::tcount_wrp()
   // number of blocks limited by birmap size
   size_t free_bytes, total_bytes;
   cudaMemGetInfo(&free_bytes, &total_bytes);
-  CHECK_CUDA(m_stream);
+  RAFT_CHECK_CUDA(m_stream);
 
   size_t nblock_available = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmld * (THREADS / 32));
 

diff --git a/cpp/src/components/utils.h b/cpp/src/components/utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -92,7 +92,7 @@ class Exception : public std::exception {
 template <typename Type>
 void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream)
 {
-  CUDA_TRY(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**
@@ -118,7 +118,8 @@ void updateHost(Type* hPtr, const Type* dPtr, size_t len, cudaStream_t stream)
 template <typename Type>
 void copyAsync(Type* dPtr1, const Type* dPtr2, size_t len, cudaStream_t stream)
 {
-  CUDA_TRY(cudaMemcpyAsync(dPtr1, dPtr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(dPtr1, dPtr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -189,7 +190,8 @@ void myPrintDevVector(const char* variableName,
                       OutStream& out)
 {
   std::vector<T> hostMem(componentsCount);
-  CUDA_TRY(cudaMemcpy(hostMem.data(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+  RAFT_CUDA_TRY(
+    cudaMemcpy(hostMem.data(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
   myPrintHostVector(variableName, hostMem.data(), componentsCount, out);
 }
 

diff --git a/cpp/src/components/weak_cc.cuh b/cpp/src/components/weak_cc.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -165,22 +165,22 @@ void weak_cc_label_batched(vertex_t* labels,
   weak_cc_init_label_kernel<vertex_t, TPB_X>
     <<<blocks, threads, 0, stream>>>(labels, startVertexId, batchSize, MAX_LABEL, filter_op);
 
-  CUDA_TRY(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   int n_iters = 0;
   do {
-    CUDA_TRY(cudaMemsetAsync(state.m, false, sizeof(bool), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(state.m, false, sizeof(bool), stream));
 
     weak_cc_label_device<vertex_t, edge_t, TPB_X><<<blocks, threads, 0, stream>>>(
       labels, offsets, indices, nnz, state.fa, state.xa, state.m, startVertexId, batchSize);
-    CUDA_TRY(cudaPeekAtLastError());
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     thrust::swap(state.fa, state.xa);
 
     //** Updating m *
     MLCommon::updateHost(&host_m, state.m, 1, stream);
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     n_iters++;
   } while (host_m);
@@ -235,7 +235,7 @@ void weak_cc_batched(vertex_t* labels,
   if (startVertexId == 0) {
     weak_cc_init_all_kernel<vertex_t, TPB_X>
       <<<blocks, threads, 0, stream>>>(labels, state.fa, state.xa, N, MAX_LABEL);
-    CUDA_TRY(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 
   weak_cc_label_batched<vertex_t, edge_t, TPB_X>(

diff --git a/cpp/src/converters/COOtoCSR.cuh b/cpp/src/converters/COOtoCSR.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,27 +69,27 @@ VT sort(legacy::GraphCOOView<VT, ET, WT>& graph, rmm::cuda_stream_view stream_vi
       graph.dst_indices,
       graph.dst_indices + graph.number_of_edges,
       thrust::make_zip_iterator(thrust::make_tuple(graph.src_indices, graph.edge_data)));
-    CUDA_TRY(cudaMemcpy(
+    RAFT_CUDA_TRY(cudaMemcpy(
       &max_dst_id, &(graph.dst_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault));
     thrust::stable_sort_by_key(
       rmm::exec_policy(stream_view),
       graph.src_indices,
       graph.src_indices + graph.number_of_edges,
       thrust::make_zip_iterator(thrust::make_tuple(graph.dst_indices, graph.edge_data)));
-    CUDA_TRY(cudaMemcpy(
+    RAFT_CUDA_TRY(cudaMemcpy(
       &max_src_id, &(graph.src_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault));
   } else {
     thrust::stable_sort_by_key(rmm::exec_policy(stream_view),
                                graph.dst_indices,
                                graph.dst_indices + graph.number_of_edges,
                                graph.src_indices);
-    CUDA_TRY(cudaMemcpy(
+    RAFT_CUDA_TRY(cudaMemcpy(
       &max_dst_id, &(graph.dst_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault));
     thrust::stable_sort_by_key(rmm::exec_policy(stream_view),
                                graph.src_indices,
                                graph.src_indices + graph.number_of_edges,
                                graph.dst_indices);
-    CUDA_TRY(cudaMemcpy(
+    RAFT_CUDA_TRY(cudaMemcpy(
       &max_src_id, &(graph.src_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault));
   }
   return std::max(max_src_id, max_dst_id) + 1;
@@ -177,10 +177,10 @@ void coo_to_csr_inplace(legacy::GraphCOOView<VT, ET, WT>& graph,
                       graph.number_of_edges,
                       stream_view);
 
-  CUDA_TRY(cudaMemcpy(
+  RAFT_CUDA_TRY(cudaMemcpy(
     result.indices, graph.dst_indices, sizeof(VT) * graph.number_of_edges, cudaMemcpyDefault));
   if (graph.has_data())
-    CUDA_TRY(cudaMemcpy(
+    RAFT_CUDA_TRY(cudaMemcpy(
       result.edge_data, graph.edge_data, sizeof(WT) * graph.number_of_edges, cudaMemcpyDefault));
 }
 

diff --git a/cpp/src/layout/barnes_hut.cuh b/cpp/src/layout/barnes_hut.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ void barnes_hut(raft::handle_t const& handle,
   float* radiusd    = d_radiusd.data();
 
   InitializationKernel<<<1, 1, 0, stream_view.value()>>>(limiter, maxdepthd, radiusd);
-  CHECK_CUDA(stream_view.value());
+  RAFT_CHECK_CUDA(stream_view.value());
 
   const int FOUR_NNODES     = 4 * nnodes;
   const int FOUR_N          = 4 * n;
@@ -158,10 +158,10 @@ void barnes_hut(raft::handle_t const& handle,
 
   // Sort COO for coalesced memory access.
   sort(graph, stream_view.value());
-  CHECK_CUDA(stream_view.value());
+  RAFT_CHECK_CUDA(stream_view.value());
 
   graph.degree(massl, cugraph::legacy::DegreeDirection::OUT);
-  CHECK_CUDA(stream_view.value());
+  RAFT_CHECK_CUDA(stream_view.value());
 
   const vertex_t* row = graph.src_indices;
   const vertex_t* col = graph.dst_indices;
@@ -204,7 +204,7 @@ void barnes_hut(raft::handle_t const& handle,
     thrust::fill(handle.get_thrust_policy(), d_traction.begin(), d_traction.end(), 0.f);
 
     ResetKernel<<<1, 1, 0, stream_view.value()>>>(radiusd_squared, bottomd, NNODES, radiusd);
-    CHECK_CUDA(stream_view.value());
+    RAFT_CHECK_CUDA(stream_view.value());
 
     // Compute bounding box arround all bodies
     BoundingBoxKernel<<<blocks * FACTOR1, THREADS1, 0, stream_view.value()>>>(
@@ -222,28 +222,28 @@ void barnes_hut(raft::handle_t const& handle,
       n,
       limiter,
       radiusd);
-    CHECK_CUDA(stream_view.value());
+    RAFT_CHECK_CUDA(stream_view.value());
 
     ClearKernel1<<<blocks, 1024, 0, stream_view.value()>>>(childl, FOUR_NNODES, FOUR_N);
-    CHECK_CUDA(stream_view.value());
+    RAFT_CHECK_CUDA(stream_view.value());
 
     // Build quadtree
     TreeBuildingKernel<<<blocks * FACTOR2, THREADS2, 0, stream_view.value()>>>(
       childl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, maxdepthd, bottomd, radiusd);
-    CHECK_CUDA(stream_view.value());
+    RAFT_CHECK_CUDA(stream_view.value());
 
     ClearKernel2<<<blocks, 1024, 0, stream_view.value()>>>(startl, massl, NNODES, bottomd);
-    CHECK_CUDA(stream_view.value());
+    RAFT_CHECK_CUDA(stream_view.value());
 
     // Summarizes mass and position for each cell, bottom up approach
     SummarizationKernel<<<blocks * FACTOR3, THREADS3, 0, stream_view.value()>>>(
       countl, childl, massl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, bottomd);
-    CHECK_CUDA(stream_view.value());
+    RAFT_CHECK_CUDA(stream_view.value());
 
     // Group closed bodies together, used to speed up Repulsion kernel
     SortKernel<<<blocks * FACTOR4, THREADS4, 0, stream_view.value()>>>(
       sortl, countl, startl, childl, NNODES, n, bottomd);
-    CHECK_CUDA(stream_view.value());
+    RAFT_CHECK_CUDA(stream_view.value());
 
     // Force computation O(n . log(n))
     RepulsionKernel<<<blocks * FACTOR5, THREADS5, 0, stream_view.value()>>>(scaling_ratio,
@@ -262,7 +262,7 @@ void barnes_hut(raft::handle_t const& handle,
                                                                             n,
                                                                             radiusd_squared,
                                                                             maxdepthd);
-    CHECK_CUDA(stream_view.value());
+    RAFT_CHECK_CUDA(stream_view.value());
 
     apply_gravity<vertex_t>(nodes_pos,
                             nodes_pos + nnodes + 1,