From a04398559292f6b5aff249f1b2fb045940aec6cc Mon Sep 17 00:00:00 2001 From: James Wyles Date: Wed, 22 May 2019 09:36:42 -0600 Subject: [PATCH] Changed C-style casts to static_cast, changed max number of blocks to be based on the sm count, added cudaCheckError() calls after kernel launches --- cpp/src/snmg/degree/degree.cuh | 57 ++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/cpp/src/snmg/degree/degree.cuh b/cpp/src/snmg/degree/degree.cuh index 5e153fd69e3..135496fd37f 100644 --- a/cpp/src/snmg/degree/degree.cuh +++ b/cpp/src/snmg/degree/degree.cuh @@ -55,31 +55,35 @@ namespace cugraph { // In-degree if (x == 1 || x == 0) { dim3 nthreads, nblocks; - nthreads.x = min((idx_t) loc_e, (idx_t) CUDA_MAX_KERNEL_THREADS); + nthreads.x = min(static_cast(loc_e), static_cast(CUDA_MAX_KERNEL_THREADS)); nthreads.y = 1; nthreads.z = 1; - nblocks.x = min((idx_t) ((loc_e + nthreads.x - 1) / nthreads.x), (idx_t) CUDA_MAX_BLOCKS); + nblocks.x = min(static_cast((loc_e + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); nblocks.y = 1; nblocks.z = 1; - degree_coo <<>>((idx_t) loc_e, - (idx_t) loc_e, + degree_coo <<>>(static_cast(loc_e), + static_cast(loc_e), ind, local_result); + cudaCheckError(); } // Out-degree if (x == 2 || x == 0) { dim3 nthreads, nblocks; - nthreads.x = min((idx_t) loc_v, (idx_t) CUDA_MAX_KERNEL_THREADS); + nthreads.x = min(static_cast(loc_v), static_cast(CUDA_MAX_KERNEL_THREADS)); nthreads.y = 1; nthreads.z = 1; - nblocks.x = min((idx_t) ((loc_v + nthreads.x - 1) / nthreads.x), (idx_t) CUDA_MAX_BLOCKS); + nblocks.x = min(static_cast((loc_v + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); nblocks.y = 1; nblocks.z = 1; - degree_offsets <<>>((idx_t) loc_v, - (idx_t) loc_e, + degree_offsets <<>>(static_cast(loc_v), + static_cast(loc_e), off, local_result + part_off[i]); + cudaCheckError(); } // Combining the local results into global results @@ -118,42 +122,49 @@ namespace cugraph { // In-degree if (x == 1 || x == 0) { dim3 nthreads, nblocks; - nthreads.x = min((int64_t) loc_e, (int64_t) CUDA_MAX_KERNEL_THREADS); + nthreads.x = min(static_cast(loc_e), static_cast(CUDA_MAX_KERNEL_THREADS)); nthreads.y = 1; nthreads.z = 1; - nblocks.x = min((int64_t)((loc_e + nthreads.x - 1) / nthreads.x), (int64_t) CUDA_MAX_BLOCKS); + nblocks.x = min(static_cast((loc_e + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); nblocks.y = 1; nblocks.z = 1; - degree_coo <<>>((int64_t) loc_e, - (int64_t) loc_e, - ind, - (double*) local_result); + degree_coo <<>>(static_cast(loc_e), + static_cast(loc_e), + ind, + static_cast(local_result)); + cudaCheckError(); } // Out-degree if (x == 2 || x == 0) { dim3 nthreads, nblocks; - nthreads.x = min((int64_t) loc_v, (int64_t) CUDA_MAX_KERNEL_THREADS); + nthreads.x = min(static_cast(loc_v), static_cast(CUDA_MAX_KERNEL_THREADS)); nthreads.y = 1; nthreads.z = 1; - nblocks.x = min((int64_t)((loc_v + nthreads.x - 1) / nthreads.x), (int64_t) CUDA_MAX_BLOCKS); + nblocks.x = min(static_cast((loc_v + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); nblocks.y = 1; nblocks.z = 1; - degree_offsets <<>>((int64_t) loc_v, - (int64_t) loc_e, - off, - (double*) (local_result + part_off[i])); + degree_offsets <<>>(static_cast(loc_v), + static_cast(loc_e), + off, + static_cast(local_result + + part_off[i])); + cudaCheckError(); } // Convert the values written as doubles back to int64: dim3 nthreads, nblocks; - nthreads.x = min((int64_t) glob_v, (int64_t) CUDA_MAX_KERNEL_THREADS); + nthreads.x = min(static_cast(glob_v), static_cast(CUDA_MAX_KERNEL_THREADS)); nthreads.y = 1; nthreads.z = 1; - nblocks.x = min((int64_t)((glob_v + nthreads.x - 1) / nthreads.x), (int64_t) CUDA_MAX_BLOCKS); + nblocks.x = min(static_cast((glob_v + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); nblocks.y = 1; nblocks.z = 1; - type_convert <<>>((double*) local_result, glob_v); + type_convert <<>>(static_cast(local_result), glob_v); + cudaCheckError(); // Combining the local results into global results treeReduce >(env, glob_v, local_result, degree);