From 0420df248ff578af3ee0d75245ec789305350f4b Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 2 Jan 2018 16:08:58 -0800 Subject: [PATCH] `reduce`: Combine allocation of the result value and temporary scratch space in the new implementation to resolve performance regressions. First commit through git-p4 from mirrored Git/Perforce history. bug 200355591 bug 1997368 bug 1844781 GH #888 git-commit 140a31d206168a4dde611a8825009832b96e01f3 git-author Bryce Adelstein Lelbach aka wash Jobs: 1844781-2006 1997368-2006 200355591-2006 [git-p4: depot-paths = "//sw/gpgpu/thrust/": change = 23352743] --- thrust/system/cuda/detail/reduce.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/thrust/system/cuda/detail/reduce.h b/thrust/system/cuda/detail/reduce.h index 793f0624d8..db84bf439e 100644 --- a/thrust/system/cuda/detail/reduce.h +++ b/thrust/system/cuda/detail/reduce.h @@ -939,20 +939,19 @@ reduce_n(execution_policy &policy, if (__THRUST_HAS_CUDART__) { - detail::temporary_array ret(policy, 1); - // Determine temporary device storage requirements. + T* ret_ptr = NULL; size_t tmp_size = 0; cuda_cub::throw_on_error( cub::DeviceReduce::Reduce(NULL, tmp_size, - first, ret.begin(), num_items, binary_op, init, + first, ret_ptr, num_items, binary_op, init, stream, THRUST_DEBUG_SYNC_FLAG), "after reduction step 1"); // Allocate temporary storage. - detail::temporary_array tmp(policy, tmp_size); + detail::temporary_array tmp(policy, sizeof(T) + tmp_size); // Run reduction. @@ -960,21 +959,24 @@ reduce_n(execution_policy &policy, // `reference`, which has an `operator&` that returns a `pointer`, which // has a `.get` method that returns a raw pointer, which we can (finally) // `static_cast` to `void*`. - void* tmp_ptr = static_cast((&*tmp.begin()).get()); + ret_ptr = reinterpret_cast((&*tmp.begin()).get()); + void* tmp_ptr = static_cast((&*(tmp.begin() + sizeof(T))).get()); cuda_cub::throw_on_error( cub::DeviceReduce::Reduce(tmp_ptr, tmp_size, - first, ret.begin(), num_items, binary_op, init, + first, ret_ptr, num_items, binary_op, init, stream, THRUST_DEBUG_SYNC_FLAG), "after reduction step 2"); + // Synchronize the stream and get the value. + cuda_cub::throw_on_error(cuda_cub::synchronize(policy), "reduce failed to synchronize"); - // `ret.begin()` yields a `normal_iterator`, which dereferences to a + // `tmp.begin()` yields a `normal_iterator`, which dereferences to a // `reference`, which has an `operator&` that returns a `pointer`, which // has a `.get` method that returns a raw pointer, which we can (finally) // `static_cast` to `void*`. - return cuda_cub::get_value(policy, (&*ret.begin()).get()); + return cuda_cub::get_value(policy, reinterpret_cast((&*tmp.begin()).get())); } #if !__THRUST_HAS_CUDART__