Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
reduce: Combine allocation of the result value and temporary scratc…
Browse files Browse the repository at this point in the history
…h space in the new implementation to resolve performance regressions.

First commit through git-p4 from mirrored Git/Perforce history.
bug 200355591
bug 1997368
bug 1844781
GH #888
git-commit 140a31d206168a4dde611a8825009832b96e01f3
git-author Bryce Adelstein Lelbach aka wash <[email protected]>

Jobs: 1844781-2006 1997368-2006 200355591-2006
[git-p4: depot-paths = "//sw/gpgpu/thrust/": change = 23352743]
  • Loading branch information
brycelelbach committed May 29, 2018
1 parent 527e492 commit 0420df2
Showing 1 changed file with 10 additions and 8 deletions.
18 changes: 10 additions & 8 deletions thrust/system/cuda/detail/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -939,42 +939,44 @@ reduce_n(execution_policy<Derived> &policy,

if (__THRUST_HAS_CUDART__)
{
detail::temporary_array<T, Derived> ret(policy, 1);

// Determine temporary device storage requirements.

T* ret_ptr = NULL;
size_t tmp_size = 0;
cuda_cub::throw_on_error(
cub::DeviceReduce::Reduce(NULL, tmp_size,
first, ret.begin(), num_items, binary_op, init,
first, ret_ptr, num_items, binary_op, init,
stream, THRUST_DEBUG_SYNC_FLAG),
"after reduction step 1");

// Allocate temporary storage.

detail::temporary_array<detail::uint8_t, Derived> tmp(policy, tmp_size);
detail::temporary_array<detail::uint8_t, Derived> tmp(policy, sizeof(T) + tmp_size);

// Run reduction.

// `tmp.begin()` yields a `normal_iterator`, which dereferences to a
// `reference`, which has an `operator&` that returns a `pointer`, which
// has a `.get` method that returns a raw pointer, which we can (finally)
// `static_cast` to `void*`.
void* tmp_ptr = static_cast<void*>((&*tmp.begin()).get());
ret_ptr = reinterpret_cast<T*>((&*tmp.begin()).get());
void* tmp_ptr = static_cast<void*>((&*(tmp.begin() + sizeof(T))).get());
cuda_cub::throw_on_error(
cub::DeviceReduce::Reduce(tmp_ptr, tmp_size,
first, ret.begin(), num_items, binary_op, init,
first, ret_ptr, num_items, binary_op, init,
stream, THRUST_DEBUG_SYNC_FLAG),
"after reduction step 2");

// Synchronize the stream and get the value.

cuda_cub::throw_on_error(cuda_cub::synchronize(policy),
"reduce failed to synchronize");

// `ret.begin()` yields a `normal_iterator`, which dereferences to a
// `tmp.begin()` yields a `normal_iterator`, which dereferences to a
// `reference`, which has an `operator&` that returns a `pointer`, which
// has a `.get` method that returns a raw pointer, which we can (finally)
// `static_cast` to `void*`.
return cuda_cub::get_value(policy, (&*ret.begin()).get());
return cuda_cub::get_value(policy, reinterpret_cast<T*>((&*tmp.begin()).get()));
}

#if !__THRUST_HAS_CUDART__
Expand Down

0 comments on commit 0420df2

Please sign in to comment.