Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
The great Thrust index type fix, part 1: adjacent_difference, reduce.
Browse files Browse the repository at this point in the history
  • Loading branch information
griwes committed Feb 11, 2020
1 parent 42e4491 commit 500c4e0
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 51 deletions.
64 changes: 57 additions & 7 deletions testing/adjacent_difference.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include <thrust/adjacent_difference.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/retag.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>

template <class Vector>
void TestAdjacentDifferenceSimple(void)
Expand All @@ -13,21 +15,21 @@ void TestAdjacentDifferenceSimple(void)
input[0] = 1; input[1] = 4; input[2] = 6;

typename Vector::iterator result;

result = thrust::adjacent_difference(input.begin(), input.end(), output.begin());

ASSERT_EQUAL(result - output.begin(), 3);
ASSERT_EQUAL(output[0], T(1));
ASSERT_EQUAL(output[1], T(3));
ASSERT_EQUAL(output[2], T(2));

result = thrust::adjacent_difference(input.begin(), input.end(), output.begin(), thrust::plus<T>());

ASSERT_EQUAL(result - output.begin(), 3);
ASSERT_EQUAL(output[0], T( 1));
ASSERT_EQUAL(output[1], T( 5));
ASSERT_EQUAL(output[2], T(10));

// test in-place operation, result and first are permitted to be the same
result = thrust::adjacent_difference(input.begin(), input.end(), input.begin());

Expand Down Expand Up @@ -57,14 +59,14 @@ void TestAdjacentDifference(const size_t n)
ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
ASSERT_EQUAL(h_output, d_output);

h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());

ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
ASSERT_EQUAL(h_output, d_output);

// in-place operation
h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
Expand All @@ -90,7 +92,7 @@ void TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes(const size_t n)

h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());

// in-place operation with different iterator types
h_result = thrust::adjacent_difference(h_input.cbegin(), h_input.cend(), h_input.begin(), thrust::plus<T>());
d_result = thrust::adjacent_difference(d_input.cbegin(), d_input.cend(), d_input.begin(), thrust::plus<T>());
Expand Down Expand Up @@ -160,3 +162,51 @@ void TestAdjacentDifferenceDispatchImplicit()
}
DECLARE_UNITTEST(TestAdjacentDifferenceDispatchImplicit);

struct detect_wrong_difference
{
bool * flag;

__host__ __device__ detect_wrong_difference operator++() const { return *this; }
__host__ __device__ detect_wrong_difference operator*() const { return *this; }
template<typename Difference>
__host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
template<typename Index>
__host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }

__device__
void operator=(long long difference) const
{
if (difference != 1)
{
*flag = false;
}
}
};

void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
{
thrust::counting_iterator<long long> begin(1);
thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);

thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
*all_differences_correct = true;

detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };

thrust::adjacent_difference(thrust::device, begin, end, out);

bool all_differences_correct_h = *all_differences_correct;
thrust::device_free(all_differences_correct);

ASSERT_EQUAL(all_differences_correct_h, true);
}

void TestAdjacentDifferenceWithBigIndexes()
{
TestAdjacentDifferenceWithBigIndexesHelper(30);
TestAdjacentDifferenceWithBigIndexesHelper(31);
TestAdjacentDifferenceWithBigIndexesHelper(32);
TestAdjacentDifferenceWithBigIndexesHelper(33);
}
DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);
35 changes: 13 additions & 22 deletions thrust/system/cuda/detail/adjacent_difference.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <cub/block/block_adjacent_difference.cuh>
#include <thrust/system/cuda/detail/core/agent_launcher.h>
#include <thrust/system/cuda/detail/par_to_seq.h>
#include <thrust/system/cuda/detail/dispatch.h>
#include <thrust/functional.h>
#include <thrust/distance.h>
#include <thrust/detail/mpl/math.h>
Expand Down Expand Up @@ -257,8 +258,8 @@ namespace __adjacent_difference {

template <bool IS_LAST_TILE>
void THRUST_DEVICE_FUNCTION
consume_tile(Size num_remaining,
Size tile_idx,
consume_tile(int num_remaining,
int tile_idx,
Size tile_base)
{
if (tile_idx == 0)
Expand All @@ -279,7 +280,7 @@ namespace __adjacent_difference {
consume_range(Size num_items)
{
int tile_idx = blockIdx.x;
Size tile_base = tile_idx * ITEMS_PER_TILE;
Size tile_base = static_cast<Size>(tile_idx) * ITEMS_PER_TILE;
Size num_remaining = num_items - tile_base;

if (num_remaining > ITEMS_PER_TILE) // not a last tile
Expand Down Expand Up @@ -349,7 +350,7 @@ namespace __adjacent_difference {
char * /*shmem*/)
{
int tile_idx = blockIdx.x * blockDim.x + threadIdx.x;
int tile_base = tile_idx * items_per_tile;
Size tile_base = static_cast<Size>(tile_idx) * items_per_tile;
if (tile_base > 0 && tile_idx < num_tiles)
result[tile_idx] = first[tile_base - 1];
}
Expand Down Expand Up @@ -391,8 +392,8 @@ namespace __adjacent_difference {
AgentPlan init_plan = init_agent::get_plan();


size_t tile_size = difference_plan.items_per_tile;
size_t num_tiles = (num_items + tile_size - 1) / tile_size;
Size tile_size = difference_plan.items_per_tile;
Size num_tiles = (num_items + tile_size - 1) / tile_size;

size_t tmp1 = num_tiles * sizeof(input_type);
size_t vshmem_size = core::vshmem_size(difference_plan.shared_memory_size,
Expand Down Expand Up @@ -448,29 +449,19 @@ namespace __adjacent_difference {
bool debug_sync = THRUST_DEBUG_SYNC_FLAG;

cudaError_t status;
status = doit_step(NULL,
storage_size,
first,
result,
binary_op,
num_items,
stream,
debug_sync);
THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items,
(NULL, storage_size, first, result, binary_op,
num_items_fixed, stream, debug_sync));
cuda_cub::throw_on_error(status, "adjacent_difference failed on 1st step");

// Allocate temporary storage.
thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
tmp(policy, storage_size);
void *ptr = static_cast<void*>(tmp.data().get());

status = doit_step(ptr,
storage_size,
first,
result,
binary_op,
num_items,
stream,
debug_sync);
THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items,
(ptr, storage_size, first, result, binary_op,
num_items_fixed, stream, debug_sync));
cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step");

status = cuda_cub::synchronize(policy);
Expand Down
57 changes: 57 additions & 0 deletions thrust/system/cuda/detail/dispatch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright 2018 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <thrust/detail/preprocessor.h>

/**
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
* implementation. This version assumes that callables for both branches consist
* of the same tokens, and is intended to be used with Thrust-style dispatch
* interfaces, that always deduce the size type from the arguments.
*/
#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
if (count <= std::numeric_limits<thrust::detail::int32_t>::max()) { \
thrust::detail::int32_t THRUST_PP_CAT2(count, _fixed) = count; \
status = call arguments; \
} \
else { \
thrust::detail::int64_t THRUST_PP_CAT2(count, _fixed) = count; \
status = call arguments; \
}

/**
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
* implementation. This version allows using different token sequences for callables
* in both branches, and is intended to be used with CUB-style dispatch interfaces,
* where the "simple" interface always forces the size to be `int` (making it harder
* for us to use), but the complex interface that we end up using doesn't actually
* provide a way to fully deduce the type from just the call, making the size type
* appear in the token sequence of the callable.
*
* See reduce_n_impl to see an example of how this is meant to be used.
*/
#define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
if (count <= std::numeric_limits<thrust::detail::int32_t>::max()) { \
thrust::detail::int32_t THRUST_PP_CAT2(count, _fixed) = count; \
status = call_32 arguments; \
} \
else { \
thrust::detail::int64_t THRUST_PP_CAT2(count, _fixed) = count; \
status = call_64 arguments; \
}

45 changes: 23 additions & 22 deletions thrust/system/cuda/detail/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <cub/device/device_reduce.cuh>
#include <thrust/system/cuda/detail/par_to_seq.h>
#include <thrust/system/cuda/detail/get_value.h>
#include <thrust/system/cuda/detail/dispatch.h>
#include <thrust/functional.h>
#include <thrust/system/cuda/detail/core/agent_launcher.h>
#include <thrust/detail/minmax.h>
Expand Down Expand Up @@ -930,21 +931,22 @@ T reduce_n_impl(execution_policy<Derived>& policy,
BinaryOp binary_op)
{
cudaStream_t stream = cuda_cub::stream(policy);
cudaError_t status;

// Determine temporary device storage requirements.

size_t tmp_size = 0;
cuda_cub::throw_on_error(
cub::DeviceReduce::Reduce(NULL,
tmp_size,
first,
reinterpret_cast<T*>(NULL),
num_items,
binary_op,
init,
stream,
THRUST_DEBUG_SYNC_FLAG),
"after reduction step 1");

THRUST_INDEX_TYPE_DISPATCH2(status,
cub::DeviceReduce::Reduce,
(cub::DispatchReduce<
InputIt, T*, Size, BinaryOp
>::Dispatch),
num_items,
(NULL, tmp_size, first, reinterpret_cast<T*>(NULL),
num_items_fixed, binary_op, init, stream,
THRUST_DEBUG_SYNC_FLAG));
cuda_cub::throw_on_error(status, "after reduction step 1");

// Allocate temporary storage.

Expand All @@ -963,17 +965,16 @@ T reduce_n_impl(execution_policy<Derived>& policy,
// make this guarantee.
T* ret_ptr = thrust::detail::aligned_reinterpret_cast<T*>(tmp.data().get());
void* tmp_ptr = static_cast<void*>((tmp.data() + sizeof(T)).get());
cuda_cub::throw_on_error(
cub::DeviceReduce::Reduce(tmp_ptr,
tmp_size,
first,
ret_ptr,
num_items,
binary_op,
init,
stream,
THRUST_DEBUG_SYNC_FLAG),
"after reduction step 2");
THRUST_INDEX_TYPE_DISPATCH2(status,
cub::DeviceReduce::Reduce,
(cub::DispatchReduce<
InputIt, T*, Size, BinaryOp
>::Dispatch),
num_items,
(tmp_ptr, tmp_size, first, ret_ptr,
num_items_fixed, binary_op, init, stream,
THRUST_DEBUG_SYNC_FLAG));
cuda_cub::throw_on_error(status, "after reduction step 2");

// Synchronize the stream and get the value.

Expand Down

0 comments on commit 500c4e0

Please sign in to comment.