The great Thrust index type fix, part 1: adjacent_difference, reduce.

NVIDIA · Feb 11, 2020 · 500c4e0 · 500c4e0
1 parent 42e4491
commit 500c4e0
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 51 deletions.
diff --git a/testing/adjacent_difference.cu b/testing/adjacent_difference.cu
@@ -2,6 +2,8 @@
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 template <class Vector>
 void TestAdjacentDifferenceSimple(void)
@@ -13,21 +15,21 @@ void TestAdjacentDifferenceSimple(void)
     input[0] = 1; input[1] = 4; input[2] = 6;
 
     typename Vector::iterator result;
-    
+
     result = thrust::adjacent_difference(input.begin(), input.end(), output.begin());
 
     ASSERT_EQUAL(result - output.begin(), 3);
     ASSERT_EQUAL(output[0], T(1));
     ASSERT_EQUAL(output[1], T(3));
     ASSERT_EQUAL(output[2], T(2));
-    
+
     result = thrust::adjacent_difference(input.begin(), input.end(), output.begin(), thrust::plus<T>());
-    
+
     ASSERT_EQUAL(result - output.begin(), 3);
     ASSERT_EQUAL(output[0], T( 1));
     ASSERT_EQUAL(output[1], T( 5));
     ASSERT_EQUAL(output[2], T(10));
-    
+
     // test in-place operation, result and first are permitted to be the same
     result = thrust::adjacent_difference(input.begin(), input.end(), input.begin());
 
@@ -57,14 +59,14 @@ void TestAdjacentDifference(const size_t n)
     ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
     ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
     ASSERT_EQUAL(h_output, d_output);
-    
+
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
 
     ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
     ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
     ASSERT_EQUAL(h_output, d_output);
-    
+
     // in-place operation
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
@@ -90,7 +92,7 @@ void TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes(const size_t n)
 
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
-    
+
     // in-place operation with different iterator types
     h_result = thrust::adjacent_difference(h_input.cbegin(), h_input.cend(), h_input.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.cbegin(), d_input.cend(), d_input.begin(), thrust::plus<T>());
@@ -160,3 +162,51 @@ void TestAdjacentDifferenceDispatchImplicit()
 }
 DECLARE_UNITTEST(TestAdjacentDifferenceDispatchImplicit);
 
+struct detect_wrong_difference
+{
+    bool * flag;
+
+    __host__ __device__ detect_wrong_difference operator++() const { return *this; }
+    __host__ __device__ detect_wrong_difference operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long difference) const
+    {
+        if (difference != 1)
+        {
+            *flag = false;
+        }
+    }
+};
+
+void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
+    *all_differences_correct = true;
+
+    detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };
+
+    thrust::adjacent_difference(thrust::device, begin, end, out);
+
+    bool all_differences_correct_h = *all_differences_correct;
+    thrust::device_free(all_differences_correct);
+
+    ASSERT_EQUAL(all_differences_correct_h, true);
+}
+
+void TestAdjacentDifferenceWithBigIndexes()
+{
+    TestAdjacentDifferenceWithBigIndexesHelper(30);
+    TestAdjacentDifferenceWithBigIndexesHelper(31);
+    TestAdjacentDifferenceWithBigIndexesHelper(32);
+    TestAdjacentDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);
diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h
@@ -37,6 +37,7 @@
 #include <cub/block/block_adjacent_difference.cuh>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/dispatch.h>
 #include <thrust/functional.h>
 #include <thrust/distance.h>
 #include <thrust/detail/mpl/math.h>
@@ -257,8 +258,8 @@ namespace __adjacent_difference {
 
       template <bool IS_LAST_TILE>
       void THRUST_DEVICE_FUNCTION
-      consume_tile(Size num_remaining,
-                   Size  tile_idx,
+      consume_tile(int  num_remaining,
+                   int  tile_idx,
                    Size tile_base)
       {
         if (tile_idx == 0)
@@ -279,7 +280,7 @@ namespace __adjacent_difference {
       consume_range(Size num_items)
       {
         int  tile_idx      = blockIdx.x;
-        Size tile_base     = tile_idx * ITEMS_PER_TILE;
+        Size tile_base     = static_cast<Size>(tile_idx) * ITEMS_PER_TILE;
         Size num_remaining = num_items - tile_base;
 
         if (num_remaining > ITEMS_PER_TILE)    // not a last tile
@@ -349,7 +350,7 @@ namespace __adjacent_difference {
                        char *   /*shmem*/)
     {
       int tile_idx  = blockIdx.x * blockDim.x + threadIdx.x;
-      int tile_base = tile_idx * items_per_tile;
+      Size tile_base = static_cast<Size>(tile_idx) * items_per_tile;
       if (tile_base > 0 && tile_idx < num_tiles)
         result[tile_idx] = first[tile_base - 1];
     }
@@ -391,8 +392,8 @@ namespace __adjacent_difference {
     AgentPlan init_plan       = init_agent::get_plan();
 
 
-    size_t tile_size = difference_plan.items_per_tile;
-    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+    Size tile_size = difference_plan.items_per_tile;
+    Size num_tiles = (num_items + tile_size - 1) / tile_size;
 
     size_t tmp1        = num_tiles * sizeof(input_type);
     size_t vshmem_size = core::vshmem_size(difference_plan.shared_memory_size,
@@ -448,29 +449,19 @@ namespace __adjacent_difference {
     bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
-    status = doit_step(NULL,
-                       storage_size,
-                       first,
-                       result,
-                       binary_op,
-                       num_items,
-                       stream,
-                       debug_sync);
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items,
+        (NULL, storage_size, first, result, binary_op,
+           num_items_fixed, stream, debug_sync));
     cuda_cub::throw_on_error(status, "adjacent_difference failed on 1st step");
 
     // Allocate temporary storage.
     thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
       tmp(policy, storage_size);
     void *ptr = static_cast<void*>(tmp.data().get());
 
-    status = doit_step(ptr,
-                       storage_size,
-                       first,
-                       result,
-                       binary_op,
-                       num_items,
-                       stream,
-                       debug_sync);
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items,
+        (ptr, storage_size, first, result, binary_op,
+           num_items_fixed, stream, debug_sync));
     cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step");
 
     status = cuda_cub::synchronize(policy);

diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/preprocessor.h>
+
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version assumes that callables for both branches consist
+ * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * interfaces, that always deduce the size type from the arguments.
+ */
+#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
+    if (count <= std::numeric_limits<thrust::detail::int32_t>::max()) { \
+        thrust::detail::int32_t THRUST_PP_CAT2(count, _fixed) = count; \
+        status = call arguments; \
+    } \
+    else { \
+        thrust::detail::int64_t THRUST_PP_CAT2(count, _fixed) = count; \
+        status = call arguments; \
+    }
+
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version allows using different token sequences for callables
+ * in both branches, and is intended to be used with CUB-style dispatch interfaces,
+ * where the "simple" interface always forces the size to be `int` (making it harder
+ * for us to use), but the complex interface that we end up using doesn't actually
+ * provide a way to fully deduce the type from just the call, making the size type
+ * appear in the token sequence of the callable.
+ *
+ * See reduce_n_impl to see an example of how this is meant to be used.
+ */
+#define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    if (count <= std::numeric_limits<thrust::detail::int32_t>::max()) { \
+        thrust::detail::int32_t THRUST_PP_CAT2(count, _fixed) = count; \
+        status = call_32 arguments; \
+    } \
+    else { \
+        thrust::detail::int64_t THRUST_PP_CAT2(count, _fixed) = count; \
+        status = call_64 arguments; \
+    }
+
diff --git a/thrust/system/cuda/detail/reduce.h b/thrust/system/cuda/detail/reduce.h
@@ -38,6 +38,7 @@
 #include <cub/device/device_reduce.cuh>
 #include <thrust/system/cuda/detail/par_to_seq.h>
 #include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/dispatch.h>
 #include <thrust/functional.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/detail/minmax.h>
@@ -930,21 +931,22 @@ T reduce_n_impl(execution_policy<Derived>& policy,
                 BinaryOp                   binary_op)
 {
   cudaStream_t stream = cuda_cub::stream(policy);
+  cudaError_t status;
 
   // Determine temporary device storage requirements.
 
   size_t tmp_size = 0;
-  cuda_cub::throw_on_error(
-    cub::DeviceReduce::Reduce(NULL,
-                              tmp_size,
-                              first,
-                              reinterpret_cast<T*>(NULL),
-                              num_items,
-                              binary_op,
-                              init,
-                              stream,
-                              THRUST_DEBUG_SYNC_FLAG),
-    "after reduction step 1");
+
+  THRUST_INDEX_TYPE_DISPATCH2(status,
+    cub::DeviceReduce::Reduce,
+    (cub::DispatchReduce<
+        InputIt, T*, Size, BinaryOp
+    >::Dispatch),
+    num_items,
+    (NULL, tmp_size, first, reinterpret_cast<T*>(NULL),
+        num_items_fixed, binary_op, init, stream,
+        THRUST_DEBUG_SYNC_FLAG));
+  cuda_cub::throw_on_error(status, "after reduction step 1");
 
   // Allocate temporary storage.
 
@@ -963,17 +965,16 @@ T reduce_n_impl(execution_policy<Derived>& policy,
   // make this guarantee.
   T* ret_ptr = thrust::detail::aligned_reinterpret_cast<T*>(tmp.data().get());
   void* tmp_ptr = static_cast<void*>((tmp.data() + sizeof(T)).get());
-  cuda_cub::throw_on_error(
-    cub::DeviceReduce::Reduce(tmp_ptr,
-                              tmp_size,
-                              first,
-                              ret_ptr,
-                              num_items,
-                              binary_op,
-                              init,
-                              stream,
-                              THRUST_DEBUG_SYNC_FLAG),
-    "after reduction step 2");
+  THRUST_INDEX_TYPE_DISPATCH2(status,
+    cub::DeviceReduce::Reduce,
+    (cub::DispatchReduce<
+        InputIt, T*, Size, BinaryOp
+    >::Dispatch),
+    num_items,
+    (tmp_ptr, tmp_size, first, ret_ptr,
+        num_items_fixed, binary_op, init, stream,
+        THRUST_DEBUG_SYNC_FLAG));
+  cuda_cub::throw_on_error(status, "after reduction step 2");
 
   // Synchronize the stream and get the value.