From 14690093a079cc816969c070b5ba22f9ab1442b2 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 4 Aug 2020 16:09:53 -0400 Subject: [PATCH 1/2] Add iterator_*_t helpers for iterator traits. - iterator_value_t - iterator_pointer_t - iterator_reference_t - iterator_difference_t - iterator_system_t --- thrust/iterator/detail/iterator_traits.inl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/thrust/iterator/detail/iterator_traits.inl b/thrust/iterator/detail/iterator_traits.inl index 8a9cc4ffb..2d3cd5773 100644 --- a/thrust/iterator/detail/iterator_traits.inl +++ b/thrust/iterator/detail/iterator_traits.inl @@ -33,6 +33,8 @@ template typedef typename thrust::iterator_traits::value_type type; }; // end iterator_value +template +using iterator_value_t = typename iterator_value::type; template struct iterator_pointer @@ -40,6 +42,8 @@ template typedef typename thrust::iterator_traits::pointer type; }; // end iterator_pointer +template +using iterator_pointer_t = typename iterator_pointer::type; template struct iterator_reference @@ -47,6 +51,8 @@ template typedef typename iterator_traits::reference type; }; // end iterator_reference +template +using iterator_reference_t = typename iterator_reference::type; template struct iterator_difference @@ -54,6 +60,9 @@ template typedef typename thrust::iterator_traits::difference_type type; }; // end iterator_difference +template +using iterator_difference_t = typename iterator_difference::type; + namespace detail { @@ -90,6 +99,8 @@ template<> typedef thrust::iterator_system::type type; }; // end iterator_system +template +using iterator_system_t = typename iterator_system::type; template struct iterator_traversal From e1b3caadeaef517179b39e5c7426880cf1ae1c4a Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 31 Jul 2020 17:22:32 -0400 Subject: [PATCH 2/2] Add async scan algorithms, new async test framework. --- cmake/ThrustHeaderTesting.cmake | 1 + testing/CMakeLists.txt | 1 + testing/async/CMakeLists.txt | 80 +++ .../async/exclusive_scan/counting_iterator.cu | 46 ++ .../async/exclusive_scan/discard_output.cu | 38 + testing/async/exclusive_scan/large_indices.cu | 244 +++++++ testing/async/exclusive_scan/large_types.cu | 58 ++ testing/async/exclusive_scan/mixed_types.cu | 120 ++++ testing/async/exclusive_scan/mixin.h | 119 ++++ testing/async/exclusive_scan/simple.cu | 72 ++ .../async/exclusive_scan/stateful_operator.cu | 62 ++ testing/async/exclusive_scan/using_vs_adl.cu | 171 +++++ .../async/inclusive_scan/counting_iterator.cu | 45 ++ .../async/inclusive_scan/discard_output.cu | 37 + testing/async/inclusive_scan/large_indices.cu | 239 +++++++ testing/async/inclusive_scan/large_types.cu | 58 ++ testing/async/inclusive_scan/mixed_types.cu | 109 +++ testing/async/inclusive_scan/mixin.h | 115 +++ testing/async/inclusive_scan/simple.cu | 70 ++ .../async/inclusive_scan/stateful_operator.cu | 61 ++ testing/async/inclusive_scan/using_vs_adl.cu | 169 +++++ testing/async/mixin.h | 663 ++++++++++++++++++ testing/async/test_policy_overloads.h | 410 +++++++++++ testing/event.cu | 2 +- testing/future.cu | 2 +- testing/unittest/testframework.h | 16 + testing/unittest/util_async.h | 7 +- thrust/async/scan.h | 345 +++++++++ thrust/detail/event_error.h | 7 +- thrust/future.h | 5 +- .../system/cuda/detail/async/exclusive_scan.h | 199 ++++++ .../system/cuda/detail/async/inclusive_scan.h | 194 +++++ thrust/system/cuda/detail/async/scan.h | 33 + thrust/system/cuda/detail/future.inl | 7 +- thrust/system/cuda/future.h | 7 +- thrust/system/detail/adl/async/scan.h | 34 + thrust/system/detail/generic/scan.inl | 4 +- 37 files changed, 3826 insertions(+), 24 deletions(-) create mode 100644 testing/async/CMakeLists.txt create mode 100644 testing/async/exclusive_scan/counting_iterator.cu create mode 100644 testing/async/exclusive_scan/discard_output.cu create mode 100644 testing/async/exclusive_scan/large_indices.cu create mode 100644 testing/async/exclusive_scan/large_types.cu create mode 100644 testing/async/exclusive_scan/mixed_types.cu create mode 100644 testing/async/exclusive_scan/mixin.h create mode 100644 testing/async/exclusive_scan/simple.cu create mode 100644 testing/async/exclusive_scan/stateful_operator.cu create mode 100644 testing/async/exclusive_scan/using_vs_adl.cu create mode 100644 testing/async/inclusive_scan/counting_iterator.cu create mode 100644 testing/async/inclusive_scan/discard_output.cu create mode 100644 testing/async/inclusive_scan/large_indices.cu create mode 100644 testing/async/inclusive_scan/large_types.cu create mode 100644 testing/async/inclusive_scan/mixed_types.cu create mode 100644 testing/async/inclusive_scan/mixin.h create mode 100644 testing/async/inclusive_scan/simple.cu create mode 100644 testing/async/inclusive_scan/stateful_operator.cu create mode 100644 testing/async/inclusive_scan/using_vs_adl.cu create mode 100644 testing/async/mixin.h create mode 100644 testing/async/test_policy_overloads.h create mode 100644 thrust/async/scan.h create mode 100644 thrust/system/cuda/detail/async/exclusive_scan.h create mode 100644 thrust/system/cuda/detail/async/inclusive_scan.h create mode 100644 thrust/system/cuda/detail/async/scan.h create mode 100644 thrust/system/detail/adl/async/scan.h diff --git a/cmake/ThrustHeaderTesting.cmake b/cmake/ThrustHeaderTesting.cmake index 96ea2bd2d..560c0a95a 100644 --- a/cmake/ThrustHeaderTesting.cmake +++ b/cmake/ThrustHeaderTesting.cmake @@ -65,6 +65,7 @@ foreach(thrust_target IN LISTS THRUST_TARGETS) async/copy.h async/for_each.h async/reduce.h + async/scan.h async/sort.h async/transform.h event.h diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index 354b0b2ff..80aab18b0 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -162,6 +162,7 @@ foreach(thrust_target IN LISTS THRUST_TARGETS) endforeach() # Add specialized tests: +add_subdirectory(async) add_subdirectory(cmake) add_subdirectory(cpp) add_subdirectory(cuda) diff --git a/testing/async/CMakeLists.txt b/testing/async/CMakeLists.txt new file mode 100644 index 000000000..00d50f097 --- /dev/null +++ b/testing/async/CMakeLists.txt @@ -0,0 +1,80 @@ +# The async tests perform a large amount of codegen, making them expensive to +# build and test. To keep compilation and runtimes manageable, the tests are +# broken up into many files per algorithm to enable parallelism during +# compilation and testing. The structure of these test directories are: +# +# thrust/testing/async//.cu +# +# These generate executables and CTest tests named +# ${config_prefix}.test.async... + +# The async tests only support CUDA enabled configs. Create a list of valid +# thrust targets: +set(cuda_configs) +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if (config_device STREQUAL CUDA) + list(APPEND cuda_configs ${thrust_target}) + endif() +endforeach() + +list(LENGTH cuda_configs num_cuda_configs) +if (num_cuda_configs EQUAL 0) + return() # No valid configs found, nothing to do. +endif() + +# Process a single algorithm directory, adding all .cu/cpp files as tests for +# each valid backend. algo_name is the name of the subdir ( +# above) and is used for naming the executable/targets. +function(thrust_add_async_test_dir algo_name) + file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + "${algo_name}/*.cu" + "${algo_name}/*.cpp" + ) + + # Per-algorithm, all-config metatarget: thrust.all.test.async.[algo].all + set(algo_meta_target thrust.all.test.async.${algo_name}.all) + add_custom_target(${algo_meta_target}) + + foreach(thrust_target IN LISTS cuda_configs) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + # Per-algorithm, per-config metatarget: thrust.[config].test.async.[algo].all + set(algo_config_meta_target ${config_prefix}.test.async.${algo_name}.all) + add_custom_target(${algo_config_meta_target}) + add_dependencies(${algo_meta_target} ${algo_config_meta_target}) + + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + string(PREPEND test_name async.${algo_name}.) + + thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) + if(THRUST_ENABLE_TESTS_WITH_RDC) + thrust_enable_rdc_for_cuda_target(${test_target}) + endif() + + add_dependencies(${algo_config_meta_target} ${test_target}) + endforeach() + endforeach() +endfunction() + +# Grab all algorithm subdirectories: +set(test_dirs) +file(GLOB contents + CONFIGURE_DEPENDS + "${CMAKE_CURRENT_LIST_DIR}/*" +) + +foreach(test_dir IN LISTS contents) + if(IS_DIRECTORY "${test_dir}") + list(APPEND test_dirs "${test_dir}") + endif() +endforeach() + +# Process all test dirs: +foreach(test_dir IN LISTS test_dirs) + get_filename_component(algo_name "${test_dir}" NAME_WLE) + thrust_add_async_test_dir(${algo_name}) +endforeach() diff --git a/testing/async/exclusive_scan/counting_iterator.cu b/testing/async/exclusive_scan/counting_iterator.cu new file mode 100644 index 000000000..7771299dd --- /dev/null +++ b/testing/async/exclusive_scan/counting_iterator.cu @@ -0,0 +1,46 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include + +template > +struct invoker + : testing::async::mixin::input::counting_iterator_from_0 + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "fancy input iterator (counting_iterator)"; + } +}; + +template +struct test_counting_iterator +{ + void operator()(std::size_t num_values) const + { + num_values = unittest::truncate_to_max_representable(num_values); + testing::async::test_policy_overloads>::run(num_values); + } +}; +// Use built-in types only, counting_iterator doesn't seem to be compatible with +// the custom_numeric. +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_counting_iterator, + BuiltinNumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/discard_output.cu b/testing/async/exclusive_scan/discard_output.cu new file mode 100644 index 000000000..ec7ca5f47 --- /dev/null +++ b/testing/async/exclusive_scan/discard_output.cu @@ -0,0 +1,38 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Compilation test with discard iterators. No runtime validation is actually +// performed, other than testing whether the algorithm completes without +// exception. + +template > +struct discard_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::discard_iterator + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::noop +{ + static std::string description() { return "discard output"; } +}; + +template +struct test_discard +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_discard, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/large_indices.cu b/testing/async/exclusive_scan/large_indices.cu new file mode 100644 index 000000000..4d1c51df0 --- /dev/null +++ b/testing/async/exclusive_scan/large_indices.cu @@ -0,0 +1,244 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu. + +namespace +{ + +// Fake iterator that asserts +// (a) it is written with a sequence and +// (b) a defined maximum value is written at some point +// +// This allows us to test very large problem sizes without actually allocating +// large amounts of memory that would exceed most devices' capacity. +struct assert_sequence_iterator +{ + using value_type = std::int64_t; + using difference_type = std::int64_t; + + // Defined for thrust::iterator_traits: + using pointer = value_type*; + using reference = assert_sequence_iterator; // weird but convenient + using iterator_category = + typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference>::type; + + std::int64_t expected{0}; + std::int64_t max{0}; + mutable thrust::device_ptr found_max{nullptr}; + mutable thrust::device_ptr unexpected_value{nullptr}; + + // Should be called on the first iterator generated. This needs to be + // done explicitly from the host. + void initialize_shared_state() + { + found_max = thrust::device_malloc(1); + unexpected_value = thrust::device_malloc(1); + *found_max = false; + *unexpected_value = false; + } + + // Should be called only once on the initialized iterator. This needs to be + // done explicitly from the host. + void free_shared_state() const + { + thrust::device_free(found_max); + thrust::device_free(unexpected_value); + found_max = nullptr; + unexpected_value = nullptr; + } + + __host__ __device__ assert_sequence_iterator operator+(difference_type i) const + { + return clone(expected + i); + } + + __host__ __device__ reference operator[](difference_type i) const + { + return clone(expected + i); + } + + // Some weirdness, this iterator acts like its own reference + __device__ assert_sequence_iterator operator=(value_type val) + { + if (val != expected) + { + printf("Error: expected %lld, got %lld\n", expected, val); + *unexpected_value = true; + } + else if (val == max) + { + *found_max = true; + } + + return *this; + } + +private: + __host__ __device__ + assert_sequence_iterator clone(value_type new_expected) const + { + return {new_expected, max, found_max, unexpected_value}; + } +}; + +// output mixin that generates assert_sequence_iterators. +// Must be paired with validate_assert_sequence_iterators mixin to free +// shared state. +struct assert_sequence_output +{ + struct output_type + { + using iterator = assert_sequence_iterator; + + iterator iter; + + explicit output_type(iterator&& it) + : iter{std::move(it)} + { + iter.initialize_shared_state(); + } + + ~output_type() + { + iter.free_shared_state(); + } + + iterator begin() { return iter; } + }; + + template + static output_type generate_output(std::size_t num_values, InputType&) + { + using value_type = typename assert_sequence_iterator::value_type; + assert_sequence_iterator it{0, + // minus one bc exclusive scan: + static_cast(num_values - 1), + nullptr, + nullptr}; + return output_type{std::move(it)}; + } +}; + +struct validate_assert_sequence_iterators +{ + using output_t = assert_sequence_output::output_type; + + template + static void compare_outputs(EventType& e, + output_t const&, + output_t const& test) + { + testing::async::mixin::compare_outputs::detail::basic_event_validation(e); + + ASSERT_EQUAL(*test.iter.unexpected_value, false); + ASSERT_EQUAL(*test.iter.found_max, true); + } +}; + +//------------------------------------------------------------------------------ +// Overloads without custom binary operators use thrust::plus<>, so use +// constant input iterator to generate the output sequence: +struct default_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple // - initial_value + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::tuple<>{}, std::tuple{0}}; + } +}; + +struct default_bin_op_invoker + : testing::async::mixin::input::constant_iterator_1 + , assert_sequence_output + , default_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::simple + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with default binary operator"; + } +}; + +} // anon namespace + +void test_large_indices_default_scan_op() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(test_large_indices_default_scan_op); + +namespace +{ + +//------------------------------------------------------------------------------ +// Generate the output sequence using counting iterators and thrust::max<> for +// custom operator overloads. +struct custom_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple> // - initial_value, binop + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::make_tuple(0, thrust::maximum<>{})}; + } +}; + +struct custom_bin_op_invoker + : testing::async::mixin::input::counting_iterator_from_1 + , assert_sequence_output + , custom_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::simple + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with custom binary operator"; + } +}; + +} // namespace + +void test_large_indices_custom_scan_op() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(test_large_indices_custom_scan_op); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/large_types.cu b/testing/async/exclusive_scan/large_types.cu new file mode 100644 index 000000000..571d39262 --- /dev/null +++ b/testing/async/exclusive_scan/large_types.cu @@ -0,0 +1,58 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +// This test is an adaptation of TestScanWithLargeTypes from scan.cu. + +// Need special initialization for the FixedVector type: +template +struct device_vector_fill +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::fill(input.begin(), input.end(), value_type{2}); + return input; + } +}; + +template > +struct invoker + : device_vector_fill + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "scan with large value types."; + } +}; + +struct test_large_types +{ + void operator()(std::size_t num_values) const + { + using testing::async::test_policy_overloads; + + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + } +}; +DECLARE_UNITTEST(test_large_types); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/mixed_types.cu b/testing/async/exclusive_scan/mixed_types.cu new file mode 100644 index 000000000..f69af1794 --- /dev/null +++ b/testing/async/exclusive_scan/mixed_types.cu @@ -0,0 +1,120 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Test using mixed int/float types for: +// - input_value_type | (int, float) +// - output_value_type | (int, float) +// - initial_value_type | (int, float, ) +// - thrust::plus T-type | (int, float, void>) +// +// The initial_value_type and thrust::plus types are covered by the +// mixin::postfix_args::scan_mixed_types_overloads component. +// +// The testing/scan.cu TestMixedTypes test spells out the expected behavior, +// which is defined by https://wg21.link/P0571. + +namespace +{ + +template +struct mixed_type_input_generator +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + // fractional values are chosen deliberately to test + // casting orders and accumulator types: + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +// A fractional value is used to ensure that a different result is obtained when +// using float vs. int. +template +struct mixed_types_postfix_args +{ + using postfix_args_type = std::tuple< // Overloads to test: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple>, // - initial_value, plus<> + std::tuple>, // - initial_value, plus + std::tuple> // - initial_value, plus + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{ + std::tuple<>{}, + std::make_tuple(static_cast(5.5)), + std::make_tuple(static_cast(5.5), thrust::plus<>{}), + std::make_tuple(static_cast(5.5), thrust::plus{}), + std::make_tuple(static_cast(5.5), thrust::plus{})}; + } +}; + +template +struct invoker + : mixed_type_input_generator + , testing::async::mixin::output::device_vector + , mixed_types_postfix_args + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::simple + // Use almost_equal instead of almost_equal_if_fp because floating point + // addition may be hidden in the scan_op (thrust::plus is always + // tested). + , testing::async::mixin::compare_outputs::assert_almost_equal +{ + static std::string description() + { + return "mixed input/output/initial type tests"; + } +}; + +} // namespace + +void test_scan_mixed_types(size_t num_values) +{ + // Since fp addition is non-associative, the results may be slightly off + // from the reference. + // This is primarily handled by using `compare_almost_equal` to do a fuzzy + // comparison. But for large enough test sizes, eventually the scan results + // will wrap for integral value_types. If a float accumulator is used, the + // small errors from non-associative addition may cause the wrap to happen in + // a different location, resulting in an error too large for almost_equal to + // ignore. + // This wrap seems to happen around 2^16 values, so skip when num_values is + // close to that. + if (num_values > ((1ll << 16) - 10)) + { + return; + } + + // invoker template params are input_value_type, output_vt, initial_vt: + using testing::async::test_policy_overloads; + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + // We all float down here + test_policy_overloads>::run(num_values); +} +DECLARE_SIZED_UNITTEST(test_scan_mixed_types); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/mixin.h b/testing/async/exclusive_scan/mixin.h new file mode 100644 index 000000000..02ac9908f --- /dev/null +++ b/testing/async/exclusive_scan/mixin.h @@ -0,0 +1,119 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +namespace testing +{ +namespace async +{ +namespace exclusive_scan +{ + +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace postfix_args +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple // - initial_value, binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::tuple<>{}, + std::make_tuple(value_type{42}), + std::make_tuple(value_type{42}, + alternate_binary_op{})}; + } +}; + +} // namespace postfix_args + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +template +struct host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +struct simple +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + auto e = thrust::async::exclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +} // namespace mixin +} // namespace exclusive_scan +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/exclusive_scan/simple.cu b/testing/async/exclusive_scan/simple.cu new file mode 100644 index 000000000..8c55052d7 --- /dev/null +++ b/testing/async/exclusive_scan/simple.cu @@ -0,0 +1,72 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +template > +struct simple_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple invocation with device vectors"; + } +}; + +template +struct test_simple +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple, NumericTypes); + +// Testing the in-place algorithm uses the exact same instantiations of the +// underlying scan implementation as above. Test them here to avoid compiling +// them twice. +template > +struct simple_inplace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector_reuse_input + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + input_value_type> + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple in-place invocation with device vectors"; + } +}; + +template +struct test_simple_in_place +{ + void operator()(std::size_t num_values) const + { + using invoker = simple_inplace_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_in_place, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/stateful_operator.cu b/testing/async/exclusive_scan/stateful_operator.cu new file mode 100644 index 000000000..411ffbd99 --- /dev/null +++ b/testing/async/exclusive_scan/stateful_operator.cu @@ -0,0 +1,62 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +namespace +{ + +// Custom binary operator for scan: +template +struct stateful_operator +{ + T offset; + + __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; } +}; + +// Postfix args overload definition that uses a stateful custom binary operator +template +struct use_stateful_operator +{ + using postfix_args_type = std::tuple< // Single overload: + std::tuple> // init_val, bin_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{ + std::make_tuple(value_type{42}, + stateful_operator{value_type{2}})}; + } +}; + +template +struct invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , use_stateful_operator + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() { return "scan with stateful operator"; } +}; + +} // namespace + +template +struct test_stateful_operator +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_stateful_operator, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/using_vs_adl.cu b/testing/async/exclusive_scan/using_vs_adl.cu new file mode 100644 index 000000000..34a80bd79 --- /dev/null +++ b/testing/async/exclusive_scan/using_vs_adl.cu @@ -0,0 +1,171 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Verify what happens when calling the algorithm without any namespace +// qualifiers: +// - If the async entry point is available in the global namespace due to a +// using statement, the async algorithm should be called. +// - Otherwise, ADL should resolve the call to the synchronous algo in the +// thrust:: namespace. + +namespace invoke_reference +{ + +template +struct adl_host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + using OutIter = thrust::remove_cvref_t; + + // ADL should resolve this to the synchronous `thrust::` algorithm. + // This is checked by ensuring that the call returns an output iterator. + OutIter result = + exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + (void)result; + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +namespace invoke_async +{ + +struct using_namespace +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using namespace thrust::async; + thrust::device_event e = + exclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +struct using_cpo +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using thrust::async::exclusive_scan; + thrust::device_event e = + exclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +template > +struct using_namespace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_namespace + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "importing async CPO with `using namespace thrust::async`"; + } +}; + +void test_using_namespace() +{ + using invoker = using_namespace_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(test_using_namespace); + +template > +struct using_cpo_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_cpo + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "importing async CPO with " + "`using namespace thrust::async::exclusive_scan`"; + } +}; + +void test_using_cpo() +{ + using invoker = using_cpo_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(test_using_cpo); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/counting_iterator.cu b/testing/async/inclusive_scan/counting_iterator.cu new file mode 100644 index 000000000..fe9fdeb80 --- /dev/null +++ b/testing/async/inclusive_scan/counting_iterator.cu @@ -0,0 +1,45 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include + +template > +struct invoker + : testing::async::mixin::input::counting_iterator_from_0 + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "fancy input iterator (counting_iterator)"; + } +}; + +template +struct test_counting_iterator +{ + void operator()(std::size_t num_values) const + { + num_values = unittest::truncate_to_max_representable(num_values); + testing::async::test_policy_overloads>::run(num_values); + } +}; +// Use built-in types only, counting_iterator doesn't seem to be compatible with +// the custom_numeric. +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_counting_iterator, + BuiltinNumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/discard_output.cu b/testing/async/inclusive_scan/discard_output.cu new file mode 100644 index 000000000..c202de7f0 --- /dev/null +++ b/testing/async/inclusive_scan/discard_output.cu @@ -0,0 +1,37 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Compilation test with discard iterators. No runtime validation is actually +// performed, other than testing whether the algorithm completes without +// exception. + +template > +struct discard_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::discard_iterator + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::noop +{ + static std::string description() { return "discard output"; } +}; + +template +struct test_discard +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_discard, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/large_indices.cu b/testing/async/inclusive_scan/large_indices.cu new file mode 100644 index 000000000..4124cf96d --- /dev/null +++ b/testing/async/inclusive_scan/large_indices.cu @@ -0,0 +1,239 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu. + +namespace +{ + +// Fake iterator that asserts +// (a) it is written with a sequence and +// (b) a defined maximum value is written at some point +// +// This allows us to test very large problem sizes without actually allocating +// large amounts of memory that would exceed most devices' capacity. +struct assert_sequence_iterator +{ + using value_type = std::int64_t; + using difference_type = std::int64_t; + + // Defined for thrust::iterator_traits: + using pointer = value_type *; + using reference = assert_sequence_iterator; // weird but convenient + using iterator_category = typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference>::type; + + std::int64_t expected{0}; + std::int64_t max{0}; + mutable thrust::device_ptr found_max{nullptr}; + mutable thrust::device_ptr unexpected_value{nullptr}; + + // Should be called on the first iterator generated. This needs to be done + // explicitly from the host. + void initialize_shared_state() + { + found_max = thrust::device_malloc(1); + unexpected_value = thrust::device_malloc(1); + *found_max = false; + *unexpected_value = false; + } + + // Should be called only once on the initialized iterator. This needs to be + // done explicitly from the host. + void free_shared_state() const + { + thrust::device_free(found_max); + thrust::device_free(unexpected_value); + found_max = nullptr; + unexpected_value = nullptr; + } + + __host__ __device__ assert_sequence_iterator operator+(difference_type i) const + { + return clone(expected + i); + } + + __host__ __device__ reference operator[](difference_type i) const + { + return clone(expected + i); + } + + // Some weirdness, this iterator acts like its own reference + __device__ assert_sequence_iterator operator=(value_type val) + { + if (val != expected) + { + printf("Error: expected %lld, got %lld\n", expected, val); + + *unexpected_value = true; + } + else if (val == max) + { + *found_max = true; + } + + return *this; + } + +private: + __host__ __device__ assert_sequence_iterator + clone(value_type new_expected) const + { + return {new_expected, max, found_max, unexpected_value}; + } +}; + +// output mixin that generates assert_sequence_iterators. +// Must be paired with validate_assert_sequence_iterators mixin to free +// shared state. +struct assert_sequence_output +{ + struct output_type + { + using iterator = assert_sequence_iterator; + + iterator iter; + + explicit output_type(iterator &&it) + : iter{std::move(it)} + { + iter.initialize_shared_state(); + } + + ~output_type() { iter.free_shared_state(); } + + iterator begin() { return iter; } + }; + + template + static output_type generate_output(std::size_t num_values, InputType &) + { + using value_type = typename assert_sequence_iterator::value_type; + assert_sequence_iterator it{1, + static_cast(num_values), + nullptr, + nullptr}; + return output_type{std::move(it)}; + } +}; + +struct validate_assert_sequence_iterators +{ + using output_t = assert_sequence_output::output_type; + + template + static void compare_outputs(EventType &e, + output_t const &, + output_t const &test) + { + testing::async::mixin::compare_outputs::detail::basic_event_validation(e); + + ASSERT_EQUAL(*test.iter.unexpected_value, false); + ASSERT_EQUAL(*test.iter.found_max, true); + } +}; + +//------------------------------------------------------------------------------ +// Overloads without custom binary operators use thrust::plus<>, so use +// constant input iterator to generate the output sequence: +struct default_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<> // - no extra args + >; + + static postfix_args_type generate_postfix_args() + { + return std::tuple>{}; + } +}; + +struct default_bin_op_invoker + : testing::async::mixin::input::constant_iterator_1 + , assert_sequence_output + , default_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::simple + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with default binary operator"; + } +}; + +} // end anon namespace + +void test_large_indices_default_scan_op() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(test_large_indices_default_scan_op); + +namespace +{ + +//------------------------------------------------------------------------------ +// Generate the output sequence using counting iterators and thrust::max<> for +// custom operator overloads. +struct custom_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple> // - custom binary op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::make_tuple(thrust::maximum<>{})}; + } +}; + +struct custom_bin_op_invoker + : testing::async::mixin::input::counting_iterator_from_1 + , assert_sequence_output + , custom_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::simple + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with custom binary operator"; + } +}; + +} // end anon namespace + +void test_large_indices_custom_scan_op() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(test_large_indices_custom_scan_op); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/large_types.cu b/testing/async/inclusive_scan/large_types.cu new file mode 100644 index 000000000..00bb8b461 --- /dev/null +++ b/testing/async/inclusive_scan/large_types.cu @@ -0,0 +1,58 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +// This test is an adaptation of TestScanWithLargeTypes from scan.cu. + +// Need special initialization for the FixedVector type: +template +struct device_vector_fill +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::fill(input.begin(), input.end(), value_type{2}); + return input; + } +}; + +template > +struct invoker + : device_vector_fill + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "scan with large value types."; + } +}; + +struct test_large_types +{ + void operator()(std::size_t num_values) const + { + using testing::async::test_policy_overloads; + + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + } +}; +DECLARE_UNITTEST(test_large_types); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/mixed_types.cu b/testing/async/inclusive_scan/mixed_types.cu new file mode 100644 index 000000000..57931c8d0 --- /dev/null +++ b/testing/async/inclusive_scan/mixed_types.cu @@ -0,0 +1,109 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Test using mixed int/float types for: +// - input_value_type | (int, float) +// - output_value_type | (int, float) +// - thrust::plus T-type | (int, float, void>) +// +// The thrust::plus types are covered by the +// scan_mixed_types_overloads component. +// +// The testing/scan.cu TestMixedTypes test spells out the expected behavior, +// which is defined by https://wg21.link/P0571. + +namespace +{ + +template +struct mixed_type_input_generator +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + // fractional values are chosen deliberately to test + // casting orders and accumulator types: + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +// A fractional value is used to ensure that a different result is obtained when +// using float vs. int. +struct mixed_types_postfix_args +{ + using postfix_args_type = std::tuple< // Overloads to test: + std::tuple<>, // - no extra args + std::tuple>, // - plus<> + std::tuple>, // - plus + std::tuple> // - plus + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::tuple<>{}, + std::make_tuple(thrust::plus<>{}), + std::make_tuple(thrust::plus{}), + std::make_tuple(thrust::plus{})}; + } +}; + +template +struct invoker + : mixed_type_input_generator + , testing::async::mixin::output::device_vector + , mixed_types_postfix_args + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::simple + // Use almost_equal instead of almost_equal_if_fp because floating point + // addition may be hidden in the scan_op (thrust::plus is always + // tested). + , testing::async::mixin::compare_outputs::assert_almost_equal +{ + static std::string description() + { + return "mixed input/output/functor value_type tests"; + } +}; + +} // namespace + +void test_scan_mixed_types(size_t num_values) +{ + // Since fp addition is non-associative, the results may be slightly off + // from the reference. + // This is primarily handled by using `compare_almost_equal` to do a fuzzy + // comparison. But for large enough test sizes, eventually the scan results + // will wrap for integral value_types. If a float accumulator is used, the + // small errors from non-associative addition may cause the wrap to happen in + // a different location, resulting in an error too large for almost_equal to + // ignore. + // This wrap seems to happen around 2^16 values, so skip when num_values is + // close to that. + if (num_values > ((1ll << 16) - 10)) + { + return; + } + + // invoker template params are input_value_type, output_vt: + using testing::async::test_policy_overloads; + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); +} +DECLARE_SIZED_UNITTEST(test_scan_mixed_types); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/mixin.h b/testing/async/inclusive_scan/mixin.h new file mode 100644 index 000000000..82ecd59b8 --- /dev/null +++ b/testing/async/inclusive_scan/mixin.h @@ -0,0 +1,115 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +namespace testing +{ +namespace async +{ +namespace inclusive_scan +{ + +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace postfix_args +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple // - binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::tuple<>{}, std::make_tuple(alternate_binary_op{})}; + } +}; + +} // namespace postfix_args + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +template +struct host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::inclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +struct simple +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + auto e = thrust::async::inclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +} // namespace mixin +} // namespace inclusive_scan +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/inclusive_scan/simple.cu b/testing/async/inclusive_scan/simple.cu new file mode 100644 index 000000000..1256f009b --- /dev/null +++ b/testing/async/inclusive_scan/simple.cu @@ -0,0 +1,70 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +template > +struct simple_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple invocation with device vectors"; + } +}; + +template +struct test_simple +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple, NumericTypes); + +// Testing the in-place algorithm uses the exact same instantiations of the +// underlying scan implementation as above. Test them here to avoid compiling +// them twice. +template > +struct simple_inplace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector_reuse_input + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + input_value_type> + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple in-place invocation with device vectors"; + } +}; + +template +struct test_simple_in_place +{ + void operator()(std::size_t num_values) const + { + using invoker = simple_inplace_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_in_place, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/stateful_operator.cu b/testing/async/inclusive_scan/stateful_operator.cu new file mode 100644 index 000000000..224c29303 --- /dev/null +++ b/testing/async/inclusive_scan/stateful_operator.cu @@ -0,0 +1,61 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +namespace +{ + +// Custom binary operator for scan: +template +struct stateful_operator +{ + T offset; + + __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; } +}; + +// Postfix args overload definition that uses a stateful custom binary operator +template +struct use_stateful_operator +{ + using postfix_args_type = std::tuple< // Single overload: + std::tuple> // bin_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{ + std::make_tuple(stateful_operator{value_type{2}})}; + } +}; + +template +struct invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , use_stateful_operator + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() { return "scan with stateful operator"; } +}; + +} // namespace + +template +struct test_stateful_operator +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_stateful_operator, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/using_vs_adl.cu b/testing/async/inclusive_scan/using_vs_adl.cu new file mode 100644 index 000000000..9789ce5c9 --- /dev/null +++ b/testing/async/inclusive_scan/using_vs_adl.cu @@ -0,0 +1,169 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Verify what happens when calling the algorithm without any namespace +// qualifiers: +// - If the async entry point is available in the global namespace due to a +// using statement, the async algorithm should be called. +// - Otherwise, ADL should resolve the call to the synchronous algo in the +// thrust:: namespace. + +namespace invoke_reference +{ + +template +struct adl_host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + using OutIter = thrust::remove_cvref_t; + + // ADL should resolve this to the synchronous `thrust::` algorithm. + // This is checked by ensuring that the call returns an output iterator. + OutIter result = + inclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + (void)result; + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +namespace invoke_async +{ + +struct using_namespace +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using namespace thrust::async; + thrust::device_event e = + inclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +struct using_cpo +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using thrust::async::inclusive_scan; + thrust::device_event e = + inclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +template > +struct using_namespace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_namespace + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "importing async CPO with `using namespace thrust::async`"; + } +}; + +void test_using_namespace() +{ + using invoker = using_namespace_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(test_using_namespace); + +template > +struct using_cpo_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_cpo + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "importing async CPO with " + "`using namespace thrust::async::inclusive_scan`"; + } +}; + +void test_using_cpo() +{ + using invoker = using_cpo_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(test_using_cpo); + +#endif // C++14 diff --git a/testing/async/mixin.h b/testing/async/mixin.h new file mode 100644 index 000000000..6d1c06ed7 --- /dev/null +++ b/testing/async/mixin.h @@ -0,0 +1,663 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +#include +#include + +// clang-format off + +// This file contains a set of mix-in classes that define an algorithm +// definition for use with test_policy_overloads. The algorithm +// definition describes the details of a thrust::async algorithm invocation: +// +// - Input type and initialization +// - Output type and initialization (supports in-place, too) +// - Postfix arguments that define the algorithm's overload set +// - Abstracted invocation of the async algorithm +// - Abstracted invocation of a reference algorithm +// - Validation of async vs. reference output +// - A description string. +// +// This definition is used by test_policy_overloads to test each overload +// against a reference while injecting a variety of execution policies. This +// validates that each overload behaves correctly according to some reference. +// +// Since much of the algorithm definition is generic and may be reused in +// multiple tests with slight changes, a mix-in system is used to simplify +// the creation of algorithm definitions. The following namespace hierarchy is +// used to organize these generic components: +// +// * testing::async::mixin:: +// ** ::input - Input types/values (device vectors, counting iterators, etc) +// ** ::output - Output types/values (device vectors, inplace device vectors, +// discard iterators, etc) +// ** ::postfix_args - Algorithm specific overload sets +// ** ::invoke_reference - Algorithm specific reference invocation +// ** ::invoke_async - Algorithm specific async algo invocation +// ** ::compare_outputs - Compare output values. +// +// Each algorithm should define its own `mixins.h` header to declare algorithm +// specific mixins (e.g. postfix_args, invoke_reference, and invoke_async) +// in a testing::async::::mixins namespace structure. +// +// For example, the test.async.exclusive_scan.basic test uses the following +// algorithm definition from mix-ins: +// +// ``` +// #include +// #include +// #include +// template > +// struct basic_invoker +// : testing::async::mixin::input::device_vector +// , testing::async::mixin::output::device_vector +// , testing::async::exclusive_scan::mixin::postfix_args:: +// all_overloads +// , testing::async::exclusive_scan::mixin::invoke_reference:: +// host_synchronous +// , testing::async::exclusive_scan::mixin::invoke_async::basic +// , testing::async::mixin::compare_outputs::assert_equal_quiet +// { +// static std::string description() +// { +// return "basic invocation with device vectors"; +// } +// }; +// +// ... +// +// testing::async::test_policy_overloads>::run(num_values); +// ``` +// +// The basic_invoker class expands to something similar to the following: +// +// ``` +// template > +// struct basic_invoker +// { +// public: +// +// static std::string description() +// { +// return "basic invocation with device vectors"; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::input::device_vector +// // +// // input_type must provide idiomatic definitions of: +// // - `using iterator = ...;` +// // - `iterator begin() const { ... }` +// // - `iterator end() const { ... }` +// // - `size_t size() const { ... }` +// using input_type = thrust::device_vector; +// +// // Generate an instance of the input: +// static input_type generate_input(std::size_t num_values) +// { +// input_type input(num_values); +// thrust::sequence(input.begin(), input.end(), 25, 3); +// return input; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::output::device_vector +// // +// // output_type must provide idiomatic definitions of: +// // - `using iterator = ...;` +// // - `iterator begin() { ... }` +// using output_type = thrust::device_vector; +// +// // Generate an instance of the output: +// // Might be more complicated, eg. fancy iterators, etc +// static output_type generate_output(std::size_t num_values) +// { +// return output_type(num_values); +// } +// +// //------------------------------------------------------------------------- +// // testing::async::exclusive_scan::mixin::postfix_args::all_overloads +// using postfix_args_type = std::tuple< // List any extra arg overloads: +// std::tuple<>, // - no extra args +// std::tuple, // - initial_value +// std::tuple // - initial_value, binary_op +// >; +// +// // Create instances of the extra arguments to use when invoking the +// // algorithm: +// static postfix_args_type generate_postfix_args() +// { +// return postfix_args_type{ +// std::tuple<>{}, // no extra args +// std::make_tuple(initial_value_type{42}), // initial_value +// // initial_value, binary_op: +// std::make_tuple(initial_value_Type{57}, alternate_binary_op{}) +// }; +// } +// +// //------------------------------------------------------------------------- +// // +// testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous +// // +// // Invoke a reference implementation for a single overload as described by +// // postfix_tuple. This tuple contains instances of any trailing arguments +// // to pass to the algorithm. The tuple/index_sequence pattern is used to +// // support a "no extra args" overload, since the parameter pack expansion +// // will do exactly what we want in all cases. +// template +// static void invoke_reference(input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// // Create host versions of the input/output: +// thrust::host_vector host_input(input.cbegin(), +// input.cend()); +// thrust::host_vector host_output(host_input.size()); +// +// // Run host synchronous algorithm to generate reference. +// thrust::exclusive_scan(host_input.cbegin(), +// host_input.cend(), +// host_output.begin(), +// std::get( +// THRUST_FWD(postfix_tuple))...); +// +// // Copy back to device. +// output = host_output; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::exclusive_scan::mixin::invoke_async::basic +// // +// // Invoke the async algorithm for a single overload as described by +// // the prefix and postfix tuples. These tuples contains instances of any +// // additional arguments to pass to the algorithm. The tuple/index_sequence +// // pattern is used to support the "no extra args" overload, since the +// // parameter pack expansion will do exactly what we want in all cases. +// // Prefix args are included here (but not for invoke_reference) to allow +// // the test framework to change the execution policy. +// // This method must return an event or future. +// template +// static auto invoke_async(PrefixArgTuple &&prefix_tuple, +// std::index_sequence, +// input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// output.resize(input.size()); +// auto e = thrust::async::exclusive_scan( +// std::get(THRUST_FWD(prefix_tuple))..., +// input.cbegin(), +// input.cend(), +// output.begin(), +// std::get(THRUST_FWD(postfix_tuple))...); +// return e; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::compare_outputs::assert_equal_quiet +// // +// // Wait on and validate the event/future (usually with TEST_EVENT_WAIT / +// // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output +// // matches the testing output. +// template +// static void compare_outputs(EventType &e, +// output_type const &ref, +// output_type const &test) +// { +// TEST_EVENT_WAIT(e); +// ASSERT_EQUAL_QUIET(ref, test); +// } +// }; +// ``` +// +// Similar invokers with slight tweaks are used in other +// async/exclusive_scan/*.cu tests. + +// clang-format on + +namespace testing +{ +namespace async +{ +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace input +{ + +template +struct device_vector +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + static_cast(1), + static_cast(1)); + return input; + } +}; + +template +struct counting_iterator_from_0 +{ + struct input_type + { + using iterator = thrust::counting_iterator; + + std::size_t num_values; + + iterator begin() const { return iterator{static_cast(0)}; } + iterator cbegin() const { return iterator{static_cast(0)}; } + + iterator end() const { return iterator{static_cast(num_values)}; } + iterator cend() const { return iterator{static_cast(num_values)}; } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +template +struct counting_iterator_from_1 +{ + struct input_type + { + using iterator = thrust::counting_iterator; + + std::size_t num_values; + + iterator begin() const { return iterator{static_cast(1)}; } + iterator cbegin() const { return iterator{static_cast(1)}; } + + iterator end() const { return iterator{static_cast(1 + num_values)}; } + iterator cend() const { return iterator{static_cast(1 + num_values)}; } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +template +struct constant_iterator_1 +{ + struct input_type + { + using iterator = thrust::constant_iterator; + + std::size_t num_values; + + iterator begin() const { return iterator{static_cast(1)}; } + iterator cbegin() const { return iterator{static_cast(1)}; } + + iterator end() const + { + return iterator{static_cast(1)} + num_values; + } + iterator cend() const + { + return iterator{static_cast(1)} + num_values; + } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +} // namespace input + +//------------------------------------------------------------------------------ +namespace output +{ + +template +struct device_vector +{ + using output_type = thrust::device_vector; + + template + static output_type generate_output(std::size_t num_values, + InputType& /* unused */) + { + return output_type(num_values); + } +}; + +template +struct device_vector_reuse_input +{ + using output_type = thrust::device_vector&; + + template + static output_type generate_output(std::size_t /*num_values*/, + InputType& input) + { + return input; + } +}; + +struct discard_iterator +{ + struct output_type + { + using iterator = thrust::discard_iterator<>; + + iterator begin() const { return thrust::make_discard_iterator(); } + iterator cbegin() const { return thrust::make_discard_iterator(); } + }; + + template + static output_type generate_output(std::size_t /* num_values */, + InputType& /* input */) + { + return output_type{}; + } +}; + +} // namespace output + +//------------------------------------------------------------------------------ +namespace postfix_args +{ +/* Defined per algorithm. Example: + * + * // Defines several overloads: + * // algorithm([policy,] input, output) // no postfix args + * // algorithm([policy,] input, output, initial_value) + * // algorithm([policy,] input, output, initial_value, binary_op) + * template > + * struct all_overloads + * { + * using postfix_args_type = std::tuple< // List any extra arg overloads: + * std::tuple<>, // - no extra args + * std::tuple, // - initial_value + * std::tuple // - initial_value, binary_op + * >; + * + * static postfix_args_type generate_postfix_args() + * { + * return postfix_args_type{ + * std::tuple<>{}, // no extra args + * std::make_tuple(initial_value_type{42}), // initial_value + * // initial_value, binary_op: + * std::make_tuple(initial_value_Type{57}, alternate_binary_op{}) + * } + * }; + * + */ +} + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +/* Defined per algorithm. Example: + * + * template + * struct host_synchronous + * { + * template + * static void invoke_reference(InputType const& input, + * OutputType& output, + * PostfixArgTuple&& postfix_tuple, + * std::index_sequence) + * { + * // Create host versions of the input/output: + * thrust::host_vector host_input(input.cbegin(), + * input.cend()); + * thrust::host_vector host_output(host_input.size()); + * + * // Run host synchronous algorithm to generate reference. + * // Be sure to call a backend that doesn't use the same underlying + * // implementation. + * thrust::exclusive_scan(host_input.cbegin(), + * host_input.cend(), + * host_output.begin(), + * std::get( + * THRUST_FWD(postfix_tuple))...); + * + * // Copy back to device. + * output = host_output; + * } + * }; + * + */ + +// Used to save time when testing unverifiable invocations (discard_iterators) +struct noop +{ + template + static void invoke_reference(Ts&&...) + {} +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +/* Defined per algorithm. Example: + * + * struct basic + * { + * template + * static auto invoke_async(PrefixArgTuple&& prefix_tuple, + * std::index_sequence, + * InputType const& input, + * OutputType& output, + * PostfixArgTuple&& postfix_tuple, + * std::index_sequence) + * { + * auto e = thrust::async::exclusive_scan( + * std::get(THRUST_FWD(prefix_tuple))..., + * input.cbegin(), + * input.cend(), + * output.begin(), + * std::get(THRUST_FWD(postfix_tuple))...); + * return e; + * } + * }; + */ + +} // namespace invoke_async + +//------------------------------------------------------------------------------ +namespace compare_outputs +{ + +namespace detail +{ + +void basic_event_validation(thrust::device_event& e) +{ + TEST_EVENT_WAIT(e); +} + +template +void basic_event_validation(thrust::device_future& f) +{ + TEST_FUTURE_VALUE_RETRIEVAL(f); +} + +} // namespace detail + +struct assert_equal +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + detail::basic_event_validation(e); + ASSERT_EQUAL(ref, test); + } +}; + +struct assert_almost_equal +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + detail::basic_event_validation(e); + ASSERT_ALMOST_EQUAL(ref, test); + } +}; + +// Does an 'almost_equal' comparison for floating point types. Since fp +// addition is non-associative, this is sometimes necessary. +struct assert_almost_equal_if_fp +{ +private: + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::false_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_EQUAL(ref, test); + } + + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::true_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_ALMOST_EQUAL(ref, test); + } + +public: + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + using value_type = typename OutputType::value_type; + compare_outputs_impl(e, ref, test, std::is_floating_point{}); + } +}; + +struct assert_equal_quiet +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + detail::basic_event_validation(e); + ASSERT_EQUAL_QUIET(ref, test); + } +}; + +// Does an 'almost_equal' comparison for floating point types, since fp +// addition is non-associative +struct assert_almost_equal_if_fp_quiet +{ +private: + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::false_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_EQUAL_QUIET(ref, test); + } + + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::true_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_ALMOST_EQUAL(ref, test); + } + +public: + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + using value_type = typename OutputType::value_type; + compare_outputs_impl(e, ref, test, std::is_floating_point{}); + } +}; + +// Used to save time when testing unverifiable invocations (discard_iterators). +// Just does basic validation of the future/event. +struct noop +{ + template + static void compare_outputs(EventType &e, Ts&&...) + { + detail::basic_event_validation(e); + } +}; + +} // namespace compare_outputs + +} // namespace mixin +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/test_policy_overloads.h b/testing/async/test_policy_overloads.h new file mode 100644 index 000000000..b7bf1ab94 --- /dev/null +++ b/testing/async/test_policy_overloads.h @@ -0,0 +1,410 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include + +#include + +#include + +// TODO Cover these cases from testing/async_reduce.cu: +// - [x] test_async_reduce_after ("after_future" in test_policy_overloads) +// - [ ] test_async_reduce_on_then_after (KNOWN_FAILURE, see #1195) +// - [ ] all the child variants (e.g. with allocator) too +// - [ ] test_async_copy_then_reduce (Need to figure out how to fit this in) +// - [ ] test_async_reduce_caching (only useful when returning future) + +namespace testing +{ + +namespace async +{ + +// Tests that policies are handled correctly for all overloads of an async +// algorithm. +// +// The AlgoDef parameter type defines an async algorithm, its overloads, and +// abstracts its invocation. See the async/mixins.h for a documented example of +// this interface and some convenience mixins that can be used to construct a +// definition quickly. +// +// The AlgoDef interface is used to run several tests of the algorithm, +// exhaustively testing all overloads for algorithm correctness and proper +// policy handling. +// +// ## Basic tests +// +// In the basic tests, each overload is called repeatedly with: +// 1) No policy +// 2) thrust::device +// 3) thrust::device(thrust::device_allocator) +// 4) thrust::device.on(stream) +// 5) thrust::device(thrust::device_allocator).on(stream) +// +// The output of the async algorithm is compared against a reference output, +// and the returned event/future is tested to make sure it holds a reference to +// the expected stream. +// +// ## After Future tests +// +// The after_future tests check that the future/event returned from an algorithm +// behaves properly when consumed by a policy's `.after` method. +template +struct test_policy_overloads +{ + using algo_def = AlgoDef; + using input_type = typename algo_def::input_type; + using output_type = typename algo_def::output_type; + using postfix_args_type = typename algo_def::postfix_args_type; + + static constexpr std::size_t num_postfix_arg_sets = + std::tuple_size::value; + + // Main entry point; call this from a unit test function. + static void run(std::size_t num_values) + { + test_postfix_overloads(num_values); + } + +private: + template + using size_const = std::integral_constant; + + //---------------------------------------------------------------------------- + // Recursively call sub tests for each overload set in postfix_args: + template + static void test_postfix_overloads(std::size_t const num_values, + size_const = {}) + { + static_assert(PostfixIdx < num_postfix_arg_sets, "Internal error."); + + run_basic_policy_tests(num_values); + run_after_future_tests(num_values); + + // Recurse to test next round of overloads: + test_postfix_overloads(num_values, size_const{}); + } + + static void test_postfix_overloads(std::size_t const, + size_const) + { + // terminal case, no-op + } + + //---------------------------------------------------------------------------- + // For the specified postfix overload set, test the algorithm with several + // different policy configurations. + template + static void run_basic_policy_tests(std::size_t const num_values) + { + // When a policy uses the default stream, the algorithm implementation + // should spawn a new stream in the returned event: + auto using_default_stream = [](auto& e) { + ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(), + e.stream().native_handle()); + }; + + // When a policy uses a non-default stream, the implementation should pass + // the stream through to the output: + thrust::system::cuda::detail::unique_stream test_stream{}; + auto using_test_stream = [&test_stream](auto& e) { + ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle()); + }; + + // Test the different types of policies: + basic_policy_test("(no policy)", + std::make_tuple(), + using_default_stream, + num_values); + + basic_policy_test("thrust::device", + std::make_tuple(thrust::device), + using_default_stream, + num_values); + + basic_policy_test( + "thrust::device(thrust::device_allocator{})", + std::make_tuple(thrust::device(thrust::device_allocator{})), + using_default_stream, + num_values); + + basic_policy_test("thrust::device.on(test_stream.get())", + std::make_tuple( + thrust::device.on(test_stream.get())), + using_test_stream, + num_values); + + basic_policy_test( + "thrust::device(thrust::device_allocator{}).on(test_stream.get())", + std::make_tuple( + thrust::device(thrust::device_allocator{}).on(test_stream.get())), + using_test_stream, + num_values); + } + + // Invoke the algorithm multiple times with the provided policy and validate + // the results. + template + static void basic_policy_test(std::string const &policy_desc, + PrefixArgTuple &&prefix_tuple_ref, + ValidateEvent const &validate, + std::size_t num_values) + try + { + // Sink the prefix tuple into a const local so it can be safely passed to + // multiple invocations without worrying about potential modifications. + using prefix_tuple_type = thrust::remove_cvref_t; + prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref); + + using postfix_tuple_type = + std::tuple_element_t; + postfix_tuple_type const postfix_tuple = get_postfix_tuple(); + + // Generate index sequences for the tuples: + constexpr auto prefix_tuple_size = std::tuple_size{}; + constexpr auto postfix_tuple_size = std::tuple_size{}; + using prefix_index_seq = std::make_index_sequence; + using postfix_index_seq = std::make_index_sequence; + + // Use unique, non-const inputs for each invocation to support in-place + // algo_def configurations. + input_type input_a = algo_def::generate_input(num_values); + input_type input_b = algo_def::generate_input(num_values); + input_type input_c = algo_def::generate_input(num_values); + input_type input_d = algo_def::generate_input(num_values); + input_type input_ref = algo_def::generate_input(num_values); + + output_type output_a = algo_def::generate_output(num_values, input_a); + output_type output_b = algo_def::generate_output(num_values, input_b); + output_type output_c = algo_def::generate_output(num_values, input_c); + output_type output_d = algo_def::generate_output(num_values, input_d); + output_type output_ref = algo_def::generate_output(num_values, input_ref); + + // Invoke multiple overlapping async algorithms, capturing their outputs + // and events/futures: + auto e_a = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_a, + output_a, + postfix_tuple, + postfix_index_seq{}); + auto e_b = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_b, + output_b, + postfix_tuple, + postfix_index_seq{}); + auto e_c = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_c, + output_c, + postfix_tuple, + postfix_index_seq{}); + auto e_d = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_d, + output_d, + postfix_tuple, + postfix_index_seq{}); + + // Let reference calc overlap with async testing: + algo_def::invoke_reference(input_ref, + output_ref, + postfix_tuple, + postfix_index_seq{}); + + // These wait on the e_X events: + algo_def::compare_outputs(e_a, output_ref, output_a); + algo_def::compare_outputs(e_b, output_ref, output_b); + algo_def::compare_outputs(e_c, output_ref, output_c); + algo_def::compare_outputs(e_d, output_ref, output_d); + + validate(e_a); + validate(e_b); + validate(e_c); + validate(e_d); + } + catch (unittest::UnitTestException &exc) + { + // Append some identifying information to the exception to help with + // debugging: + using overload_t = std::tuple_element_t; + + std::string const overload_desc = + unittest::demangle(typeid(overload_t).name()); + std::string const input_desc = + unittest::demangle(typeid(input_type).name()); + std::string const output_desc = + unittest::demangle(typeid(output_type).name()); + + exc << "\n" + << " - algo_def::description = " << algo_def::description() << "\n" + << " - test = basic_policy\n" + << " - policy = " << policy_desc << "\n" + << " - input_type = " << input_desc << "\n" + << " - output_type = " << output_desc << "\n" + << " - tuple of trailing arguments = " << overload_desc << "\n" + << " - num_values = " << num_values; + throw; + } + + //---------------------------------------------------------------------------- + // Test .after(event/future) handling: + template + static void run_after_future_tests(std::size_t const num_values) + try + { + using postfix_tuple_type = + std::tuple_element_t; + postfix_tuple_type const postfix_tuple = get_postfix_tuple(); + + // Generate index sequences for the tuples. Prefix size always = 1 here, + // since the async algorithms are always invoked with a single prefix + // arg (the execution policy) here. + constexpr auto postfix_tuple_size = std::tuple_size{}; + using prefix_index_seq = std::make_index_sequence<1>; + using postfix_index_seq = std::make_index_sequence; + + // Use unique, non-const inputs for each invocation to support in-place + // algo_def configurations. + input_type input_a = algo_def::generate_input(num_values); + input_type input_b = algo_def::generate_input(num_values); + input_type input_c = algo_def::generate_input(num_values); + input_type input_tmp = algo_def::generate_input(num_values); + input_type input_ref = algo_def::generate_input(num_values); + + output_type output_a = algo_def::generate_output(num_values, input_a); + output_type output_b = algo_def::generate_output(num_values, input_b); + output_type output_c = algo_def::generate_output(num_values, input_c); + output_type output_tmp = algo_def::generate_output(num_values, input_tmp); + output_type output_ref = algo_def::generate_output(num_values, input_ref); + + auto e_a = algo_def::invoke_async(std::make_tuple(thrust::device), + prefix_index_seq{}, + input_a, + output_a, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_a.valid_stream()); + auto const stream_a = e_a.stream().native_handle(); + + // Execution on default stream should create a new stream in the result: + ASSERT_NOT_EQUAL_QUIET(thrust::cuda_cub::default_stream(), stream_a); + + //-------------------------------------------------------------------------- + // Test event consumption when the event is an rvalue. + //-------------------------------------------------------------------------- + // Using `forward_as_tuple` instead of `make_tuple` to explicitly control + // value categories. + // Explicitly order this invocation after e_a: + auto e_b = + algo_def::invoke_async(std::forward_as_tuple(thrust::device.after(e_a)), + prefix_index_seq{}, + input_b, + output_b, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_b.valid_stream()); + auto const stream_b = e_b.stream().native_handle(); + + // Second invocation should use same stream as before: + ASSERT_EQUAL_QUIET(stream_a, stream_b); + + // Verify that double consumption of e_a produces an exception: + ASSERT_THROWS_EQUAL(auto x = algo_def::invoke_async( + std::forward_as_tuple(thrust::device.after(e_a)), + prefix_index_seq{}, + input_tmp, + output_tmp, + postfix_tuple, + postfix_index_seq{}); + THRUST_UNUSED_VAR(x), + thrust::event_error, + thrust::event_error(thrust::event_errc::no_state)); + + //-------------------------------------------------------------------------- + // Test event consumption when the event is an lvalue + //-------------------------------------------------------------------------- + // Explicitly order this invocation after e_b: + auto policy_after_e_b = thrust::device.after(e_b); + auto policy_after_e_b_tuple = std::forward_as_tuple(policy_after_e_b); + auto e_c = + algo_def::invoke_async(policy_after_e_b_tuple, + prefix_index_seq{}, + input_c, + output_c, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_c.valid_stream()); + auto const stream_c = e_c.stream().native_handle(); + + // Should use same stream as e_b: + ASSERT_EQUAL_QUIET(stream_b, stream_c); + + // Verify that double consumption of e_b produces an exception: + ASSERT_THROWS_EQUAL( + auto x = algo_def::invoke_async(policy_after_e_b_tuple, + prefix_index_seq{}, + input_tmp, + output_tmp, + postfix_tuple, + postfix_index_seq{}); + THRUST_UNUSED_VAR(x), + thrust::event_error, + thrust::event_error(thrust::event_errc::no_state)); + + // Let reference calc overlap with async testing: + algo_def::invoke_reference(input_ref, + output_ref, + postfix_tuple, + postfix_index_seq{}); + + // Validate results + // Use e_c for all three checks -- e_a and e_b will not pass the event + // checks since their streams were stolen by dependencies. + algo_def::compare_outputs(e_c, output_ref, output_a); + algo_def::compare_outputs(e_c, output_ref, output_b); + algo_def::compare_outputs(e_c, output_ref, output_c); + } + catch (unittest::UnitTestException &exc) + { + // Append some identifying information to the exception to help with + // debugging: + using postfix_t = std::tuple_element_t; + + std::string const postfix_desc = + unittest::demangle(typeid(postfix_t).name()); + std::string const input_desc = + unittest::demangle(typeid(input_type).name()); + std::string const output_desc = + unittest::demangle(typeid(output_type).name()); + + exc << "\n" + << " - algo_def::description = " << algo_def::description() << "\n" + << " - test = after_future\n" + << " - input_type = " << input_desc << "\n" + << " - output_type = " << output_desc << "\n" + << " - tuple of trailing arguments = " << postfix_desc << "\n" + << " - num_values = " << num_values; + throw; + } + + //---------------------------------------------------------------------------- + // Various helper functions: + template + static auto get_postfix_tuple() + { + return std::get(algo_def::generate_postfix_args()); + } +}; + +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/event.cu b/testing/event.cu index 5833d4145..581426919 100644 --- a/testing/event.cu +++ b/testing/event.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include diff --git a/testing/future.cu b/testing/future.cu index 137558860..eb1ab582a 100644 --- a/testing/future.cu +++ b/testing/future.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include diff --git a/testing/unittest/testframework.h b/testing/unittest/testframework.h index 117908dd9..82481362e 100644 --- a/testing/unittest/testframework.h +++ b/testing/unittest/testframework.h @@ -440,6 +440,22 @@ class TEST##UnitTest : public UnitTest { \ }; \ TEST##UnitTest TEST##Instance +// Macro to create instances of a test for several array sizes. +#define DECLARE_SIZED_UNITTEST(TEST) \ +class TEST##UnitTest : public UnitTest { \ + public: \ + TEST##UnitTest() : UnitTest(#TEST) {} \ + void run() \ + { \ + std::vector sizes = get_test_sizes(); \ + for(size_t i = 0; i != sizes.size(); ++i) \ + { \ + TEST(sizes[i]); \ + } \ + } \ +}; \ +TEST##UnitTest TEST##Instance + // Macro to create instances of a test for several data types and array sizes #define DECLARE_VARIABLE_UNITTEST(TEST) \ class TEST##UnitTest : public UnitTest { \ diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h index 984cc61c6..9a3454efd 100644 --- a/testing/unittest/util_async.h +++ b/testing/unittest/util_async.h @@ -1,9 +1,9 @@ #pragma once #include -#include +#include -#if THRUST_CPP_DIALECT >= 2011 +#if THRUST_CPP_DIALECT >= 2014 #include @@ -73,5 +73,4 @@ auto test_future_value_retrieval( } // namespace unittest -#endif // THRUST_CPP_DIALECT >= 2011 - +#endif // THRUST_CPP_DIALECT >= 2014 diff --git a/thrust/async/scan.h b/thrust/async/scan.h new file mode 100644 index 000000000..5c20f8481 --- /dev/null +++ b/thrust/async/scan.h @@ -0,0 +1,345 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! \file async/scan.h + * \brief Functions for asynchronously computing prefix scans. + */ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include +#include + +#include + +#include +#include +#include + +#include + +namespace thrust +{ + +namespace async +{ + +// Fallback implementations used when no overloads are found via ADL: +namespace unimplemented +{ + +template +event +async_inclusive_scan(thrust::execution_policy&, + ForwardIt, + Sentinel, + OutputIt, + BinaryOp) +{ + THRUST_STATIC_ASSERT_MSG( + (thrust::detail::depend_on_instantiation::value), + "this algorithm is not implemented for the specified system" + ); + return {}; +} + +template +event +async_exclusive_scan(thrust::execution_policy&, + ForwardIt, + Sentinel, + OutputIt, + InitialValueType, + BinaryOp) +{ + THRUST_STATIC_ASSERT_MSG( + (thrust::detail::depend_on_instantiation::value), + "this algorithm is not implemented for the specified system" + ); + return {}; +} + +} // namespace unimplemented + +namespace inclusive_scan_detail +{ + +// Include fallback implementation for ADL failures +using thrust::async::unimplemented::async_inclusive_scan; + +// Implementation of the thrust::async::inclusive_scan CPO. +struct inclusive_scan_fn final +{ + template + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(op) + ) + ) + + template + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + thrust::plus<>{} + ) + ) + + template >>> + auto operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(op) + ) + ) + + template + auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + thrust::plus<>{} + ) + ) +}; + +} // namespace inclusive_scan_detail + +THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{}; + +namespace exclusive_scan_detail +{ + +// Include fallback implementation for ADL failures +using thrust::async::unimplemented::async_exclusive_scan; + +// Implementation of the thrust::async::exclusive_scan CPO. +struct exclusive_scan_fn final +{ + template + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) + ) + + template + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + thrust::plus<>{} + ) + ) + + template + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + iterator_value_t>{}, + thrust::plus<>{} + ) + ) + + template >>> + auto + operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) + ) + + template >>> + auto + operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + thrust::plus<>{} + ) + ) + + template + auto operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + iterator_value_t>{}, + thrust::plus<>{} + ) + ) +}; + +} // namespace exclusive_scan_detail + +THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{}; + +} // namespace async + +} // end namespace thrust + +#endif diff --git a/thrust/detail/event_error.h b/thrust/detail/event_error.h index 114d4763f..cd4d8e7d9 100644 --- a/thrust/detail/event_error.h +++ b/thrust/detail/event_error.h @@ -20,10 +20,9 @@ #pragma once #include -#include -#include +#include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -162,5 +161,5 @@ inline bool operator<(event_error const& lhs, event_error const& rhs) noexcept } // end namespace thrust -#endif +#endif // C++14 diff --git a/thrust/future.h b/thrust/future.h index 12bebf8c6..25a231fbe 100644 --- a/thrust/future.h +++ b/thrust/future.h @@ -21,10 +21,9 @@ #pragma once #include -#include -#include +#include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h new file mode 100644 index 000000000..1ac46ecb5 --- /dev/null +++ b/thrust/system/cuda/detail/async/exclusive_scan.h @@ -0,0 +1,199 @@ +/****************************************************************************** + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan +// - Note that thrust::plus<> is transparent, cub::Sum is not. This should be +// fixed in CUB first). +// - Need to check if CUB actually optimizes for sums before putting in effort + +namespace thrust +{ +namespace system +{ +namespace cuda +{ +namespace detail +{ + +template +unique_eager_event +async_exclusive_scan_n(execution_policy& policy, + ForwardIt first, + Size n, + OutputIt out, + InitialValueType init, + BinaryOp op) +{ + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; + + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + cudaError_t status; + size_t tmp_size = 0; + { + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (nullptr, + tmp_size, + first, + out, + op, + init, + n_fixed, + nullptr, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after determining tmp storage " + "requirements for exclusive_scan"); + } + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n( + device_alloc, tmp_size + ); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple( + std::move(content), + unique_stream(nonowning, user_raw_stream) + ), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple(std::move(content)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + { + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (tmp_ptr, + tmp_size, + first, + out, + op, + init, + n_fixed, + user_raw_stream, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after dispatching exclusive_scan kernel"); + } + + return ev; +} + +}}} // namespace system::cuda::detail + +namespace cuda_cub +{ + +// ADL entry point. +template +auto async_exclusive_scan(execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + InitialValueType &&init, + BinaryOp&& op) +THRUST_RETURNS( + thrust::system::cuda::detail::async_exclusive_scan_n( + policy, + first, + distance(first, THRUST_FWD(last)), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) +) + +} // namespace cuda_cub + +} // namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#endif // C++14 + diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h new file mode 100644 index 000000000..6b3dcef91 --- /dev/null +++ b/thrust/system/cuda/detail/async/inclusive_scan.h @@ -0,0 +1,194 @@ +/****************************************************************************** + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan +// - Note that thrust::plus<> is transparent, cub::Sum is not. This should be +// fixed in CUB first). +// - Need to check if CUB actually optimizes for sums before putting in effort + +namespace thrust +{ +namespace system +{ +namespace cuda +{ +namespace detail +{ + +template +unique_eager_event +async_inclusive_scan_n(execution_policy& policy, + ForwardIt first, + Size n, + OutputIt out, + BinaryOp op) +{ + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; + + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + cudaError_t status; + size_t tmp_size = 0; + { + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (nullptr, + tmp_size, + first, + out, + op, + cub::NullType{}, + n_fixed, + nullptr, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after determining tmp storage " + "requirements for inclusive_scan"); + } + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n( + device_alloc, tmp_size + ); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple( + std::move(content), + unique_stream(nonowning, user_raw_stream) + ), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple(std::move(content)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + { + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (tmp_ptr, + tmp_size, + first, + out, + op, + cub::NullType{}, + n_fixed, + user_raw_stream, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after dispatching inclusive_scan kernel"); + } + + return ev; +} + +}}} // namespace system::cuda::detail + +namespace cuda_cub +{ + +// ADL entry point. +template +auto async_inclusive_scan(execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) +THRUST_RETURNS( + thrust::system::cuda::detail::async_inclusive_scan_n( + policy, + first, + distance(first, THRUST_FWD(last)), + THRUST_FWD(out), + THRUST_FWD(op) + ) +) + +} // namespace cuda_cub + +} // namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#endif // C++14 + diff --git a/thrust/system/cuda/detail/async/scan.h b/thrust/system/cuda/detail/async/scan.h new file mode 100644 index 000000000..4a9f31681 --- /dev/null +++ b/thrust/system/cuda/detail/async/scan.h @@ -0,0 +1,33 @@ +/****************************************************************************** + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#include +#include diff --git a/thrust/system/cuda/detail/future.inl b/thrust/system/cuda/detail/future.inl index b01b20b75..ee23b0eab 100644 --- a/thrust/system/cuda/detail/future.inl +++ b/thrust/system/cuda/detail/future.inl @@ -9,10 +9,9 @@ #pragma once #include -#include -#include +#include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -1370,5 +1369,5 @@ THRUST_DECLTYPE_RETURNS(std::move(dependency)) } // end namespace thrust -#endif +#endif // C++14 diff --git a/thrust/system/cuda/future.h b/thrust/system/cuda/future.h index fc2986f8b..e42437e93 100644 --- a/thrust/system/cuda/future.h +++ b/thrust/system/cuda/future.h @@ -6,10 +6,9 @@ #pragma once #include -#include -#include +#include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -71,5 +70,5 @@ unique_eager_future_type( #include -#endif +#endif // C++14 diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h new file mode 100644 index 000000000..a2a90618b --- /dev/null +++ b/thrust/system/detail/adl/async/scan.h @@ -0,0 +1,34 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The purpose of this header is to #include the async/scan.h header of the +// sequential, host, and device systems. It should be #included in any code +// which uses ADL to dispatch async scans. + +#pragma once + +#include + +//#include + +//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h> +//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER +//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER + +#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h> +#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER +#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER + diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl index 300b697b2..83d272c3e 100644 --- a/thrust/system/detail/generic/scan.inl +++ b/thrust/system/detail/generic/scan.inl @@ -61,9 +61,7 @@ __host__ __device__ { // Use the input iterator's value type per https://wg21.link/P0571 using ValueType = typename thrust::iterator_value::type; - - // assume 0 as the initialization value - return thrust::exclusive_scan(exec, first, last, result, ValueType(0)); + return thrust::exclusive_scan(exec, first, last, result, ValueType{}); } // end exclusive_scan()