From 7caa6926481eafe3f9c5bd85e9582a41ad9349e5 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 31 Jul 2020 17:22:32 -0400 Subject: [PATCH] WIP Adding async scan algorithms, iterate on async testing. --- dependencies/cub | 2 +- testing/CMakeLists.txt | 1 + testing/async/CMakeLists.txt | 82 +++ testing/async/exclusive_scan/basic.cu | 42 ++ .../async/exclusive_scan/counting_iterator.cu | 49 ++ .../async/exclusive_scan/discard_output.cu | 41 ++ testing/async/exclusive_scan/in_place.cu | 42 ++ testing/async/exclusive_scan/mixed_types.cu | 103 ++++ testing/async/exclusive_scan/mixin.h | 143 +++++ testing/async/exclusive_scan/using_vs_adl.cu | 182 ++++++ testing/async/mixin.h | 523 ++++++++++++++++++ testing/async/test_policy_overloads.h | 255 +++++++++ testing/unittest/util_async.h | 7 +- thrust/async/scan.h | 355 ++++++++++++ .../system/cuda/detail/async/exclusive_scan.h | 180 ++++++ .../system/cuda/detail/async/inclusive_scan.h | 173 ++++++ thrust/system/cuda/detail/async/scan.h | 33 ++ thrust/system/detail/adl/async/scan.h | 34 ++ thrust/system/detail/generic/scan.inl | 4 +- 19 files changed, 2243 insertions(+), 8 deletions(-) create mode 100644 testing/async/CMakeLists.txt create mode 100644 testing/async/exclusive_scan/basic.cu create mode 100644 testing/async/exclusive_scan/counting_iterator.cu create mode 100644 testing/async/exclusive_scan/discard_output.cu create mode 100644 testing/async/exclusive_scan/in_place.cu create mode 100644 testing/async/exclusive_scan/mixed_types.cu create mode 100644 testing/async/exclusive_scan/mixin.h create mode 100644 testing/async/exclusive_scan/using_vs_adl.cu create mode 100644 testing/async/mixin.h create mode 100644 testing/async/test_policy_overloads.h create mode 100644 thrust/async/scan.h create mode 100644 thrust/system/cuda/detail/async/exclusive_scan.h create mode 100644 thrust/system/cuda/detail/async/inclusive_scan.h create mode 100644 thrust/system/cuda/detail/async/scan.h create mode 100644 thrust/system/detail/adl/async/scan.h diff --git a/dependencies/cub b/dependencies/cub index 2749cb0c7b..99ac8fe71e 160000 --- a/dependencies/cub +++ b/dependencies/cub @@ -1 +1 @@ -Subproject commit 2749cb0c7bc5a72c806d7ca0b8e4d702dbe017e5 +Subproject commit 99ac8fe71e937d57c594c38fe2d3ee9fd99e1a1b diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index fdfc04e97b..244d839fed 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -151,6 +151,7 @@ foreach(thrust_target IN LISTS THRUST_TARGETS) endforeach() # Add specialized tests: +add_subdirectory(async) add_subdirectory(cpp) add_subdirectory(cuda) add_subdirectory(omp) diff --git a/testing/async/CMakeLists.txt b/testing/async/CMakeLists.txt new file mode 100644 index 0000000000..d744f09e6e --- /dev/null +++ b/testing/async/CMakeLists.txt @@ -0,0 +1,82 @@ +# The async tests perform a large amount of codegen, making them expensive to +# build and test. To keep compilation and runtimes manageable, the tests are +# broken up into many files per algorithm to enable parallelism during +# compilation and testing. The structure of these test directories are: +# +# thrust/testing/async//.cu +# +# These generate executables and CTest tests named +# ${config_prefix}.test.async... + +# The async tests only support CUDA enabled configs. Create a list of valid +# thrust targets: +set(cuda_configs) +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if (config_device STREQUAL CUDA) + list(APPEND cuda_configs ${thrust_target}) + endif() +endforeach() + +list(LENGTH cuda_configs num_cuda_configs) +if (num_cuda_configs EQUAL 0) + return() # No valid configs found, nothing to do. +endif() + +# Process a single algorithm directory, adding all .cu/cpp files as tests for +# each valid backend. algo_name is the name of the subdir ( +# above) and is used for naming the executable/targets. +function(thrust_add_async_test_dir algo_name) + file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + "${algo_name}/*.cu" + "${algo_name}/*.cpp" + ) + + # Per-algorithm, all-config metatarget: thrust.all.async.[algo].all + set(algo_meta_target thrust.all.async.${algo_name}.all) + add_custom_target(${algo_meta_target}) + + foreach(thrust_target IN LISTS cuda_configs) + # Per-algorithm, per-config metatarget: thrust.[config].async.[algo].all + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + set(algo_config_meta_target ${config_prefix}.async.${algo_name}.all) + add_custom_target(${algo_config_meta_target}) + add_dependencies(${algo_meta_target} ${algo_config_meta_target}) + + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + string(PREPEND test_name async.${algo_name}.) + message("Adding test ${config_prefix}.${test_name} at ${test_src}.") + + thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) + if(THRUST_ENABLE_TESTS_WITH_RDC) + thrust_enable_rdc_for_cuda_target(${test_target}) + endif() + + add_dependencies(${algo_config_meta_target} ${test_target}) + endforeach() + endforeach() +endfunction() + +# Grab all algorithm subdirectories: +set(test_dirs) +file(GLOB contents + CONFIGURE_DEPENDS + "${CMAKE_CURRENT_LIST_DIR}/*" +) +message("Contents: ${contents}") +foreach(test_dir IN LISTS contents) + if(IS_DIRECTORY "${test_dir}") + list(APPEND test_dirs "${test_dir}") + message("Found test_dir: ${test_dir}") + endif() +endforeach() + +# Process all test dirs: +foreach(test_dir IN LISTS test_dirs) + get_filename_component(algo_name "${test_dir}" NAME_WLE) + message("Processing algorithm ${algo_name} from ${test_dir}.") + thrust_add_async_test_dir(${algo_name}) +endforeach() diff --git a/testing/async/exclusive_scan/basic.cu b/testing/async/exclusive_scan/basic.cu new file mode 100644 index 0000000000..ac79e25a9f --- /dev/null +++ b/testing/async/exclusive_scan/basic.cu @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +template > +struct basic_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "basic invocation with device vectors"; + } +}; + +template +struct TestBasic +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasic, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/counting_iterator.cu b/testing/async/exclusive_scan/counting_iterator.cu new file mode 100644 index 0000000000..467f8a7175 --- /dev/null +++ b/testing/async/exclusive_scan/counting_iterator.cu @@ -0,0 +1,49 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include + +template > +struct invoker + : testing::async::mixin::input::counting_iterator + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "fancy input iterator (counting_iterator)"; + } +}; + +template +struct TestCountingIterator +{ + void operator()(std::size_t num_values) const + { + num_values = unittest::truncate_to_max_representable(num_values); + testing::async::test_policy_overloads>::run(num_values); + } +}; +// Use built-in types only, counting_iterator doesn't seem to be compatible with +// the custom_numeric. +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestCountingIterator, + BuiltinNumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/discard_output.cu b/testing/async/exclusive_scan/discard_output.cu new file mode 100644 index 0000000000..353c88e739 --- /dev/null +++ b/testing/async/exclusive_scan/discard_output.cu @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Compilation test with discard iterators. No runtime validation is actually +// performed, other than testing whether the algorithm completes without +// exception. + +template > +struct discard_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::discard_iterator + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::noop +{ + static std::string description() { return "discard output"; } +}; + +template +struct TestDiscard +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestDiscard, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/in_place.cu b/testing/async/exclusive_scan/in_place.cu new file mode 100644 index 0000000000..71ced40b7f --- /dev/null +++ b/testing/async/exclusive_scan/in_place.cu @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +template > +struct basic_inplace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector_reuse_input + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + input_value_type> + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "basic in-place invocation with device vectors"; + } +}; + +template +struct TestBasicInPlace +{ + void operator()(std::size_t num_values) const + { + using invoker = basic_inplace_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasicInPlace, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/mixed_types.cu b/testing/async/exclusive_scan/mixed_types.cu new file mode 100644 index 0000000000..05584e7a75 --- /dev/null +++ b/testing/async/exclusive_scan/mixed_types.cu @@ -0,0 +1,103 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Test using mixed int/float types for: +// - input_value_type | (int, float) +// - output_value_type | (int, float) +// - initial_value_type | (int, float, ) +// - thrust::plus T-type | (int, float, void>) +// +// The initial_value_type and thrust::plus types are covered by the +// mixin::postfix_args::scan_mixed_types_overloads component. +// +// The testing/scan.cu TestMixedTypes test spells out the expected behavior, +// which is defined by https://wg21.link/P0571. + +template +struct mixed_type_input_generator +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + // fractional values are chosen deliberately to test + // casting orders and accumulator types: + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +// A fractional value is used to ensure that a different result is obtained when +// using float vs. int. +template +struct mixed_types_postfix_args +{ + using postfix_args_type = std::tuple< // Overloads to test: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple>, // - initial_value, plus<> + std::tuple>, // - initial_value, plus + std::tuple> // - initial_value, plus + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, + {static_cast(5.5)}, + {static_cast(5.5), thrust::plus<>{}}, + {static_cast(5.5), thrust::plus{}}, + {static_cast(5.5), thrust::plus{}}}; + } +}; + +template +struct invoker + : mixed_type_input_generator + , testing::async::mixin::output::device_vector + , mixed_types_postfix_args + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "mixed input/output/initial type tests"; + } +}; + +void TestScanMixedTypes() +{ + // 10 values are enough to check the behavior we want to test while staying + // small enough to reason about. + constexpr std::size_t num_values = 10; + + // invoker template params are input_value_type, output_vt, initial_vt: + using testing::async::test_policy_overloads; + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + // We all float down here + test_policy_overloads>::run(num_values); +} +DECLARE_UNITTEST(TestScanMixedTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/mixin.h b/testing/async/exclusive_scan/mixin.h new file mode 100644 index 0000000000..cf4671d976 --- /dev/null +++ b/testing/async/exclusive_scan/mixin.h @@ -0,0 +1,143 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +// TODO Finish implementing tests. Draw from other async algos, as well as +// the older scan tests. + +// TODO compare_outputs::noop should still test the stream/event. Maybe rename. + +// From testing/scan.cu: +// - TestInclusiveScanDispatchExplicit +// - TestInclusiveScanDispatchImplicit +// - TestScanWithLargeTypes +// - TestInclusiveScanWithBigIndexes +// - TestInclusiveScanWithUserDefinedType + +// From testing/async_reduce.cu: +// - test_async_reduce_after (can this be worked into the framework? See below) +// - test_async_reduce_on_then_after (can this be worked into the framework?) +// - all the child variants (e.g. with allocator) too +// - test_async_reduce_caching +// - test_async_copy_then_reduce + +// For the `after` tests, I think we can add them to test_policy_overloads: +// 1) Reverse the nested iteration over prefix/postfix args. +// - Currently we iterate prefix, then postfix. +// - The `after` tests won't use any prefix tuple, they'll be hardcoded into +// a test_configuration-esque function. +// - This will avoid calling the same `after` tests multiple times. + +namespace testing +{ +namespace async +{ +namespace exclusive_scan +{ + +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace postfix_args +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple // - initial_value, binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, {42}, {42, alternate_binary_op{}}}; + } +}; + +} // namespace postfix_args + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +template +struct host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +struct basic +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + auto e = thrust::async::exclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +} // namespace mixin +} // namespace exclusive_scan +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/exclusive_scan/using_vs_adl.cu b/testing/async/exclusive_scan/using_vs_adl.cu new file mode 100644 index 0000000000..4e7dc7b7de --- /dev/null +++ b/testing/async/exclusive_scan/using_vs_adl.cu @@ -0,0 +1,182 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Verify what happens when calling the algorithm without any namespace +// qualifiers: +// - If the async entry point is available in the global namespace due to a +// using statement, the async algorithm should be called. +// - Otherwise, ADL should resolve the call to the synchronous algo in the +// thrust:: namespace. + +namespace invoke_reference +{ + +template +struct adl_host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + using OutIter = thrust::remove_cvref_t; + + // ADL should resolve this to the synchronous `thrust::` algorithm. + // This is checked by ensuring that the call returns an output iterator. + OutIter result = + exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + (void)result; + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +namespace invoke_async +{ + +struct using_namespace +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using namespace thrust::async; + thrust::device_event e = + exclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +struct using_cpo +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using thrust::async::exclusive_scan; + thrust::device_event e = + exclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +template > +struct using_namespace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_namespace + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "importing async CPO with `using namespace thrust::async`"; + } +}; + +template +struct TestUsingNamespace +{ + void operator()(std::size_t num_values) const + { + using invoker = using_namespace_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestUsingNamespace, NumericTypes); + +template > +struct using_cpo_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_cpo + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "importing async CPO with " + "`using namespace thrust::async::exclusive_scan`"; + } +}; + +template +struct TestUsingCPO +{ + void operator()(std::size_t num_values) const + { + using invoker = using_cpo_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestUsingCPO, NumericTypes); + +#endif // C++14 diff --git a/testing/async/mixin.h b/testing/async/mixin.h new file mode 100644 index 0000000000..8d9a07585e --- /dev/null +++ b/testing/async/mixin.h @@ -0,0 +1,523 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include +#include + +#include + +#include +#include + +#include +#include + +// This file contains a set of mix-in classes that define an algorithm +// definition for use with test_policy_overloads. The algorithm +// definition describes the details of a thrust::async algorithm invocation: +// +// - Input type and initialization +// - Output type and initialization (supports in-place, too) +// - Postfix arguments that define the algorithm's overload set +// - Abstracted invocation of the async algorithm +// - Abstracted invocation of a reference algorithm +// - Validation of async vs. reference output +// - A description string. +// +// This definition is used by test_policy_overloads to test each overload +// against a reference while injecting a variety of execution policies. This +// validates that each overload behaves correctly according to some reference. +// +// Since much of the algorithm definition is generic and may be reused in +// multiple tests with slight changes, a mix-in system is used to simplify +// the creation of algorithm definitions. The following namespace hierarchy is +// used to organize these generic components: +// +// * testing::async::mixin:: +// ** ::input - Input types/values (device vectors, counting iterators, etc) +// ** ::output - Output types/values (device vectors, inplace device vectors, +// discard iterators, etc) +// ** ::postfix_args - Algorithm specific overload sets +// ** ::invoke_reference - Algorithm specific reference invocation +// ** ::invoke_async - Algorithm specific async algo invocation +// ** ::compare_outputs - Compare output values. +// +// Each algorithm should define its own `mixins.h` header to declare algorithm +// specific mixins (e.g. postfix_args, invoke_reference, and invoke_async) +// in a testing::async::::mixins namespace structure. +// +// For example, the test.async.exclusive_scan.basic test uses the following +// algorithm definition from mix-ins: +// +// ``` +// #include +// #include +// #include +// template > +// struct basic_invoker +// : testing::async::mixin::input::device_vector +// , testing::async::mixin::output::device_vector +// , testing::async::exclusive_scan::mixin::postfix_args:: +// all_overloads +// , testing::async::exclusive_scan::mixin::invoke_reference:: +// host_synchronous +// , testing::async::exclusive_scan::mixin::invoke_async::basic +// , testing::async::mixin::compare_outputs::assert_equal_quiet +// { +// static std::string description() +// { +// return "basic invocation with device vectors"; +// } +// }; +// +// ... +// +// testing::async::test_policy_overloads>::run(num_values); +// ``` +// +// The basic_invoker class expands to something similar to the following: +// +// ``` +// template > +// struct basic_invoker +// { +// public: +// +// static std::string description() +// { +// return "basic invocation with device vectors"; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::input::device_vector +// using input_type = thrust::device_vector; +// +// // Generate an instance of the input: +// static input_type generate_input(std::size_t num_values) +// { +// input_type input(num_values); +// thrust::sequence(input.begin(), input.end(), 25, 3); +// return input; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::output::device_vector +// using output_type = thrust::device_vector; +// +// // Generate an instance of the output: +// // Might be more complicated, eg. fancy iterators, etc +// static output_type generate_output(std::size_t num_values) +// { +// return output_type(num_values); +// } +// +// //------------------------------------------------------------------------- +// // +// testing::async::mixin::exclusive_scan::mixin::postfix_args::all_overloads +// // +// using postfix_args_type = std::tuple< // List any extra +// arg overloads: +// std::tuple<>, // - no extra args +// std::tuple, // - initial_value +// std::tuple // - initial_value, +// binary_op +// >; +// +// // Create instances of the extra arguments to use when invoking the +// // algorithm: +// static postfix_args_type generate_postfix_args() +// { +// return { +// {}, // no extra args +// {42}, // initial_value +// {57, alternate_binary_op{}} // initial_value, binary_op +// }; +// } +// +// //------------------------------------------------------------------------- +// // +// testing::async::mixin::exclusive_scan::mixin::invoke_reference::host_synchronous +// // +// // Invoke a reference implementation for a single overload as described by +// // postfix_tuple. This tuple contains instances of any additional +// arguments +// // to pass to the algorithm. The tuple/index_sequence pattern is used to +// // support the "no extra args" overload, since the parameter pack +// expansion +// // will do exactly what we want in all cases. +// template +// static void invoke_reference(input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// // Create host versions of the input/output: +// thrust::host_vector host_input(input.cbegin(), +// input.cend()); +// thrust::host_vector host_output(host_input.size()); +// +// // Run host synchronous algorithm to generate reference. +// thrust::exclusive_scan(host_input.cbegin(), +// host_input.cend(), +// host_output.begin(), +// std::get( +// THRUST_FWD(postfix_tuple))...); +// +// // Copy back to device. +// output = host_output; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::exclusive_scan::mixin::invoke_async::basic +// // +// // Invoke the async algorithm for a single overload as described by +// // the prefix and postfix tuples. These tuples contains instances of any +// // additional arguments to pass to the algorithm. The tuple/index_sequence +// // pattern is used to support the "no extra args" overload, since the +// // parameter pack expansion will do exactly what we want in all cases. +// // Prefix args are included here (but not for invoke_reference) to allow +// the +// // test framework to change the execution policy. +// // This method must return an event or future. +// template +// static auto invoke_async(PrefixArgTuple &&prefix_tuple, +// std::index_sequence, +// input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// output.resize(input.size()); +// auto e = thrust::async::exclusive_scan( +// std::get(THRUST_FWD(prefix_tuple))..., +// input.cbegin(), +// input.cend(), +// output.begin(), +// std::get(THRUST_FWD(postfix_tuple))...); +// return e; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::compare_outputs::assert_equal_quiet +// // +// // Wait on and validate the event/future (usually with TEST_EVENT_WAIT / +// // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output +// // matches the testing output. +// template +// static void compare_outputs(EventType &e, +// output_type const &ref, +// output_type const &test) +// { +// TEST_EVENT_WAIT(e); +// ASSERT_EQUAL_QUIET(ref, test); +// } +// }; +// ``` +// +// Similar invokers with slight tweaks are used in other +// async/exclusive_scan/*.cu tests. + +namespace testing +{ +namespace async +{ +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace input +{ + +// TODO it'd be nice to specify a lambda expression that'd replace the call to +// thrust::sequence when desired. +template +struct device_vector +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +template +struct counting_iterator +{ + struct input_type + { + using iterator = thrust::counting_iterator; + + std::size_t num_values; + + iterator begin() { return iterator{static_cast(0)}; } + iterator begin() const { return iterator{static_cast(0)}; } + iterator cbegin() const { return iterator{static_cast(0)}; } + + iterator end() { return iterator{static_cast(0)} + num_values; } + iterator end() const + { + return iterator{static_cast(0)} + num_values; + } + iterator cend() const + { + return iterator{static_cast(0)} + num_values; + } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +} // namespace input + +//------------------------------------------------------------------------------ +namespace output +{ + +template +struct device_vector +{ + using output_type = thrust::device_vector; + + template + static output_type generate_output(std::size_t num_values, + InputType& /* unused */) + { + return output_type(num_values); + } +}; + +template +struct device_vector_reuse_input +{ + using output_type = thrust::device_vector&; + + template + static output_type generate_output(std::size_t /*num_values*/, + InputType& input) + { + return input; + } +}; + +struct discard_iterator +{ + struct output_type + { + using iterator = thrust::discard_iterator<>; + + iterator begin() { return thrust::make_discard_iterator(); } + iterator begin() const { return thrust::make_discard_iterator(); } + iterator cbegin() const { return thrust::make_discard_iterator(); } + }; + + template + static output_type generate_output(std::size_t /* num_values */, + InputType& /* input */) + { + return output_type{}; + } +}; + +} // namespace output + +//------------------------------------------------------------------------------ +namespace postfix_args +{ +/* Defined per algorithm. Example: + * + * // Defines several overloads: + * // algorithm([policy,] input, output) // no postfix args + * // algorithm([policy,] input, output, initial_value) + * // algorithm([policy,] input, output, initial_value, binary_op) + * template > struct all_overloads + * { + * using postfix_args_type = std::tuple< // List any extra arg + * overloads: std::tuple<>, // - no extra args + * std::tuple, // - initial_value + * std::tuple // - initial_value, binary_op + * >; + * + * static postfix_args_type generate_postfix_args() + * { + * return {{}, {42}, {42, alternate_binary_op{}}}; + * } + * }; + * + */ +} + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +/* Defined per algorithm. Example: + * + * template + * struct host_synchronous + * { + * template + * static void invoke_reference(InputType const& input, + * OutputType& output, + * PostfixArgTuple&& postfix_tuple, + * std::index_sequence) + * { + * // Create host versions of the input/output: + * thrust::host_vector host_input(input.cbegin(), + * input.cend()); + * thrust::host_vector host_output(host_input.size()); + * + * // Run host synchronous algorithm to generate reference. + * // Be sure to call a backend that doesn't use the same underlying + * // implementation. + * thrust::exclusive_scan(host_input.cbegin(), + * host_input.cend(), + * host_output.begin(), + * std::get( + * THRUST_FWD(postfix_tuple))...); + * + * // Copy back to device. + * output = host_output; + * } + * }; + * + */ + +// Used to save time when testing unverifiable invocations (discard_iterators) +struct noop +{ + template + static void invoke_reference(Ts&&...) + {} +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +/* Defined per algorithm. Example: + * + * struct basic + * { + * template + * static auto invoke_async(PrefixArgTuple&& prefix_tuple, + * std::index_sequence, + * InputType const& input, + * OutputType& output, + * PostfixArgTuple&& postfix_tuple, + * std::index_sequence) + * { + * auto e = thrust::async::exclusive_scan( + * std::get(THRUST_FWD(prefix_tuple))..., + * input.cbegin(), + * input.cend(), + * output.begin(), + * std::get(THRUST_FWD(postfix_tuple))...); + * return e; + * } + * }; + */ + +} // namespace invoke_async + +//------------------------------------------------------------------------------ +namespace compare_outputs +{ + +struct assert_equal_quiet +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + TEST_EVENT_WAIT(e); + ASSERT_EQUAL(ref, test); + } +}; + +// Does an 'almost_equal' comparison for floating point types, since fp +// addition is non-associative +struct assert_maybe_fuzzy_equal_quiet +{ +private: + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::false_type /* is_floating_point */) + { + TEST_EVENT_WAIT(e); + ASSERT_EQUAL(ref, test); + } + + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::true_type /* is_floating_point */) + { + TEST_EVENT_WAIT(e); + ASSERT_ALMOST_EQUAL(ref, test); + } + +public: + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + using value_type = typename OutputType::value_type; + compare_outputs_impl(e, ref, test, std::is_floating_point{}); + } +}; + +// Used to save time when testing unverifiable invocations (discard_iterators) +struct noop +{ + template + static void compare_outputs(Ts&&... ts) + {} +}; + +} // namespace compare_outputs + +} // namespace mixin +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/test_policy_overloads.h b/testing/async/test_policy_overloads.h new file mode 100644 index 0000000000..9999237b2e --- /dev/null +++ b/testing/async/test_policy_overloads.h @@ -0,0 +1,255 @@ +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include + +#include + +#include + +namespace testing +{ + +namespace async +{ + +// Tests that policies are handled correctly for all overloads of an async +// algorithm. +// +// Specifically, each overload is called with: +// 1) No policy +// 2) thrust::device +// 3) thrust::device(thrust::device_allocator) +// 4) thrust::device.on(stream) +// 5) thrust::device(thrust::device_allocator).on(stream) +// +// For each test, the returned event/future is tested to make sure it holds a +// reference to the proper stream. +// +// The AlgoDef type defines an async algorithm, its overloads, and abstracts its +// invocation. See the async/mixins.h for a documented example of this interface +// and some convenience mixins for constructing a definition quickly. +template +struct test_policy_overloads +{ + using algo_def = AlgoDef; + using input_type = typename algo_def::input_type; + using output_type = typename algo_def::output_type; + using postfix_args_type = typename algo_def::postfix_args_type; + + // Main entry point; call this from a unit test function. + static void run(std::size_t num_values) + { + // When a policy uses the default stream, the algorithm implementation + // should spawn a new stream in the returned event. This lambda validates + // this: + auto using_default_stream = [](auto& e) { + ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(), + e.stream().native_handle()); + }; + + // Lambda that verifies non-default streams are passed through to the + // event/future: + thrust::system::cuda::detail::unique_stream test_stream{}; + auto using_test_stream = [&test_stream](auto& e) { + ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle()); + }; + + // Test the different types of policies: + test_policy("(no policy)", + std::make_tuple(), + using_default_stream, + num_values); + test_policy("thrust::device", + std::make_tuple(thrust::device), + using_default_stream, + num_values); + test_policy("thrust::device(thrust::device_allocator{})", + std::make_tuple( + thrust::device(thrust::device_allocator{})), + using_default_stream, + num_values); + test_policy("thrust::device.on(test_stream.get())", + std::make_tuple(thrust::device.on(test_stream.get())), + using_test_stream, + num_values); + test_policy( + "thrust::device(thrust::device_allocator{}).on(test_stream.get())", + std::make_tuple( + thrust::device(thrust::device_allocator{}).on(test_stream.get())), + using_test_stream, + num_values); + } + +private: + template + static void test_policy(std::string const& policy_desc, + PolicyTuple&& policy_tuple, + ValidateEvent&& validate, + std::size_t num_values) + { + constexpr std::size_t num_postfix_arg_sets = + std::tuple_size::value; + + // Test the current policy across all overloads of the algorithm: + iterate_postfix_args<0, num_postfix_arg_sets>{}( + policy_desc, + THRUST_FWD(policy_tuple), + algo_def::generate_postfix_args(), + THRUST_FWD(validate), + num_values); + } + + // Iterate through postfix arg sets, calling all overloads they define + // with the provided policy_tuple. + template + struct iterate_postfix_args + { + template + void operator()(std::string const& policy_desc, + PolicyTuple&& policy_tuple, + postfix_args_type&& postfix_args, + ValidateEvent&& validate, + std::size_t num_values) + { + try + { + test_configuration(policy_tuple, + std::get(std::move(postfix_args)), + validate, + num_values); + } + catch (unittest::UnitTestException& exc) + { + // Append some identifying information to the exception: + std::string const overload_desc = unittest::demangle( + typeid(typename std::tuple_element::type) + .name()); + + std::string const input_desc = + unittest::demangle(typeid(input_type).name()); + std::string const output_desc = + unittest::demangle(typeid(output_type).name()); + + exc << "\n" + << " - testcase = " << algo_def::description() << "\n" + << " - policy = " << policy_desc << "\n" + << " - input_type = " << input_desc << "\n" + << " - output_type = " << output_desc << "\n" + << " - tuple of trailing arguments = " << overload_desc << "\n" + << " - num_values = " << num_values; + throw; + } + + // Recurse + iterate_postfix_args{}( + policy_desc, + THRUST_FWD(policy_tuple), + std::move(postfix_args), + THRUST_FWD(validate), + num_values); + } + }; + + // Terminal specialization + template + struct iterate_postfix_args + { + template + void operator()(Ts&&...) + {} + }; + + // Actually invoke the algorithms with the supplied prefix/postfix args + // and do the validations: + template + static void test_configuration(PrefixArgTuple&& prefix_tuple_ref, + PostfixArgTuple&& postfix_tuple_ref, + ValidateEvent const& validate, + std::size_t num_values) + { + using prefix_tuple_type = thrust::remove_cvref_t; + using postfix_tuple_type = thrust::remove_cvref_t; + + // Sink these tuples into const locals so they can be safely passed to + // multiple invocations without worrying about potential modifications. + prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref); + postfix_tuple_type const postfix_tuple = THRUST_FWD(postfix_tuple_ref); + + // Generate index sequences for the tuples: + constexpr auto prefix_tuple_size = std::tuple_size{}; + constexpr auto postfix_tuple_size = std::tuple_size{}; + using prefix_index_seq = std::make_index_sequence; + using postfix_index_seq = std::make_index_sequence; + + // Use unique, non-const inputs for each invocation to support in-place + // algo_def configurations. + input_type input_a = algo_def::generate_input(num_values); + input_type input_b = algo_def::generate_input(num_values); + input_type input_c = algo_def::generate_input(num_values); + input_type input_d = algo_def::generate_input(num_values); + input_type input_ref = algo_def::generate_input(num_values); + + output_type output_a = algo_def::generate_output(num_values, input_a); + output_type output_b = algo_def::generate_output(num_values, input_b); + output_type output_c = algo_def::generate_output(num_values, input_c); + output_type output_d = algo_def::generate_output(num_values, input_d); + output_type output_ref = algo_def::generate_output(num_values, input_ref); + + // Invoke multiple overlapping async algorithms, capturing their outputs + // and events/futures: + auto e_a = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_a, + output_a, + postfix_tuple, + postfix_index_seq{}); + auto e_b = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_b, + output_b, + postfix_tuple, + postfix_index_seq{}); + auto e_c = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_c, + output_c, + postfix_tuple, + postfix_index_seq{}); + auto e_d = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_d, + output_d, + postfix_tuple, + postfix_index_seq{}); + + // Let reference calc overlap with async testing: + algo_def::invoke_reference(input_ref, + output_ref, + postfix_tuple, + postfix_index_seq{}); + + algo_def::compare_outputs(e_a, output_ref, output_a); + algo_def::compare_outputs(e_b, output_ref, output_b); + algo_def::compare_outputs(e_c, output_ref, output_c); + algo_def::compare_outputs(e_d, output_ref, output_d); + + validate(e_a); + validate(e_b); + validate(e_c); + validate(e_d); + } +}; + +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h index 984cc61c6b..9a3454efd5 100644 --- a/testing/unittest/util_async.h +++ b/testing/unittest/util_async.h @@ -1,9 +1,9 @@ #pragma once #include -#include +#include -#if THRUST_CPP_DIALECT >= 2011 +#if THRUST_CPP_DIALECT >= 2014 #include @@ -73,5 +73,4 @@ auto test_future_value_retrieval( } // namespace unittest -#endif // THRUST_CPP_DIALECT >= 2011 - +#endif // THRUST_CPP_DIALECT >= 2014 diff --git a/thrust/async/scan.h b/thrust/async/scan.h new file mode 100644 index 0000000000..d7adfac983 --- /dev/null +++ b/thrust/async/scan.h @@ -0,0 +1,355 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! \file async/scan.h + * \brief Functions for asynchronously computing prefix scans. + */ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include +#include + +#include + +#include +#include +#include + +#include + +namespace thrust +{ + +namespace async +{ + +// Fallback implementations used when no overloads are found via ADL: +namespace unimplemented +{ + +template +event +async_inclusive_scan(thrust::execution_policy&, + ForwardIt, + Sentinel, + OutputIt, + BinaryOp) +{ + THRUST_STATIC_ASSERT_MSG( + (thrust::detail::depend_on_instantiation::value), + "this algorithm is not implemented for the specified system" + ); + return {}; +} + +template +event +async_exclusive_scan(thrust::execution_policy&, + ForwardIt, + Sentinel, + OutputIt, + InitialValueType, + BinaryOp) +{ + THRUST_STATIC_ASSERT_MSG( + (thrust::detail::depend_on_instantiation::value), + "this algorithm is not implemented for the specified system" + ); + return {}; +} + +} // namespace unimplemented + +namespace inclusive_scan_detail +{ + +// Include fallback implementation for ADL failures +using thrust::async::unimplemented::async_inclusive_scan; + +// Implementation of the thrust::async::inclusive_scan CPO. +struct inclusive_scan_fn final +{ + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + thrust::plus<>{} + ) + ) + + template >>> + THRUST_NODISCARD + auto operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + thrust::plus<>{} + ) + ) +}; + +} // namespace inclusive_scan_detail + +THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{}; + +namespace exclusive_scan_detail +{ + +// Include fallback implementation for ADL failures +using thrust::async::unimplemented::async_exclusive_scan; + +// Implementation of the thrust::async::exclusive_scan CPO. +struct exclusive_scan_fn final +{ + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + thrust::plus<>{} + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + iterator_value_t>{}, + thrust::plus<>{} + ) + ) + + template >>> + THRUST_NODISCARD + auto + operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) + ) + + template >>> + THRUST_NODISCARD + auto + operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + thrust::plus<>{} + ) + ) + + template + THRUST_NODISCARD + auto operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + iterator_value_t>{}, + thrust::plus<>{} + ) + ) +}; + +} // namespace exclusive_scan_detail + +THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{}; + +} // namespace async + +} // end namespace thrust + +#endif diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h new file mode 100644 index 0000000000..022b7177f4 --- /dev/null +++ b/thrust/system/cuda/detail/async/exclusive_scan.h @@ -0,0 +1,180 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan +// - Note that thrust::plus<> is transparent, cub::Sum is not. This should be +// fixed in CUB first). +// - Need to check if CUB actually optimizes for sums before putting in effort + +namespace thrust +{ +namespace system +{ +namespace cuda +{ +namespace detail +{ + +template +auto async_exclusive_scan_n(execution_policy& policy, + ForwardIt first, + Size n, + OutputIt out, + InitialValueType init, + BinaryOp op) + -> unique_eager_event +{ + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + size_t tmp_size = 0; + thrust::cuda_cub::throw_on_error( + cub::DeviceScan::ExclusiveScan( + nullptr, + tmp_size, + first, + out, + op, + init, + n, + nullptr, // Null stream, just for sizing. + THRUST_DEBUG_SYNC_FLAG + ), + "after exclusive_scan sizing" + ); + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n( + device_alloc, tmp_size + ); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple( + std::move(content), + unique_stream(nonowning, user_raw_stream) + ), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple(std::move(content)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + thrust::cuda_cub::throw_on_error( + cub::DeviceScan::ExclusiveScan( + tmp_ptr, + tmp_size, + first, + out, + op, + init, + n, + ev.stream().native_handle(), + THRUST_DEBUG_SYNC_FLAG + ), + "after exclusive_scan launch" + ); + + return std::move(ev); +} + +}}} // namespace system::cuda::detail + +namespace cuda_cub +{ + +// ADL entry point. +template +auto async_exclusive_scan(execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + InitialValueType &&init, + BinaryOp&& op) +THRUST_RETURNS( + thrust::system::cuda::detail::async_exclusive_scan_n( + policy, + first, + distance(first, THRUST_FWD(last)), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) +) + +} // cuda_cub + +} // end namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#endif + diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h new file mode 100644 index 0000000000..5f37c19cff --- /dev/null +++ b/thrust/system/cuda/detail/async/inclusive_scan.h @@ -0,0 +1,173 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan +// - Note that thrust::plus<> is transparent, cub::Sum is not. This should be +// fixed in CUB first). +// - Need to check if CUB actually optimizes for sums before putting in effort + +namespace thrust +{ +namespace system +{ +namespace cuda +{ +namespace detail +{ + +template +auto async_inclusive_scan_n(execution_policy& policy, + ForwardIt first, + Size n, + OutputIt out, + BinaryOp op) + -> unique_eager_event +{ + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + size_t tmp_size = 0; + thrust::cuda_cub::throw_on_error( + cub::DeviceScan::InclusiveScan( + nullptr, + tmp_size, + first, + out, + op, + n, + nullptr, // Null stream, just for sizing. + THRUST_DEBUG_SYNC_FLAG + ), + "after inclusive_scan sizing" + ); + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n( + device_alloc, tmp_size + ); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple( + std::move(content), + unique_stream(nonowning, user_raw_stream) + ), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple(std::move(content)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + thrust::cuda_cub::throw_on_error( + cub::DeviceScan::InclusiveScan( + tmp_ptr, + tmp_size, + first, + out, + op, + n, + ev.stream().native_handle(), + THRUST_DEBUG_SYNC_FLAG + ), + "after inclusive_scan launch" + ); + + return std::move(ev); +} + +}}} // namespace system::cuda::detail + +namespace cuda_cub +{ + +// ADL entry point. +template +auto async_inclusive_scan(execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) +THRUST_RETURNS( + thrust::system::cuda::detail::async_inclusive_scan_n( + policy, + first, + distance(first, THRUST_FWD(last)), + THRUST_FWD(out), + THRUST_FWD(op) + ) +) + +} // cuda_cub + +} // end namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#endif + diff --git a/thrust/system/cuda/detail/async/scan.h b/thrust/system/cuda/detail/async/scan.h new file mode 100644 index 0000000000..7d993e6641 --- /dev/null +++ b/thrust/system/cuda/detail/async/scan.h @@ -0,0 +1,33 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#include +#include diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h new file mode 100644 index 0000000000..a2a90618b4 --- /dev/null +++ b/thrust/system/detail/adl/async/scan.h @@ -0,0 +1,34 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The purpose of this header is to #include the async/scan.h header of the +// sequential, host, and device systems. It should be #included in any code +// which uses ADL to dispatch async scans. + +#pragma once + +#include + +//#include + +//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h> +//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER +//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER + +#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h> +#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER +#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER + diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl index 300b697b26..83d272c3e9 100644 --- a/thrust/system/detail/generic/scan.inl +++ b/thrust/system/detail/generic/scan.inl @@ -61,9 +61,7 @@ __host__ __device__ { // Use the input iterator's value type per https://wg21.link/P0571 using ValueType = typename thrust::iterator_value::type; - - // assume 0 as the initialization value - return thrust::exclusive_scan(exec, first, last, result, ValueType(0)); + return thrust::exclusive_scan(exec, first, last, result, ValueType{}); } // end exclusive_scan()