From 7caa6926481eafe3f9c5bd85e9582a41ad9349e5 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 31 Jul 2020 17:22:32 -0400
Subject: [PATCH] WIP Adding async scan algorithms, iterate on async testing.

---
 dependencies/cub                              |   2 +-
 testing/CMakeLists.txt                        |   1 +
 testing/async/CMakeLists.txt                  |  82 +++
 testing/async/exclusive_scan/basic.cu         |  42 ++
 .../async/exclusive_scan/counting_iterator.cu |  49 ++
 .../async/exclusive_scan/discard_output.cu    |  41 ++
 testing/async/exclusive_scan/in_place.cu      |  42 ++
 testing/async/exclusive_scan/mixed_types.cu   | 103 ++++
 testing/async/exclusive_scan/mixin.h          | 143 +++++
 testing/async/exclusive_scan/using_vs_adl.cu  | 182 ++++++
 testing/async/mixin.h                         | 523 ++++++++++++++++++
 testing/async/test_policy_overloads.h         | 255 +++++++++
 testing/unittest/util_async.h                 |   7 +-
 thrust/async/scan.h                           | 355 ++++++++++++
 .../system/cuda/detail/async/exclusive_scan.h | 180 ++++++
 .../system/cuda/detail/async/inclusive_scan.h | 173 ++++++
 thrust/system/cuda/detail/async/scan.h        |  33 ++
 thrust/system/detail/adl/async/scan.h         |  34 ++
 thrust/system/detail/generic/scan.inl         |   4 +-
 19 files changed, 2243 insertions(+), 8 deletions(-)
 create mode 100644 testing/async/CMakeLists.txt
 create mode 100644 testing/async/exclusive_scan/basic.cu
 create mode 100644 testing/async/exclusive_scan/counting_iterator.cu
 create mode 100644 testing/async/exclusive_scan/discard_output.cu
 create mode 100644 testing/async/exclusive_scan/in_place.cu
 create mode 100644 testing/async/exclusive_scan/mixed_types.cu
 create mode 100644 testing/async/exclusive_scan/mixin.h
 create mode 100644 testing/async/exclusive_scan/using_vs_adl.cu
 create mode 100644 testing/async/mixin.h
 create mode 100644 testing/async/test_policy_overloads.h
 create mode 100644 thrust/async/scan.h
 create mode 100644 thrust/system/cuda/detail/async/exclusive_scan.h
 create mode 100644 thrust/system/cuda/detail/async/inclusive_scan.h
 create mode 100644 thrust/system/cuda/detail/async/scan.h
 create mode 100644 thrust/system/detail/adl/async/scan.h

diff --git a/dependencies/cub b/dependencies/cub
index 2749cb0c7b..99ac8fe71e 160000
--- a/dependencies/cub
+++ b/dependencies/cub
@@ -1 +1 @@
-Subproject commit 2749cb0c7bc5a72c806d7ca0b8e4d702dbe017e5
+Subproject commit 99ac8fe71e937d57c594c38fe2d3ee9fd99e1a1b
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index fdfc04e97b..244d839fed 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -151,6 +151,7 @@ foreach(thrust_target IN LISTS THRUST_TARGETS)
 endforeach()
 
 # Add specialized tests:
+add_subdirectory(async)
 add_subdirectory(cpp)
 add_subdirectory(cuda)
 add_subdirectory(omp)
diff --git a/testing/async/CMakeLists.txt b/testing/async/CMakeLists.txt
new file mode 100644
index 0000000000..d744f09e6e
--- /dev/null
+++ b/testing/async/CMakeLists.txt
@@ -0,0 +1,82 @@
+# The async tests perform a large amount of codegen, making them expensive to
+# build and test. To keep compilation and runtimes manageable, the tests are
+# broken up into many files per algorithm to enable parallelism during
+# compilation and testing. The structure of these test directories are:
+#
+# thrust/testing/async/<algorithm_name>/<unit_test>.cu
+#
+# These generate executables and CTest tests named
+# ${config_prefix}.test.async.<algorithm_name>.<unit_test>.
+
+# The async tests only support CUDA enabled configs. Create a list of valid
+# thrust targets:
+set(cuda_configs)
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (config_device STREQUAL CUDA)
+    list(APPEND cuda_configs ${thrust_target})
+  endif()
+endforeach()
+
+list(LENGTH cuda_configs num_cuda_configs)
+if (num_cuda_configs EQUAL 0)
+  return() # No valid configs found, nothing to do.
+endif()
+
+# Process a single algorithm directory, adding all .cu/cpp files as tests for
+# each valid backend. algo_name is the name of the subdir (<algorithm_name>
+# above) and is used for naming the executable/targets.
+function(thrust_add_async_test_dir algo_name)
+  file(GLOB test_srcs
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    CONFIGURE_DEPENDS
+    "${algo_name}/*.cu"
+    "${algo_name}/*.cpp"
+  )
+
+  # Per-algorithm, all-config metatarget: thrust.all.async.[algo].all
+  set(algo_meta_target thrust.all.async.${algo_name}.all)
+  add_custom_target(${algo_meta_target})
+
+  foreach(thrust_target IN LISTS cuda_configs)
+    # Per-algorithm, per-config metatarget: thrust.[config].async.[algo].all
+    thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+    set(algo_config_meta_target ${config_prefix}.async.${algo_name}.all)
+    add_custom_target(${algo_config_meta_target})
+    add_dependencies(${algo_meta_target} ${algo_config_meta_target})
+
+    foreach(test_src IN LISTS test_srcs)
+      get_filename_component(test_name "${test_src}" NAME_WLE)
+      string(PREPEND test_name async.${algo_name}.)
+      message("Adding test ${config_prefix}.${test_name} at ${test_src}.")
+
+      thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+      if(THRUST_ENABLE_TESTS_WITH_RDC)
+        thrust_enable_rdc_for_cuda_target(${test_target})
+      endif()
+
+      add_dependencies(${algo_config_meta_target} ${test_target})
+    endforeach()
+  endforeach()
+endfunction()
+
+# Grab all algorithm subdirectories:
+set(test_dirs)
+file(GLOB contents
+  CONFIGURE_DEPENDS
+  "${CMAKE_CURRENT_LIST_DIR}/*"
+)
+message("Contents: ${contents}")
+foreach(test_dir IN LISTS contents)
+  if(IS_DIRECTORY "${test_dir}")
+    list(APPEND test_dirs "${test_dir}")
+    message("Found test_dir: ${test_dir}")
+  endif()
+endforeach()
+
+# Process all test dirs:
+foreach(test_dir IN LISTS test_dirs)
+  get_filename_component(algo_name "${test_dir}" NAME_WLE)
+  message("Processing algorithm ${algo_name} from ${test_dir}.")
+  thrust_add_async_test_dir(${algo_name})
+endforeach()
diff --git a/testing/async/exclusive_scan/basic.cu b/testing/async/exclusive_scan/basic.cu
new file mode 100644
index 0000000000..ac79e25a9f
--- /dev/null
+++ b/testing/async/exclusive_scan/basic.cu
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct basic_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "basic invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct TestBasic
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<basic_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasic, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/counting_iterator.cu b/testing/async/exclusive_scan/counting_iterator.cu
new file mode 100644
index 0000000000..467f8a7175
--- /dev/null
+++ b/testing/async/exclusive_scan/counting_iterator.cu
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <algorithm>
+#include <limits>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : testing::async::mixin::input::counting_iterator<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "fancy input iterator (counting_iterator)";
+  }
+};
+
+template <typename T>
+struct TestCountingIterator
+{
+  void operator()(std::size_t num_values) const
+  {
+    num_values = unittest::truncate_to_max_representable<T>(num_values);
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+// Use built-in types only, counting_iterator doesn't seem to be compatible with
+// the custom_numeric.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestCountingIterator,
+                                          BuiltinNumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/discard_output.cu b/testing/async/exclusive_scan/discard_output.cu
new file mode 100644
index 0000000000..353c88e739
--- /dev/null
+++ b/testing/async/exclusive_scan/discard_output.cu
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Compilation test with discard iterators. No runtime validation is actually
+// performed, other than testing whether the algorithm completes without
+// exception.
+
+template <typename input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct discard_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::discard_iterator
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::noop
+{
+  static std::string description() { return "discard output"; }
+};
+
+template <typename T>
+struct TestDiscard
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<discard_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestDiscard, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/in_place.cu b/testing/async/exclusive_scan/in_place.cu
new file mode 100644
index 0000000000..71ced40b7f
--- /dev/null
+++ b/testing/async/exclusive_scan/in_place.cu
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+template <typename input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct basic_inplace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector_reuse_input<input_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        input_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "basic in-place invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct TestBasicInPlace
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = basic_inplace_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasicInPlace, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/mixed_types.cu b/testing/async/exclusive_scan/mixed_types.cu
new file mode 100644
index 0000000000..05584e7a75
--- /dev/null
+++ b/testing/async/exclusive_scan/mixed_types.cu
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Test using mixed int/float types for:
+// - input_value_type       | (int, float)
+// - output_value_type      | (int, float)
+// - initial_value_type     | (int, float, <none>)
+// - thrust::plus<T> T-type | (int, float, void>)
+//
+// The initial_value_type and thrust::plus<T> types are covered by the
+// mixin::postfix_args::scan_mixed_types_overloads component.
+//
+// The testing/scan.cu TestMixedTypes test spells out the expected behavior,
+// which is defined by https://wg21.link/P0571.
+
+template <typename value_type>
+struct mixed_type_input_generator
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     // fractional values are chosen deliberately to test
+                     // casting orders and accumulator types:
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+// A fractional value is used to ensure that a different result is obtained when
+// using float vs. int.
+template <typename value_type>
+struct mixed_types_postfix_args
+{
+  using postfix_args_type = std::tuple<         // Overloads to test:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, thrust::plus<>>,     // - initial_value, plus<>
+    std::tuple<value_type, thrust::plus<int>>,  // - initial_value, plus<int>
+    std::tuple<value_type, thrust::plus<float>> // - initial_value, plus<float>
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{},
+            {static_cast<value_type>(5.5)},
+            {static_cast<value_type>(5.5), thrust::plus<>{}},
+            {static_cast<value_type>(5.5), thrust::plus<int>{}},
+            {static_cast<value_type>(5.5), thrust::plus<float>{}}};
+  }
+};
+
+template <typename input_value_type,
+          typename output_value_type,
+          typename initial_value_type>
+struct invoker
+    : mixed_type_input_generator<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , mixed_types_postfix_args<initial_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "mixed input/output/initial type tests";
+  }
+};
+
+void TestScanMixedTypes()
+{
+  // 10 values are enough to check the behavior we want to test while staying
+  // small enough to reason about.
+  constexpr std::size_t num_values = 10;
+
+  // invoker template params are input_value_type, output_vt, initial_vt:
+  using testing::async::test_policy_overloads;
+  test_policy_overloads<invoker<int, int, int>>::run(num_values);
+  test_policy_overloads<invoker<int, int, float>>::run(num_values);
+  test_policy_overloads<invoker<int, float, int>>::run(num_values);
+  test_policy_overloads<invoker<int, float, float>>::run(num_values);
+  test_policy_overloads<invoker<float, int, int>>::run(num_values);
+  test_policy_overloads<invoker<float, int, float>>::run(num_values);
+  test_policy_overloads<invoker<float, float, int>>::run(num_values);
+  // We all float down here
+  test_policy_overloads<invoker<float, float, float>>::run(num_values);
+}
+DECLARE_UNITTEST(TestScanMixedTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/mixin.h b/testing/async/exclusive_scan/mixin.h
new file mode 100644
index 0000000000..cf4671d976
--- /dev/null
+++ b/testing/async/exclusive_scan/mixin.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/async/scan.h>
+
+#include <thrust/scan.h>
+
+#include <async/mixin.h>
+
+// TODO Finish implementing tests. Draw from other async algos, as well as
+// the older scan tests.
+
+// TODO compare_outputs::noop should still test the stream/event. Maybe rename.
+
+// From testing/scan.cu:
+//   - TestInclusiveScanDispatchExplicit
+//   - TestInclusiveScanDispatchImplicit
+//   - TestScanWithLargeTypes
+//   - TestInclusiveScanWithBigIndexes
+//   - TestInclusiveScanWithUserDefinedType
+
+// From testing/async_reduce.cu:
+//   - test_async_reduce_after (can this be worked into the framework? See below)
+//   - test_async_reduce_on_then_after (can this be worked into the framework?)
+//     - all the child variants (e.g. with allocator) too
+//   - test_async_reduce_caching
+//   - test_async_copy_then_reduce
+
+// For the `after` tests, I think we can add them to test_policy_overloads:
+// 1) Reverse the nested iteration over prefix/postfix args.
+//    - Currently we iterate prefix, then postfix.
+//    - The `after` tests won't use any prefix tuple, they'll be hardcoded into
+//      a test_configuration-esque function.
+//    - This will avoid calling the same `after` tests multiple times.
+
+namespace testing
+{
+namespace async
+{
+namespace exclusive_scan
+{
+
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple<         // List any extra arg overloads:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{}, {42}, {42, alternate_binary_op{}}};
+  }
+};
+
+} // namespace postfix_args
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::exclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+struct basic
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    auto e = thrust::async::exclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+} // namespace mixin
+} // namespace exclusive_scan
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/using_vs_adl.cu b/testing/async/exclusive_scan/using_vs_adl.cu
new file mode 100644
index 0000000000..4e7dc7b7de
--- /dev/null
+++ b/testing/async/exclusive_scan/using_vs_adl.cu
@@ -0,0 +1,182 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Verify what happens when calling the algorithm without any namespace
+// qualifiers:
+// - If the async entry point is available in the global namespace due to a
+//   using statement, the async algorithm should be called.
+// - Otherwise, ADL should resolve the call to the synchronous algo in the
+//   thrust:: namespace.
+
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct adl_host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+
+    // ADL should resolve this to the synchronous `thrust::` algorithm.
+    // This is checked by ensuring that the call returns an output iterator.
+    OutIter result =
+      exclusive_scan(host_input.cbegin(),
+                     host_input.cend(),
+                     host_output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    (void)result;
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+namespace invoke_async
+{
+
+struct using_namespace
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using namespace thrust::async;
+    thrust::device_event e =
+      exclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+struct using_cpo
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using thrust::async::exclusive_scan;
+    thrust::device_event e =
+      exclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_namespace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_namespace
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with `using namespace thrust::async`";
+  }
+};
+
+template <typename T>
+struct TestUsingNamespace
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = using_namespace_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestUsingNamespace, NumericTypes);
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_cpo_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_cpo
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with "
+           "`using namespace thrust::async::exclusive_scan`";
+  }
+};
+
+template <typename T>
+struct TestUsingCPO
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = using_cpo_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestUsingCPO, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/mixin.h b/testing/async/mixin.h
new file mode 100644
index 0000000000..8d9a07585e
--- /dev/null
+++ b/testing/async/mixin.h
@@ -0,0 +1,523 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <thrust/iterator/discard_iterator.h>
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <tuple>
+#include <type_traits>
+
+// This file contains a set of mix-in classes that define an algorithm
+// definition for use with test_policy_overloads<algo_def>. The algorithm
+// definition describes the details of a thrust::async algorithm invocation:
+//
+// - Input type and initialization
+// - Output type and initialization (supports in-place, too)
+// - Postfix arguments that define the algorithm's overload set
+// - Abstracted invocation of the async algorithm
+// - Abstracted invocation of a reference algorithm
+// - Validation of async vs. reference output
+// - A description string.
+//
+// This definition is used by test_policy_overloads to test each overload
+// against a reference while injecting a variety of execution policies. This
+// validates that each overload behaves correctly according to some reference.
+//
+// Since much of the algorithm definition is generic and may be reused in
+// multiple tests with slight changes, a mix-in system is used to simplify
+// the creation of algorithm definitions. The following namespace hierarchy is
+// used to organize these generic components:
+//
+// * testing::async::mixin::
+// ** ::input - Input types/values (device vectors, counting iterators, etc)
+// ** ::output - Output types/values (device vectors, inplace device vectors,
+//                                    discard iterators, etc)
+// ** ::postfix_args - Algorithm specific overload sets
+// ** ::invoke_reference - Algorithm specific reference invocation
+// ** ::invoke_async - Algorithm specific async algo invocation
+// ** ::compare_outputs - Compare output values.
+//
+// Each algorithm should define its own `mixins.h` header to declare algorithm
+// specific mixins (e.g. postfix_args, invoke_reference, and invoke_async)
+// in a testing::async::<algorithm_name>::mixins namespace structure.
+//
+// For example, the test.async.exclusive_scan.basic test uses the following
+// algorithm definition from mix-ins:
+//
+// ```
+//   #include <async/test_policy_overloads.h>
+//   #include <async/mixin.h>
+//   #include <async/exclusive_scan/mixin.h>
+//   template <typename input_value_type,
+//            typename output_value_type   = input_value_type,
+//            typename initial_value_type  = input_value_type,
+//            typename alternate_binary_op = thrust::maximum<>>
+//   struct basic_invoker
+//      : testing::async::mixin::input::device_vector<input_value_type>
+//      , testing::async::mixin::output::device_vector<output_value_type>
+//      , testing::async::exclusive_scan::mixin::postfix_args::
+//          all_overloads<initial_value_type, alternate_binary_op>
+//      , testing::async::exclusive_scan::mixin::invoke_reference::
+//          host_synchronous<input_value_type, output_value_type>
+//      , testing::async::exclusive_scan::mixin::invoke_async::basic
+//      , testing::async::mixin::compare_outputs::assert_equal_quiet
+//   {
+//     static std::string description()
+//     {
+//       return "basic invocation with device vectors";
+//     }
+//   };
+//
+//   ...
+//
+//   testing::async::test_policy_overloads<basic_invoker<T>>::run(num_values);
+// ```
+//
+// The basic_invoker class expands to something similar to the following:
+//
+// ```
+//  template <typename input_value_type,
+//            typename output_value_type   = input_value_type,
+//            typename initial_value_type  = input_value_type,
+//            typename alternate_binary_op = thrust::maximum<>>
+//  struct basic_invoker
+//  {
+//  public:
+//
+//    static std::string description()
+//    {
+//      return "basic invocation with device vectors";
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::input::device_vector
+//    using input_type  = thrust::device_vector<input_value_type>;
+//
+//    // Generate an instance of the input:
+//    static input_type generate_input(std::size_t num_values)
+//    {
+//      input_type input(num_values);
+//      thrust::sequence(input.begin(), input.end(), 25, 3);
+//      return input;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::output::device_vector
+//    using output_type = thrust::device_vector<output_value_type>;
+//
+//    // Generate an instance of the output:
+//    // Might be more complicated, eg. fancy iterators, etc
+//    static output_type generate_output(std::size_t num_values)
+//    {
+//      return output_type(num_values);
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    //
+//    testing::async::mixin::exclusive_scan::mixin::postfix_args::all_overloads
+//    //
+//    using postfix_args_type = std::tuple<                 // List any extra
+//    arg overloads:
+//      std::tuple<>,                                       // - no extra args
+//      std::tuple<initial_value_type>,                     // - initial_value
+//      std::tuple<initial_value_type, alternate_binary_op> // - initial_value,
+//      binary_op
+//      >;
+//
+//    // Create instances of the extra arguments to use when invoking the
+//    // algorithm:
+//    static postfix_args_type generate_postfix_args()
+//    {
+//      return {
+//        {},                         // no extra args
+//        {42},                       // initial_value
+//        {57, alternate_binary_op{}} // initial_value, binary_op
+//      };
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    //
+//    testing::async::mixin::exclusive_scan::mixin::invoke_reference::host_synchronous
+//    //
+//    // Invoke a reference implementation for a single overload as described by
+//    // postfix_tuple. This tuple contains instances of any additional
+//    arguments
+//    // to pass to the algorithm. The tuple/index_sequence pattern is used to
+//    // support the "no extra args" overload, since the parameter pack
+//    expansion
+//    // will do exactly what we want in all cases.
+//    template <typename PostfixArgTuple, std::size_t... PostfixArgIndices>
+//    static void invoke_reference(input_type const &input,
+//                                 output_type &output,
+//                                 PostfixArgTuple &&postfix_tuple,
+//                                 std::index_sequence<PostfixArgIndices...>)
+//    {
+//      // Create host versions of the input/output:
+//      thrust::host_vector<input_value_type> host_input(input.cbegin(),
+//                                                       input.cend());
+//      thrust::host_vector<output_value_type> host_output(host_input.size());
+//
+//      // Run host synchronous algorithm to generate reference.
+//      thrust::exclusive_scan(host_input.cbegin(),
+//                             host_input.cend(),
+//                             host_output.begin(),
+//                             std::get<PostfixArgIndices>(
+//                               THRUST_FWD(postfix_tuple))...);
+//
+//      // Copy back to device.
+//      output = host_output;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::exclusive_scan::mixin::invoke_async::basic
+//    //
+//    // Invoke the async algorithm for a single overload as described by
+//    // the prefix and postfix tuples. These tuples contains instances of any
+//    // additional arguments to pass to the algorithm. The tuple/index_sequence
+//    // pattern is used to support the "no extra args" overload, since the
+//    // parameter pack expansion will do exactly what we want in all cases.
+//    // Prefix args are included here (but not for invoke_reference) to allow
+//    the
+//    // test framework to change the execution policy.
+//    // This method must return an event or future.
+//    template <typename PrefixArgTuple,
+//              std::size_t... PrefixArgIndices,
+//              typename PostfixArgTuple,
+//              std::size_t... PostfixArgIndices>
+//    static auto invoke_async(PrefixArgTuple &&prefix_tuple,
+//                             std::index_sequence<PrefixArgIndices...>,
+//                             input_type const &input,
+//                             output_type &output,
+//                             PostfixArgTuple &&postfix_tuple,
+//                             std::index_sequence<PostfixArgIndices...>)
+//    {
+//      output.resize(input.size());
+//      auto e = thrust::async::exclusive_scan(
+//        std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+//        input.cbegin(),
+//        input.cend(),
+//        output.begin(),
+//        std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+//      return e;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::compare_outputs::assert_equal_quiet
+//    //
+//    // Wait on and validate the event/future (usually with TEST_EVENT_WAIT /
+//    // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output
+//    // matches the testing output.
+//    template <typename EventType>
+//    static void compare_outputs(EventType &e,
+//                                output_type const &ref,
+//                                output_type const &test)
+//    {
+//      TEST_EVENT_WAIT(e);
+//      ASSERT_EQUAL_QUIET(ref, test);
+//    }
+// };
+// ```
+//
+// Similar invokers with slight tweaks are used in other
+// async/exclusive_scan/*.cu tests.
+
+namespace testing
+{
+namespace async
+{
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace input
+{
+
+// TODO it'd be nice to specify a lambda expression that'd replace the call to
+// thrust::sequence when desired.
+template <typename value_type>
+struct device_vector
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+template <typename value_type>
+struct counting_iterator
+{
+  struct input_type
+  {
+    using iterator = thrust::counting_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() { return iterator{static_cast<value_type>(0)}; }
+    iterator begin() const { return iterator{static_cast<value_type>(0)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(0)}; }
+
+    iterator end() { return iterator{static_cast<value_type>(0)} + num_values; }
+    iterator end() const
+    {
+      return iterator{static_cast<value_type>(0)} + num_values;
+    }
+    iterator cend() const
+    {
+      return iterator{static_cast<value_type>(0)} + num_values;
+    }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+} // namespace input
+
+//------------------------------------------------------------------------------
+namespace output
+{
+
+template <typename value_type>
+struct device_vector
+{
+  using output_type = thrust::device_vector<value_type>;
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values,
+                                     InputType& /* unused */)
+  {
+    return output_type(num_values);
+  }
+};
+
+template <typename value_type>
+struct device_vector_reuse_input
+{
+  using output_type = thrust::device_vector<value_type>&;
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t /*num_values*/,
+                                     InputType& input)
+  {
+    return input;
+  }
+};
+
+struct discard_iterator
+{
+  struct output_type
+  {
+    using iterator = thrust::discard_iterator<>;
+
+    iterator begin() { return thrust::make_discard_iterator(); }
+    iterator begin() const { return thrust::make_discard_iterator(); }
+    iterator cbegin() const { return thrust::make_discard_iterator(); }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t /* num_values */,
+                                     InputType& /* input */)
+  {
+    return output_type{};
+  }
+};
+
+} // namespace output
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+/* Defined per algorithm. Example:
+ *
+ * // Defines several overloads:
+ * // algorithm([policy,] input, output) // no postfix args
+ * // algorithm([policy,] input, output, initial_value)
+ * // algorithm([policy,] input, output, initial_value, binary_op)
+ * template <typename value_type, typename alternate_binary_op =
+ * thrust::maximum<>> struct all_overloads
+ * {
+ *   using postfix_args_type = std::tuple<         // List any extra arg
+ * overloads: std::tuple<>,                               // - no extra args
+ *     std::tuple<value_type>,                     // - initial_value
+ *     std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+ *     >;
+ *
+ *   static postfix_args_type generate_postfix_args()
+ *   {
+ *     return {{}, {42}, {42, alternate_binary_op{}}};
+ *   }
+ * };
+ *
+ */
+}
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+/* Defined per algorithm. Example:
+ *
+ * template <typename input_value_type,
+ *           typename output_value_type = input_value_type>
+ * struct host_synchronous
+ * {
+ *   template <typename InputType,
+ *             typename OutputType,
+ *             typename PostfixArgTuple,
+ *             std::size_t... PostfixArgIndices>
+ *   static void invoke_reference(InputType const& input,
+ *                                OutputType& output,
+ *                                PostfixArgTuple&& postfix_tuple,
+ *                                std::index_sequence<PostfixArgIndices...>)
+ *   {
+ *     // Create host versions of the input/output:
+ *     thrust::host_vector<input_value_type> host_input(input.cbegin(),
+ *                                                      input.cend());
+ *     thrust::host_vector<output_value_type> host_output(host_input.size());
+ *
+ *     // Run host synchronous algorithm to generate reference.
+ *     // Be sure to call a backend that doesn't use the same underlying
+ *     // implementation.
+ *     thrust::exclusive_scan(host_input.cbegin(),
+ *                            host_input.cend(),
+ *                            host_output.begin(),
+ *                            std::get<PostfixArgIndices>(
+ *                              THRUST_FWD(postfix_tuple))...);
+ *
+ *     // Copy back to device.
+ *     output = host_output;
+ *   }
+ * };
+ *
+ */
+
+// Used to save time when testing unverifiable invocations (discard_iterators)
+struct noop
+{
+  template <typename... Ts>
+  static void invoke_reference(Ts&&...)
+  {}
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+/* Defined per algorithm. Example:
+ *
+ * struct basic
+ * {
+ *   template <typename PrefixArgTuple,
+ *             std::size_t... PrefixArgIndices,
+ *             typename InputType,
+ *             typename OutputType,
+ *             typename PostfixArgTuple,
+ *             std::size_t... PostfixArgIndices>
+ *   static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+ *                            std::index_sequence<PrefixArgIndices...>,
+ *                            InputType const& input,
+ *                            OutputType& output,
+ *                            PostfixArgTuple&& postfix_tuple,
+ *                            std::index_sequence<PostfixArgIndices...>)
+ *   {
+ *     auto e = thrust::async::exclusive_scan(
+ *       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+ *       input.cbegin(),
+ *       input.cend(),
+ *       output.begin(),
+ *       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+ *     return e;
+ *   }
+ * };
+ */
+
+} // namespace invoke_async
+
+//------------------------------------------------------------------------------
+namespace compare_outputs
+{
+
+struct assert_equal_quiet
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    TEST_EVENT_WAIT(e);
+    ASSERT_EQUAL(ref, test);
+  }
+};
+
+// Does an 'almost_equal' comparison for floating point types, since fp
+// addition is non-associative
+struct assert_maybe_fuzzy_equal_quiet
+{
+private:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::false_type /* is_floating_point */)
+  {
+    TEST_EVENT_WAIT(e);
+    ASSERT_EQUAL(ref, test);
+  }
+
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::true_type /* is_floating_point */)
+  {
+    TEST_EVENT_WAIT(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+
+public:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    using value_type = typename OutputType::value_type;
+    compare_outputs_impl(e, ref, test, std::is_floating_point<value_type>{});
+  }
+};
+
+// Used to save time when testing unverifiable invocations (discard_iterators)
+struct noop
+{
+  template <typename... Ts>
+  static void compare_outputs(Ts&&... ts)
+  {}
+};
+
+} // namespace compare_outputs
+
+} // namespace mixin
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/test_policy_overloads.h b/testing/async/test_policy_overloads.h
new file mode 100644
index 0000000000..9999237b2e
--- /dev/null
+++ b/testing/async/test_policy_overloads.h
@@ -0,0 +1,255 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/device_allocator.h>
+#include <thrust/future.h>
+
+#include <unittest/unittest.h>
+
+#include <string>
+
+namespace testing
+{
+
+namespace async
+{
+
+// Tests that policies are handled correctly for all overloads of an async
+// algorithm.
+//
+// Specifically, each overload is called with:
+// 1) No policy
+// 2) thrust::device
+// 3) thrust::device(thrust::device_allocator<void>)
+// 4) thrust::device.on(stream)
+// 5) thrust::device(thrust::device_allocator<void>).on(stream)
+//
+// For each test, the returned event/future is tested to make sure it holds a
+// reference to the proper stream.
+//
+// The AlgoDef type defines an async algorithm, its overloads, and abstracts its
+// invocation. See the async/mixins.h for a documented example of this interface
+// and some convenience mixins for constructing a definition quickly.
+template <typename AlgoDef>
+struct test_policy_overloads
+{
+  using algo_def          = AlgoDef;
+  using input_type        = typename algo_def::input_type;
+  using output_type       = typename algo_def::output_type;
+  using postfix_args_type = typename algo_def::postfix_args_type;
+
+  // Main entry point; call this from a unit test function.
+  static void run(std::size_t num_values)
+  {
+    // When a policy uses the default stream, the algorithm implementation
+    // should spawn a new stream in the returned event. This lambda validates
+    // this:
+    auto using_default_stream = [](auto& e) {
+      ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(),
+                       e.stream().native_handle());
+    };
+
+    // Lambda that verifies non-default streams are passed through to the
+    // event/future:
+    thrust::system::cuda::detail::unique_stream test_stream{};
+    auto using_test_stream = [&test_stream](auto& e) {
+      ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle());
+    };
+
+    // Test the different types of policies:
+    test_policy("(no policy)",
+                std::make_tuple(),
+                using_default_stream,
+                num_values);
+    test_policy("thrust::device",
+                std::make_tuple(thrust::device),
+                using_default_stream,
+                num_values);
+    test_policy("thrust::device(thrust::device_allocator<void>{})",
+                std::make_tuple(
+                  thrust::device(thrust::device_allocator<void>{})),
+                using_default_stream,
+                num_values);
+    test_policy("thrust::device.on(test_stream.get())",
+                std::make_tuple(thrust::device.on(test_stream.get())),
+                using_test_stream,
+                num_values);
+    test_policy(
+      "thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())",
+      std::make_tuple(
+        thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())),
+      using_test_stream,
+      num_values);
+  }
+
+private:
+  template <typename PolicyTuple, typename ValidateEvent>
+  static void test_policy(std::string const& policy_desc,
+                          PolicyTuple&& policy_tuple,
+                          ValidateEvent&& validate,
+                          std::size_t num_values)
+  {
+    constexpr std::size_t num_postfix_arg_sets =
+      std::tuple_size<postfix_args_type>::value;
+
+    // Test the current policy across all overloads of the algorithm:
+    iterate_postfix_args<0, num_postfix_arg_sets>{}(
+      policy_desc,
+      THRUST_FWD(policy_tuple),
+      algo_def::generate_postfix_args(),
+      THRUST_FWD(validate),
+      num_values);
+  }
+
+  // Iterate through postfix arg sets, calling all overloads they define
+  // with the provided policy_tuple.
+  template <size_t PostfixArgSetIdx, size_t NumPostfixArgSets>
+  struct iterate_postfix_args
+  {
+    template <typename PolicyTuple, typename ValidateEvent>
+    void operator()(std::string const& policy_desc,
+                    PolicyTuple&& policy_tuple,
+                    postfix_args_type&& postfix_args,
+                    ValidateEvent&& validate,
+                    std::size_t num_values)
+    {
+      try
+      {
+        test_configuration(policy_tuple,
+                           std::get<PostfixArgSetIdx>(std::move(postfix_args)),
+                           validate,
+                           num_values);
+      }
+      catch (unittest::UnitTestException& exc)
+      {
+        // Append some identifying information to the exception:
+        std::string const overload_desc = unittest::demangle(
+          typeid(typename std::tuple_element<PostfixArgSetIdx,
+                                             postfix_args_type>::type)
+            .name());
+
+        std::string const input_desc =
+          unittest::demangle(typeid(input_type).name());
+        std::string const output_desc =
+          unittest::demangle(typeid(output_type).name());
+
+        exc << "\n"
+            << " - testcase = " << algo_def::description() << "\n"
+            << " - policy = " << policy_desc << "\n"
+            << " - input_type = " << input_desc << "\n"
+            << " - output_type = " << output_desc << "\n"
+            << " - tuple of trailing arguments = " << overload_desc << "\n"
+            << " - num_values = " << num_values;
+        throw;
+      }
+
+      // Recurse
+      iterate_postfix_args<PostfixArgSetIdx + 1, NumPostfixArgSets>{}(
+        policy_desc,
+        THRUST_FWD(policy_tuple),
+        std::move(postfix_args),
+        THRUST_FWD(validate),
+        num_values);
+    }
+  };
+
+  // Terminal specialization
+  template <size_t N>
+  struct iterate_postfix_args<N, N>
+  {
+    template <typename... Ts>
+    void operator()(Ts&&...)
+    {}
+  };
+
+  // Actually invoke the algorithms with the supplied prefix/postfix args
+  // and do the validations:
+  template <typename PrefixArgTuple,
+            typename PostfixArgTuple,
+            typename ValidateEvent>
+  static void test_configuration(PrefixArgTuple&& prefix_tuple_ref,
+                                 PostfixArgTuple&& postfix_tuple_ref,
+                                 ValidateEvent const& validate,
+                                 std::size_t num_values)
+  {
+    using prefix_tuple_type  = thrust::remove_cvref_t<PrefixArgTuple>;
+    using postfix_tuple_type = thrust::remove_cvref_t<PostfixArgTuple>;
+
+    // Sink these tuples into const locals so they can be safely passed to
+    // multiple invocations without worrying about potential modifications.
+    prefix_tuple_type const prefix_tuple   = THRUST_FWD(prefix_tuple_ref);
+    postfix_tuple_type const postfix_tuple = THRUST_FWD(postfix_tuple_ref);
+
+    // Generate index sequences for the tuples:
+    constexpr auto prefix_tuple_size  = std::tuple_size<prefix_tuple_type>{};
+    constexpr auto postfix_tuple_size = std::tuple_size<postfix_tuple_type>{};
+    using prefix_index_seq  = std::make_index_sequence<prefix_tuple_size>;
+    using postfix_index_seq = std::make_index_sequence<postfix_tuple_size>;
+
+    // Use unique, non-const inputs for each invocation to support in-place
+    // algo_def configurations.
+    input_type input_a   = algo_def::generate_input(num_values);
+    input_type input_b   = algo_def::generate_input(num_values);
+    input_type input_c   = algo_def::generate_input(num_values);
+    input_type input_d   = algo_def::generate_input(num_values);
+    input_type input_ref = algo_def::generate_input(num_values);
+
+    output_type output_a   = algo_def::generate_output(num_values, input_a);
+    output_type output_b   = algo_def::generate_output(num_values, input_b);
+    output_type output_c   = algo_def::generate_output(num_values, input_c);
+    output_type output_d   = algo_def::generate_output(num_values, input_d);
+    output_type output_ref = algo_def::generate_output(num_values, input_ref);
+
+    // Invoke multiple overlapping async algorithms, capturing their outputs
+    // and events/futures:
+    auto e_a = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_a,
+                                      output_a,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_b = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_b,
+                                      output_b,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_c = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_c,
+                                      output_c,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_d = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_d,
+                                      output_d,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+
+    // Let reference calc overlap with async testing:
+    algo_def::invoke_reference(input_ref,
+                               output_ref,
+                               postfix_tuple,
+                               postfix_index_seq{});
+
+    algo_def::compare_outputs(e_a, output_ref, output_a);
+    algo_def::compare_outputs(e_b, output_ref, output_b);
+    algo_def::compare_outputs(e_c, output_ref, output_c);
+    algo_def::compare_outputs(e_d, output_ref, output_d);
+
+    validate(e_a);
+    validate(e_b);
+    validate(e_c);
+    validate(e_d);
+  }
+};
+
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h
index 984cc61c6b..9a3454efd5 100644
--- a/testing/unittest/util_async.h
+++ b/testing/unittest/util_async.h
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 
@@ -73,5 +73,4 @@ auto test_future_value_retrieval(
 
 } // namespace unittest
 
-#endif // THRUST_CPP_DIALECT >= 2011
-
+#endif // THRUST_CPP_DIALECT >= 2014
diff --git a/thrust/async/scan.h b/thrust/async/scan.h
new file mode 100644
index 0000000000..d7adfac983
--- /dev/null
+++ b/thrust/async/scan.h
@@ -0,0 +1,355 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/scan.h
+ *  \brief Functions for asynchronously computing prefix scans.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/detail/static_assert.h>
+
+#include <thrust/system/detail/adl/async/scan.h>
+
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/future.h>
+
+namespace thrust
+{
+
+namespace async
+{
+
+// Fallback implementations used when no overloads are found via ADL:
+namespace unimplemented
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_inclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_exclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     InitialValueType,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace inclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_inclusive_scan;
+
+// Implementation of the thrust::async::inclusive_scan CPO.
+struct inclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             BinaryOp&& op) const
+    // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out,
+                  BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace inclusive_scan_detail
+
+THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{};
+
+namespace exclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_exclusive_scan;
+
+// Implementation of the thrust::async::exclusive_scan CPO.
+struct exclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace exclusive_scan_detail
+
+THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{};
+
+} // namespace async
+
+} // end namespace thrust
+
+#endif
diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h
new file mode 100644
index 0000000000..022b7177f4
--- /dev/null
+++ b/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -0,0 +1,180 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+auto async_exclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                            ForwardIt first,
+                            Size n,
+                            OutputIt out,
+                            InitialValueType init,
+                            BinaryOp op)
+  -> unique_eager_event
+{
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceScan::ExclusiveScan(
+      nullptr,
+      tmp_size,
+      first,
+      out,
+      op,
+      init,
+      n,
+      nullptr, // Null stream, just for sizing.
+      THRUST_DEBUG_SYNC_FLAG
+    ),
+    "after exclusive_scan sizing"
+  );
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceScan::ExclusiveScan(
+      tmp_ptr,
+      tmp_size,
+      first,
+      out,
+      op,
+      init,
+      n,
+      ev.stream().native_handle(),
+      THRUST_DEBUG_SYNC_FLAG
+    ),
+    "after exclusive_scan launch"
+  );
+
+  return std::move(ev);
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+auto async_exclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          InitialValueType &&init,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_exclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(init),
+    THRUST_FWD(op)
+  )
+)
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h
new file mode 100644
index 0000000000..5f37c19cff
--- /dev/null
+++ b/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -0,0 +1,173 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename BinaryOp>
+auto async_inclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                            ForwardIt first,
+                            Size n,
+                            OutputIt out,
+                            BinaryOp op)
+  -> unique_eager_event
+{
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceScan::InclusiveScan(
+      nullptr,
+      tmp_size,
+      first,
+      out,
+      op,
+      n,
+      nullptr, // Null stream, just for sizing.
+      THRUST_DEBUG_SYNC_FLAG
+    ),
+    "after inclusive_scan sizing"
+  );
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceScan::InclusiveScan(
+      tmp_ptr,
+      tmp_size,
+      first,
+      out,
+      op,
+      n,
+      ev.stream().native_handle(),
+      THRUST_DEBUG_SYNC_FLAG
+    ),
+    "after inclusive_scan launch"
+  );
+
+  return std::move(ev);
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+auto async_inclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_inclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(op)
+  )
+)
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/async/scan.h b/thrust/system/cuda/detail/async/scan.h
new file mode 100644
index 0000000000..7d993e6641
--- /dev/null
+++ b/thrust/system/cuda/detail/async/scan.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/cpp14_required.h>
+
+#include <thrust/system/cuda/detail/async/exclusive_scan.h>
+#include <thrust/system/cuda/detail/async/inclusive_scan.h>
diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h
new file mode 100644
index 0000000000..a2a90618b4
--- /dev/null
+++ b/thrust/system/detail/adl/async/scan.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/scan.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async scans.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/scan.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+
diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl
index 300b697b26..83d272c3e9 100644
--- a/thrust/system/detail/generic/scan.inl
+++ b/thrust/system/detail/generic/scan.inl
@@ -61,9 +61,7 @@ __host__ __device__
 {
   // Use the input iterator's value type per https://wg21.link/P0571
   using ValueType = typename thrust::iterator_value<InputIterator>::type;
-
-  // assume 0 as the initialization value
-  return thrust::exclusive_scan(exec, first, last, result, ValueType(0));
+  return thrust::exclusive_scan(exec, first, last, result, ValueType{});
 } // end exclusive_scan()